### Bitcoin news corpus
- English news about "bitcoin" since Oct 01 2016

In [149]:
import codecs
import pickle
import os
import nltk


def save_or_retrieve(file_pkz, func, *args, **kwargs):
    """
    will save or retrieve the contents returned by func
    :param file_pkz: path
    :param func: function or lambda for easy use
    :param args: args for func
    :param kwargs: args for func
    :return:
    """
    if not os.path.exists(file_pkz):
        print('saving "%s"' % file_pkz)
        content = func(*args, **kwargs)
        compressed_content = codecs.encode(pickle.dumps(content), 'zlib_codec')
        with open(file_pkz, 'wb') as f:
            f.write(compressed_content)
        return content
    else:
        print('retrieving "%s"' % file_pkz)
        with open(file_pkz, 'rb') as f:
            compressed_content = f.read()
        return pickle.loads(codecs.decode(compressed_content, 'zlib_codec'))

corpus_dir = 'links_contents'
working_dir = 'data'

english_stopwords = nltk.corpus.stopwords.words('english')
wnl = nltk.WordNetLemmatizer()

corpus_reader = nltk.corpus.CategorizedPlaintextCorpusReader(corpus_dir, r'.*.txt', cat_pattern=r'([0-9]+)-.*')
categories = corpus_reader.categories()
print('categories:', categories[:5], '...')

all_words_pkz = os.path.join(working_dir, 'all_words.pkz')
all_words = save_or_retrieve(
    all_words_pkz,
    lambda: nltk.Text(w.lower() for w in corpus_reader.words() if w.lower() not in english_stopwords))

all_lemmatized_words_pkz = os.path.join(working_dir, 'all_lemmatized_words.pkz')
all_lemmatized_words = save_or_retrieve(
    all_lemmatized_words_pkz,
    lambda: nltk.Text(wnl.lemmatize(w.lower()) for w in corpus_reader.words() if w.lower() not in english_stopwords))

words = ['bitcoin', 'china']  # lowercase
for word in words:
    # print('CONCORDANCE: ', word.upper())
    # all_text.concordance(word, width=50)
    # print()
    # Distributional similarity: find other words which appear in the same contexts as the specified word; list most similar words first.
    print('similarity for word "%s"' % word)
    print('in original words:')
    all_words.similar(word)
    print('in lemmatized words:')
    all_lemmatized_words.similar(word)

categories: ['20161001', '20161002', '20161003', '20161004', '20161005'] ...
retrieving "data\all_words.pkz"


retrieving "data\all_lemmatized_words.pkz"


similarity for word "bitcoin"
in original words:


blockchain cryptocurrency bitcoins technology ethereum news digital
exchange new use price financial transactions based network zcash
market online currency china
in lemmatized words:


blockchain cryptocurrency technology exchange news bitcoins digital
new ethereum payment transaction price market use financial user like
network currency mining
similarity for word "china"
in original words:


bitcoin chinese blockchain price india u exchanges adoption technology
exchange network cryptocurrency mining community company japan news
korea first trading
in lemmatized words:


bitcoin chinese blockchain u price exchange india technology company
network adoption wallet mining startup country ecosystem one currency
cryptocurrency community


### Cleaning corpus
- Deleting lines with less than 10 words

In [148]:
import re

def clean_lines(lines, preprocess=True):
    if not preprocess:
        return lines
    else:
        preprocessed_lines = []
        for line in lines:
            # TODO a NN to recognize the type of word? NUMBER, PERCENT, HOUR, DATE, ABBR, (y hellooo!)

            # standardize perc values  --  remember ?: non capturing group
            line = re.sub(r'[+\-]?(?:\d*[.,]?\d+|\d+[.,]?\d*)%', 'PERCENTAGE', line)

            # standardize monetary values $1.5M, $1., $.3
            line = re.sub(r'\$\d*(?:\d*[.,]?\d+|\d+[.,]?\d*)[Mm]?', 'MONEY', line)

            # standardize certain decimal values like "2.0" or ".2" not "2016."
            line = re.sub('\d*[.,]\d+', 'DECIMAL', line)

            # standardize num values - not by now too many false positives
            # line = re.sub('\d*(?:\d*[.,]?\d+|\d+[.,]?\d*)', 'NUMBER', line)

            # standardize abbreviations i.e. -> ie, u.s.a. -> usa, but not ee.uu.
            # there is no way to multiple re group captures http://stackoverflow.com/a/464879/2692914
            p = re.compile(r'\b((?:[a-zA-Z]\.)+)')
            for m in p.finditer(line):
                line = line.replace(m.group(), ''.join(m.group(1).split('.')))

            # separate in sentences
            # sentences = re.split(r'(?:.\s+)', line)
            sentences = [s for s in re.split(r'\.+', line) if len(s.split()) > 2]
            for s in sentences:
                preprocessed_lines.append(s)
        return preprocessed_lines


def clean_file(in_file, out_file, preprocess=True):
    # min words per paragraph
    min_words_per_paragraph = 7
    with open(in_file, 'r', encoding='utf8') as f:
        lines = [line.rstrip('\n') for line in f if len(line.split()) > min_words_per_paragraph]

    # cleaning and preprocessing files
    if preprocess:
        lines = clean_lines(lines)
        
    # QUITAR ESPACIOS DE LAS LINEAS! 
    # USAR EL SERVIDOR DEDICADO CON UN POS TAGGER! http://stanfordnlp.github.io/CoreNLP/corenlp-server.html#dedicated-server 
    
    content = '\n'.join(lines)
    # doenst work as expected, for semantic analysis we need some stopwords (i.e. have)
    # content = content.lower()
    # english_stopwords = nltk.corpus.stopwords.words('english')
    # wnl = nltk.WordNetLemmatizer()
    # words = list(set(re.split(r'[^\w]+', content)))
    # for word in words:
    #     if word in english_stopwords:
    #         # ignore english stopwords
    #         new_word = ''
    #     else:
    #         # lemmatize words
    #         new_word = wnl.lemmatize(word)
    #     content = re.sub(r'\b%s\b' % word, new_word, content)

    with open(out_file, 'wb') as f:
        f.write(bytes(content, encoding='utf8'))

in_dir = 'links_contents'
out_dir = 'links_contents_clean'
files = [
    '20161001-2fc38645b2f1075b-Community_Prefers_Bitcoin_As_Exchange_Medium_Over_Store_of_Value%2C.txt',
    '20161001-51c397b0cd8bdbc6-Why_Blockchain_Won%E2%80%99t_Disrupt_Banks_First_-_CoinDesk.txt',
    '20161001-adb0edbb66d3fb0d-Kim_Dotcom_Reiterates_His_Bitcoin_Price_Forecast%2C_%242000_in_2.txt',
    '20161002-2c276484c2156687-Bitcoin_Can_Buy_You_a_Biometric_Data_Skimmer_on_the.txt',
    '20161002-c2b624fa4e45d409-Croatian_Law_Enforcement_Completes_Another_Bitcoin-related_Darknet_Drug_Bust_-.txt',
    '20161003-1eca38f66bf45f56-MGT_Capital_Investments_Inc_%28NYSEMKT%3AMGT%29%3A_Opportunity_Through_Uncertainty_%7C_Insider.txt'
]

for file in files:
    in_file = os.path.join(in_dir, file)
    # out_file = os.path.join(out_dir, file)
    # clean_file(in_file, out_file, False)
    out_file = os.path.join(out_dir, file + ".clean.txt")
    clean_file(in_file, out_file, True)

#### Using Stanford's CoreNLP for sentiment analysis and entity recognition

In [198]:
# http://stanfordnlp.github.io/CoreNLP/corenlp-server.html#getting-started
from pycorenlp import StanfordCoreNLP

def corenlp_analysis_all(in_file):
    with open(in_file, 'r', encoding='utf8') as f:
        content = "".join(f.readlines())

    nlp = StanfordCoreNLP('http://127.0.0.1:9000')
    output = nlp.annotate(
        content,
        properties={
            'annotators': 'sentiment',
            'outputFormat': 'json'
        }
    )

    for i, _ in enumerate(output['sentences']):
        print("Sentence:", [t['word'] for t in output['sentences'][i]['tokens']])
        print("Sentiment:", output['sentences'][i]['sentiment'])
        print("")


def corenlp_analysis(in_file):
    with open(in_file, 'r', encoding='utf8') as f:
        # content = f.readlines()
        lines = [line.rstrip('\n') for line in f]

    nlp = StanfordCoreNLP('http://127.0.0.1:9000')
    for line in lines:
        output = nlp.annotate(
            line,
            properties={
                'annotators': 'sentiment,ner,pos',
                'outputFormat': 'json'
            }
        )
        # http://stanfordnlp.github.io/CoreNLP/ner.html
        ner_names = ["PERSON", "LOCATION", "ORGANIZATION", "MISC"]
        ner_numerical = ["MONEY", "NUMBER", "ORDINAL", "PERCENT"]
        ner_temporal = ["DATE", "TIME", "DURATION", "SET"]
        for i, _ in enumerate(output['sentences']):
            # print([t['word'] + ("/" + t["pos"]) + (":" + t["ner"] if t["ner"] != "O" else "") for t in output['sentences'][i]['tokens']])
            print("Sentiment:", output['sentences'][i]['sentiment'])
            # print("")


in_dir = 'links_contents_clean'
files = [
    '20161001-2fc38645b2f1075b-Community_Prefers_Bitcoin_As_Exchange_Medium_Over_Store_of_Value%2C.txt.clean.txt',
    # '20161001-51c397b0cd8bdbc6-Why_Blockchain_Won%E2%80%99t_Disrupt_Banks_First_-_CoinDesk.txt.clean.txt',
    # '20161001-adb0edbb66d3fb0d-Kim_Dotcom_Reiterates_His_Bitcoin_Price_Forecast%2C_%242000_in_2.txt.clean.txt',
    # '20161002-2c276484c2156687-Bitcoin_Can_Buy_You_a_Biometric_Data_Skimmer_on_the.txt.clean.txt',
    # '20161002-c2b624fa4e45d409-Croatian_Law_Enforcement_Completes_Another_Bitcoin-related_Darknet_Drug_Bust_-.txt.clean.txt',
    # '20161003-1eca38f66bf45f56-MGT_Capital_Investments_Inc_%28NYSEMKT%3AMGT%29%3A_Opportunity_Through_Uncertainty_%7C_Insider.tx.clean.txtt'
]

for file in files:
    in_file = os.path.join(in_dir, file)
    corenlp_analysis(in_file)

Sentiment: Negative


Sentiment: Negative


Sentiment: Positive


Sentiment: Negative


Sentiment: Negative


Sentiment: Negative
Sentiment: Neutral


Sentiment: Negative


Sentiment: Neutral
Sentiment: Negative


Sentiment: Negative
Sentiment: Negative
Sentiment: Negative


Sentiment: Negative


Sentiment: Negative
Sentiment: Neutral
Sentiment: Negative


Sentiment: Negative
Sentiment: Negative


Sentiment: Negative


Sentiment: Negative


Sentiment: Negative


Sentiment: Negative
Sentiment: Positive


Sentiment: Negative
Sentiment: Neutral


Sentiment: Negative
Sentiment: Negative


Sentiment: Negative


Sentiment: Negative
Sentiment: Negative


Sentiment: Positive
Sentiment: Negative
Sentiment: Negative
Sentiment: Neutral
Sentiment: Negative


Problema: los títulos están en mayúsculas, debemos reconocer las entidades para ello ignoramos sólo tomaremos en cuenta oraciones que contengan como mínimo un 10% de palabras en minúscula

In [188]:
# para poner en proper-case
# - reconocimiento de entidades
# - y sólo los nnp pueden serlo
#
# el número de menciones de la palabra según "coreference"
# - "coreference" only works when words are in capitalized and not capitalized
#
# el sentimiento positivo, negativo, neutral
# - de sólo el título
# - de sólo las sentencias con menciones


def retrieve_named_entities(lines, min_lower_words_per_sentence_rate=0.10):
    nlp = StanfordCoreNLP('http://127.0.0.1:9000')
    entity_words = {}
    
    # ignore the lines with more than 90% of uppercase
    content = ""
    for line in lines:
        lower_words = 0
        words = line.split()
        # todo bad performance
        for word in words:
            if word[0].isalpha() and word[0].islower():
                lower_words += 1
        if lower_words > len(words) * min_lower_words_per_sentence_rate:
            content += line + "\n"

    output = nlp.annotate(
        content,
        properties={
            'annotators': 'ner',
            'outputFormat': 'json'
        }
    )
    # http://stanfordnlp.github.io/CoreNLP/ner.html
    entity = ""
    entity_words = {}
    for i, _ in enumerate(output['sentences']):
        words = []
        for t in output['sentences'][i]['tokens']:
            if entity != t["ner"]:
                if len(words) > 0:
                    entity_words[" ".join(words)] = entity
                    words = []

            entity = t["ner"]
            if entity != 'O':
                words.append(t["word"])
                # print([t["word"] for t in output['sentences'][i]['tokens']])
                # print(entity_words)
    return entity_words


in_dir = 'links_contents_clean'
files = [
    '20161001-2fc38645b2f1075b-Community_Prefers_Bitcoin_As_Exchange_Medium_Over_Store_of_Value%2C.txt.clean.txt',
    # '20161001-51c397b0cd8bdbc6-Why_Blockchain_Won%E2%80%99t_Disrupt_Banks_First_-_CoinDesk.txt.clean.txt',
    # '20161001-adb0edbb66d3fb0d-Kim_Dotcom_Reiterates_His_Bitcoin_Price_Forecast%2C_%242000_in_2.txt.clean.txt',
    # '20161002-2c276484c2156687-Bitcoin_Can_Buy_You_a_Biometric_Data_Skimmer_on_the.txt.clean.txt',
    # '20161002-c2b624fa4e45d409-Croatian_Law_Enforcement_Completes_Another_Bitcoin-related_Darknet_Drug_Bust_-.txt.clean.txt',
    # '20161003-1eca38f66bf45f56-MGT_Capital_Investments_Inc_%28NYSEMKT%3AMGT%29%3A_Opportunity_Through_Uncertainty_%7C_Insider.tx.clean.txtt'
]

for file in files:
    in_file = os.path.join(in_dir, file)
    with open(in_file, 'r', encoding='utf8') as f:
        # content = f.readlines()
        lines = [line.rstrip('\n') for line in f]
    print(retrieve_named_entities(lines))

{'J. Christina Wang': 'PERSON', 'Federal Reserve Bank of Boston': 'ORGANIZATION', '41 %': 'PERCENT', 'Foundation': 'ORGANIZATION', '37 %': 'PERCENT', '18 %': 'PERCENT', 'currently': 'DATE', 'one': 'NUMBER', '36 %': 'PERCENT', '11 %': 'PERCENT', 'Stephanie Lo': 'PERSON', '12 %': 'PERCENT', '20 %': 'PERCENT', '27 %': 'PERCENT', 'end 2017': 'DATE', '28 %': 'PERCENT', 'Harvard University': 'ORGANIZATION', 'Facebook': 'ORGANIZATION', 'Matthew R. Silver of Pepper Hamilton LLP': 'ORGANIZATION', 'Twitter': 'ORGANIZATION', 'Bitcoin': 'PERSON', 'Board': 'ORGANIZATION', 'the next 12 months': 'DURATION', '19 %': 'PERCENT', 'Timothy R. McTaggart': 'PERSON', '15 %': 'PERCENT', 'Llew Claasen': 'PERSON', '62 %': 'PERCENT', 'the past': 'DATE', '35 %': 'PERCENT', 'Claasen': 'PERSON', 'Cointelegraph': 'PERSON', '14 %': 'PERCENT', 'Bitcoin Foundation': 'ORGANIZATION', '42 %': 'PERCENT', 'The Foundation': 'ORGANIZATION', 'quarterly': 'SET', '2016/2017': 'NUMBER', 'Reddit': 'LOCATION', '10 %': 'PERCENT'}


In [197]:
# ** ojo podrìamos usar las named entities para encontrar el tf-idf like de cada uno!
# 
# detectar fecha y hora de artìculo
# 
# podemos buscar caracterìsticas
# - nùmero de sentencias positivas
# - nùmero de sentencias negativas
# - nùmero de sentencias neutras
# - nùmero de menciones de named entities 
# - tf-idf
# 
# y con esto decimos si es positivo o negativo segùn el precio de la bolsa!!!
# 
# 
# para poner en proper-case
# - reconocimiento de entidades
# - y sólo los nnp pueden serlo
# 
# el número de menciones de la palabra según "coreference"
# - "coreference" only works when words are in capitalized and not capitalized
# 
# el sentimiento positivo, negativo, neutral
# - de sólo el título
# - de sólo las sentencias con menciones
# 
from collections import Counter

def analize_sentiment(content):
    nlp = StanfordCoreNLP('http://127.0.0.1:9000')
    output = nlp.annotate(
        content,
        properties={
            'annotators': 'sentiment',
            'outputFormat': 'json'
        }
    )
    # http://stanfordnlp.github.io/CoreNLP/sentiment.html#options
    sentiment = [s['sentiment'] for s in output['sentences']]
    return Counter(sentiment)
    

in_dir = 'links_contents_clean'
files = [
    '20161001-2fc38645b2f1075b-Community_Prefers_Bitcoin_As_Exchange_Medium_Over_Store_of_Value%2C.txt.clean.txt',
    # '20161001-51c397b0cd8bdbc6-Why_Blockchain_Won%E2%80%99t_Disrupt_Banks_First_-_CoinDesk.txt.clean.txt',
    # '20161001-adb0edbb66d3fb0d-Kim_Dotcom_Reiterates_His_Bitcoin_Price_Forecast%2C_%242000_in_2.txt.clean.txt',
    # '20161002-2c276484c2156687-Bitcoin_Can_Buy_You_a_Biometric_Data_Skimmer_on_the.txt.clean.txt',
    # '20161002-c2b624fa4e45d409-Croatian_Law_Enforcement_Completes_Another_Bitcoin-related_Darknet_Drug_Bust_-.txt.clean.txt',
    # '20161003-1eca38f66bf45f56-MGT_Capital_Investments_Inc_%28NYSEMKT%3AMGT%29%3A_Opportunity_Through_Uncertainty_%7C_Insider.tx.clean.txtt'
]

for file in files:
    in_file = os.path.join(in_dir, file)
    with open(in_file, 'r', encoding='utf8') as f:
        lines = f.readlines()
    sentiment = analize_sentiment("\n".join(lines))
    print(sentiment)

Counter({'Negative': 27, 'Neutral': 3, 'Positive': 2})


In [81]:
# import re
# # p = re.compile(r'\\b+.*\\b+')
p = re.compile(r'(?:\w+\b|\b\w+\b|\b\w+)')
for m in p.finditer(' a1b2c3d4 sd. sd;abc'):
    print(m.start(), m.group())
s = re.sub(r'\b\w+\b', 'w', '---aaa---word---word...word,wprd-w*2016/2017')
print('\"%s\"' % s)

1 a1b2c3d4
10 sd
14 sd
17 abc
"---w---w---w...w,w-w*w/w"


In [87]:
words = re.split(r'[^\w]+', 'aa bc de--aa ---aaa---word---word...word,wprd-w*2016/2017')
print(words)

['aa', 'bc', 'de', 'aa', 'aaa', 'word', 'word', 'word', 'wprd', 'w', '2016', '2017']


In [89]:
print('have' in english_stopwords)

True


In [141]:
line = "u.s.a. ee.uu. write i.e. Timothy R. McTaggart and Matthew R. Silver of. my example .1 MegaUpload 2.0 and BitCache 1. Read more 2016..."
line = re.sub('\d*[.,]\d+', 'DECIMAL', line)

# print(re.sub(r'\b(([a-zA-Z])\.)+', '\g<2>', line))
# print(re.sub(r'\b(?:([a-zA-Z])\.)+', '*', line))
p = re.compile(r'\b((?:[a-zA-Z]\.)+)')
for m in p.finditer(line):
    line = line.replace(m.group(), ''.join(m.group(1).split('.')))
    # print(m.start(), m.group(), ''.join(m.group(1).split('.')))
print(line)

usa ee.uu. write ie Timothy R McTaggart and Matthew R Silver of. my example DECIMAL MegaUpload DECIMAL and BitCache 1. Read more 2016...


In [142]:
line = 'it usa ee.uu. write ie Timothy R McTaggart and Matthew. Inc (NYSEMKT:MGT). my example MegaUpload 2.0 and BitCache. Read more... "hello worl."'
#FAILS WITH: MGT Capital intends to change its corporate name to “John McAfee Global Technologies, Inc.”
print([s for s in re.split(r'\.', line) if s])
print([s for s in re.split(r'\.+', line) if len(s.split()) > 2])

['it usa ee', 'uu', ' write ie Timothy R McTaggart and Matthew', ' Inc (NYSEMKT:MGT)', ' my example MegaUpload 2', '0 and BitCache', ' Read more', ' "hello worl', '"']
['it usa ee', ' write ie Timothy R McTaggart and Matthew', ' my example MegaUpload 2', '0 and BitCache']


In [187]:
lines = [
"Many in the Bitcoin community have expressed that the most compelling function they want the Bitcoin Foundation to focus its advocacy programs on in the next 12 months is; serving as a medium of exchange.",
"Community Prefers Bitcoin As Exchange Medium Over Store of Value, Survey Shows", 
"Community Prefers Bitcoin As Exchange Medium Over Store of Value, Survey Shows", 
"According to the outcomes of a community survey released by the Foundation, 37% of the respondents choose to focus on Bitcoin as a medium of exchange - i.e. used to", 
]
content = ""
for line in lines:
    lower_words = 0
    words = line.split()
    for word in words:
        if word[0].isalpha() and word[0].islower():
            lower_words += 1
    if lower_words > len(words) * 0.10:
        content += line + "\n"
print(content)

Many in the Bitcoin community have expressed that the most compelling function they want the Bitcoin Foundation to focus its advocacy programs on in the next 12 months is; serving as a medium of exchange.
According to the outcomes of a community survey released by the Foundation, 37% of the respondents choose to focus on Bitcoin as a medium of exchange - i.e. used to

