### Bitcoin news corpus
- English news about "bitcoin" since Oct 01 2016

In [149]:
import codecs
import os
import pickle

import nltk

def save_or_retrieve(file_pkz, func, *args, **kwargs):
    """
    will save or retrieve the contents returned by func
    :param file_pkz: path
    :param func: function or lambda for easy use
    :param args: args for func
    :param kwargs: args for func
    :return:
    """
    if not os.path.exists(file_pkz):
        print('saving "%s"' % file_pkz)
        content = func(*args, **kwargs)
        compressed_content = codecs.encode(pickle.dumps(content), 'zlib_codec')
        with open(file_pkz, 'wb') as f:
            f.write(compressed_content)
        return content
    else:
        print('retrieving "%s"' % file_pkz)
        with open(file_pkz, 'rb') as f:
            compressed_content = f.read()
        return pickle.loads(codecs.decode(compressed_content, 'zlib_codec'))

corpus_dir = 'links_contents'
working_dir = 'data'

english_stopwords = nltk.corpus.stopwords.words('english')
wnl = nltk.WordNetLemmatizer()

corpus_reader = nltk.corpus.CategorizedPlaintextCorpusReader(corpus_dir, r'.*.txt', cat_pattern=r'([0-9]+)-.*')
categories = corpus_reader.categories()
print('categories:', categories[:5], '...')

all_words_pkz = os.path.join(working_dir, 'all_words.pkz')
all_words = save_or_retrieve(
    all_words_pkz,
    lambda: nltk.Text(w.lower() for w in corpus_reader.words() if w.lower() not in english_stopwords))

all_lemmatized_words_pkz = os.path.join(working_dir, 'all_lemmatized_words.pkz')
all_lemmatized_words = save_or_retrieve(
    all_lemmatized_words_pkz,
    lambda: nltk.Text(wnl.lemmatize(w.lower()) for w in corpus_reader.words() if w.lower() not in english_stopwords))

words = ['bitcoin', 'china']  # lowercase
for word in words:
    # print('CONCORDANCE: ', word.upper())
    # all_text.concordance(word, width=50)
    # print()

    # Distributional similarity: find other words which appear in the same contexts as the specified word; list most similar words first.
    print('similarity for word "%s"' % word)
    print('in original words:')
    all_words.similar(word)
    print('in lemmatized words:')
    all_lemmatized_words.similar(word)


categories: ['20161001', '20161002', '20161003', '20161004', '20161005'] ...
retrieving "data\all_words.pkz"


retrieving "data\all_lemmatized_words.pkz"


similarity for word "bitcoin"
in original words:


blockchain cryptocurrency bitcoins technology ethereum news digital
exchange new use price financial transactions based network zcash
market online currency china
in lemmatized words:


blockchain cryptocurrency technology exchange news bitcoins digital
new ethereum payment transaction price market use financial user like
network currency mining
similarity for word "china"
in original words:


bitcoin chinese blockchain price india u exchanges adoption technology
exchange network cryptocurrency mining community company japan news
korea first trading
in lemmatized words:


bitcoin chinese blockchain u price exchange india technology company
network adoption wallet mining startup country ecosystem one currency
cryptocurrency community


### Cleaning corpus
- Deleting lines with less than 10 words
- Files with less than 1KB or more than 15KB

#### Using Stanford's CoreNLP for sentiment analysis and entity recognition
Problema: los títulos están en mayúsculas, debemos reconocer las entidades para ello ignoramos sólo tomaremos en cuenta oraciones que contengan como mínimo un 10% de palabras en minúscula

In [188]:
from pycorenlp import StanfordCoreNLP

def retrieve_named_entities(lines, min_lower_words_per_sentence_rate=0.10):
    nlp = StanfordCoreNLP('http://127.0.0.1:9000')
    entity_words = {}
    
    # ignore the lines with more than 90% of uppercase
    content = ""
    for line in lines:
        lower_words = 0
        words = line.split()
        # todo bad performance
        for word in words:
            if word[0].isalpha() and word[0].islower():
                lower_words += 1
        if lower_words > len(words) * min_lower_words_per_sentence_rate:
            content += line + "\n"

    output = nlp.annotate(
        content,
        properties={
            'annotators': 'ner',
            'outputFormat': 'json'
        }
    )
    # http://stanfordnlp.github.io/CoreNLP/ner.html
    entity = ""
    entity_words = {}
    for i, _ in enumerate(output['sentences']):
        words = []
        for t in output['sentences'][i]['tokens']:
            if entity != t["ner"]:
                if len(words) > 0:
                    entity_words[" ".join(words)] = entity
                    words = []

            entity = t["ner"]
            if entity != 'O':
                words.append(t["word"])
                # print([t["word"] for t in output['sentences'][i]['tokens']])
                # print(entity_words)
    return entity_words


in_dir = 'links_contents_clean'
files = [
    '20161001-2fc38645b2f1075b-Community_Prefers_Bitcoin_As_Exchange_Medium_Over_Store_of_Value%2C.txt.clean.txt',
    # '20161001-51c397b0cd8bdbc6-Why_Blockchain_Won%E2%80%99t_Disrupt_Banks_First_-_CoinDesk.txt.clean.txt',
    # '20161001-adb0edbb66d3fb0d-Kim_Dotcom_Reiterates_His_Bitcoin_Price_Forecast%2C_%242000_in_2.txt.clean.txt',
    # '20161002-2c276484c2156687-Bitcoin_Can_Buy_You_a_Biometric_Data_Skimmer_on_the.txt.clean.txt',
    # '20161002-c2b624fa4e45d409-Croatian_Law_Enforcement_Completes_Another_Bitcoin-related_Darknet_Drug_Bust_-.txt.clean.txt',
    # '20161003-1eca38f66bf45f56-MGT_Capital_Investments_Inc_%28NYSEMKT%3AMGT%29%3A_Opportunity_Through_Uncertainty_%7C_Insider.tx.clean.txtt'
]

for file in files:
    in_file = os.path.join(in_dir, file)
    with open(in_file, 'r', encoding='utf8') as f:
        # content = f.readlines()
        lines = [line.rstrip('\n') for line in f]
    print(retrieve_named_entities(lines))

{'J. Christina Wang': 'PERSON', 'Federal Reserve Bank of Boston': 'ORGANIZATION', '41 %': 'PERCENT', 'Foundation': 'ORGANIZATION', '37 %': 'PERCENT', '18 %': 'PERCENT', 'currently': 'DATE', 'one': 'NUMBER', '36 %': 'PERCENT', '11 %': 'PERCENT', 'Stephanie Lo': 'PERSON', '12 %': 'PERCENT', '20 %': 'PERCENT', '27 %': 'PERCENT', 'end 2017': 'DATE', '28 %': 'PERCENT', 'Harvard University': 'ORGANIZATION', 'Facebook': 'ORGANIZATION', 'Matthew R. Silver of Pepper Hamilton LLP': 'ORGANIZATION', 'Twitter': 'ORGANIZATION', 'Bitcoin': 'PERSON', 'Board': 'ORGANIZATION', 'the next 12 months': 'DURATION', '19 %': 'PERCENT', 'Timothy R. McTaggart': 'PERSON', '15 %': 'PERCENT', 'Llew Claasen': 'PERSON', '62 %': 'PERCENT', 'the past': 'DATE', '35 %': 'PERCENT', 'Claasen': 'PERSON', 'Cointelegraph': 'PERSON', '14 %': 'PERCENT', 'Bitcoin Foundation': 'ORGANIZATION', '42 %': 'PERCENT', 'The Foundation': 'ORGANIZATION', 'quarterly': 'SET', '2016/2017': 'NUMBER', 'Reddit': 'LOCATION', '10 %': 'PERCENT'}


In [4]:
import os
from collections import Counter
from pycorenlp import StanfordCoreNLP

def analize_sentiment(content):
    nlp = StanfordCoreNLP('http://127.0.0.1:9000')
    output = nlp.annotate(
        content,
        properties={
            'annotators': 'sentiment',
            'outputFormat': 'json'
        }
    )
    # http://stanfordnlp.github.io/CoreNLP/sentiment.html#options
    sentiment = [s['sentiment'] for s in output['sentences']]
    return Counter(sentiment)
    

in_dir = 'data/corpus/links_contents_clean'
files = [
    '20161001-2fc38645b2f1075b-Community_Prefers_Bitcoin_As_Exchange_Medium_Over_Store_of_Value%2C.txt',
    # '20161001-51c397b0cd8bdbc6-Why_Blockchain_Won%E2%80%99t_Disrupt_Banks_First_-_CoinDesk.txt.clean.txt',
    # '20161001-adb0edbb66d3fb0d-Kim_Dotcom_Reiterates_His_Bitcoin_Price_Forecast%2C_%242000_in_2.txt.clean.txt',
    # '20161002-2c276484c2156687-Bitcoin_Can_Buy_You_a_Biometric_Data_Skimmer_on_the.txt.clean.txt',
    # '20161002-c2b624fa4e45d409-Croatian_Law_Enforcement_Completes_Another_Bitcoin-related_Darknet_Drug_Bust_-.txt.clean.txt',
    # '20161003-1eca38f66bf45f56-MGT_Capital_Investments_Inc_%28NYSEMKT%3AMGT%29%3A_Opportunity_Through_Uncertainty_%7C_Insider.tx.clean.txtt'
]

for file in files:
    in_file = os.path.join(in_dir, file)
    with open(in_file, 'r', encoding='utf8') as f:
        lines = f.readlines()
    sentiment = analize_sentiment("\n".join(lines))
    print(sentiment)

Counter({'Negative': 25, 'Positive': 3, 'Neutral': 3})
