### Bitcoin news corpus
- English news about "bitcoin" since Oct 01 2016

In [149]:
import codecs
import pickle
import os
import nltk


def save_or_retrieve(file_pkz, func, *args, **kwargs):
    """
    will save or retrieve the contents returned by func
    :param file_pkz: path
    :param func: function or lambda for easy use
    :param args: args for func
    :param kwargs: args for func
    :return:
    """
    if not os.path.exists(file_pkz):
        print('saving "%s"' % file_pkz)
        content = func(*args, **kwargs)
        compressed_content = codecs.encode(pickle.dumps(content), 'zlib_codec')
        with open(file_pkz, 'wb') as f:
            f.write(compressed_content)
        return content
    else:
        print('retrieving "%s"' % file_pkz)
        with open(file_pkz, 'rb') as f:
            compressed_content = f.read()
        return pickle.loads(codecs.decode(compressed_content, 'zlib_codec'))

corpus_dir = 'links_contents'
working_dir = 'data'

english_stopwords = nltk.corpus.stopwords.words('english')
wnl = nltk.WordNetLemmatizer()

corpus_reader = nltk.corpus.CategorizedPlaintextCorpusReader(corpus_dir, r'.*.txt', cat_pattern=r'([0-9]+)-.*')
categories = corpus_reader.categories()
print('categories:', categories[:5], '...')

all_words_pkz = os.path.join(working_dir, 'all_words.pkz')
all_words = save_or_retrieve(
    all_words_pkz,
    lambda: nltk.Text(w.lower() for w in corpus_reader.words() if w.lower() not in english_stopwords))

all_lemmatized_words_pkz = os.path.join(working_dir, 'all_lemmatized_words.pkz')
all_lemmatized_words = save_or_retrieve(
    all_lemmatized_words_pkz,
    lambda: nltk.Text(wnl.lemmatize(w.lower()) for w in corpus_reader.words() if w.lower() not in english_stopwords))

words = ['bitcoin', 'china']  # lowercase
for word in words:
    # print('CONCORDANCE: ', word.upper())
    # all_text.concordance(word, width=50)
    # print()
    # Distributional similarity: find other words which appear in the same contexts as the specified word; list most similar words first.
    print('similarity for word "%s"' % word)
    print('in original words:')
    all_words.similar(word)
    print('in lemmatized words:')
    all_lemmatized_words.similar(word)

categories: ['20161001', '20161002', '20161003', '20161004', '20161005'] ...
retrieving "data\all_words.pkz"


retrieving "data\all_lemmatized_words.pkz"


similarity for word "bitcoin"
in original words:


blockchain cryptocurrency bitcoins technology ethereum news digital
exchange new use price financial transactions based network zcash
market online currency china
in lemmatized words:


blockchain cryptocurrency technology exchange news bitcoins digital
new ethereum payment transaction price market use financial user like
network currency mining
similarity for word "china"
in original words:


bitcoin chinese blockchain price india u exchanges adoption technology
exchange network cryptocurrency mining community company japan news
korea first trading
in lemmatized words:


bitcoin chinese blockchain u price exchange india technology company
network adoption wallet mining startup country ecosystem one currency
cryptocurrency community


### Cleaning corpus
- Deleting lines with less than 10 words

In [148]:
import re

def clean_lines(lines, preprocess=True):
    if not preprocess:
        return lines
    else:
        preprocessed_lines = []
        for line in lines:
            # TODO a NN to recognize the type of word? NUMBER, PERCENT, HOUR, DATE, ABBR, (y hellooo!)

            # standardize perc values  --  remember ?: non capturing group
            line = re.sub(r'[+\-]?(?:\d*[.,]?\d+|\d+[.,]?\d*)%', 'PERCENTAGE', line)

            # standardize monetary values $1.5M, $1., $.3
            line = re.sub(r'\$\d*(?:\d*[.,]?\d+|\d+[.,]?\d*)[Mm]?', 'MONEY', line)

            # standardize certain decimal values like "2.0" or ".2" not "2016."
            line = re.sub('\d*[.,]\d+', 'DECIMAL', line)

            # standardize num values - not by now too many false positives
            # line = re.sub('\d*(?:\d*[.,]?\d+|\d+[.,]?\d*)', 'NUMBER', line)

            # standardize abbreviations i.e. -> ie, u.s.a. -> usa, but not ee.uu.
            # there is no way to multiple re group captures http://stackoverflow.com/a/464879/2692914
            p = re.compile(r'\b((?:[a-zA-Z]\.)+)')
            for m in p.finditer(line):
                line = line.replace(m.group(), ''.join(m.group(1).split('.')))

            # separate in sentences
            # sentences = re.split(r'(?:.\s+)', line)
            sentences = [s for s in re.split(r'\.+', line) if len(s.split()) > 2]
            for s in sentences:
                preprocessed_lines.append(s)
        return preprocessed_lines


def clean_file(in_file, out_file, preprocess=True):
    # min words per paragraph
    min_words_per_paragraph = 7
    with open(in_file, 'r', encoding='utf8') as f:
        lines = [line.rstrip('\n') for line in f if len(line.split()) > min_words_per_paragraph]

    # cleaning and preprocessing files
    if preprocess:
        lines = clean_lines(lines)
        
    # QUITAR ESPACIOS DE LAS LINEAS! 
    # USAR EL SERVIDOR DEDICADO CON UN POS TAGGER! http://stanfordnlp.github.io/CoreNLP/corenlp-server.html#dedicated-server 
    
    content = '\n'.join(lines)
    # doenst work as expected, for semantic analysis we need some stopwords (i.e. have)
    # content = content.lower()
    # english_stopwords = nltk.corpus.stopwords.words('english')
    # wnl = nltk.WordNetLemmatizer()
    # words = list(set(re.split(r'[^\w]+', content)))
    # for word in words:
    #     if word in english_stopwords:
    #         # ignore english stopwords
    #         new_word = ''
    #     else:
    #         # lemmatize words
    #         new_word = wnl.lemmatize(word)
    #     content = re.sub(r'\b%s\b' % word, new_word, content)

    with open(out_file, 'wb') as f:
        f.write(bytes(content, encoding='utf8'))

in_dir = 'links_contents'
out_dir = 'links_contents_clean'
files = [
    '20161001-2fc38645b2f1075b-Community_Prefers_Bitcoin_As_Exchange_Medium_Over_Store_of_Value%2C.txt',
    '20161001-51c397b0cd8bdbc6-Why_Blockchain_Won%E2%80%99t_Disrupt_Banks_First_-_CoinDesk.txt',
    '20161001-adb0edbb66d3fb0d-Kim_Dotcom_Reiterates_His_Bitcoin_Price_Forecast%2C_%242000_in_2.txt',
    '20161002-2c276484c2156687-Bitcoin_Can_Buy_You_a_Biometric_Data_Skimmer_on_the.txt',
    '20161002-c2b624fa4e45d409-Croatian_Law_Enforcement_Completes_Another_Bitcoin-related_Darknet_Drug_Bust_-.txt',
    '20161003-1eca38f66bf45f56-MGT_Capital_Investments_Inc_%28NYSEMKT%3AMGT%29%3A_Opportunity_Through_Uncertainty_%7C_Insider.txt'
]

for file in files:
    in_file = os.path.join(in_dir, file)
    # out_file = os.path.join(out_dir, file)
    # clean_file(in_file, out_file, False)
    out_file = os.path.join(out_dir, file + ".clean.txt")
    clean_file(in_file, out_file, True)

#### Using Stanford's CoreNLP for sentiment analysis and entity recognition

In [158]:
# http://stanfordnlp.github.io/CoreNLP/corenlp-server.html#getting-started
from pycorenlp import StanfordCoreNLP

def corenlp_analysis_all(in_file):
    with open(in_file, 'r', encoding='utf8') as f:
        content = "".join(f.readlines())

    nlp = StanfordCoreNLP('http://127.0.0.1:9000')
    output = nlp.annotate(
        content,
        properties={
            'annotators': 'sentiment',
            'outputFormat': 'json'
        }
    )

    for i, _ in enumerate(output['sentences']):
        print("Sentence:", [t['word'] for t in output['sentences'][i]['tokens']])
        print("Sentiment:", output['sentences'][i]['sentiment'])
        print("")


def corenlp_analysis(in_file):
    with open(in_file, 'r', encoding='utf8') as f:
        # content = f.readlines()
        lines = [line.rstrip('\n') for line in f]

    nlp = StanfordCoreNLP('http://127.0.0.1:9000')
    for line in lines:
        output = nlp.annotate(
            line,
            properties={
                'annotators': 'sentiment,ner',
                'outputFormat': 'json'
            }
        )
        # http://stanfordnlp.github.io/CoreNLP/ner.html
        ner_names = ["PERSON", "LOCATION", "ORGANIZATION", "MISC"]
        ner_numerical = ["MONEY", "NUMBER", "ORDINAL", "PERCENT"]
        ner_temporal = ["DATE", "TIME", "DURATION", "SET"]
        for i, _ in enumerate(output['sentences']):
            print([t['word'] + (":" + t["ner"] if t["ner"] != "O" else "") for t in output['sentences'][i]['tokens']])
            print("Sentiment:", output['sentences'][i]['sentiment'])
            print("")


in_dir = 'links_contents_clean'
files = [
    '20161001-2fc38645b2f1075b-Community_Prefers_Bitcoin_As_Exchange_Medium_Over_Store_of_Value%2C.txt.clean.txt',
    # '20161001-51c397b0cd8bdbc6-Why_Blockchain_Won%E2%80%99t_Disrupt_Banks_First_-_CoinDesk.txt.clean.txt',
    # '20161001-adb0edbb66d3fb0d-Kim_Dotcom_Reiterates_His_Bitcoin_Price_Forecast%2C_%242000_in_2.txt.clean.txt',
    # '20161002-2c276484c2156687-Bitcoin_Can_Buy_You_a_Biometric_Data_Skimmer_on_the.txt.clean.txt',
    # '20161002-c2b624fa4e45d409-Croatian_Law_Enforcement_Completes_Another_Bitcoin-related_Darknet_Drug_Bust_-.txt.clean.txt',
    # '20161003-1eca38f66bf45f56-MGT_Capital_Investments_Inc_%28NYSEMKT%3AMGT%29%3A_Opportunity_Through_Uncertainty_%7C_Insider.tx.clean.txtt'
]

for file in files:
    in_file = os.path.join(in_dir, file)
    corenlp_analysis(in_file)

['Community', 'Prefers', 'Bitcoin', 'As', 'Exchange', 'Medium', 'Over', 'Store', 'of', 'Value', ',', 'Survey', 'Shows']
Sentiment: Negative

['Community', 'Prefers', 'Bitcoin', 'As', 'Exchange', 'Medium', 'Over', 'Store', 'of', 'Value', ',', 'Survey', 'Shows']
Sentiment: Negative



['Many', 'in', 'the', 'Bitcoin:LOCATION', 'community', 'have', 'expressed', 'that', 'the', 'most', 'compelling', 'function', 'they', 'want', 'the', 'Bitcoin:ORGANIZATION', 'Foundation:ORGANIZATION', 'to', 'focus', 'its', 'advocacy', 'programs', 'on', 'in', 'the:DURATION', 'next:DURATION', '12:DURATION', 'months:DURATION', 'is', ';', 'serving', 'as', 'a', 'medium', 'of', 'exchange', '.']
Sentiment: Positive



['According', 'to', 'the', 'outcomes', 'of', 'a', 'community', 'survey', 'released', 'by', 'the', 'Foundation:ORGANIZATION', ',', '37:PERCENT', '%:PERCENT', 'of', 'the', 'respondents', 'choose', 'to', 'focus', 'on', 'Bitcoin', 'as', 'a', 'medium', 'of', 'exchange', '-', 'i.e.', 'used', 'to', 'actually', 'buy', 'things', '-', 'over', 'its', 'use', 'as', 'a', 'store', 'of', 'value', '-LRB-', '18:PERCENT', '%:PERCENT', '-RRB-', ',', '10:PERCENT', '%:PERCENT', 'who', 'do', 'not', 'know', 'and', '36:PERCENT', '%:PERCENT', 'that', 'opted', 'for', 'other', 'uses', '.']
Sentiment: Negative

['Commenting', 'on', 'the', 'survey', ',', 'the', 'Executive', 'Director', 'of', 'The', 'Bitcoin:ORGANIZATION', 'Foundation:ORGANIZATION', ',', 'Llew:PERSON', 'Claasen:PERSON', ',', 'told', 'Cointelegraph:PERSON', ':']
Sentiment: Negative



['``', 'The:ORGANIZATION', 'Foundation:ORGANIZATION', 'needs', 'to', 're-establish', 'legitimacy', 'amongst', 'long-time', 'Bitcoin:LOCATION', 'community', 'members', 'because', 'of', 'things', 'that', 'may', 'or', 'may', 'not', 'have', 'happened', 'in', 'the:DATE', 'past:DATE', 'through', 'predecessors', '.']
Sentiment: Negative

['``']
Sentiment: Neutral



['When', 'used', 'as', 'a', 'medium', 'of', 'exchange', ',', 'rather', 'than', 'as', 'an', 'investment', 'vehicle', ',', 'Bitcoin:PERSON', 'shows', 'users', 'many', 'potential', 'benefits', 'as', 'it', 'can', 'not', 'be', 'created', 'at', 'will', 'and', 'its', 'supply', 'is', 'finite', ',', 'write', 'Timothy:PERSON', 'R.:PERSON', 'McTaggart:PERSON', 'and', 'Matthew:ORGANIZATION', 'R.:ORGANIZATION', 'Silver:ORGANIZATION', 'of:ORGANIZATION', 'Pepper:ORGANIZATION', 'Hamilton:ORGANIZATION', 'LLP:ORGANIZATION', '.']
Sentiment: Negative



['However', ',', 'Harvard:ORGANIZATION', 'University:ORGANIZATION', "'s", 'Stephanie:PERSON', 'Lo:PERSON', 'and', 'J.:PERSON', 'Christina:PERSON', 'Wang:PERSON', 'of', 'the', 'Federal:ORGANIZATION', 'Reserve:ORGANIZATION', 'Bank:ORGANIZATION', 'of:ORGANIZATION', 'Boston:ORGANIZATION', 'in', 'Bitcoin:LOCATION', 'as', 'Money', '?']
Sentiment: Neutral

[',', 'noted', 'that', 'Bitcoin:PERSON', 'must', 'be', 'accepted', 'as', 'payment', 'for', 'a', 'sufficiently', 'large', 'set', 'of', 'goods', 'or', 'services', ',', 'or', 'other', 'assets', 'to', 'serve', 'as', 'a', 'medium', 'of', 'exchange', '.']
Sentiment: Negative



['``', 'A', 'user', 'is', 'willing', 'to', 'accept', 'a', 'fiat', 'money', 'as', 'payment', 'for', 'other', 'objects', 'of', 'value', 'only', 'if', 'she', 'is', 'confident', 'that', 'enough', 'others', 'will', 'be', 'willing', 'to', 'accept', 'it', 'in', 'turn', 'from', 'her', '.']
Sentiment: Negative

['Unlike', 'the', 'regular', 'fiat', 'money', ',', 'however', ',', 'Bitcoin:PERSON', 'is', 'not', 'backed', 'by', 'any', 'sovereign', 'entity', 'that', 'can', 'compel', 'the', 'acceptance', 'of', 'its', 'affiliated', 'fiat', 'money', 'within', 'a', 'certain', 'realm', '.']
Sentiment: Negative

['Therefore', ',', 'in', 'order', 'to', 'serve', 'as', 'a', 'medium', 'of', 'exchange', ',', 'Bitcoin:LOCATION', 'has', 'to', 'rely', 'solely', 'on', 'the', 'self-fulfilling', 'expectation', 'on', 'the', 'part', 'of', 'private', 'agents', 'that', 'it', 'will', 'be', 'accepted', '.', "''"]
Sentiment: Negative

['Claasen:PERSON', 'pointed', 'out', 'that', 'respondents', 'were', 'encouraged', 'to', 'p

['``', 'We', "'re", 'not', 'under', 'any', 'illusion', 'that', 'we', "'re", 'going', 'to', 'directly', 'influence', 'the', 'protocol', 'product', 'roadmap', 'during', 'this', 'plan', 'to', 'end:DATE', '2017:DATE', '.']
Sentiment: Negative

['That', 'said', ',', 'we', 'have', 'community', 'support', 'for', 'getting', 'involved', 'in', 'key', 'areas', 'that', 'are', 'either', 'currently:DATE', 'unserved', 'or', 'underserved', '.']
Sentiment: Neutral

['We', 'have', 'no', 'desire', 'to', 'compete', 'with', 'anyone', 'else', 'in', 'the', 'Bitcoin', 'community', '.']
Sentiment: Negative



['``', 'It', "'s", 'clear', 'to', 'me', 'that', 'the', 'foundation', 'should', 'focus', 'its', 'advocacy', 'programs', 'on', 'Bitcoin', 'as', 'a', 'store', 'of', 'value', 'and', 'medium', 'of', 'exchange', 'during', '2016/2017:NUMBER', '.']
Sentiment: Negative

['People', 'are', 'looking', 'for', 'another', 'option', 'in', 'key', 'areas', 'like', 'international', 'remittances', ',', 'micro', '-', 'and', 'peer-to-peer', 'payments', 'and', 'a', 'fiat', 'currency', 'hedge', 'and', 'they', 'need', 'to', 'know', 'more', 'about', 'Bitcoin:PERSON', 'in', 'this', 'context', '.', "''"]
Sentiment: Negative



['He', 'said', 'the', 'survey', 'was', 'only', 'one:NUMBER', 'of', 'the', 'tools', 'that', 'was', 'used', 'in', 'formulating', 'the', 'plan', 'as', 'he', 'had', 'many', 'face-to-face', 'and', 'telephonic', 'conversations', 'with', 'key', 'players', 'in', 'the', 'community', 'over', 'the', 'last', 'while', 'that', 'have', 'also', 'shaped', 'our', 'plans', '.']
Sentiment: Negative



['The', 'survey', 'shows', 'that', '41:PERCENT', '%:PERCENT', 'of', 'respondents', 'want', 'the', 'Bitcoin:ORGANIZATION', 'Foundation:ORGANIZATION', 'to', 'structure', 'its', 'operations', 'globally', 'by', 'having', 'initiatives', 'centrally', 'initiated', 'but', 'locally', 'managed', '.']
Sentiment: Negative



['35:PERCENT', '%:PERCENT', 'want', 'them', 'to', 'be', 'either', 'locally', 'initiated', 'and', 'locally', 'managed', 'through', 'affiliated', 'chapters', ',', 'and', '15:PERCENT', '%:PERCENT', '-', 'centrally', 'initiated', 'and', 'centrally', 'managed', 'by', 'the', 'Foundation:ORGANIZATION', '.']
Sentiment: Negative



['``', 'Philosophically', ',', 'the', 'community', 'does', 'not', 'wish', 'for', 'the', 'foundation', 'to', 'attempt', 'to', 'centralize', 'control', 'of', 'any', 'functions', 'and', 'we', "'re", 'very', 'supportive', 'of', 'decentralized', 'decision-making', 'by', 'the', 'community', 'in', 'the', 'context', 'of', 'co-ordinated', 'activity', '.']
Sentiment: Negative

['There', 'is', 'much', 'work', 'to', 'be', 'done', 'in', 'this', 'area', 'because', 'it', "'s", 'so', 'new', 'to', 'everyone', 'in', 'the', 'community', '.', "''"]
Sentiment: Positive



['In', 'another', 'area', 'touched', 'in', 'the', 'survey', ',', '42:PERCENT', '%:PERCENT', 'of', 'the', 'respondents', 'say', 'their', 'companies', 'are', 'not', 'currently:DATE', 'using', 'Bitcoin:LOCATION', 'in', 'any', 'way', 'and', '28:PERCENT', '%:PERCENT', 'having', 'companies', 'that', 'use', 'Bitcoin:PERSON', 'as', 'a', 'primary', 'product', 'or', 'service', 'and', '19:PERCENT', '%:PERCENT', 'currently:DATE', 'researching', 'using', 'Bitcoin', 'in', 'a', 'product', 'or', 'service', '.']
Sentiment: Negative

['11:PERCENT', '%:PERCENT', 'have', 'a', 'company', "'s", 'products', 'use', 'Bitcoin', '.']
Sentiment: Negative



['62:PERCENT', '%:PERCENT', 'of', 'respondents', 'would', 'be', 'willing', 'to', 'contribute', 'to', 'funding', 'Bitcoin:ORGANIZATION', 'Foundation:ORGANIZATION', 'operations', 'in', 'their', 'personal', 'capacity', 'in', 'return', 'for', 'a', 'published', 'quarterly:SET', 'plan', 'provided', 'they', 'can', 'see', 'its', 'plan', '.']
Sentiment: Negative

['19:PERCENT', '%:PERCENT', 'each', 'both', 'agree', 'and', 'disagree', 'to', 'make', 'such', 'contributions', '.']
Sentiment: Negative



['On', 'how', 'the', 'Foundation', 'should', 'fund', 'its', 'activities', ',', 'some', 'respondents', 'say', 'it', 'should', 'be', 'primarily', 'through', 'individual/consumer', 'memberships', '-LRB-', '27:PERCENT', '%:PERCENT', '-RRB-', ',', 'corporate', 'sponsorships', '-LRB-', '20:PERCENT', '%:PERCENT', '-RRB-', ',', 'conferences', '-LRB-', '18:PERCENT', '%:PERCENT', '-RRB-', ',', 'business', 'memberships', '-LRB-', '14:PERCENT', '%:PERCENT', '-RRB-', ',', 'training', 'programs', '-LRB-', '12:PERCENT', '%:PERCENT', '-RRB-', 'and', 'other', 'means', '-LRB-', '10:PERCENT', '%:PERCENT', '-RRB-', '.']
Sentiment: Negative



['Claasen:PERSON', 'noted', 'that', 'from', 'an', 'organizational', 'structure', 'perspective', ',', 'an', 'army', 'of', 'volunteers', 'can', 'not', 'be', 'relied', 'upon', 'to', 'run', 'the', 'programs', 'of', 'the', 'Foundation', 'because', 'people', '``', 'do', "n't", 'have', 'large', 'blocks', 'of', 'time', 'to', 'commit', 'for', 'free', 'and', 'this', 'is', 'perfectly', 'reasonable', 'and', 'not', 'in', 'any', 'way', 'suggestive', 'of', 'a', 'lack', 'of', 'support', 'for', 'the', 'foundation', '.']
Sentiment: Negative

['We', 'must', 'pay', 'for', 'people', "'s", 'time', ',', 'which', 'means', 'that', 'we', 'must', 'have', 'a', 'compelling', 'and', 'sustainable', 'revenue', 'model', 'that', 'does', "n't", 'create', 'conflicts', 'of', 'interest', '.', "''"]
Sentiment: Negative



['He', 'added', 'that', 'they', 'are', 'working', 'with', 'the', 'Board:ORGANIZATION', 'and', 'key', 'stakeholders', 'on', 'a', 'comprehensive', 'operating', 'plan', '.']
Sentiment: Positive

['``', 'Everything', 'that', 'we', "'re", 'going', 'to', 'be', 'doing', 'going', 'forward', 'will', 'be', 'driven', 'by', 'where', 'our', 'members', 'wants', 'us', 'to', 'focus', 'our', 'resources', '.']
Sentiment: Negative

['We', 'will', 'be', 'completely', 'transparent', 'with', 'our', 'operating', 'and', 'financial', 'plan', '.']
Sentiment: Negative

['I', 'ca', "n't", 'say', 'much', 'more', 'than', 'that', 'just', 'yet', 'beyond', 'commentary', 'on', 'the', 'survey', 'results', ',', 'but', 'will', 'do', 'so', 'soon', ',', "''", 'he', 'said', '.']
Sentiment: Neutral

['For', 'updates', 'and', 'exclusive', 'offers', ',', 'enter', 'your', 'e-mail', 'below', '.']
Sentiment: Negative



In [81]:
# import re
# # p = re.compile(r'\\b+.*\\b+')
p = re.compile(r'(?:\w+\b|\b\w+\b|\b\w+)')
for m in p.finditer(' a1b2c3d4 sd. sd;abc'):
    print(m.start(), m.group())
s = re.sub(r'\b\w+\b', 'w', '---aaa---word---word...word,wprd-w*2016/2017')
print('\"%s\"' % s)

1 a1b2c3d4
10 sd
14 sd
17 abc
"---w---w---w...w,w-w*w/w"


In [87]:
words = re.split(r'[^\w]+', 'aa bc de--aa ---aaa---word---word...word,wprd-w*2016/2017')
print(words)

['aa', 'bc', 'de', 'aa', 'aaa', 'word', 'word', 'word', 'wprd', 'w', '2016', '2017']


In [89]:
print('have' in english_stopwords)

True


In [141]:
line = "u.s.a. ee.uu. write i.e. Timothy R. McTaggart and Matthew R. Silver of. my example .1 MegaUpload 2.0 and BitCache 1. Read more 2016..."
line = re.sub('\d*[.,]\d+', 'DECIMAL', line)

# print(re.sub(r'\b(([a-zA-Z])\.)+', '\g<2>', line))
# print(re.sub(r'\b(?:([a-zA-Z])\.)+', '*', line))
p = re.compile(r'\b((?:[a-zA-Z]\.)+)')
for m in p.finditer(line):
    line = line.replace(m.group(), ''.join(m.group(1).split('.')))
    # print(m.start(), m.group(), ''.join(m.group(1).split('.')))
print(line)

usa ee.uu. write ie Timothy R McTaggart and Matthew R Silver of. my example DECIMAL MegaUpload DECIMAL and BitCache 1. Read more 2016...


In [142]:
line = 'it usa ee.uu. write ie Timothy R McTaggart and Matthew. Inc (NYSEMKT:MGT). my example MegaUpload 2.0 and BitCache. Read more... "hello worl."'
#FAILS WITH: MGT Capital intends to change its corporate name to “John McAfee Global Technologies, Inc.”
print([s for s in re.split(r'\.', line) if s])
print([s for s in re.split(r'\.+', line) if len(s.split()) > 2])

['it usa ee', 'uu', ' write ie Timothy R McTaggart and Matthew', ' Inc (NYSEMKT:MGT)', ' my example MegaUpload 2', '0 and BitCache', ' Read more', ' "hello worl', '"']
['it usa ee', ' write ie Timothy R McTaggart and Matthew', ' my example MegaUpload 2', '0 and BitCache']
