In [54]:
import nltk, os
import pandas as pd
from pymongo import MongoClient

In [55]:
files = os.listdir("articles1")
corpusdir = "articles1/"
corpus = nltk.corpus.reader.plaintext.PlaintextCorpusReader(corpusdir, ".*")

In [56]:
from textblob import TextBlob

def blobbed(s):
    blob = TextBlob(s)
    return blob

def polarity(s):
    blob = blobbed(s)
    polarity = blob.sentiment.polarity
    return polarity

def subjectivity(s):
    blob = blobbed(s)
    subjectivity = blob.sentiment.subjectivity
    return subjectivity

def sentiment(s):
    pol = polarity(s)
    sub = subjectivity(s)
    return sentiment

In [57]:
from numpy import unique, log2, log


def word_entropy(s):
    res = 0
    val , counts = unique(s, return_counts=True)
    freqs = counts.astype('float')/len(s)
    freqs = [i*log2(i) for i in freqs if i > 0]
    entropy = -sum(freqs)
    return entropy
    
def binary_entropy(s):
    binary = ''.join(format(ord(x), 'b') for x in s)
    entropy = word_entropy(binary)
    return entropy

In [58]:
def word_count(s):
    counts = len(unique(s.split()))
    return counts

def word_len(s):
    length = len(s.split())
    return length

def sent_len(s):
    length = len(s.split("."))
    return length

In [61]:
def quantify(text):
    data = {}
    data['polarity'] = polarity(text)
    data['subjectivity'] = subjectivity(text)
    data['sentiment'] = data['subjectivity']-data['polarity']
    data['word_entropy'] = word_entropy(text)
    data['binary_entropy'] = binary_entropy(text)
    data['word_count'] = word_count(text)
    data['word_len'] = word_len(text)
    data['sent_len'] = sent_len(text)
    return data

In [62]:
quantify(news[0])

{'binary_entropy': 0.0010652564781223517,
 'polarity': 0.07904040404040402,
 'sent_len': 32,
 'sentiment': 0.39368686868686875,
 'subjectivity': 0.4727272727272728,
 'word_count': 194,
 'word_entropy': 0.0057509026390405206,
 'word_len': 300}

In [63]:
def process_text_corpus(data_directory,database=None):
    corpus_dir = data_directory+"/"
    
    files = [corpus_dir+i for i in os.listdir(corpus_dir)]
    
    news=[]; words_tokens = []; tags = []; sentence_tokens=[]
    
    for i in files:
        with open(i, "r") as ff:
            
            thisNews = ff.read().strip()
            
            if database:
                dd=thisNews.split("\n")
                sens = dd[2].split(".")
                words = dd[2].split()
                quant= quantify(dd[2])
                client = MongoClient(host="localhost", port=27017)
                if len(dd) > 1:
                    data={"headline":dd[0], "date":dd[1], "article":dd[2],"sentences":sens,"words":words,"quantify":quant}
                    client[database]['articles'].insert_one(data)
                
                
            news.append(thisNews)
            sentence_tokens.append(nltk.sent_tokenize(thisNews))
            
    for sentence in sentence_tokens:
        sents, tagger=[],[]
        for sent in  sentence:
            sents.append(nltk.word_tokenize(sent))
        tagger.append(nltk.pos_tag(sentence))
                
                
        words_tokens.append(sents)
        tags.append(tagger)
        ff.close()
    return {"news":news,"words":words_tokens,"tags":tags,"sentence_tokens":sentence_tokens}
    

In [64]:
articles_corpus=process_text_corpus("articles1", database="vastdb")

In [65]:

for sentences in sentence_tokens:
    sents = []; tagger=[]
    for sentence in sentences:
        sents.append(nltk.word_tokenize(sentence))
        tagger.append(nltk.pos_tag(sentence))
    

In [68]:
sentence_tokens

[["Boatmen's Share Price Jumps On News of NationsBank Deal\nMay 12, 2011\nBoatmen's Bancshares Inc.'s stock price surged Friday after it agreed to be acquired by NationsBank Corp.. Boatmen's shares jumped $10.313 to close at $53.25 on the Nasdaq Stock Market.",
  'NationsBank shares, meanwhile, slid $7.25 to $85.125 in composite trading on the Downtown Stock Exchange.',
  'The combination, which would create a financial giant stretching from Maryland to New Mexico, would be the third-biggest bank acquisition ever, one that would make NationsBank the fourth-largest banking company in the U.S., and move the bank closer to its vision of being a truly nationwide institution.',
  'Operating in nine states, primarily in the South plus Uptown D.C., the Charlotte, N.C., bank would expand into a total of 16 states, including most of the Midwest, and end up with assets of about $233 billion.',
  "NationsBank will pay about $60.27 for each Boatmen's share, or a stunning premium of about 41% above

In [67]:
tagger

[[('2', 'CD'),
  ('p', 'NN'),
  ('i', 'NN'),
  ('p', 'VBP'),
  ('e', 'NN'),
  ('\n', 'NN'),
  ('h', 'NN'),
  ('v', 'IN'),
  ('a', 'DT'),
  ('c', 'JJ'),
  ('\n', 'NN'),
  ('h', 'NN'),
  ('e', 'VBZ'),
  ('a', 'DT'),
  ('t', 'NN'),
  ('i', 'NN'),
  ('n', 'VBP'),
  ('g', 'NN'),
  ('\n', 'NNP'),
  ('v', 'NN'),
  ('e', 'NN'),
  ('n', 'IN'),
  ('t', 'NN'),
  ('i', 'NN'),
  ('l', 'VBP'),
  ('a', 'DT'),
  ('t', 'NN'),
  ('i', 'NN'),
  ('n', 'VBP'),
  ('g', 'NN'),
  ('\n', 'VBP'),
  ('a', 'DT'),
  ('i', 'NN'),
  ('r', 'NN'),
  ('\n', 'NNP'),
  ('c', 'VBZ'),
  ('o', 'JJ'),
  ('n', 'IN'),
  ('d', 'NN'),
  ('i', 'NN'),
  ('t', 'VBP'),
  ('i', 'NN'),
  ('o', 'VBP'),
  ('n', 'NN'),
  ('i', 'NN'),
  ('n', 'VBP'),
  ('g', 'NN'),
  ('\n', 'NNP'),
  ('s', 'NN'),
  ('y', 'NN'),
  ('s', 'NN'),
  ('t', 'NN'),
  ('e', 'NN'),
  ('m', 'NN'),
  ('\n', 'NNP'),
  ('t', 'NN'),
  ('w', 'NN'),
  ('o', 'IN'),
  ('p', 'NN'),
  ('i', 'NN'),
  ('p', 'VBP'),
  ('e', 'NN'),
  ('\n', 'NNP'),
  ('s', 'NN'),
  ('y', 'NN'),
 