In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from pymongo import MongoClient
from spacy.en import English

ImportError: DLL load failed: A megadott eljárás nem található.

In [None]:
client = MongoClient('localhost', 27017)
db = client.python_import
collection = db.earnings_transcript

In [None]:
transcripts = pd.DataFrame(list(collection.find().limit(50)))

In [None]:
transcripts.head(1)

In [None]:
nlp = English()

In [None]:
text = transcripts.iloc[0]['rawText']

In [None]:
doc = nlp(text)

In [None]:
len(doc)

In [None]:
for i in range(0, 200):
    print('{} - {} - {} - {}'.format(doc[i], doc[i].lemma_, doc[i].sentiment, doc[i].cluster))

# Keyphrase extraction

In [None]:
import textacy

In [None]:
ranked = textacy.keyterms.sgrank(
    doc,
    normalize='lemma',
    ngrams = (2,3,4,5),
    n_keyterms=200,
)

In [None]:
ranked[:20]

In [None]:
%pylab inline
textacy.viz.draw_semantic_network(
    textacy.keyterms.terms_to_semantic_network(
        [tok for tok in doc[:200] if not tok.is_stop and tok.pos_ in ["NOUN", "PROPN", "ADJ", "VERB"] and tok.is_alpha],
        normalize='lemma',
        edge_weighting='cooc_freq',
        window_width=15
    )
)

In [None]:
from textacy.text_utils import keyword_in_context

keyword_in_context(doc.text, "morgan", window_width=50)

In [None]:
import math
from collections import Counter 
words = [tok for tok in doc if tok.is_alpha and not tok.is_stop and tok.pos_ in ["NOUN", "VERB", "ADJ", "PROPN"]]
word_probs = {tok.text.lower(): tok.prob for tok in words}

freqs = Counter(tok.text for tok in words)
word_scores = {tok: freqs[tok] for tok, prob in word_probs.items()}

In [None]:
from wordcloud import WordCloud

wordcloud = WordCloud(background_color="white", max_words=50, scale=1.5).generate_from_frequencies(freqs)
image = wordcloud.to_image()
image.save("./wordcloud.png")
from IPython.display import Image 
Image(filename='./wordcloud.png')

In [None]:
def tokenize(doc):
    return [tok.lemma_ for tok in doc if tok.is_alpha and not tok.is_stop]

tokenize(doc)[300:320]

# Load AFINN-111

In [None]:
afinn = pd.read_csv(filepath_or_buffer='AFINN-111.txt', sep='\t', header=None)

In [None]:
afinn.rename(index=str, columns={0: "Word", 1: "Score"}, inplace=True)

In [None]:
afinn.head(5)

In [None]:
afinn[afinn['Word'] == 'shit']['Score'].get(0, 0)

In [None]:
transcripts['NLP'] = transcripts['rawText'].apply(lambda val: nlp(val))

In [None]:
transcripts['tokens'] = transcripts['NLP'].apply(lambda val: tokenize(val))

In [None]:
def build_score_for_tokens(tokens):
    score_pos, score_neg = 0, 0
    for token in tokens:
        temp_score = afinn[afinn['Word'] == token]['Score'].get(0, 0)
        if temp_score > 0:
            score_pos += temp_score
        elif temp_score < 0:
            score_neg += temp_score
    return (score_pos, score_neg, score_pos + score_neg)

In [None]:
transcripts['Sentiment Score'] = transcripts['tokens'].apply(lambda val: build_score_for_tokens(val))

In [None]:
transcripts['SS Ratio'] = transcripts['Sentiment Score'].apply(lambda row: row[0]/(-row[1]))

In [None]:
transcripts['HTone Ratio'] = transcripts['h_tone'].apply(lambda row: row['positiveCount']/row['negativeCount'])

In [None]:
transcripts[['url', 'Sentiment Score', 'h_tone', 'SS Ratio', 'HTone Ratio']]

### Smaller sentiment score ratio

In [None]:
transcripts[transcripts['SS Ratio'] < transcripts['HTone Ratio']]['url']

### Much bigger sentiment score ratio

In [None]:
transcripts[(transcripts['SS Ratio'] / transcripts['HTone Ratio']) > 2.6]['url']

In [None]:
transcripts['SS Ratio'].describe()

# Textacy discovery

In [None]:
textacy.datasets.CapitolWords()