## Unigrams

### Scattertext

In [52]:
import pickle
import pandas as pd
import scattertext as st
import pytextrank, spacy

In [2]:
corpus_ballmer = pickle.load(open('cleaned_corpus_ball.pickle', 'rb'))

In [3]:
corpus_nadella = pickle.load(open('cleaned_corpus_nad.pickle', 'rb'))

In [4]:
corpus_text_ballmer = []
for doc in corpus_ballmer:
    corpus_text_ballmer.append(' '.join(doc))

In [5]:
corpus_text_nadella = []
for doc in corpus_nadella:
    corpus_text_nadella.append(' '.join(doc))

In [6]:
quarters_ballmer = ['msft_07q3', 'msft_07q4', 'msft_08q1', 'msft_08q2', 'msft_08q3', 'msft_08q4', 'msft_09q1', 'msft_09q2', 'msft_09q3', 'msft_09q4', 'msft_10q1', 'msft_10q2', 'msft_10q3', 'msft_10q4', 'msft_11q1', 'msft_11q2', 'msft_11q3', 'msft_11q4', 'msft_12q1', 'msft_12q2', 'msft_12q3', 'msft_12q4', 'msft_13q1', 'msft_13q2', 'msft_13q3', 'msft_13q4', 'msft_14q1', 'msft_14q2']
quarters_nadella = ['msft_14q3', 'msft_14q4', 'msft_15q1', 'msft_15q2', 'msft_15q3', 'msft_15q4', 'msft_16q1', 'msft_16q2', 'msft_16q3', 'msft_16q4', 'msft_17q1', 'msft_17q2', 'msft_17q3', 'msft_17q4', 'msft_18q1', 'msft_18q2', 'msft_18q3', 'msft_18q4', 'msft_19q1', 'msft_19q2', 'msft_19q3', 'msft_19q4', 'msft_20q1', 'msft_20q2', 'msft_20q3', 'msft_20q4', 'msft_21q1', 'msft_21q2']

In [7]:
df_ballmer = pd.DataFrame(corpus_text_ballmer, columns=['text'])
df_ballmer['ceo'] = 'Ballmer'
df_ballmer['quarter'] = quarters_ballmer

In [8]:
df_nadella = pd.DataFrame(corpus_text_nadella, columns=['text'])
df_nadella['ceo'] = 'Nadella'
df_nadella['quarter'] = quarters_nadella

In [9]:
df_msft = pd.concat([df_ballmer, df_nadella], axis=0).reset_index(drop=True)

In [10]:
df_msft

Unnamed: 0,text,ceo,quarter
0,ahead everyone senior financial corporate acco...,Ballmer,msft_07q3
1,ahead everyone senior financial corporate acco...,Ballmer,msft_07q4
2,meet madam begin everyone senior financial cor...,Ballmer,msft_08q1
3,time listen mode answer session press star tou...,Ballmer,msft_08q2
4,meet heally begin heally everyone senior finan...,Ballmer,msft_08q3
5,yearn meet madam begin everyone senior financi...,Ballmer,msft_08q4
6,time listenonly mode time hand financial mr ah...,Ballmer,msft_09q1
7,unknown speaker time mr begin everyone little ...,Ballmer,msft_09q2
8,time listen mode time meet mr begin barb every...,Ballmer,msft_09q3
9,mr begin barb everyone usual senior financial ...,Ballmer,msft_09q4


In [56]:
corpus = st.CorpusFromPandas(df_msft,
                             category_col='ceo',
                             text_col='text',
                             nlp=st.whitespace_nlp_with_sentences
                            ).build()

In [12]:
html = st.produce_scattertext_explorer(
        corpus,
        category='Ballmer',
        category_name='Steve Ballmer Era',
        not_category_name='Satya Nadella Era',
        minimum_term_frequency=10,
        pmi_threshold_coefficient=5,
        width_in_pixels=1000,
        metadata=df_msft['quarter'],
        )

In [13]:
open('scattertext_demo.html', 'wb').write(html.encode('utf-8'));

## Bigram

### Scattertext

In [41]:
corpus_ballmer_bi = pickle.load(open('cleaned_corpus_bi_ball.pickle', 'rb'))

In [42]:
corpus_nadella_bi = pickle.load(open('cleaned_corpus_bi_nad.pickle', 'rb'))

In [43]:
corpus_text_ballmer_bi = []
for doc in corpus_ballmer_bi:
    corpus_text_ballmer_bi.append(' '.join(doc))

In [44]:
corpus_text_nadella_bi = []
for doc in corpus_nadella_bi:
    corpus_text_nadella_bi.append(' '.join(doc))

In [45]:
df_ballmer_bi = pd.DataFrame(corpus_text_ballmer_bi, columns=['text'])
df_ballmer_bi['ceo'] = 'Ballmer'
df_ballmer_bi['quarter'] = quarters_ballmer

In [46]:
df_nadella_bi = pd.DataFrame(corpus_text_nadella_bi, columns=['text'])
df_nadella_bi['ceo'] = 'Nadella'
df_nadella_bi['quarter'] = quarters_nadella

In [47]:
df_msft_bi = pd.concat([df_ballmer_bi, df_nadella_bi], axis=0).reset_index(drop=True)

In [48]:
df_msft_bi

Unnamed: 0,text,ceo,quarter
0,microsoft corporation corporation earnings ear...,Ballmer,msft_07q3
1,microsoft corporation corporation earnings ear...,Ballmer,msft_07q4
2,microsoft corporation corporation earnings ear...,Ballmer,msft_08q1
3,microsoft corporation corporation earnings ear...,Ballmer,msft_08q2
4,microsoft corporation corporation earnings ear...,Ballmer,msft_08q3
5,microsoft corporation corporation earnings ear...,Ballmer,msft_08q4
6,microsoft corporation corporation earnings ear...,Ballmer,msft_09q1
7,microsoft corporation corporation earnings ear...,Ballmer,msft_09q2
8,microsoft corporation corporation earnings ear...,Ballmer,msft_09q3
9,microsoft corporation corporation earnings ear...,Ballmer,msft_09q4


In [64]:
nlp = spacy.load("en_core_web_sm")
df_msft_bi = df_msft_bi.assign(
    parse=lambda df: df.text.apply(nlp),
    ceo=lambda df: df.ceo.apply({'Ballmer': 'Steve Ballmer Era', 'Nadella': 'Satya Nadella Era'}.get)
)

corpus_bi = st.CorpusFromParsedDocuments(
    df_msft_bi,
    category_col='ceo',
    parsed_col='parse',
    feats_from_spacy_doc=st.PyTextRankPhrases()
).build().compact(
    AssociationCompactor(2000, use_non_text_features=True)
)

AttributeError: module 'pytextrank' has no attribute 'TextRank'

In [58]:
html = st.produce_scattertext_explorer(
        corpus_bi,
        category='Ballmer',
        category_name='Steve Ballmer Era',
        not_category_name='Satya Nadella Era',
        minimum_term_frequency=10,
        pmi_threshold_coefficient=5,
        width_in_pixels=1000,
        metadata=df_msft_bi['quarter'],
        )

In [59]:
open('scattertext_bi_demo.html', 'wb').write(html.encode('utf-8'));