In [33]:
from scattertext.termcompaction.CompactTerms import CompactTerms

import scattertext as st
from scattertext import LogOddsRatioInformativeDirichletPrior

import numpy as np
from scipy.sparse import csr_matrix
import pandas as pd
from scattertext.TermDocMatrix import TermDocMatrix
from scattertext.indexstore import IndexStore, IndexStoreFromList
from IPython.display import IFrame

In [None]:
with open ('left_collocations_article.pkl', 'rb') as fp:
    left_collocations_article = pickle.load(fp)
with open ('right_collocations_article.pkl', 'rb') as fp:
    right_collocations_article = pickle.load(fp)

# turn collocations into ngrams
NGRAMS = 2

ll, left_ngrams_article, left_ngrams_by_article = collocations_to_ngrams(left_collocations_article, NGRAMS)
rl, right_ngrams_article, right_ngrams_by_article = collocations_to_ngrams(right_collocations_article, NGRAMS)


In [35]:
rdf = st.SampleCorpora.RottenTomatoes.get_data()

print (type(rdf))

rdf['category_name'] = rdf['category'].apply(lambda x: {'plot': 'Plot', 'rotten': 'Negative', 'fresh': 'Positive'}[x])
print(rdf.category_name.value_counts())
rdf[['text', 'movie_name', 'category_name']].head()

<class 'pandas.core.frame.DataFrame'>
Positive    2455
Negative    2411
Plot         156
Name: category_name, dtype: int64


Unnamed: 0,text,movie_name,category_name
0,"A senior at an elite college (Katie Holmes), a...",abandon,Plot
1,Will Lightman is a hip Londoner who one day re...,about_a_boy,Plot
2,Warren Schmidt (Nicholson) is forced to deal w...,about_schmidt,Plot
3,An account of screenwriter Charlie Kaufman's (...,adaptation,Plot
4,Ali G unwittingly becomes a pawn in the evil C...,ali_g_indahouse,Plot


In [29]:
corpus = (st.CorpusFromPandas(rdf, 
                              category_col='category_name', 
                              text_col='text',
                              nlp = st.whitespace_nlp_with_sentences)
          .build())
corpus.get_term_freq_df().to_csv('term_freqs.csv')
unigram_corpus = corpus.get_unigram_corpus()

In [34]:
priors = (st.PriorFactory(unigram_corpus, 
                          category='Positive', 
                          not_categories=['Negative'],
                          starting_count=0.01)
          .use_neutral_categories()
          .get_priors())
class LogOddsRatioSmoothedZScorePrior:
    def __init__(self, prior, prior_scale):
        self.prior = prior
        self.prior_scale = prior_scale
    def get_scores(self, a, b): 
        ap = a + self.prior * self.prior_scale*sum(a)/sum(self.prior.values)
        bp = b + self.prior * self.prior_scale*sum(b)/sum(self.prior.values)
        lor = (np.log(ap/(np.sum(ap) - ap)) - np.log(bp/(np.sum(bp) - bp)))
        lorstd = 1./ap + 1./(np.sum(ap) - ap) + 1./bp + 1./(np.sum(bp) - bp)
        return lor/np.sqrt(lorstd)
        
    def get_name(self): 
        return 'Log-Odds-Ratio w/ Informative Dirichlet Prior Z-Score'
html = st.produce_fightin_words_explorer(
    unigram_corpus,
    category='Positive',
    not_category_name='Negative',
    not_categories=['Negative'],
    #term_scorer=LogOddsRatioSmoothedZScorePrior(priors, 10, ),
    # Equivalent:
    term_scorer=st.LogOddsRatioInformativeDirichletPrior(priors, 10, 'class-size'),    
    metadata = rdf['movie_name']
)
file_name = 'rotten_fresh_loridp.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1300, height=700)