In [37]:
import pandas as pd
import spacy
import scattertext as st
import numpy as np
from pprint import pprint
%matplotlib inline
from IPython.display import display, HTML
import os, pkgutil, json, urllib
from urllib.request import urlopen
from IPython.display import IFrame
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))

In [3]:
sea=pd.read_csv("seamayor.csv", encoding="latin1")

In [6]:
sea=sea.drop("Unnamed: 0", axis=1)

In [4]:
nlp = spacy.en.English()

In [7]:
corpus = st.CorpusFromPandas(sea, 
...                              category_col='user', 
...                              text_col='cleantweets',
...                              nlp=nlp).build()

In [8]:
print(list(corpus.get_scaled_f_scores_vs_background().index[:10]))

['icymi', 'wmbe', 'charleena', 'durkan', 'nikkita', 'lgbtq', 'nagomi', 'transitoriented', 'singlefamily', 'mayoral']


In [14]:
term_freq_df = corpus.get_term_freq_df()
term_freq_df['Cary Moon Score'] = corpus.get_scaled_f_scores('Cary Moon')
pprint(list(term_freq_df.sort_values(by='Cary Moon Score', ascending=False).index[:20]))

['town',
 'town hall',
 'virtual town',
 'virtual',
 'moon',
 'interview',
 'oliver',
 'cary',
 'cary moon',
 'chamber',
 'wants',
 'voted',
 'racial',
 'for everyone',
 '6 pm',
 'voting for',
 'rsvp',
 'says',
 'corporate',
 'speculation']


In [15]:
term_freq_df['Jenny Durkan Score'] = corpus.get_scaled_f_scores('Jenny Durkan')
pprint(list(term_freq_df.sort_values(by='Jenny Durkan Score', ascending=False).index[:20]))

['attorney',
 'wmbe',
 'have been',
 'we also',
 'fought',
 'trump',
 'vital',
 'for justice',
 'also need',
 'who can',
 'our campaign',
 'u s',
 's attorney',
 'day to',
 'getting',
 'gun',
 'schools',
 'lesbian',
 'discrimination',
 'office to']


In [53]:
html = st.produce_scattertext_explorer(corpus,
...          category='Cary Moon',
...          category_name='Cary Moon for Mayor',
...          not_category_name='Jenny Durkan', transform=st.Scalers.log_scale_standardize, width_in_pixels=1000, term_significance = st.LogOddsRatioUninformativeDirichletPrior())


In [54]:
file_name="Mayorviz.html"
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)

In [55]:
def scale(ar): 
    return (ar - ar.min()) / (ar.max() - ar.min())

def zero_centered_scale(ar):
    scores = np.zeros(len(ar))
    scores[ar > 0] = scale(ar[ar > 0])
    scores[ar < 0] = -scale(-ar[ar < 0])
    return (scores + 1) / 2.

frequencies_scaled = scale(np.log(term_freq_df.sum(axis=1).values))

In [59]:
from sklearn.linear_model import LogisticRegression
scores = corpus.get_logreg_coefs('Cary Moon',
                                 LogisticRegression(penalty='l2', C=10, max_iter=10000, n_jobs=-1))
scores_scaled = zero_centered_scale(scores)

html = st.produce_scattertext_explorer(corpus,
                                    category='Cary Moon',
                                    category_name='Cary Moon Mayor',
                                    not_category_name='Jenny Durkan',
                                    minimum_term_frequency=5,
                                    width_in_pixels=1000,
                                    x_coords=frequencies_scaled,
                                    y_coords=scores_scaled,
                                    scores=scores,
                                    sort_by_dist=False,
                                    x_label='Log frequency',
                                    y_label='L2-Penalized Log Reg Coef')
file_name = 'L2vsLog.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)