# Applying Scattertext to All Sides Headline Roundup

In [5]:
# import modules
%matplotlib inline
import scattertext as st
import re, io
from pprint import pprint
import pandas as pd
import numpy as np
from scipy.stats import rankdata, hmean, norm
import spacy
import os, pkgutil, json, urllib
from urllib.request import urlopen
from IPython.display import IFrame
from IPython.core.display import display, HTML
from scattertext import CorpusFromPandas, produce_scattertext_explorer
display(HTML("<style>.container { width:98% !important; }</style>"))
nlp = spacy.load('en')

### Grab the All Sides media set and preview it

In [6]:
convention_df = pd.read_csv("/Users/meldye/Documents/Insight/news-corpus-df.csv")
convention_df.head()

Unnamed: 0.1,Unnamed: 0,date,main_headline,description,source,bias,headline,link,text,text_len
0,0,2018-06-13,b'May Jobs Report Sparks Debate on Who Deserve...,With the recent release of the May jobs report...,Washington Post,Left,b'How trend-riding Trump is taking credit for ...,https://www.washingtonpost.com/news/posteveryt...,"Jared Bernstein, a former chief economist to V...",5870
1,1,2018-06-13,b'May Jobs Report Sparks Debate on Who Deserve...,With the recent release of the May jobs report...,Wall Street Journal- Editorial,Right,b'It\xe2\x80\x99s Trump\xe2\x80\x99s Economy Now',https://www.wsj.com/articles/its-trumps-econom...,Liberals have opposed virtually every move Pre...,536
2,2,2018-06-13,b'May Jobs Report Sparks Debate on Who Deserve...,With the recent release of the May jobs report...,USA TODAY,Center,b'The Bubble: By undoing Obama accomplishments...,https://www.usatoday.com/story/news/politics/o...,CLOSE President Trump’s once bitter political ...,7123
3,4,2018-06-13,b'Michael Cohen Expected to Cooperate With Fed...,"Attorneys for Michael Cohen, President Trump's...",Wall Street Journal- News,Center,b'Trump Lawyer Michael Cohen\xe2\x80\x99s Atto...,https://www.wsj.com/articles/trump-lawyer-mich...,"The attorneys for Michael Cohen, President Don...",538
4,5,2018-06-13,b'Michael Cohen Expected to Cooperate With Fed...,"Attorneys for Michael Cohen, President Trump's...",Vox,Left,b'Reports suggest Michael Cohen is thinking of...,https://www.vox.com/2018/6/13/17458594/michael...,Longtime Trump lawyer Michael Cohen is changin...,6986


In [8]:
print("Document Count")
print(convention_df.groupby('bias')['text'].count())
print("Word Count")
convention_df.groupby('bias').apply(lambda x: x.text.apply(lambda x: len(x.split())).sum())
convention_df['text'] = convention_df.text.apply(nlp)

Document Count
bias
Center      807
Left       1316
Right      1039
Name: text, dtype: int64
Word Count


### Turn it into a Scattertext corpus and have spaCy parse it.

In [118]:
corpus = st.CorpusFromParsedDocuments(convention_df, category_col='bias', parsed_col='text').build()

In [120]:
# remove stop words
stop_word_list = ['ad', '\'s rise', 'photo wait', 'or blog', 'blog', 'skip', 'b.', 'read or', 'or share', 'device', 'unsupported', 'unsupported on', 'your device', 'enlarge', 'autoplay', 'embed', 'copy this', 'unsubscribe', 'toggle', 'playback', '76', 'of 76', 's.', '37', 'is', 'are', 'and the', 'is the', 'to get', 'also', 'however', 'n’t', 'also said', 'for a', '’ve', '10', '1', 'r', '#', '’ve', 'it ’s', 'n’t', 'that ’s', 'around', 'around the', 'that they', 'and his', 'of his', '’m', 'i ’m', 'something', 'caption', 'post', 'view', '/ the', '’s', '’re', 'videos', 'replay more', 'read more', 'watch', 'replay', 'must watch', 'just watched', 'more videos', 'hide caption', '―', 'photos', 'hide', 'watched', 'cnn', 'washington post', 'told cnn', 'washington times', 'times llc', '_', '© 2018', 'click here', '©', 'click', 'here for', 'reprint permission', 'for reprint', 'reprint', 'llc', 'press contributed', 'permission', 'told fox', 'the associated', 'associated press', 'associated', 'copyright', 'fox news', 'mrs.', 'ms.', 'mr.', 'this report', 'contributed to', 'fox', 'contributed', 'to this', 'copyright ©', 'said mr.', 'advertisement', '2018', 'the washington', 'times', '&', 'follow']
stop_word_list = list(set(stop_word_list))
corpus=corpus.remove_terms(stop_word_list)
# corpus.get_stoplisted_unigram_corpus()

In [121]:
term_freq_df = corpus.get_term_freq_df()
term_freq_df.head()

Unnamed: 0_level_0,Left freq,Right freq,Center freq
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
jared,77,29,39
bernstein,5,3,9
a,25512,13923,12429
former,1117,660,685
chief,430,267,282


In [122]:
term_freq_df = corpus.get_term_freq_df()
list(term_freq_df.columns.values)

['Left  freq', 'Right  freq', 'Center  freq']

In [123]:
print(list(corpus.get_scaled_f_scores_vs_background().index[:10]))

['trump', 'obama', 'comey', 'obamacare', 'twitter', 'tweeted', 'hillary', 'clinton', 'republicans', 'gop']


In [124]:
term_freq_df = corpus.get_term_freq_df()
term_freq_df['Left Score'] = \
corpus.get_scaled_f_scores('Left ')
pprint(list(term_freq_df.sort_values(by='Left Score', ascending=False).index[0:25]))

['the us',
 'convention',
 'rise',
 'clinton ’s',
 'email',
 'king',
 'sign',
 'the campaign',
 'climate',
 'is that',
 'of trump',
 'attacks',
 'he had',
 'he ’s',
 'emails',
 'us',
 'clinton',
 'the democratic',
 'muslim',
 'to have',
 'his own',
 'continued',
 '11',
 'national convention',
 'united states']


In [125]:
term_freq_df['Right Score'] = \
corpus.get_scaled_f_scores('Right ')
pprint(list(term_freq_df.sort_values(by='Right Score', 
                                      ascending=False).index[0:25]))

['mr. trump',
 'mrs. clinton',
 'illegal',
 'president trump',
 'the u.s.',
 'press',
 'israel',
 'the media',
 'here',
 'free',
 'obamacare',
 'democrat',
 'taxes',
 'pence',
 'we are',
 'plan',
 'we will',
 'reports',
 'the american',
 'the obama',
 'and we',
 'plans',
 'of our',
 'american people',
 'tweeted']


In [126]:
term_freq_df.head()

Unnamed: 0_level_0,Left freq,Right freq,Center freq,Left Score,Right Score
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
jared,77,29,39,0.720772,0.28146
bernstein,5,3,9,0.457507,0.463126
a,25512,13923,12429,0.91225,0.111604
former,1117,660,685,0.124182,0.111371
chief,430,267,282,0.116896,0.114302


In [115]:
html = st.produce_scattertext_explorer(corpus,
                                       category='Left ',
                                       category_name='Left Leaning Media',
                                       not_category_name='Right Leaning Media',
                                       minimum_term_frequency=25,
                                       width_in_pixels=1000,
                                       transform=st.Scalers.log_scale_standardize)
file_name = 'output/AllSidesScattertextLog.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)

html = produce_scattertext_explorer(corpus,
                                    category='Left ',
                                    category_name='Democratic',
                                    not_category_name='Republican',
                                    width_in_pixels=1000,
                                    minimum_term_frequency=5,
                                    metadata=convention_df['speaker'],
                                    term_significance = st.LogOddsRatioUninformativeDirichletPrior())
file_name = 'output/AllSidesScattertextLog.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)

In [23]:
term_freq_df = corpus.get_term_freq_df()
term_freq_df['left_precision'] = term_freq_df['Left  freq'] * 1./(term_freq_df['Left  freq'] + term_freq_df['Right  freq'])
term_freq_df['left_freq_pct'] = term_freq_df['Left  freq'] * 1./term_freq_df['Left  freq'].sum()
term_freq_df['left_hmean'] = term_freq_df.apply(lambda x: (hmean([x['left_precision'], x['left_freq_pct']])
                                                                   if x['left_precision'] > 0 and x['left_freq_pct'] > 0 
                                                                   else 0), axis=1)                                                        
term_freq_df.sort_values(by='left_hmean', ascending=False).iloc[:50]                                                   

Unnamed: 0_level_0,Left freq,Right freq,Center freq,left_precision,left_freq_pct,left_hmean
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
the,65156,39152,32007,0.62465,0.029066,0.055548
to,33333,19293,16035,0.633394,0.01487,0.029058
of,27423,15101,12483,0.644883,0.012233,0.024011
a,25512,13923,12429,0.646938,0.011381,0.022368
and,25182,15096,11743,0.625205,0.011234,0.022071
in,21704,11671,10671,0.650307,0.009682,0.01908
that,17284,9158,7315,0.653657,0.00771,0.015241
trump,12010,6186,6223,0.660035,0.005358,0.010629
on,11036,6049,6180,0.645947,0.004923,0.009772
for,10334,6350,5359,0.619396,0.00461,0.009152


In [22]:
def normcdf(x):
    return norm.cdf(x, x.mean(), x.std())
term_freq_df['left_precision_normcdf'] = normcdf(term_freq_df['left_precision'])
term_freq_df['left_freq_pct_normcdf'] = normcdf(term_freq_df['left_freq_pct'])



#term_freq_df['left_scaled_f_score'] = hmean([term_freq_df['left_precision_normcdf'], term_freq_df['left_freq_pct_normcdf']])
term_freq_df.sort_values(by='left_precision_normcdf', ascending=False).iloc[:10]

  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = (x >= self.b) & cond0


Unnamed: 0_level_0,Left freq,Right freq,Center freq,left_precision,left_freq_pct,left_hmean,left_precision_normcdf,left_freq_pct_normcdf
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
david becker,1,0,0,1.0,4.461028e-07,8.922051e-07,0.805784,0.492209
different policies,1,0,0,1.0,4.461028e-07,8.922051e-07,0.805784,0.492209
presidents nicolas,1,0,0,1.0,4.461028e-07,8.922051e-07,0.805784,0.492209
sarkozy and,1,0,0,1.0,4.461028e-07,8.922051e-07,0.805784,0.492209
and françois,1,0,0,1.0,4.461028e-07,8.922051e-07,0.805784,0.492209
hollande very,1,0,0,1.0,4.461028e-07,8.922051e-07,0.805784,0.492209
different orientations,1,0,0,1.0,4.461028e-07,8.922051e-07,0.805784,0.492209
orientations very,1,0,0,1.0,4.461028e-07,8.922051e-07,0.805784,0.492209
still we,1,0,0,1.0,4.461028e-07,8.922051e-07,0.805784,0.492209
two consecutive,1,0,0,1.0,4.461028e-07,8.922051e-07,0.805784,0.492209


In [25]:
term_freq_df['left_precision_normcdf']

KeyError: 'left_precision_normcdf'