In [91]:
import pandas as pd
import numpy as np
import random
from collections import defaultdict 
from ast import literal_eval
from collections import Counter
import re
import unicodedata
from nlp_preprocessing import *
from topic_modeling import *
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS, CountVectorizer

pd.set_option("display.max_rows", 50)
pd.set_option("display.max_columns", None)
pd.set_option('display.max_colwidth', None)

# pd.reset_option('display.max_colwidth')

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
df_nlp_round1 = pd.read_csv("../Data/data_NLP_round1.csv")

df_nlp_round1.head()

Unnamed: 0,number,global_bias,title,date,summary,link,news_title,news_source,news_link,bias,paras,authors,publish_date,text
0,5,From the Left,Trump Administration Drops Citizenship Question From Census,"July 3rd, 2019","['The Trump Administration dropped plans to add a citizenship question to the 2020 census, after...",https://www.allsides.com/story/trump-administration-drops-citizenship-question-census,Trump Responds After His Administration Drops Bid For Citizenship Question On 2020 Census,HuffPost,https://www.huffpost.com/entry/trump-citizenship-question-2020-census_n_5d1bd769e4b082e553718d6b,Left,President Donald Trump spoke out Tuesday on his administration’s decision not to add a citizensh...,"['Antonia Blumberg', 'Huffpost Us', 'Reporter']",2019-07-03 08:13:05+05:30,“A very sad time for America when the Supreme Court of the United States won’t allow a question ...
1,5,From the Right,Trump Administration Drops Citizenship Question From Census,"July 3rd, 2019","['The Trump Administration dropped plans to add a citizenship question to the 2020 census, after...",https://www.allsides.com/story/trump-administration-drops-citizenship-question-census,Trump administration drops push for citizenship question on 2020 census,Washington Times,https://www.washingtontimes.com/news/2019/jul/2/trump-drops-push-citizenship-question-2020-census/,Lean Right,"President Trump’s quest to add a citizenship question to the 2020 census ended Tuesday, with the...","['The Washington Times Http', 'Stephen Dinan']",2019-07-02 00:00:00,"President Trump‘s quest to add a citizenship question to the 2020 census ended Tuesday, with the..."
2,15,From the Left,Iran to Surpass Uranium Enrichment Breaching Nuclear Agreement,"July 7th, 2019","['On Sunday, Iranian officials said the country will exceed the limits set in the 2015 nuclear d...",https://www.allsides.com/story/iran-surpass-uranium-enrichment-breaching-nuclear-agreement,"Iran Announces New Breach of Nuclear Deal Limits, and Threatens Further Violations",New York Times (News),https://www.nytimes.com/2019/07/07/world/middleeast/iran-nuclear-limits-breach.html,Lean Left,Iran said on Sunday that within hours it would breach the limits on uranium enrichment set four ...,"['David D. Kirkpatrick', 'David E. Sanger']",2019-07-07 00:00:00,Iran said on Sunday that within hours it would breach the limits on uranium enrichment set four ...
3,15,From the Right,Iran to Surpass Uranium Enrichment Breaching Nuclear Agreement,"July 7th, 2019","['On Sunday, Iranian officials said the country will exceed the limits set in the 2015 nuclear d...",https://www.allsides.com/story/iran-surpass-uranium-enrichment-breaching-nuclear-agreement,Iran raises uranium enrichment as nuclear deal unravels,Washington Times,https://www.washingtontimes.com/news/2019/jul/7/iran-raises-uranium-enrichment-nuclear-deal-unrave/,Lean Right,"Iran announced Sunday it will raise its level of uranium enrichment, breaking another limit of i...","['The Washington Times Http', 'Jon Gambrell', 'Nasser Karimi']",2019-07-07 00:00:00,"TEHRAN, Iran — Iran announced Sunday it will raise its level of uranium enrichment, breaking ano..."
4,25,From the Left,Social Media Summit Draws Wide Range of Coverage,"July 12th, 2019","[""The 'Social Media Summit' hosted by President Donald Trump at the White House on Thursday made...",https://www.allsides.com/story/social-media-summit-draws-wide-range-coverage,Trump accuses social media companies of ‘terrible bias’ at White House summit decried by critics,Washington Post,https://www.washingtonpost.com/technology/2019/07/11/we-will-not-let-them-get-away-with-it-trump...,Lean Left,"President Trump assailed Facebook, Google and Twitter on Thursday — accusing them of exhibiting ...","['Tony Romm', 'Senior Tech Policy Reporter']",2019-07-11 00:00:00,"“Some of you are extraordinary. The crap you think of is unbelievable,” Trump said.\n\nAD\n\nTru..."


Since each news article can contain slightly different unicode formatting, its best to convert everything to ascii format, to make it easier to work the data. All incomptabile characters will be converted or dropped. Since we are working with English, the hope is that a majority of the data is retained.
**But we can come to this later to see how much data is being dropped.**

In [5]:
# Ensuring everything is in ascii format and removing any wierd formatings.
df_nlp_round1['text_ascii'] = df_nlp_round1.text.map(lambda x: unicodedata.normalize('NFKD', x).encode('ascii', 'ignore').decode('ascii'))
df_nlp_round1[['text','text_ascii']].sample()

Unnamed: 0,text,text_ascii
830,OTTAWA — The security fences are coming down. And the world leaders have jetted off.\n\nBut for ...,OTTAWA The security fences are coming down. And the world leaders have jetted off.\n\nBut for P...


### Breaking Into Paras

Let's breakout each news article into paragraphs and expand this into a new dataframe.  
These paragraphs will be treated as individual documents that will be used to vectorize & topic model. Post which, for a given overall news headline, each paragraph from the left & right bias will be compared to see pair up paragraphs.

In [37]:
df_expanded = df_nlp_round1[['number','global_bias','title','news_source','text_ascii']].copy(deep=True)

# Splitting each para into a list of paras
df_expanded['text_paras_list'] = df_expanded.text_ascii.str.split('\n\n')

# Exploding the paragraphs into a dataframe, where each row has a paragraph
df_expanded_col = pd.DataFrame(df_expanded.text_paras_list.explode())
df_expanded_col.rename(columns={'text_paras_list':'text_paras'}, inplace=True)

# Joining the exploded dataframe back, so that other metadata can be associated with it
df_expanded = df_expanded.join(df_expanded_col,).reset_index()
df_expanded.rename(columns={'index':'article'}, inplace=True)
df_expanded.drop(columns='text_paras_list', inplace=True)

# getting paragraph numbering
df_expanded['para_count'] = df_expanded.groupby('article').cumcount()

In [92]:
df_expanded['text_paras_clean'] = df_expanded.text_paras.map(cleaning)
df_expanded[['text_paras','text_paras_clean']].sample(2)

# df_nlp_round1['text_clean'] = df_nlp_round1.text_ascii.map(cleaning)
# df_nlp_round1[['text','text_clean']].sample()

Unnamed: 0,text_paras,text_paras_clean
9657,"But the president is expected to focus on economic fairness issues, such as the minimum wage, as part of his push to reverse increasing income disparity in America, which he calls the defining issue of our time.",but the president is expected to focus on economic fairness issues such as the minimum wage as part of his push to reverse increasing income disparity in america which he calls the defining issue of our time
15161,AD,ad


In [105]:
%%time
# df_expanded['text_paras_lemma'] = df_expanded.text_paras_clean.map(spacy_lemmatization)
df_expanded[['text_paras_clean','text_paras_lemma']].sample(2)

# df_nlp_round1['text_lemma'] = df_nlp_round1.text_clean.map(spacy_lemmatization)
# df_nlp_round1[['text','text_lemma']].sample()

Wall time: 17 ms


Unnamed: 0,text_paras_clean,text_paras_lemma
22327,if not him then mr mccabe or other f b i officials interviewing with mr trump for the job could perhaps wear a wire or otherwise record the president mr rosenstein offered white house officials never checked his phone when he arrived for meetings there mr rosenstein added implying it would be easy to secretly record mr trump,if not then mr mccabe or other f b i official interview with mr trump for the job could perhaps wear a wire or otherwise record the president mr rosenstein offer white house official never check phone when arrive for meeting there mr rosenstein add imply would be easy to secretly record mr trump
10107,to win big on family feud two members of a family have to combine to get 200 points so when tim sass scored an epic 182 points in the final round of his familys appearance on the long running game show he must have felt pretty confident that his team would walk away with $20 000 but he didnt remember the political adage you are never as far up as you think you are when his relative anna stepped up to the plate she got exactly zero points for example no one in the survey though that throw up was a good answer for something a stomach did the sass family did walk away with $5 per point and $910 aint too shabby,to win big on family feud two member of a family have to combine to get 200 point so when tim sass score an epic 182 point in the final round of familys appearance on the long running game show must have feel pretty confident that team would walk away with $ 20 000 but do not remember the political adage be never as far up as think be when relative anna step up to the plate get exactly zero point for example no one in the survey though that throw up be a good answer for something a stomach do the sass family do walk away with $ 5 per point and $ 910 be not too shabby


In [33]:
%%time
custom_words = ['ad','advertisment']

df_expanded['text_paras_stopwords'] = df_expanded.text_paras_clean.map(lambda x: remove_stopwords(x, custom_words=custom_words))
df_expanded[['text_paras_lemma','text_paras_stopwords']].sample(2)

# df_nlp_round1['text_stopwords'] = df_nlp_round1.text_clean.map(remove_stopwords)
# df_nlp_round1[['text','text_stopwords']].sample()

Wall time: 35.4 s


Unnamed: 0,text,text_stopwords
187,A federal appeals court on Wednesday ordered a lower court to allow the case against former Nati...,federal appeals court wednesday ordered lower court allow case former national security adviser ...


In [35]:
df_nlp_round1['text_final'] = df_nlp_round1['text_stopwords']

In [36]:
%%time

params = {'stop_words':'english','min_df': 10, 'max_df': 0.5, 'ngram_range':(1, 1),}

tfidf = TfidfVectorizer(**params)
review_word_matrix_tfidf = tfidf.fit_transform(df_nlp_round1['text_stopwords'])
review_vocab_tfidf = tfidf.get_feature_names()

lda_tfidf, score_tfidf, topic_matrix_tfidf, word_matrix_tfidf = lda_topic_modeling(review_word_matrix_tfidf, vocab = review_vocab_tfidf, n = 20)

iteration: 1 of max_iter: 100
iteration: 2 of max_iter: 100
iteration: 3 of max_iter: 100
iteration: 4 of max_iter: 100
iteration: 5 of max_iter: 100
iteration: 6 of max_iter: 100
iteration: 7 of max_iter: 100
iteration: 8 of max_iter: 100
iteration: 9 of max_iter: 100
iteration: 10 of max_iter: 100
iteration: 11 of max_iter: 100
iteration: 12 of max_iter: 100
iteration: 13 of max_iter: 100
iteration: 14 of max_iter: 100
iteration: 15 of max_iter: 100
iteration: 16 of max_iter: 100
iteration: 17 of max_iter: 100
iteration: 18 of max_iter: 100
iteration: 19 of max_iter: 100
iteration: 20 of max_iter: 100
iteration: 21 of max_iter: 100
iteration: 22 of max_iter: 100
iteration: 23 of max_iter: 100
iteration: 24 of max_iter: 100
iteration: 25 of max_iter: 100
iteration: 26 of max_iter: 100
iteration: 27 of max_iter: 100
iteration: 28 of max_iter: 100
iteration: 29 of max_iter: 100
iteration: 30 of max_iter: 100
iteration: 31 of max_iter: 100
iteration: 32 of max_iter: 100
iteration: 33 of 