In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import sys
sys.path.append('/content/drive/MyDrive/Unbiased_news/python_modules')

In [3]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import pandas as pd
import numpy as np
import random
from collections import defaultdict 
from ast import literal_eval
from collections import Counter
import re
import unicodedata
from nlp_preprocessing import *
from topic_modeling import *
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS, CountVectorizer
import spacy
import pickle
from sklearn.metrics.pairwise import cosine_similarity

sp_nlp = spacy.load('en')

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
# pd.set_option('display.max_colwidth', None)

pd.reset_option('display.max_colwidth')

%load_ext autoreload
%autoreload 2

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Classification Ideas

Vader for sentiment - built on social media 
textblob sentiment - 
compound output for 

word complexity - count words longer than 5 letters or avg length
no. of words in document

! and ? in doc for sensationalism

spacy tagging on proper nouns. NER. 

NER on doc & occurance %

sentiment on headlines
NER for headlines and checking sentiment on those NER in the article

passive & active voice 



# Data Import

In [5]:
# df = pd.read_csv("Data/data_NLP_round1.csv")
df = pd.read_csv("/content/drive/MyDrive/Unbiased_news/Data/data_NLP_round1.csv")
df.head()

Unnamed: 0,number,global_bias,title,date,summary,link,news_title,news_source,news_link,bias,paras,authors,publish_date,text
0,5,From the Left,Trump Administration Drops Citizenship Questio...,"July 3rd, 2019",['The Trump Administration dropped plans to ad...,https://www.allsides.com/story/trump-administr...,Trump Responds After His Administration Drops ...,HuffPost,https://www.huffpost.com/entry/trump-citizensh...,Left,President Donald Trump spoke out Tuesday on hi...,"['Antonia Blumberg', 'Huffpost Us', 'Reporter']",2019-07-03 08:13:05+05:30,“A very sad time for America when the Supreme ...
1,5,From the Right,Trump Administration Drops Citizenship Questio...,"July 3rd, 2019",['The Trump Administration dropped plans to ad...,https://www.allsides.com/story/trump-administr...,Trump administration drops push for citizenshi...,Washington Times,https://www.washingtontimes.com/news/2019/jul/...,Lean Right,President Trump’s quest to add a citizenship q...,"['The Washington Times Http', 'Stephen Dinan']",2019-07-02 00:00:00,President Trump‘s quest to add a citizenship q...
2,15,From the Left,Iran to Surpass Uranium Enrichment Breaching N...,"July 7th, 2019","['On Sunday, Iranian officials said the countr...",https://www.allsides.com/story/iran-surpass-ur...,Iran Announces New Breach of Nuclear Deal Limi...,New York Times (News),https://www.nytimes.com/2019/07/07/world/middl...,Lean Left,Iran said on Sunday that within hours it would...,"['David D. Kirkpatrick', 'David E. Sanger']",2019-07-07 00:00:00,Iran said on Sunday that within hours it would...
3,15,From the Right,Iran to Surpass Uranium Enrichment Breaching N...,"July 7th, 2019","['On Sunday, Iranian officials said the countr...",https://www.allsides.com/story/iran-surpass-ur...,Iran raises uranium enrichment as nuclear deal...,Washington Times,https://www.washingtontimes.com/news/2019/jul/...,Lean Right,Iran announced Sunday it will raise its level ...,"['The Washington Times Http', 'Jon Gambrell', ...",2019-07-07 00:00:00,"TEHRAN, Iran — Iran announced Sunday it will r..."
4,25,From the Left,Social Media Summit Draws Wide Range of Coverage,"July 12th, 2019","[""The 'Social Media Summit' hosted by Presiden...",https://www.allsides.com/story/social-media-su...,Trump accuses social media companies of ‘terri...,Washington Post,https://www.washingtonpost.com/technology/2019...,Lean Left,"President Trump assailed Facebook, Google and ...","['Tony Romm', 'Senior Tech Policy Reporter']",2019-07-11 00:00:00,“Some of you are extraordinary. The crap you t...


Since each news article can contain slightly different unicode formatting, its best to convert everything to ascii format, to make it easier to work the data. All incomptabile characters will be converted or dropped. Since we are working with English, the hope is that a majority of the data is retained.
**But we can come to this later to see how much data is being dropped.**

In [6]:
# Ensuring everything is in ascii format and removing any wierd formatings.
df['text_ascii'] = df.text.map(lambda x: unicodedata.normalize('NFKD', x).encode('ascii', 'ignore').decode('ascii'))
df[['text','text_ascii']].sample()

Unnamed: 0,text,text_ascii
240,Judge Amy Coney Barrett flatly refused on Tues...,Judge Amy Coney Barrett flatly refused on Tues...


In [6]:

df.news_source.unique()

array(['HuffPost', 'Washington Times', 'New York Times (News)',
       'Washington Post', 'Fox News (Online News)'], dtype=object)

# Pre-processing to work on

1. Better cleaning process - Post lemma and pre lemma? what else??
1. Compound term extraction - incl. punctuation separated & space separated
1. Named entity extraction & linkage (eg: hong_kong vs hong kong)

# Breaking Into Sentences

Let's split by sentences.

In [8]:
def sent_split(article):
    sent_list2 = []
    sent_list = nltk.sent_tokenize(article)
    for sent in sent_list:
        sent_list2.extend(sent.split('\n\n'))
    return sent_list2
    

def simple_cleaning(text_sent):
    text_sent = text_sent.lower()

    if ((text_sent == 'ad') or (text_sent.find('click here') >= 0 ) or (text_sent.find('sign up here') >= 0 ) or
        (text_sent.find('sign up for daily') >= 0 ) or (text_sent.find('sign up for the') >= 0 ) or
        (text_sent.find('contributed to this') >= 0 ) or (text_sent.find('all rights reserved') > 0 ) or
        (text_sent.find('reported from') >= 0 ) or (text_sent.find('contributed reporting') >= 0 ) or
        (text_sent.find('want fox news') >= 0) or (text_sent == '') or
        (text_sent.find('the washington times, llc') >= 0) or (text_sent.find('sign up for our') >= 0) or
        (text_sent.find('daily to your inbox') >= 0)
       ): 
        return False
    elif len((re.sub('[^a-z\s]', '', text_sent)).split()) <= 5:
        return False
    else:
        return True

In [None]:
df.columns

Index(['number', 'global_bias', 'title', 'date', 'summary', 'link',
       'news_title', 'news_source', 'news_link', 'bias', 'paras', 'authors',
       'publish_date', 'text', 'text_ascii'],
      dtype='object')

In [10]:
df_sentences = df[['number','global_bias','title','date','news_title','news_link','bias','news_source','text_ascii']].copy(deep=True)

# # Splitting each para into a list of paras
df_sentences['text_sent_list'] = df_sentences.text_ascii.map(sent_split)

# # Exploding the paragraphs into a dataframe, where each row has a paragraph
df_sentences_col = pd.DataFrame(df_sentences.text_sent_list.explode())
df_sentences_col.rename(columns={'text_sent_list':'text_sent'}, inplace=True)

# # # Cleaning up some portions of the expansion
# # df_sentences_col = df_sentences_col[~(df_sentences_col.text_sent == 'AD')]
# # df_sentences_col = df_sentences_col[~(df_sentences_col.text_sent.str.contains('click here', case=False))]
# # df_sentences_col = df_sentences_col[~(df_sentences_col.text_sent.str.contains('sign up here', case=False))]
# # df_sentences_col = df_sentences_col[~(df_sentences_col.text_sent.str.contains('sign up for daily', case=False))]
# # df_sentences_col = df_sentences_col[~(df_sentences_col.text_sent.str.contains('sign up for the', case=False))]
# # df_sentences_col = df_sentences_col[~(df_sentences_col.text_sent.str.contains('contributed to this', case=False))]
# # df_sentences_col = df_sentences_col[~(df_sentences_col.text_sent.str.contains('All rights reserved', case=False))]
# # df_sentences_col = df_sentences_col[~(df_sentences_col.text_sent.str.contains('reported from', case=False))]
# # df_sentences_col = df_sentences_col[~(df_sentences_col.text_sent.str.contains('contributed reporting', case=False))]
# # df_sentences_col = df_sentences_col[~(df_sentences_col.text_sent.isna())]
# # df_sentences_col = df_sentences_col[~(df_sentences_col.text_sent == '')]

df_sentences_col = df_sentences_col[df_sentences_col.text_sent.map(simple_cleaning)]
df_sentences_col = df_sentences_col[~(df_sentences_col.text_sent.isna())]

# # Joining the exploded dataframe back, so that other metadata can be associated with it
df_sentences = df_sentences.join(df_sentences_col, how='left').reset_index()
df_sentences.rename(columns={'index':'article'}, inplace=True)
df_sentences.drop(columns='text_sent_list', inplace=True)

# Dropping entire articles from left & right for which nothing got joined in the above statement.
article_nums_todrop = df_sentences[df_sentences.text_sent.isna()].number.tolist()
df_sentences = df_sentences[~(df_sentences.number.isin(article_nums_todrop))].reset_index(drop = True)

# getting paragraph numbering
df_sentences['text_count'] = df_sentences.groupby('article').cumcount()

del df_sentences_col

In [None]:
df_sentences.loc[df_sentences.text_sent.map(lambda x: len(x.split()) == 6), 'text_sent']

In [11]:
df_sentences.shape

(42012, 12)

In [None]:
df_sentences.head()

Unnamed: 0,article,number,global_bias,title,date,news_title,news_link,bias,news_source,text_ascii,text_sent,text_count
0,0,5,From the Left,Trump Administration Drops Citizenship Questio...,"July 3rd, 2019",Trump Responds After His Administration Drops ...,https://www.huffpost.com/entry/trump-citizensh...,Left,HuffPost,A very sad time for America when the Supreme C...,A very sad time for America when the Supreme C...,0
1,0,5,From the Left,Trump Administration Drops Citizenship Questio...,"July 3rd, 2019",Trump Responds After His Administration Drops ...,https://www.huffpost.com/entry/trump-citizensh...,Left,HuffPost,A very sad time for America when the Supreme C...,to be asked on the #2020 Census!,1
2,0,5,From the Left,Trump Administration Drops Citizenship Questio...,"July 3rd, 2019",Trump Responds After His Administration Drops ...,https://www.huffpost.com/entry/trump-citizensh...,Left,HuffPost,A very sad time for America when the Supreme C...,He added that he had asked his officials to do...,2
3,0,5,From the Left,Trump Administration Drops Citizenship Questio...,"July 3rd, 2019",Trump Responds After His Administration Drops ...,https://www.huffpost.com/entry/trump-citizensh...,Left,HuffPost,A very sad time for America when the Supreme C...,President Donald Trump spoke out Tuesday on hi...,3
4,0,5,From the Left,Trump Administration Drops Citizenship Questio...,"July 3rd, 2019",Trump Responds After His Administration Drops ...,https://www.huffpost.com/entry/trump-citizensh...,Left,HuffPost,A very sad time for America when the Supreme C...,A very sad time for America when the Supreme C...,4


In [None]:
df_sentences.to_csv("/content/drive/MyDrive/Unbiased_news/Data/sent_expanded_ready_for_modeling_v2.csv", index=False)

# Breaking Into Paras

Let's breakout each news article into paragraphs and expand this into a new dataframe.  
These paragraphs will be treated as individual documents that will be used to vectorize & topic model. Post which, for a given overall news headline, each paragraph from the left & right bias will be compared to see pair up paragraphs.

In [None]:
df_expanded = df[['number','global_bias','title','news_source','text_ascii']].copy(deep=True)

# Splitting each para into a list of paras
df_expanded['text_paras_list'] = df_expanded.text_ascii.str.split('\n\n')

# Exploding the paragraphs into a dataframe, where each row has a paragraph
df_expanded_col = pd.DataFrame(df_expanded.text_paras_list.explode())
df_expanded_col.rename(columns={'text_paras_list':'text_paras'}, inplace=True)

# Cleaning up some portions of the expansion
df_expanded_col = df_expanded_col[~(df_expanded_col.text_paras == 'AD')]
df_expanded_col = df_expanded_col[~(df_expanded_col.text_paras.str.contains('click here', case=False))]
df_expanded_col = df_expanded_col[~(df_expanded_col.text_paras.str.contains('sign up here', case=False))]
df_expanded_col = df_expanded_col[~(df_expanded_col.text_paras.str.contains('sign up for daily', case=False))]
df_expanded_col = df_expanded_col[~(df_expanded_col.text_paras.str.contains('sign up for the', case=False))]
df_expanded_col = df_expanded_col[~(df_expanded_col.text_paras.str.contains('contributed to this', case=False))]
df_expanded_col = df_expanded_col[~(df_expanded_col.text_paras.str.contains('All rights reserved', case=False))]
df_expanded_col = df_expanded_col[~(df_expanded_col.text_paras.str.contains('reported from', case=False))]
df_expanded_col = df_expanded_col[~(df_expanded_col.text_paras.str.contains('contributed reporting', case=False))]
df_expanded_col = df_expanded_col[~(df_expanded_col.text_paras.isna())]
df_expanded_col = df_expanded_col[~(df_expanded_col.text_paras == '')]

# Joining the exploded dataframe back, so that other metadata can be associated with it
df_expanded = df_expanded.join(df_expanded_col, how='left').reset_index()
df_expanded.rename(columns={'index':'article'}, inplace=True)
df_expanded.drop(columns='text_paras_list', inplace=True)

#Dropping entire articles from left & right for which nothing got joined in the above statement.
article_nums_todrop = df_expanded[df_expanded.text_paras.isna()].number.tolist()
df_expanded = df_expanded[~(df_expanded.number.isin(article_nums_todrop))].reset_index()

# # getting paragraph numbering
df_expanded['para_count'] = df_expanded.groupby('article').cumcount()

# Pre-processing

## Lemmatization

Lemmatizing first helps preserve as much meaning of the word as possible, while separating out punctuation as needed. It also preserves entity names.  
**Only need to link compound words somehow**

In [12]:
%%time

df_sentences['text_sent_lemma'] = df_sentences.text_sent.map(spacy_lemmatization)
df_sentences[['text_sent', 'text_sent_lemma']].sample(2)

# df_expanded['text_paras_lemma'] = df_expanded.text_paras.map(spacy_lemmatization)
# df_expanded[['text_paras', 'text_paras_lemma']].sample(2)

CPU times: user 8min, sys: 666 ms, total: 8min 1s
Wall time: 8min 1s


In [None]:
pd.set_option('display.max_colwidth', None)
print(df_expanded.sample()[['text_paras','text_paras_lemma']])
pd.reset_option('display.max_colwidth')

                                                                                                                                                                                                                                                                                                                                         text_paras  \
7124  Google didn't immediately respond to a request for comment, but the company has said its competitive edge comes from offering a product that billions of people choose to use each day. Alphabet's shares opened Tuesday up roughly 1%, ahead of the broader market, after The Wall Street Journal first reported news of the impending suit.   

                                                                                                                                                                                                                                                                                                                         

## Misc Cleaning

Misc. cleaning of the documents. Currently this involves just removing email addresses, website links & any non-alphanumeric characters.

In [13]:
df_sentences['text_sent_misc_clean'] = df_sentences.text_sent_lemma.map(cleaning)
df_sentences[['text_sent_lemma','text_sent_misc_clean']].sample(2)

# df_expanded['text_paras_misc_clean'] = df_expanded.text_paras_lemma.map(cleaning)
# df_expanded[['text_paras_lemma','text_paras_misc_clean']].sample(2)

Unnamed: 0,text_sent_lemma,text_sent_misc_clean
14053,"know how that turn out , Mr. Carnegie .",know how that turn out mr carnegie
9946,would certainly hope that all member of this ...,would certainly hope that all member of this ...


In [None]:
pd.set_option('display.max_colwidth', None)
print(df_expanded.loc[18300,['text_paras','text_paras_misc_clean']])
pd.reset_option('display.max_colwidth')

text_paras               All the components of the "H-bomb" were "homemade," so North Korea could produce "powerful nuclear weapons as many as it wants," the KCNA quoted Kim as saying.
text_paras_misc_clean              all the component of the  h  bomb  be  homemade   so North Korea could produce  powerful nuclear weapon as many as  want   the KCNA quote Kim as say 
Name: 18300, dtype: object


In [None]:
pd.set_option('display.max_colwidth', None)
print(df_expanded.sample()[['text_paras','text_paras_misc_clean']])
pd.reset_option('display.max_colwidth')

                                                                                                                                                                                                                                                                                                text_paras  \
1916   Ms. Noel and Mr. Thomas were charged with conspiracy to defraud the United States and with making false records. They both surrendered to the F.B.I. on Tuesday morning and pleaded not guilty at a hearing in Federal District Court in Manhattan in the afternoon. Bail was set at $100,000 each.   

                                                                                                                                                                                                                                                                 text_paras_misc_clean  
1916    Ms Noel and Mr Thomas be charge with conspiracy to defraud the United States and with make false record   

## Remove Stop-words

Apart from using SK Learn's stop words list, we add additional words that have to be removed from the corpus.  
The additional words are identified using an interative process of topic modeling and reviewing the top words showing up.


In [14]:
%%time

custom_stop_words = ['ad', 'advertisement', '000', 'mr', 'ms', 'said', 'going', 'dont', 'think', 'know', 'want', 'like', 'im', 'thats', 'told', \
                     'lot', 'hes', 'really', 'say', 'added', 'come', 'great','newsletter','daily','sign','app',\
                    'click','app','inbox', 'latest', 'jr','everybody','`']

df_sentences['text_sent_stopwords'] = df_sentences.text_sent_misc_clean.map(lambda x: remove_stopwords(x, custom_words=custom_stop_words))

# df_expanded['text_paras_stopwords'] = df_expanded.text_paras_misc_clean.map(lambda x: remove_stopwords(x, custom_words=custom_stop_words))

# df_expanded['text_paras_stopwords'] = df_expanded.text_paras_stopwords.map(lambda x: remove_stopwords(x, remove_words_list = [], \
#                                                                                                      custom_words = custom_stop_words))
# df_expanded[['text_paras_lemma','text_paras_stopwords']].sample(2)

df_sentences[['text_sent_misc_clean','text_sent_stopwords']].sample(2)

CPU times: user 1h 13min 28s, sys: 795 ms, total: 1h 13min 29s
Wall time: 1h 13min 33s


In [None]:
pd.set_option('display.max_colwidth', None)
print(df_expanded.sample()[['text_paras','text_paras_stopwords']])
pd.reset_option('display.max_colwidth')

                                                                                                                                                                                                                                                                                                                                     text_paras  \
8882  Alexis told the police, according to their report, that he was cleaning his gun when it just went off. "He said that he was trying to clean his gun while cooking and that his hands were slippery," the report states. He told the responding officer that he was taking the gun apart when his hands slipped and he pulled the trigger.   

                                                                                                                                        text_paras_stopwords  
8882  Alexis tell police accord report clean gun go try clean gun cook hand slippery report state tell respond officer take gun apart hand slip pull trigger  


## Remove Small Words

All words less than 3 characters seem to not add much value. Hence, they shall be removed; unless its a number.

In [15]:
from nltk.tokenize import word_tokenize

def remove_small_words(text, length = 2):
    """
    Removes words smaller than a certain length, unless its a digit.
    """
    tokens = word_tokenize(text)
    tokens = [token.strip() for token in tokens]
    tokens = [token for token in tokens if (len(token) > length) or (token.isdigit())]
    return ' '.join(tokens)

In [19]:
df_sentences['text_sent_no_small_words'] = df_sentences.text_sent_stopwords.map(remove_small_words)
df_sentences[['text_sent_misc_clean','text_sent_no_small_words']].sample(2)

# df_expanded['text_paras_no_small_words'] = df_expanded.text_paras_stopwords.map(remove_small_words)
# df_expanded[['text_paras_misc_clean','text_paras_no_small_words']].sample(2)

Unnamed: 0,text_sent_misc_clean,text_sent_no_small_words
12567,come with law that tell people to stand grou...,law tell people stand ground law tell people s...
32534,the program can call welfare be actually very...,program call welfare actually small comparison...


In [None]:
pd.set_option('display.max_colwidth', None)
print(df_expanded.sample()[['text_paras','text_paras_no_small_words']])
pd.reset_option('display.max_colwidth')

In [20]:
df_sentences['text_final'] = df_sentences['text_sent_no_small_words'].fillna(value=' ')

# df_expanded['text_final'] = df_expanded['text_paras_no_small_words'].fillna(value=' ')

In [21]:
df_sentences.sample(5)

Unnamed: 0,article,number,global_bias,title,date,news_title,news_link,bias,news_source,text_ascii,text_sent,text_count,text_sent_lemma,text_sent_misc_clean,text_sent_stopwords,text_sent_no_small_words,text_final
21020,540,1897,From the Left,Supreme Court Vacancy Fight,"February 15th, 2016",Why blocking Obama’s pick to replace Scalia co...,https://www.washingtonpost.com/news/powerpost/...,Lean Left,Washington Post,"AD\n\nLast night in Las Vegas, for example, Hi...",There are several Senate Democrats who fit tha...,83,there be several Senate Democrats who fit that...,there be several senate democrats who fit that...,several senate democrats fit description inclu...,several senate democrats fit description inclu...,several senate democrats fit description inclu...
1886,51,192,From the Right,House Democrats To Subpoena White House For Do...,"October 2nd, 2019",House Democrats to subpoena White House in Ukr...,https://www.washingtontimes.com/news/2019/oct/...,Lean Right,Washington Times,House Democrats are threatening to subpoena th...,"Over the past several weeks, the committees tr...",2,"over the past several week , the committee try...",over the past several week the committee try ...,past several week committee try several time o...,past several week committee try several time o...,past several week committee try several time o...
39740,1020,3333,From the Left,Herman Cain Nominated to the Fed,"April 5th, 2019",Trump’s next possible Fed nominee can’t unders...,https://www.washingtonpost.com/opinions/herman...,Lean Left,Washington Post,Cain would be Trumps second proposed addition ...,I wonder: What might presidential candidate Ca...,38,wonder : what may presidential candidate Cain...,wonder what may presidential candidate cain ...,wonder may presidential candidate cain possibl...,wonder may presidential candidate cain possibl...,wonder may presidential candidate cain possibl...
40241,1036,3373,From the Left,Mueller and House Judiciary Committee Reach 'T...,"May 5th, 2019",House Democrat says Mueller and Judiciary Comm...,https://www.washingtonpost.com/politics/house-...,Lean Left,Washington Post,"In the report, Muellers team wrote that while ...","In late March, Mueller wrote a letter to Barr ...",19,"in late March , Mueller write a letter to Barr...",in late march mueller write a letter to barr ...,late march mueller write letter barr voice dis...,late march mueller write letter barr voice dis...,late march mueller write letter barr voice dis...
21556,548,1904,From the Left,Dem Primary in SC,"February 27th, 2016",South Carolina Primary Will Test Hillary Clint...,http://www.nytimes.com/2016/02/28/us/politics/...,Lean Left,New York Times (News),"COLUMBIA, S.C. Drawing overwhelming support f...",While exit polls showed Mrs. Clinton and Mr. S...,42,while exit poll show Mrs. Clinton and Mr. Sand...,while exit poll show mrs clinton and mr sander...,exit poll show mrs clinton sanders run evenly ...,exit poll show mrs clinton sanders run evenly ...,exit poll show mrs clinton sanders run evenly ...


In [23]:
df_sentences.to_csv("/content/drive/MyDrive/Unbiased_news/Data/sent_expanded_ready_for_modeling.csv", index=False)
# df_expanded.to_csv('Data/paras_expanded_ready_for_modeling.csv', index=False)

In [None]:
df_expanded.columns

Index(['article', 'number', 'global_bias', 'title', 'news_source',
       'text_ascii', 'text_paras', 'para_count', 'text_paras_lemma',
       'text_paras_misc_clean', 'text_paras_stopwords',
       'text_paras_no_small_words', 'text_final'],
      dtype='object')

**Below, we are assigning last pre-processed column to a 'text_final' column so that downstream functions dont have to be changed. Just the below code to indicate the final column as needed.**

In [None]:
df_expanded.shape

(24959, 14)

In [None]:
# df_expanded.to_csv("/content/drive/MyDrive/Unbiased_news/Data/paras_expanded_ready_for_modeling.csv", index=False)

df_expanded = pd.read_csv('Data/paras_expanded_ready_for_modeling.csv')

In [None]:
# with open("/content/drive/MyDrive/Unbiased_news/Data/tfidf_vectorizer.pickle", 'wb') as model_file:
#     pickle.dump(review_word_matrix_tfidf, model_file)

# Vectorizer & Topic Modeling

In [None]:
%%time

params = {'stop_words':'english','min_df': 10, 'max_df': 0.5, 'ngram_range':(1, 1),}

cv = CountVectorizer(**params)
review_word_matrix_cv = cv.fit_transform(df_expanded['text_final'])
review_vocab_cv = cv.get_feature_names()

lda_cv, score_cv, topic_matrix_cv, word_matrix_cv = lda_topic_modeling(review_word_matrix_cv, vocab = review_vocab_cv, n = 100)

iteration: 1 of max_iter: 100
iteration: 2 of max_iter: 100
iteration: 3 of max_iter: 100
iteration: 4 of max_iter: 100
iteration: 5 of max_iter: 100
iteration: 6 of max_iter: 100
iteration: 7 of max_iter: 100
iteration: 8 of max_iter: 100
iteration: 9 of max_iter: 100
iteration: 10 of max_iter: 100
iteration: 11 of max_iter: 100
iteration: 12 of max_iter: 100
iteration: 13 of max_iter: 100
iteration: 14 of max_iter: 100
iteration: 15 of max_iter: 100
iteration: 16 of max_iter: 100
iteration: 17 of max_iter: 100
iteration: 18 of max_iter: 100
iteration: 19 of max_iter: 100
iteration: 20 of max_iter: 100
iteration: 21 of max_iter: 100
iteration: 22 of max_iter: 100
iteration: 23 of max_iter: 100
iteration: 24 of max_iter: 100
iteration: 25 of max_iter: 100
iteration: 26 of max_iter: 100
iteration: 27 of max_iter: 100
iteration: 28 of max_iter: 100
iteration: 29 of max_iter: 100
iteration: 30 of max_iter: 100
iteration: 31 of max_iter: 100
iteration: 32 of max_iter: 100
iteration: 33 of 

In [None]:
%%time

params = {'stop_words':'english','min_df': 10, 'max_df': 0.5, 'ngram_range':(1, 1),}

tfidf = TfidfVectorizer(**params)
review_word_matrix_tfidf = tfidf.fit_transform(df_expanded['text_final'])
review_vocab_tfidf = tfidf.get_feature_names()

lda_tfidf, score_tfidf, topic_matrix_tfidf, word_matrix_tfidf = lda_topic_modeling(review_word_matrix_tfidf, vocab = review_vocab_tfidf, n = 200)

iteration: 1 of max_iter: 100
iteration: 2 of max_iter: 100
iteration: 3 of max_iter: 100
iteration: 4 of max_iter: 100
iteration: 5 of max_iter: 100
iteration: 6 of max_iter: 100
iteration: 7 of max_iter: 100
iteration: 8 of max_iter: 100
iteration: 9 of max_iter: 100
iteration: 10 of max_iter: 100
iteration: 11 of max_iter: 100
iteration: 12 of max_iter: 100
iteration: 13 of max_iter: 100
iteration: 14 of max_iter: 100
iteration: 15 of max_iter: 100
iteration: 16 of max_iter: 100
iteration: 17 of max_iter: 100
iteration: 18 of max_iter: 100
iteration: 19 of max_iter: 100
iteration: 20 of max_iter: 100
iteration: 21 of max_iter: 100
iteration: 22 of max_iter: 100
iteration: 23 of max_iter: 100
iteration: 24 of max_iter: 100
iteration: 25 of max_iter: 100
iteration: 26 of max_iter: 100
iteration: 27 of max_iter: 100
iteration: 28 of max_iter: 100
iteration: 29 of max_iter: 100
iteration: 30 of max_iter: 100
iteration: 31 of max_iter: 100
iteration: 32 of max_iter: 100
iteration: 33 of 

### Exploring The Topic Models

Let's take a look at the topic model to see what we've got.

Looking at the top words for each topics, there are a number of filler words which we could remove to make the topics a lot more senseful. Additionally, all numbers except for years can be removed too. Lastly, a way needs to be identified for detecting compound words, especially names of places, like Hong Kong, North America etc 

In [None]:
import pickle

with open("/content/drive/MyDrive/Unbiased_news/Data/lda_100_topics_model.pickle", 'wb') as model_file:
    pickle.dump(lda_tfidf, model_file)

# Pairing the Articles

## LDA with 100 Topics & Count Vectorizer

In [None]:
df_expanded_100_cv_topics = df_expanded.join(topic_matrix_cv)

In [None]:
pd.set_option('display.max_colwidth', None)

# article numbers - 3474, 5
one_topic = df_expanded_100_cv_topics[df_expanded_100_cv_topics.number == 3474].dropna(subset=['text_final'])
left_article = one_topic[one_topic.global_bias == 'From the Left']
right_article = one_topic[one_topic.global_bias == 'From the Right']
left_article_len = len(left_article)
right_article_len = len(right_article)

smaller_article,bigger_article = (left_article,right_article) if left_article_len < right_article_len else (right_article,left_article)
counter = 1

for index, row in smaller_article.iterrows():
        
    X = bigger_article.loc[:,'topic_0':'topic_19']
    y = row.loc['topic_0':'topic_19'].values.reshape(1,-1)  
    
    similarity_scores = cosine_similarity(X,y).flatten()
    indices = np.argsort(similarity_scores)
    
    print(f"*** Para {counter} *** ")
    print(row['text_paras'])
    print(similarity_scores[indices[-1:-4:-1]])
    print(bigger_article.iloc[indices[-1:-4:-1]].loc[:,'text_paras'])
    
    index_to_drop = bigger_article.index[indices[-1]]
    bigger_article.drop(index = index_to_drop, inplace = True)
    
    print('\n')
    
    counter += 1
pd.reset_option('display.max_colwidth')

*** Para 1 *** 
With wide grins and a historic handshake, President Trump became the first sitting U.S. leader to set foot in North Korea when he took 20 steps into the Hermit Kingdom on Sunday.
[1. 1. 1.]
24940                                                                                                                                                                              Ivanka Trump called the visit surreal.
24902    The meeting came four months after the second summit between the two leaders broke down in Hanoi. Trump has argued that the summit was a success because his relationship with the North Korean leader deepened.
24935                               At the beginning, there was a lot of anger between myself and Kim Jong Un, Trump said. Something happened. There was a point at which it happened, and all of a sudden you get along.
Name: text_paras, dtype: object


*** Para 2 *** 
Trump shook hands with North Korean leader Kim Jong Un as he crossed the low stone curb se

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats i

24937                                                                                                                            He began to realize his only chance is not solving the problem but managing the problem, he said. He began to move the goal posts.
24927    Moon greeted Kim at the border after Trump, but did not join the two men for their private talks. North Koreas Foreign Ministry said last week that it did not want Seoul mediating, and told South Korean authorities to mind their own business at home.
24929                                                                                                            We are in a much different place than we were two and a half years ago, he said. He said it was insulting that the news media could say otherwise.
Name: text_paras, dtype: object


*** Para 13 *** 
Trump says he told Kim that, at the right time, youre going to come over and that that could be any time he wants to do it. He added that he would certainly extend the i

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats i

## LDA with 200 Topics

In [None]:
df_expanded.shape

(27875, 13)

In [None]:
# df_expanded_20_topics = pd.read_csv("Data/paras_expanded_with_topics.csv")

df_expanded_200_topics = pd.read_csv("Data/paras_expanded_200_topics.csv")

In [None]:
pd.set_option('display.max_colwidth', None)

# article numbers - 3474, 5
one_topic = df_expanded_200_topics[df_expanded_200_topics.number == 5].dropna(subset=['text_final'])
left_article = one_topic[one_topic.global_bias == 'From the Left']
right_article = one_topic[one_topic.global_bias == 'From the Right']
left_article_len = len(left_article)
right_article_len = len(right_article)

smaller_article,bigger_article = (left_article,right_article) if left_article_len < right_article_len else (right_article,left_article)
counter = 1

for index, row in smaller_article.iterrows():
        
    X = bigger_article.loc[:,'topic_0':'topic_19']
    y = row.loc['topic_0':'topic_19'].values.reshape(1,-1)  
    
    similarity_scores = cosine_similarity(X,y).flatten()
    indices = np.argsort(similarity_scores)
    
    print(f"*** Para {counter} *** ")
    print(row['text_paras'])
    print(similarity_scores[indices[-1:-4:-1]])
    print(bigger_article.iloc[indices[-1:-4:-1]].loc[:,'text_paras'])
    
    index_to_drop = bigger_article.index[indices[-1]]
    bigger_article.drop(index = index_to_drop, inplace = True)
    
    print('\n')
    
    counter += 1
pd.reset_option('display.max_colwidth')

*** Para 1 *** 
A very sad time for America when the Supreme Court of the United States wont allow a question of Is this person a Citizen of the United States? to be asked on the #2020 Census! the president wrote on Twitter. He added that he had asked his officials to do whatever is necessary to bring the citizenship question to a successful conclusion in the future.
[1.         0.99999985 0.99999865]
30                                                                                                                        Three district court judges ruled that Mr. Ross ignored those findings and cut too many other corners in his zeal to get the question approved in time.
21                                                                                                       The citizenship question had become a microcosm of the broader debate over Mr. Trumps approach to racial and immigration matters, and both sides turned up the rhetoric.
25    The Trump administrations politically mot

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats i

## LDA with 100 Topics

In [None]:
df_expanded_100_topics = pd.read_csv("Data/paras_expanded_100_topics.csv")

In [None]:
pd.set_option('display.max_colwidth', None)

one_topic = df_expanded_100_topics[df_expanded_100_topics.number == 3474].dropna(subset=['text_final'])
left_article = one_topic[one_topic.global_bias == 'From the Left']
right_article = one_topic[one_topic.global_bias == 'From the Right']
left_article_len = len(left_article)
right_article_len = len(right_article)

smaller_article,bigger_article = (left_article,right_article) if left_article_len < right_article_len else (right_article,left_article)
counter = 1

for index, row in smaller_article.iterrows():
        
    X = bigger_article.loc[:,'topic_0':'topic_99']
    y = row.loc['topic_0':'topic_99'].values.reshape(1,-1)  
    
    similarity_scores = cosine_similarity(X,y).flatten()
    indices = np.argsort(similarity_scores)
    
    print(f"*** Para {counter} *** ")
    print(row['text_paras'])
    print(similarity_scores[indices[-1:-4:-1]])
    print(bigger_article.iloc[indices[-1:-4:-1]].loc[:,'text_paras'])
    
    index_to_drop = bigger_article.index[indices[-1]]
    bigger_article.drop(index = index_to_drop, inplace = True)
    
    print('\n')
    
    counter += 1
pd.reset_option('display.max_colwidth')

*** Para 1 *** 
With wide grins and a historic handshake, President Trump became the first sitting U.S. leader to set foot in North Korea when he took 20 steps into the Hermit Kingdom on Sunday.
[0.95824952 0.93834497 0.93456814]
27824                                                                                                                                             Moon also spoke of the meeting as a historic moment.
27827    Trump broadcast his offer to meet Kim at the border in a tweet Saturday from the Group of 20 summit in Osaka, Japan. A senior North Korean official responded that the offer was interesting.
27842                                                                     Trump said the DMZ had been very dangerous, but was now much less so since his first summit with Kim in Singapore last June.
Name: text_paras, dtype: object


*** Para 2 *** 
Trump shook hands with North Korean leader Kim Jong Un as he crossed the low stone curb separating the North and the South 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats i


Talks between the U.S. and North Korea had mostly broken down since the Hanoi summit, which ended without a deal. North Korea has hesitated at Trump's insistence that it give up its nuclear ambitions before it sees relief from crushing international sanctions.
[0.81099935 0.78796818 0.77037507]
27803                                                                                                                                                                                                                                                                                                 The meeting came four months after the second summit between the two leaders broke down in Hanoi. Trump has argued that the summit was a success because his relationship with the North Korean leader deepened.
27812                                                                                                                                                                                                   

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats i



*** Para 15 *** 
Inside the "Freedom House" on the South Korean side of the zone, Trump and Kim were joined by the president's daughter and son-in-law, Ivanka Trump and Jared Kushner.
[0.97927998 0.65374744 0.58240298]
27853                                                                                                                                                                                                                                                                                                                                                                                    While Trump met with Kim, his daughter and son-in-law, Ivanka Trump and Jared Kushner, also crossed briefly into North Korea.
27854                                                                                                                                                                                                                                                                               

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats i

## Top2Vec Modeling

In [None]:
top2vec_model = Top2Vec.load("Data/top2vec_deep_learn_model")

In [None]:
top2vec_model.get_num_topics()

210

In [None]:
topic_sizes, topic_nums = top2vec_model.get_topic_sizes()

In [None]:
df_expanded['text_paras'] = df_expanded.text_paras.fillna(value=' ')
# df_expanded_top2vec = df_expanded.join(pd.DataFrame(columns=topic_nums, index=df_expanded.index).add_prefix("topic_"))
# df_expanded_top2vec.loc[:,'topic_0':'topic_209'] = 0

# del df_expanded_no_nulls

In [None]:
help(top2vec_model)

Help on Top2Vec in module top2vec.Top2Vec object:

class Top2Vec(builtins.object)
 |  Top2Vec(documents, min_count=50, embedding_model='doc2vec', embedding_model_path=None, speed='learn', use_corpus_file=False, document_ids=None, keep_documents=True, workers=None, tokenizer=None, verbose=True)
 |  
 |  Top2Vec
 |  
 |  Creates jointly embedded topic, document and word vectors.
 |  
 |  
 |  Parameters
 |  ----------
 |  embedding_model: string
 |      This will determine which model is used to generate the document and
 |      word embeddings. The valid string options are:
 |  
 |          * doc2vec
 |          * universal-sentence-encoder
 |          * universal-sentence-encoder-multilingual
 |          * distiluse-base-multilingual-cased
 |  
 |      For large data sets and data sets with very unique vocabulary doc2vec
 |      could produce better results. This will train a doc2vec model from
 |      scratch. This method is language agnostic. However multiple languages
 |      will n

In [None]:
top2vec_model_deeplearn = Top2Vec(documents = df_expanded.text_paras.tolist(), speed='deep-learn', document_ids = df_expanded.index.tolist(), workers = 4, )

2020-11-30 20:06:34,853 - top2vec - INFO - Pre-processing documents for training
2020-11-30 20:06:48,915 - top2vec - INFO - Creating joint document/word embedding
2020-11-30 21:09:31,573 - top2vec - INFO - Creating lower dimension embedding of documents
2020-11-30 21:11:21,254 - top2vec - INFO - Finding dense areas of documents
2020-11-30 21:11:37,745 - top2vec - INFO - Finding topics


In [None]:
# top2vec_model_deeplearn_sent_enc = Top2Vec(documents = df_expanded.text_paras.tolist(), embedding_model='universal-sentence-encoder', 
#                                   speed='deep-learn', document_ids = df_expanded.index.tolist(), workers = 4, )

In [None]:
topic_sizes, topic_nums = top2vec_model_deeplearn.get_topic_sizes()
top2vec_model_deeplearn.get_num_topics()

220

In [None]:
df_expanded_top2vec_deep_learn = df_expanded.join(pd.DataFrame(columns=topic_nums, index=df_expanded.index).add_prefix("topic_"))
df_expanded_top2vec_deep_learn.loc[:,'topic_0':'topic_219'] = 0

In [None]:
documents, document_scores, document_ids = top2vec_model.search_documents_by_topic(topic_num=0, num_docs=15)
# documents

In [None]:
df_expanded_top2vec_deep_learn.loc[document_ids,'topic_0']

6090     0.672063
6060     0.664864
5994     0.628002
16563    0.593664
25221    0.568913
6039     0.556387
11985    0.552070
4678     0.549856
14399    0.547515
2352     0.546929
14397    0.542140
7744     0.534553
572      0.534332
4763     0.531643
14364    0.527492
Name: topic_0, dtype: float64

In [None]:
for i in topic_nums:
    topic = 'topic_' + str(i)
    documents, document_scores, document_ids = top2vec_model_deeplearn.search_documents_by_topic(topic_num=i, num_docs=topic_sizes[i])
    df_expanded_top2vec_deep_learn.loc[document_ids,topic] = document_scores

In [None]:
pd.set_option('display.max_colwidth', None)

one_topic = df_expanded_top2vec_deep_learn[df_expanded_top2vec_deep_learn.number == 3474].dropna(subset=['text_final'])
left_article = one_topic[one_topic.global_bias == 'From the Left']
right_article = one_topic[one_topic.global_bias == 'From the Right']
left_article_len = len(left_article)
right_article_len = len(right_article)

smaller_article,bigger_article = (left_article,right_article) if left_article_len < right_article_len else (right_article,left_article)
counter = 1

for index, row in smaller_article.iterrows():
        
    X = bigger_article.loc[:,'topic_0':'topic_99']
    y = row.loc['topic_0':'topic_99'].values.reshape(1,-1)  
    
    similarity_scores = cosine_similarity(X,y).flatten()
    indices = np.argsort(similarity_scores)
    
    print(f"*** Para {counter} *** ")
    print(row['text_paras'])
    print(similarity_scores[indices[-1:-4:-1]])
    print(bigger_article.iloc[indices[-1:-4:-1]].loc[:,'text_paras'])
    
    index_to_drop = bigger_article.index[indices[-1]]
    bigger_article.drop(index = index_to_drop, inplace = True)
    
    print('\n')
    
    counter += 1
pd.reset_option('display.max_colwidth')

*** Para 1 *** 
With wide grins and a historic handshake, President Trump became the first sitting U.S. leader to set foot in North Korea when he took 20 steps into the Hermit Kingdom on Sunday.
[1. 1. 1.]
27827    Trump broadcast his offer to meet Kim at the border in a tweet Saturday from the Group of 20 summit in Osaka, Japan. A senior North Korean official responded that the offer was interesting.
27842                                                                     Trump said the DMZ had been very dangerous, but was now much less so since his first summit with Kim in Singapore last June.
27816                                                                                                                                                         Earlier, Trump also spoke warmly of Kim.
Name: text_paras, dtype: object


*** Para 2 *** 
Trump shook hands with North Korean leader Kim Jong Un as he crossed the low stone curb separating the North and the South at around 3:45 p.m. loca

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats i


[1. 1. 1.]
27852    As journalists attempted to gain access to the previously unannounced sit-down between Trump and Kim. new White House press secretary Stephanie Grisham was bruised in a scuffle between North Korean security and members of the U.S. press pool, the Associated Press reported. The Secret Service intervened as North Korean guards pushed and shoved American reporters to block them from entering the Inter-Korean House of Freedom, according to the AP. The incident was partially captured on video.
27842                                                                                                                                                                                                                                                                                                                                                                                     Trump said the DMZ had been very dangerous, but was now much less so since his first summit with Kim in Sin

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats i


Pope Francis praised the meeting on Sunday. According to Reuters, he said he prays "such a significant gesture will be a further step on the road to peace, not only on that peninsula, but for the good of the entire world."
[0. 0. 0.]
27847    Critics say Trump actually inflamed tensions dangerously in the first months of his presidency. Now, some warn, he has gone so far the other way that he is rapidly legitimizing North Korea as a nuclear weapons state and letting Kim off the hook for massive human rights violations in one of the most repressive regimes on the planet.
27846                                                                                                                                                                         In fact, North Korea has tested short-range ballistic missiles since the Hanoi summit. The Pentagon says contacts with Pyongyang over the return of remains have ceased.
27806                                                                          

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
