In [872]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
from scraping import *
import random
from collections import defaultdict 
from ast import literal_eval
from collections import Counter
from newspaper import Article
import re

pd.set_option("display.max_rows", 50)
pd.set_option("display.max_columns", None)
pd.set_option('display.max_colwidth', 80)

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Base Links

All Sides labels articles as stories when it addresses a news topic with 3 news sources linked (mostly from left, center, right).   
We pass in the sitemap urls and use custom defined functions to get a list of all the links containing stories.
Stories can be identified by looking at the url for a '/story/' component.

In [None]:
sitemap_url = ['https://www.allsides.com/sitemap.xml?page=1',\
                 'https://www.allsides.com/sitemap.xml?page=2']

story_links = spy.allsides_sitemap_story_parser(sitemap_url)

In [None]:
story_links[10:20]

### Scrape each story link

Now that we have a list containing links to all the stories (as on 11-20-2020), we can scrape every individual story page to get the story title, data, summary and the news sources under them.

In [306]:
stories_df = allsides_story_parser(story_links, filename = "../Data/news_while_scraping_save.csv", verbose = 25)

25 https://www.allsides.com/story/ross-perot-former-independent-presidential-candidate-dead-89 Ross Perot, Former Independent Presidential Candidate, Dead at 89 July 9th, 2019
50 https://www.allsides.com/story/bernie-sanders-defends-staff-wages-after-complaints Bernie Sanders Defends Staff Wages After Complaints July 21st, 2019
75 https://www.allsides.com/story/joe-biden-swings-back-challengers Joe Biden Swings Back At Challengers August 1st, 2019
100 https://www.allsides.com/story/hong-kong-protesters-clash-police-airport Hong Kong: Protesters Clash With Police at Airport August 13th, 2019
125 https://www.allsides.com/story/facebook-bans-ads-epoch-times Facebook Bans Ads From The Epoch Times August 24th, 2019
150 https://www.allsides.com/story/pro-eu-conservatives-vote-against-johnson-parliament Pro-EU Conservatives Vote Against Johnson in Parliament September 3rd, 2019
175 https://www.allsides.com/story/doj-rejects-mccabes-appeal-avoid-charges DOJ Rejects McCabe's Appeal to Avoid Cha

In [308]:
stories_df.to_csv("../Data/allsides_stories.csv", index=False)

### Exploring the News Sources

Now that we have scrapped all the 'story' articles, we can explore to answer a few questions:
1. Are there always 3 news sources?
1. How often has the category not included one from Left, Right & Center?
1. Best way to store this data and taking steps to do so.

In [309]:
stories_df.head()

Unnamed: 0,title,date,summary,news_sources,link
0,The Latest Polls,"September 19th, 2012",[],[{'news_title': '8 takeaways from the NBC-Wall...,https://www.allsides.com/story/latest-polls
1,Japan Restarts Commercial Whaling After Decades,"July 1st, 2019","[After three decades, Japan has resumed its co...",[{'news_title': 'Japan resumes commercial whal...,https://www.allsides.com/story/japan-restarts-...
2,Would a Wealth Tax Curb Income Inequality?,"July 1st, 2019",[Massachusetts Senator and 2020 presidential h...,[{'news_title': 'Would a significant increase ...,https://www.allsides.com/story/would-wealth-ta...
3,Tanks And Flyovers Are Planned For July 4th In...,"July 2nd, 2019",[Battle tanks will be on display and fighter j...,[{'news_title': 'Tanks for Trump's July 4th Ce...,https://www.allsides.com/story/tanks-and-flyov...
4,Series of Opinions on Biden vs. Harris Debate,"July 2nd, 2019","[Since the democratic debate on June 27th, Kam...",[{'news_title': 'Desegregating schools without...,https://www.allsides.com/story/series-opinions...


#### Part 1 - 3 Sources or not

In [313]:
stories_df.news_sources.map(len).describe()

count    5201.000000
mean        2.938089
std         0.413257
min         0.000000
25%         3.000000
50%         3.000000
75%         3.000000
max         3.000000
Name: news_sources, dtype: float64

Looks like there are a few cases where the number of stories are less than 3. 
Let's view these.

In [330]:
stories_df[stories_df.news_sources.map(len) == 2].shape

(27, 5)

In [353]:
stories_df[stories_df.news_sources.map(len) == 2].sample(5)

Unnamed: 0,title,date,summary,news_sources,link
4159,A Debate on Trump’s Health Insurance Deregulation,"February 28th, 2018",[POINT/COUNTERPOINT: An Urban Institute (media...,[{'news_title': 'Study: Trump’s Obamacare Sabo...,https://www.allsides.com/story/debate-trump%E2...
2959,Analysis of Obama Speech on Terror,"December 7th, 2015",[],[{'news_title': 'Obama's not-so-peppy pep talk...,https://www.allsides.com/story/analysis-obama-...
1647,OP-ED: Boston & Islam in US,"April 25th, 2013",[],[{'news_title': 'Boston bombing just the begin...,https://www.allsides.com/story/boston-islam-us
3425,Campaign Finance,"August 1st, 2012",[],[{'news_title': 'Barack Obama 2012 Fundraising...,https://www.allsides.com/story/campaign-finance
2910,Romney's Tenure at Bain,"July 15th, 2012",[],[{'news_title': 'Obama to Romney: 'We won't be...,https://www.allsides.com/story/romneys-tenure-...


In [322]:
stories_df.loc[4061,'link']

'https://www.allsides.com/story/new-years-resolution-talk-other'

In [332]:
stories_df[stories_df.news_sources.map(len) == 1].shape

(2, 5)

In [333]:
stories_df[stories_df.news_sources.map(len) == 1]

Unnamed: 0,title,date,summary,news_sources,link
2791,SC House Approves Confederate Flag Removal,"July 9th, 2015",[],[{'news_title': 'South Carolina Approves Remov...,https://www.allsides.com/story/sc-house-approv...
3013,Putin Probably Approved Former Spy's Murder,"January 21st, 2016",[],[{'news_title': 'Putin implicated in fatal poi...,https://www.allsides.com/story/putin-probably-...


In [331]:
stories_df[stories_df.news_sources.map(len) == 0].shape

(97, 5)

In [328]:
stories_df[stories_df.news_sources.map(len) == 0].sample(5)

Unnamed: 0,title,date,summary,news_sources,link
2939,France Confirms Death of Attacks Ringleader,"November 19th, 2015",[],[],https://www.allsides.com/story/france-confirms...
3050,Clinton and Sanders Clash at Milwaukee Debate,"February 12th, 2016",[],[],https://www.allsides.com/story/clinton-and-san...
2519,Is Utah the next Silicon Valley,"February 3rd, 2015",[],[],https://www.allsides.com/story/utah-next-silic...
3261,Should Clinton Be Denied Classified Information?,"July 7th, 2016",[],[],https://www.allsides.com/story/should-clinton-...
2481,Romney Run in 2016?,"January 13th, 2015",[],[],https://www.allsides.com/story/will-romney-run...


Based on examining the above data, we can conclude the following:
1. None of the entries have more than 3 news sources
1. There are 27 entries with only 2 news sources, 2 entries with 1 news source and 97 entries with no news source at all.
1. Although majority of the entries with less than 3 news sources are older, there are some newer entries also. 

For ease of analysis downstream, only entries with 3 news sources will be kept. They can merit further investigation in the near future if necessary.

In [354]:
stories_filter1_df = stories_df[stories_df.news_sources.map(len) == 3].reset_index(drop=True).copy(deep=True)

In [358]:
stories_filter1_df.shape

(5075, 5)

In [355]:
stories_filter1_df.sample(2)

Unnamed: 0,title,date,summary,news_sources,link
57,Robert Mueller Testifies That He Did Not Clear...,"July 24th, 2019",[Former Special Counsel Robert Mueller is test...,[{'news_title': 'Read Robert Mueller’s opening...,https://www.allsides.com/story/robert-mueller-...
4920,Trump Visits Southern Border Deploying More Ag...,"April 7th, 2019",[After President Trump visited the U.S border ...,[{'news_title': 'Trump warns of 'traffic' and ...,https://www.allsides.com/story/trump-visits-so...


#### Part 2 - Center, Left, and Right Sources

The examine the news sources, we can expand them into dataframes, concatenate to the original df and then count the occurances for each of the 'global bias' available for each row.

In [356]:
news_source_df0 = pd.json_normalize(stories_filter1_df.news_sources.map(lambda x: x[0])).add_suffix('_0')
news_source_df1 = pd.json_normalize(stories_filter1_df.news_sources.map(lambda x: x[1])).add_suffix('_1')
news_source_df2 = pd.json_normalize(stories_filter1_df.news_sources.map(lambda x: x[2])).add_suffix('_2')

In [435]:
stories_filter2_df = pd.concat([stories_filter1_df,news_source_df0, news_source_df1, news_source_df2], axis = 1)

del news_source_df0
del news_source_df1
del news_source_df2

stories_filter2_df.sample(4)

Unnamed: 0,title,date,summary,news_sources,link,news_title_0,news_source_0,news_link_0,global_bias_0,bias_0,...,news_link_1,global_bias_1,bias_1,paras_1,news_title_2,news_source_2,news_link_2,global_bias_2,bias_2,paras_2
3629,Senate Healthcare Debacle,"June 28th, 2017",[The GOP Senate's attempt at healthcare reform...,[{'news_title': 'Why is the GOP so terrible at...,https://www.allsides.com/story/senate-healthca...,Why is the GOP so terrible at health care?,Salon,http://www.salon.com/2017/06/28/why-is-the-gop...,From the Left,Left,...,http://www.nationalreview.com/article/449019/s...,From the Right,Right,Republicans looked at a health-care law that w...,Why Republicans' health-care debacle will haun...,The Week - News,http://theweek.com/articles/708643/why-republi...,From the Center,Center,The July 4 recess is a time when members of Co...
767,How the US is Moving to Re-open the Economy,"April 13th, 2020",[Debate persists over re-opening the U.S. econ...,[{'news_title': 'Trump says he’ll reopen the e...,https://www.allsides.com/story/how-us-moving-r...,Trump says he’ll reopen the economy “based on ...,Vox,https://www.vox.com/2020/4/12/21218336/coronav...,From the Left,Left,...,https://thehill.com/homenews/administration/49...,From the Center,Center,One of Trump administration's top advisers on ...,We're Working Closely with Governors to Open t...,Townhall,https://townhall.com/tipsheet/katiepavlich/202...,From the Right,Right,President Trump reiterated Monday morning he i...
2583,Video of SC Shooting Reignites Debate,"April 9th, 2015",[],[{'news_title': 'South Carolina officer who sh...,https://www.allsides.com/story/video-sc-shooti...,South Carolina officer who shot black man was ...,Fox News (Online News),http://www.foxnews.com/us/2015/04/09/south-car...,From the Right,Lean Right,...,http://www.cnn.com/2015/04/09/us/north-charles...,From the Left,Lean Left,"In both cases, a white police officer kills an...",Video of Walter Scott Shooting Reignites Debat...,New York Times (News),http://www.nytimes.com/2015/04/09/us/video-of-...,From the Left,Lean Left,Nothing has done more to fuel the national deb...
892,Trump Deems Places of Worship Essential,"May 23rd, 2020",[President Donald Trump said at a press briefi...,[{'news_title': 'Trump stokes base with call t...,https://www.allsides.com/story/trump-deems-pla...,Trump stokes base with call to reopen churches...,CNN (Web News),https://www.cnn.com/2020/05/23/politics/trump-...,From the Left,Lean Left,...,https://www.bloomberg.com/news/articles/2020-0...,From the Left,Lean Left,President Donald Trump demanded on Friday that...,Trump announces that houses of worship are ‘es...,Fox News (Online News),https://www.foxnews.com/politics/trump-announc...,From the Right,Lean Right,President Trump on Friday announced that new C...


Let's see which rows have an article from left, right and center and how many have atleast one article from left and one from right.

In [448]:
all_3_bias = {'From the Center','From the Right', 'From the Left'}
left_right_bias = {'From the Left','From the Right'}
bias_columns = ['global_bias_0','global_bias_1','global_bias_2']

In [447]:
stories_filter2_df['all_3_bias'] = stories_filter2_df[bias_columns].\
                        apply(lambda x: set(x).issuperset(all_3_bias), axis=1)

stories_filter2_df['left_right_bias'] = stories_filter2_df[bias_columns].\
                        apply(lambda x: set(x).issuperset(left_right_bias), axis=1)

In [455]:
stories_filter2_df.all_3_bias.sum()

3477

In [456]:
stories_filter2_df.left_right_bias.sum()

4931

Since there are atleast 3477 entries with all three sources, we shall drop the rest of the rows for further analysis.  
We can always come back to add the entries that have both left & right entries atleast.

In [458]:
stories_filter3_df = stories_filter2_df[stories_filter2_df.all_3_bias].reset_index(drop=True)
stories_filter3_df.sample(2)

Unnamed: 0,title,date,summary,news_sources,link,news_title_0,news_source_0,news_link_0,global_bias_0,bias_0,...,bias_1,paras_1,news_title_2,news_source_2,news_link_2,global_bias_2,bias_2,paras_2,all_3_bias,left_right_bias
1351,Delay of Health Care Mandates,"July 18th, 2013",[],[{'news_title': 'House votes to delay two heal...,https://www.allsides.com/story/delay-health-ca...,House votes to delay two health care mandates:...,Christian Science Monitor,http://www.csmonitor.com/USA/Latest-News-Wires...,From the Center,Center,...,Lean Left,"Defying a veto threat from President Obama, th...",House votes for delays on Obamacare; some Demo...,Washington Times,http://www.washingtontimes.com/news/2013/jul/1...,From the Right,Lean Right,OPINION The House voted Wednesday to delay man...,True,True
157,New York Times Apologizes For A Tweet About Ne...,"September 16th, 2019",[The New York Times apologized late Saturday a...,[{'news_title': 'New York Times apologizes for...,https://www.allsides.com/story/new-york-times-...,New York Times apologizes for ‘inappropriate a...,Washington Post,https://beta.washingtonpost.com/nation/2019/09...,From the Left,Lean Left,...,Center,President Trump on Monday hammered the media a...,NYT updates Kavanaugh 'bombshell' to note accu...,Fox News (Online News),https://www.foxnews.com/politics/nyt-kavanaugh...,From the Right,Lean Right,The New York Times suddenly made a major revis...,True,True


We have now cleaned the dataframe to only include rows that an article from the left, right, and center.

#### Part 3 - Best way to store the Data

Since we have an article from each of the three biases now, we can capture the data in an alternate way - by flattening the dataframe to only have one news article per row.  
By having just one document (article) per row, it will help in better querying, scraping & cleaning of the dataframe. 
But to ensure that the group of 3 is identifiable together, the new dataframe will have the same index repeated three times. To allow for unique indexing, we will ultimately make a multi-level index, with the original index PLUS the 'global bias' value.  
By maintaining this form of indexing, we can easily pivot the dataframe later, based on needs.

In [616]:
stories_flat_df.index.names

FrozenList(['number', 'global_bias'])

In [617]:
columns_for_new_df = ['title', 'date', 'summary', 'news_sources', 'link', 'news_title', 'news_source', 'news_link', \
                       'global_bias', 'bias', 'paras']

temp_df_0 = stories_filter3_df[['title', 'date', 'summary', 'news_sources', 'link', 'news_title_0', 'news_source_0', 'news_link_0', \
           'global_bias_0', 'bias_0', 'paras_0']]
temp_df_0.columns = columns_for_new_df

temp_df_1 = stories_filter3_df[['title', 'date', 'summary', 'news_sources', 'link', 'news_title_1', 'news_source_1', 'news_link_1', \
           'global_bias_1', 'bias_1', 'paras_1']]
temp_df_1.columns = columns_for_new_df

temp_df_2 = stories_filter3_df[['title', 'date', 'summary', 'news_sources', 'link', 'news_title_2', 'news_source_2', 'news_link_2', \
           'global_bias_2', 'bias_2', 'paras_2']]
temp_df_2.columns = columns_for_new_df

stories_flat_df = pd.concat([temp_df_0,temp_df_1,temp_df_2]).sort_index()

#creating a multiindex
stories_flat_df = stories_flat_df.set_index(keys=[stories_flat_df.index,'global_bias']).sort_index()
stories_flat_df.index.names = ['number','global_bias']

del temp_df_0
del temp_df_1
del temp_df_2 

stories_flat_df.sample(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,title,date,summary,news_sources,link,news_title,news_source,news_link,bias,paras
number,global_bias,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
16,From the Center,Justice Department Changing Legal Team On Cens...,"July 8th, 2019",[The Justice Department is changing lawyers wh...,[{'news_title': 'Justice Dept. to Replace Lawy...,https://www.allsides.com/story/justice-departm...,Department of Justice hands 2020 census-relate...,USA TODAY,https://www.usatoday.com/story/news/politics/2...,Center,The Department of Justice announced Sunday tha...
2567,From the Left,White House Fires Top Tillerson Aide,"March 13th, 2018",[The White House has terminated Steve Goldstei...,[{'news_title': 'Top Tillerson aide fired afte...,https://www.allsides.com/story/white-house-fir...,State Dept.: Tillerson Found Out He Was Fired ...,Daily Beast,https://www.thedailybeast.com/report-tillerson...,Left,Rex Tillerson did not talk to Donald Trump bef...


In [619]:
stories_flat_df.to_csv("../Data/filtered_flat_articles.csv")

### Querying the News Sources

There are multiple different news sources that have been used by allsides.com.  
It would be prudent to see which are the most common ones and perhaps only scrape the complete articles from those.

In [598]:
stories_flat_df.news_source.nunique()

249

In [600]:
stories_flat_df.news_source.value_counts().head(20).sum()

7936

In [601]:
stories_flat_df.news_source.value_counts().head(20)

Fox News (Online News)        1030
Wall Street Journal (News)     659
Washington Post                622
New York Times (News)          610
The Hill                       535
Washington Times               533
USA TODAY                      516
NPR (Online News)              474
HuffPost                       389
Reuters                        359
Washington Examiner            325
CNN (Web News)                 316
Townhall                       277
BBC News                       226
Newsmax (News)                 200
Vox                            190
Politico                       189
Associated Press               179
National Review                159
New York Post (News)           148
Name: news_source, dtype: int64

Looks there are 249 news outlets, and the top 20 make up almost 80% of all entries.
We can start by focusing on just scraping these outlets and come back to add the rest perhaps.   
   
But first, let's ensure these 20 are capturing enough sets of 3 in terms of bias across the corpus.

In [624]:
top_20_news_outlets = stories_flat_df.news_source.value_counts().head(20).index.tolist()

(stories_flat_df[stories_flat_df.news_source.isin(top_20_news_outlets)].groupby(level='number').size() == 3).sum()

1635

We have 1635 entries with all three global bias present. This should be good enough to get started with at present. In the future, this can be revisited to add more news outlet.

**Now, let's get to the querying finally!**

In [689]:
# creating columns to store the scraped articles
stories_flat_df['authors'] = None
stories_flat_df['publish_date'] = None
stories_flat_df['text'] = None

We will first use 'newspaper3k' to get as many news articles as possible. Post that, we can manually work on scraping the rest of the top 20 news sites identified above.

In [693]:
stories_flat_df = newspaper3k_articles(stories_flat_df)

Error retrieving article from https://www.forbes.com/sites/nataliewexler/2019/06/30/what-kamala-harris-and-joe-biden-should-be-talking-about-instead-of-busing/
Error retrieving article from https://www.wsj.com/articles/nike-nixes-betsy-ross-flag-sneaker-after-colin-kaepernick-intervenes-11562024126
Error retrieving article from https://www.wsj.com/articles/u-s-to-drop-citizenship-question-from-census-11562101086
Error retrieving article from https://www.wsj.com/articles/trump-and-critics-prepare-for-july-4-show-of-a-lifetime-11562232601
Error retrieving article from http://online.wsj.com/article/SB10000872396390444620104578006290719533504.html
Error retrieving article from http://www.cnn.com/2012/09/19/us/us-fast-furious-report/index.html
Error retrieving article from https://www.wsj.com/articles/u-s-job-creation-bounced-back-in-june-11562330095
Error retrieving article from https://www.foxbusiness.com/economy/june-jobs-report-2019
Error retrieving article from https://www.wsj.com/arti

In [None]:
stories_flat_df.sample(15)

In [698]:
# stories_flat_df.to_csv("../Data/stories_news_scraped_v1.csv")

In [1009]:
# stories_flat_df = pd.read_csv("../Data/stories_news_scraped_v1.csv", index_col=[0,1])
# stories_flat_df

### Exploring the Queried News 

First, let's see what did not get scrapped properly.

In [714]:
stories_flat_df[stories_flat_df.text.isna()].news_source.value_counts().sort_values(ascending=False)

Wall Street Journal (News)    657
Fox News (Online News)        617
TheBlaze.com                   81
ABC News (Online)              79
Newsweek                       44
                             ... 
Heather Mac Donald              1
Fox News Latino                 1
The Root                        1
National Review                 1
William McGurn                  1
Name: news_source, Length: 73, dtype: int64

Since there are quite a few outlets here, let's explore just the top 20 that we listed earlier.

In [729]:
top_20 = stories_flat_df.news_source.value_counts().head(20).index.tolist()

mask_top_20 = stories_flat_df[stories_flat_df.news_source.isin(top_20)]

(mask_top_20[mask_top_20.text.isna()].news_source.value_counts()/mask_top_20.news_source.value_counts()).sort_values(ascending=False)
# stories_flat_df[mask_top_20.text.isna()].news_source.value_counts().sort_values(ascending=False)

Wall Street Journal (News)    0.996965
Fox News (Online News)        0.599029
Townhall                      0.068592
Washington Times              0.039400
NPR (Online News)             0.027426
Washington Examiner           0.024615
Washington Post               0.022508
New York Times (News)         0.016393
Politico                      0.015873
Reuters                       0.013928
HuffPost                      0.010283
CNN (Web News)                0.009494
The Hill                      0.007477
National Review               0.006289
USA TODAY                     0.005814
Associated Press              0.005587
Vox                           0.005263
Newsmax (News)                0.005000
BBC News                           NaN
New York Post (News)               NaN
Name: news_source, dtype: float64

Looks like we definitely need to **custom scrape Wall Street Journal**.    
The others need to be looked at more specifically, as it wouldn't make sense why some articles came through for each and others did not. One possible theory is that the links do not work any longer.    
While exploring each news outlet, we can also see if the ones that did get scrapped/parsed, or not.

#### Wall Street Journal

In [891]:
news_outlet = 'Wall Street Journal (News)'

In [934]:
stories_flat_df[(stories_flat_df.news_source == news_outlet) & (~stories_flat_df.text.isna())]

Unnamed: 0_level_0,Unnamed: 1_level_0,title,date,summary,link,news_title,news_source,news_link,bias,paras,authors,publish_date,text,wsj_result
number,global_bias,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1611,From the Center,House Approves Keystone,"November 15th, 2014",[],https://www.allsides.com/story/house-approves-keystone,House Passes Bill to Approve Keystone XL Pipeline,Wall Street Journal (News),http://www.huffingtonpost.com/2014/11/15/obamacare-sign-ups_n_6163178.html,Center,The second sign-up season under President Barack Obama's health overhaul ope...,"['Julie R. Thomson', 'Kristen Aiken', 'Caroline Bologna', 'Olivia Christense...",2014-11-15 00:00:00,"""As much as I’m a strong Republican and I love my party, it’s the country th...",-1
1894,From the Center,Trump and Sanders Win in NH,"February 10th, 2016",[],https://www.allsides.com/story/trump-and-sanders-win-nh,How Trump and Sanders Broadened Their Bases in New Hampshire,Wall Street Journal (News),http://graphics.wsj.com/elections/2016/new-hampshire-demographics-base/,Center,Voters in the nation's first primary delivered resounding victories to Donal...,['Wsj.Com News Graphics'],,Voters in the nation's first primary delivered resounding victories to Donal...,16


The two articles that did get scrapped were not done properly. One link does not belong to WSJ, and the other looks like a one-off usage of a different sub-domain of WSJ.  
It is more time efficient to delete these entries entirely.

In [1010]:
# stories_flat_df = stories_flat_df.drop(index=([1611,1894]))
# stories_flat_df.reset_index().set_index(keys=['number','global_bias'])
# stories_flat_df.to_csv("../Data/stories_news_scraped_v2.csv")

Let's confirm if there are any other entries where the **'news source'** and actual link do not belong to each other

In [943]:
temp_df = stories_flat_df.copy(deep=True)

temp_df['wsj_result'] = temp_df.news_link.map(lambda x: x.find('wsj.com'))

temp_df[(temp_df.news_source == news_outlet) & (temp_df.wsj_result == -1)]

Unnamed: 0_level_0,Unnamed: 1_level_0,title,date,summary,link,news_title,news_source,news_link,bias,paras,authors,publish_date,text,wsj_result
number,global_bias,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1


Nope, looks like we are good. Must have been a one-off mistake.

In [944]:
del temp_df

In [945]:
sample_row = stories_flat_df[(stories_flat_df.news_source == news_outlet) & (stories_flat_df.text.isna())].sample()
print(sample_row['news_link'].values[0])
print(sample_row['text'].values[0])

http://www.wsj.com/articles/poll-suggests-yes-has-edge-in-pivotal-greek-referendum-1435915236
nan


In [957]:
url = sample_row['news_link'].values[0]
# url += ".html"
# soup = general_scraper(url)

In [959]:
with open('../wsj_password.txt', 'r') as file:
    password = file.read()

session = requests.Session()

# Create the payload
login = {'username':'navish.16@gmail.com', 
          'password':password
         }
wsj_url = 'https://accounts.wsj.com/login?'

# Post the payload to the site to log in
s = session.post(wsj_url, data=login)
s
# Navigate to the next page and scrape the data
# s = session.get('https://www.chess.com/today')

<Response [200]>

In [None]:
requests.get(url)

WSJ is turning out to be a big pain to scrape. Will come back to it in the future if possible

#### Fox News

In [745]:
news_outlet = 'Fox News (Online News)'

Let's first explore the entries where the scraping worked!

In [883]:
stories_flat_df[(stories_flat_df.news_source == news_outlet) & (~stories_flat_df.text.isna())]['text'].sample().values

array(['President Donald Trump signed an executive order Wednesday paving the way for an overhaul of the way kidney disease, which affects 30 million Americans, is treated in the U.S.\n\nThe president\'s move would lead to sweeping changes in treatment as well as prevention, including improving access to dialysis treatment in the home and enabling people with failing kidneys to have opportunities sooner to get a transplant.\n\n"This is a first, second and third step, it’s more than just a first step,” said Trump just before signing the executive order. “We’re going to come up with solutions over a period of five years and 10 years, that I think most people won’t believe.”\n\nSome of the initiatives will require new government regulations.\n\nAnd because a severe organ shortage complicates the call for more transplants, the administration also aims to ease financial hardships for living donors, said the officials, who spoke on the condition of anonymity ahead of the announcement.\n\nSTE

Looks like Fox News has mostly gotten the text correctly (where it did work).
But it also failed to remove ad tags for other news as well the concluding line talking about the app & the contributions.
In particular, the following could be cleaned - **CLICK HERE TO GET THE FOX NEWS APP\n\n** and **\n\nFox News\' x contributed to this report.**

In [776]:
stories_flat_df[(stories_flat_df.news_source == news_outlet) & (stories_flat_df.text.isna())]['news_link'].sample().values

array(['http://www.foxnews.com/politics/2018/08/28/trump-threatens-google-over-alleged-biased-search-results.html'],
      dtype=object)

In [779]:
stories_flat_df[(stories_flat_df.news_source == news_outlet) & (~stories_flat_df.text.isna())]['news_link'].sample().values

array(['https://www.foxnews.com/politics/trump-on-fox-friends-i-will-not-fire-kellyanne-conway-after-watchdog-rebuke'],
      dtype=object)

Looks like a lot of the articles that did not return results had their URL changed in the following ways:   
1. By removing the year/month/date portion and removing .html at the end. Let's modify these URLs and rerun the scraping code.
1. By removing the .html at the end of the link
1. By removing a / at the end.

Not all links will still work, but these are the only systemic changes.

In [1011]:
#  Using a custom defined function to make the url changes
stories_flat_df['news_link'] = stories_flat_df.news_link.map(fox_news_url_cleaner, na_action='ignore')

In [1012]:
stories_flat_df['news_link'].sample(5)

number  global_bias    
1735    From the Right     http://www.foxnews.com/politics/top-us-lawmakers-strike-deal-to-fast-track-t...
1285    From the Center    http://www.npr.org/blogs/itsallpolitics/2013/04/17/177640184/immigration-pro...
1821    From the Center    http://www.npr.org/sections/thetwo-way/2015/09/03/437206335/hungarys-premier...
183     From the Center    https://thehill.com/regulation/lobbying/463385-senate-democrats-find-top-nra...
2880    From the Center    http://thehill.com/homenews/administration/403773-top-student-loan-official-...
Name: news_link, dtype: object

In [1013]:
# stories_flat_df.to_csv("../Data/stories_news_scraped_v2.csv")

#### Washington Post

In [1016]:
news_outlet = 'Washington Post'

Lets explore the articles that were scrapped first.

In [989]:
stories_flat_df[(stories_flat_df.news_source == news_outlet) & (~stories_flat_df.text.isna())].sample()['text'].values[0]

'It also suggested that Johnson’s bumbling exterior conceals a ruthless tactician, ready to exploit any opening to bring about the split from the E.U. and consolidate power.\n\nAD\n\nAbsent a delay, Britain will leave the European Union on Oct. 31.\n\nAD\n\nIf it leaves without a transition deal, analysts say, the country could face food and fuel shortages. The economic turmoil could spread to the E.U. nations, which collectively are the United States’ biggest trading partner. Many observers fear that with a hardened border, fresh violence could flare in Northern Ireland. And because President Trump has embraced Brexit and Johnson, the break with Europe would become a major test of the White House’s skepticism of multilateral institutions and trade blocs.\n\nIn Brussels, diplomats said they are increasingly convinced that Johnson will pilot Britain off the cliff without the safety net of a deal. European lawmakers expressed astonishment that he would so brazenly tie the hands of Parlia

Looks like the washington post articles were quite well scraped. Just need to remove the "AD" references from the corpus. Which can be done while doing NLP cleaning.

In [1006]:
stories_flat_df[(stories_flat_df.news_source == news_outlet) & (stories_flat_df.text.isna())].sample()['news_link'].values[0]

'http://www.washingtonpost.com/business/weekly-us-unemployment-aid-applications-rise-to-372000-data-distorted-by-winter-holidays/2013/01/03/d3bcb382-55aa-11e2-89de-76c1c54b1418_story.html'

Looks like there are certain links for which articles simply do not exist any longer.

#### New York Times (News)

In [1020]:
news_outlet = 'New York Times (News)'

Lets explore the articles that were scrapped first.

In [1041]:
sample_row = stories_flat_df[(stories_flat_df.news_source == news_outlet) & (~stories_flat_df.text.isna())].sample()
url = sample_row['news_link'].values[0]
print(url)
print(sample_row['text'].values[0])
# sample_row

http://www.nytimes.com/2016/07/26/us/politics/dnc-speakers-protests-sanders.html
PHILADELPHIA — Democratic Party leaders scrambled on Monday night to rescue their convention from political bedlam as supporters of Senator Bernie Sanders erupted in boos, jeers and protests against Hillary Clinton after an email leak showed that party officials had sought to undermine Mr. Sanders in their race for the nomination.

Mr. Sanders, whose speech was shifted to a more prominent time in hopes of soothing delegates, struggled to unify the convention. His full-throated endorsement of Mrs. Clinton drew scattered boos, and his valedictory tone left some supporters in tears rather than rallying around the Democratic nominee. Another speaker, Michelle Obama, was far more electrifying, but while she drew affection from the crowd, her remarks did little to heal the lingering primary wounds.

The venting among Sanders supporters reflected months of pent-up frustration after he lost the nomination to Mrs. 

Looks like only a part of the article is being captured properly by the scraper and a custom code snippet has to be written to get the whole article.

In [1040]:
soup = general_scraper(url)

print("\n\n".join([para.text for para in soup.find_all('p', class_="css-158dogj evys1bk0")]))

WASHINGTON — Five and a half years into President Obama’s time in office, the jokes are getting a bit stale: Fox News is a “shadowy right-wing organization.” The 47 percent “called Mitt Romney to apologize.” The whole “Kenyan president” bit.

Thank goodness for HealthCare.gov.

Mr. Obama started his annual remarks at the White House Correspondents’ Association dinner Saturday night with the recognition that the rollout of his health care website could have gone better, admitting that “in 2008, my slogan was ‘Yes, we can!’ In 2013, my slogan was ‘Control-Alt-Delete.’ ”

“On the plus side,” he continued, “they did turn the launch of HealthCare.gov into one of the year’s biggest movies” (video screens on either side of the president showed the title of the hit Disney film “Frozen”).

The president finished his remarks by trying to show a farewell video. When it refused to load properly, Mr. Obama brought out his “fixer”: Kathleen Sebelius, the secretary of Health and Human Services, the p

After testing the above scraping code with a few articles, I can confirm it works. 
Along with Fox News, we can scrape all NYT articles again too.

#### The Hill

In [1045]:
news_outlet = 'The Hill'

In [1047]:
sample_row = stories_flat_df[(stories_flat_df.news_source == news_outlet) & (~stories_flat_df.text.isna())].sample()
url = sample_row['news_link'].values[0]
print(url)
print(sample_row['text'].values[0])
# sample_row

http://thehill.com/homenews/administration/353162-health-secretary-tom-price-resigns
Health and Human Services (HHS) Secretary Tom Price Thomas (Tom) Edmunds PriceConspicuous by their absence from the Republican Convention Coronavirus Report: The Hill's Steve Clemons interviews Chris Christie Trump flails as audience dwindles and ratings plummet MORE resigned on Friday, after an uproar over his use of private jets for official business.

White House press secretary Sarah Huckabee Sanders said in a statement that Price offered his resignation to President Trump on Friday, and that Trump had accepted.

ADVERTISEMENT

He is the first official to resign from Trump's Cabinet.

"I have spent forty years both as a doctor and public servant putting people first. I regret that the recent events have created a distraction from these important objectives," Price said in his resignation letter.

"Success on these issues is more important than any one person. In order for you to move forward withou

Looks like the entire story is being captured for 'The Hill'.   

Let's now look at articles that did not get captured.

In [1054]:
sample_row = stories_flat_df[(stories_flat_df.news_source == news_outlet) & (stories_flat_df.text.isna())]
# url = sample_row['news_link'].values[0]
# print(url)
# print(sample_row['text'].values[0])
len(sample_row)

4

Because only 4 articles were omitted from being captured, we are going to ignore this for now and write a custom scraper later for this.

#### Washington Times

In [1056]:
news_outlet = 'Washington Times'

In [1069]:
sample_row = stories_flat_df[(stories_flat_df.news_source == news_outlet) & (~stories_flat_df.text.isna())].sample()
url = sample_row['news_link'].values[0]
print(url)
print(sample_row['text'].values[0])
# sample_row

http://www.washingtontimes.com/news/2014/aug/8/f-18-jets-dropped-500-pund-laser-guided-bombs-isil/
U.S. fighter jets have begun pounding artillery held by the al Qaeda-inspired militants in northern Iraq with laser guided bombs, a senior Defense Department official said Friday.



“At approximately 6:45 a.m. EDT, the U.S. military conducted a targeted airstrike against Islamic State of Iraq and the Levant (ISIL) terrorists,” Rear Adm. John Kirby, a top Pentagon spokesman said in a statement.



“Two F/A-18 aircraft dropped 500-pound laser-guided bombs on a mobile artillery piece near Erbil,” Rear Adm. Kirby said. “ISIL was using this artillery to shell Kurdish forces defending Erbil where U.S. personnel are located.”



A small clutch of U.S. advisers have been based in the city during recent weeks, assessing the capability of Kurdish forces to defend the region. No other details were immediately provided.



Friday’s action came roughly 12 hours after President Obama announced that he

In [1072]:
soup = general_scraper(url)
[para.text for para in soup.find('div', class_='storyareawrapper').find_all('p')[:-5]]

[]

Looks like the whole article is getting scrapped. only downside is the scraping at the end includes meta data info.

In [1075]:
sample_row = stories_flat_df[(stories_flat_df.news_source == news_outlet) & (stories_flat_df.text.isna())].sample()
url = sample_row['news_link'].values[0]
print(url)
print(sample_row['text'].values[0])
# sample_row

http://www.washingtontimes.com/news/2017/oct/3/abortion-after-20-weeks-banned-by-house-bill-but-f/
nan


Looks these links are working. We can perhaps try passing it through the old scraper again and see what happens.

#### USA TODAY

In [1077]:
news_outlet = 'USA TODAY'

In [1087]:
sample_row = stories_flat_df[(stories_flat_df.news_source == news_outlet) & (~stories_flat_df.text.isna())].sample()
url = sample_row['news_link'].values[0]
print(url)
text = sample_row['text'].values[0]
# print(text)
# sample_row
text.split('\n\n')

https://www.usatoday.com/story/news/politics/2018/05/21/trump-signs-resolution-killing-auto-lending-rule/628326002/


['WASHINGTON – Auto dealers will once again be allowed to mark up the interest rates on auto loans to sell their cars — a practice critics say is racially discriminatory — after President Trump signed a bill striking down a rule discouraging the practice.',
 'The Obama administration adopted the rule in 2013 in response to reports that black and Hispanic car buyers were paying higher interest rates on their loans than white car buyers. These "indirect" loans come from a third-party financial institution, which returns part of that higher interest rate back to the car dealership.',
 'Lenders and car dealers say that system gives salesmen the flexibility they need to offer discounted auto loans to their best customers.',
 "It's the 16th time Trump has signed a resolution to strike down a regulation under the Congressional Review Act, a formerly obscure law that allows Congress to have the final say over agency regulations.",
 "The auto lending rule broke new ground, however, because it h

Looks like the articles are scrapped properly, except the end statements. But there is no uniform format followed to remove them easily.

#### NPR (Online News)

In [1089]:
news_outlet = 'NPR (Online News)'

In [1095]:
sample_row = stories_flat_df[(stories_flat_df.news_source == news_outlet) & (~stories_flat_df.text.isna())].sample()
url = sample_row['news_link'].values[0]
print(url)
text = sample_row['text'].values[0]
print(text)
# sample_row
# text.split('\n\n')

http://www.npr.org/sections/thetwo-way/2017/04/07/522963252/congress-reacts-to-trumps-syria-missile-strike
Congressional Leaders Urge Trump Administration For Broader Syria Strategy

Enlarge this image toggle caption Win McNamee/Getty Images Win McNamee/Getty Images

Updated at 10:30 a.m. ET

Lawmakers on Capitol Hill want the Trump administration to outline a broader strategy in Syria following the president's decision to authorize U.S. missile strikes Thursday night in response to the apparent chemical weapons attack by Syrian President Bashar Assad.

"I think it was appropriate but I would like to say, despite all the enthusiasm we see this morning, if I might quote [Winston] Churchill, 'It's the end of the beginning, not the beginning of the end,' " Senate Armed Services Chairman John McCain, R-Ariz., told MSNBC on Friday morning.

He added: "We have a Middle East in chaos, the Europeans being destabilized, the spread of [the Islamic State] throughout the world. This is the challen

Looks like the articles are scrapped properly.

In [1096]:
sample_row = stories_flat_df[(stories_flat_df.news_source == news_outlet) & (stories_flat_df.text.isna())].sample()
url = sample_row['news_link'].values[0]
print(url)
text = sample_row['text'].values[0]
print(text)
# sample_row
# text.split('\n\n')

https://www.npr.org/2018/08/08/636568530/missouri-blocks-right-to-work-law
nan


These links are also working - we can pass it through the scraper and try again

#### HuffPost

In [1100]:
news_outlet = 'HuffPost'
sample_row = stories_flat_df[(stories_flat_df.news_source == news_outlet) & (~stories_flat_df.text.isna())].sample()
url = sample_row['news_link'].values[0]
print(url)
text = sample_row['text'].values[0]
print(text)
# sample_row
# text.split('\n\n')

https://www.huffingtonpost.com/entry/nancy-pelosi-longest-speech-house-history_us_5a7b7620e4b044b38219006e
Handout . / Reuters Pelosi spoke for over seven hours on the House floor on Wednesday.

Minority Leader Nancy Pelosi (D-Calif.) just made House history.

The Democrat spoke for more than eight hours on the House floor Wednesday, adamant that she would not support any long-term budget deal without a commitment from House Speaker Paul Ryan to allow a chamber vote on protecting young undocumented immigrants. In the process, Pelosi set the record for the longest continuous speech in House history.

The House Historian’s Office confirmed the lawmaker’s record to HuffPost. The previous record dates back to at least 1909 when a Missouri representative named Champ Clark spoke for 5 hours and 15 minutes against a tariff overhaul, the historian’s office said.

In her marathon speech ― which wrapped up around 6:10 p.m., about eight hours and five minutes after she started ― Pelosi called on 

In [1102]:
news_outlet = 'HuffPost'
sample_row = stories_flat_df[(stories_flat_df.news_source == news_outlet) & (stories_flat_df.text.isna())].sample()
url = sample_row['news_link'].values[0]
print(url)
text = sample_row['text'].values[0]
print(text)
# sample_row
# text.split('\n\n')

https://www.huffpost.com/entry/white-house-accuses-democrats-impeachment-overturn-election_n_5d9cfd31e4b02c9da0406582
nan


These links are also workin - we can pass it through the scraper and try again

#### Reuters

In [1104]:
news_outlet = 'Reuters'
sample_row = stories_flat_df[(stories_flat_df.news_source == news_outlet) & (~stories_flat_df.text.isna())].sample()
url = sample_row['news_link'].values[0]
print(url)
text = sample_row['text'].values[0]
print(text)
# sample_row
# text.split('\n\n')

https://www.reuters.com/article/us-usa-trump-impeachment/house-panel-moves-to-intensify-trump-impeachment-probe-idUSKCN1VX10A
WASHINGTON (Reuters) - The Democratic-led U.S. House of Representatives Judiciary Committee voted to intensify its investigation of Republican President Donald Trump on Thursday, as lawmakers edged closer to deciding whether to recommend his impeachment.

The 41-member panel adopted a resolution allowing it to designate hearings as impeachment proceedings, subject witnesses to more aggressive questioning and quicken the pace of an investigation that is expanding into areas that could prove politically explosive for both Trump and Congress.

“With these new procedures, we will begin next week an aggressive series of hearings investigating allegations of corruption, obstruction and abuse of power against the president,” House Judiciary Committee Chairman Jerrold Nadler told reporters after a 24-17 vote along party lines.

A more aggressive probe could also increas

In [1127]:
news_outlet = 'Reuters'
sample_row = stories_flat_df[(stories_flat_df.news_source == news_outlet) & (stories_flat_df.text.isna())].sample()
url = sample_row['news_link'].values[0]
print(url)
text = sample_row['text'].values[0]
print(str(text))
# sample_row
# text.split('\n\n')

http://news.yahoo.com/ryan-says-u-budget-deal-possible-readies-own-153233023--business.html
nan


### Re-scraping the above

In [1146]:
stories_flat_v2_df = stories_flat_df.copy(deep=True)

In [1147]:
for row_idx, row in stories_flat_v2_df.iterrows():
        url = row['news_link']
        news_source = row['news_source']
        text = row['text']
        authors = row['authors']
        publish_date = row['publish_date']

        if news_source == 'New York Times (News)':
            try:
                soup = general_scraper(url)
                text = "\n\n".join([para.text for para in soup.find_all('p', class_="css-158dogj evys1bk0")])
                time.sleep(2+0.5*random.random())
                
                stories_flat_v2_df.loc[row_idx, 'text'] = text
            except:
                print(f"Error retrieving article from {url}")
                
        elif pd.isna(text) and news_source != 'Wall Street Journal (News)':        
            try:
                article = Article(url)
                article.download()
                article.parse()

                authors = article.authors
                publish_date = article.publish_date
                text = article.text
                stories_flat_v2_df.loc[row_idx, 'text'] = text
            except:
                print(f"Error retrieving article from {url}") 

Error retrieving article from https://www.forbes.com/sites/nataliewexler/2019/06/30/what-kamala-harris-and-joe-biden-should-be-talking-about-instead-of-busing/
Error retrieving article from https://www.wsj.com/articles/how-to-put-citizenship-back-in-the-census-11562264430
Error retrieving article from https://www.forbes.com/sites/chasewithorn/2019/07/09/how-tom-steyer-made-his-16-billion-fortune/
Error retrieving article from https://abcnews.go.com/Politics/trump-labor-secretary-acosta-resigns-amid-controversy-epstein/story
Error retrieving article from https://abcnews.go.com/US/justice-department-file-charges-officers-involved-eric-garners/story
Error retrieving article from http://www.npr.org/2012/09/22/161598126/congress-heads-out-after-approving-funding-bill
Error retrieving article from http://www.foxnews.com/politics/obama-accused-hill-lawmakers-ducking-elected-duties-amid-criticism-doing
Error retrieving article from https://www.newsweek.com/mueller-report-volume-one-hearings-sp

In [None]:
stories_flat_v2_df.to_csv('../Data/')

In [1148]:
stories_flat_v2_df[stories_flat_v2_df.text.isna()].news_source.value_counts().sort_values(ascending=False)

Wall Street Journal (News)    657
Fox News (Online News)        404
TheBlaze.com                   80
ABC News (Online)              79
Newsweek                       44
                             ... 
Newsmax (News)                  1
Fox News Latino                 1
Heather Mac Donald              1
Virginia Mercury                1
The Daily Caller                1
Name: news_source, Length: 61, dtype: int64

In [1156]:
mask_top_20 = stories_flat_v2_df[stories_flat_v2_df.news_source.isin(top_20)]

(mask_top_20[mask_top_20.text.isna()].news_source.value_counts()/mask_top_20.news_source.value_counts()).sort_values(ascending=False)
# stories_flat_df[mask_top_20.text.isna()].news_source.value_counts().sort_values(ascending=False)

Wall Street Journal (News)    1.000000
Fox News (Online News)        0.392233
Townhall                      0.061594
NPR (Online News)             0.021097
Washington Post               0.020934
Politico                      0.015873
New York Times (News)         0.009852
Washington Examiner           0.009231
Reuters                       0.008357
National Review               0.006289
Associated Press              0.005587
Newsmax (News)                0.005000
HuffPost                      0.002571
USA TODAY                     0.001938
Washington Times              0.001876
BBC News                           NaN
CNN (Web News)                     NaN
New York Post (News)               NaN
The Hill                           NaN
Vox                                NaN
Name: news_source, dtype: float64

In [1202]:
top_10 = top_20[0:10]

mask = (stories_flat_v2_df.news_source.isin(top_10)) & (~stories_flat_v2_df.text.isna())

# stories_flat_v2_df[mask]
(stories_flat_v2_df[mask].groupby(level='number').size() == 2).sum()
# ((stories_flat_v2_df[mask].loc[(slice(None),['From the Right','From the Left']),:].groupby(level='number').size() == 2))

1177

Getting rid of the multi-index as its harder to work with!

In [1191]:
stories_flat_v3_df = stories_flat_v2_df.reset_index()

### NLP Round 1 Corpus

For the first round of NLP, let's make a corpus containing docs just from Left & Right Bias, using the top 10 news outlets (excluding WSJ naturally, as there are no articles from there).

In [1214]:
top_10 = top_20[0:10]
bias_list = ['From the Left','From the Right']
mask = (stories_flat_v3_df.news_source.isin(top_10)) & (~stories_flat_v3_df.text.isna() & (stories_flat_v3_df.global_bias.isin(bias_list)))

temp_df = stories_flat_v3_df[mask]
df_nlp_round1 = temp_df[temp_df.groupby('number').global_bias.transform('size') == 2].reset_index(drop=True)

In [1216]:
df_nlp_round1.to_csv("../Data/data_NLP_round1.csv", index=False)