# News Web Scraping

## Imports

In [1]:
import newspaper
import pandas as pd
import time
import random
from langdetect import detect

## Build newspaper + sanity check

In [2]:
t1 = time.time()
fox = newspaper.build('https://www.foxnews.com/', memoize_articles=False)
nyt = newspaper.build('https://www.nytimes.com/', memoize_articles=False)
bbc = newspaper.build('https://www.bbc.com/', memoize_articles=False)
cnn = newspaper.build('https://edition.cnn.com/', memoize_articles=False)
bre = newspaper.build('https://www.breitbart.com/', memoize_articles=False)


print('building took: %s seconds' % (time.time() - t1))


#fox = [article for article in fox.articles if (('video.' not in article.url) & ('video/' not in article.url))]

#-> problem: newspaper.source.Source object has no delete/remove function? -> converts fox into list
#current solution: do data cleaning afterwards & download the useless articles as well

building took: 21.842912912368774 seconds


In [3]:
papers = [fox, nyt, bbc, cnn, bre]
names = {fox: 'Fox News',
        nyt: 'New York Times',
        bbc: 'BBC News',
        cnn: 'CNN',
        bre: 'Breitbart News'}

In [4]:
for paper in papers:
    for article in paper.articles:
        print(article.url)

https://video.foxnews.com/v/5614615980001/#sp=watch-live
https://www.foxnews.com/politics/live-updates-trump-impeachment-1-29-2021
https://video.foxnews.com/v/1241186546001
https://video.foxnews.com/v/6227210611001/
https://video.foxnews.com/v/6227190092001/
https://video.foxnews.com/v/6227147211001/
https://video.foxnews.com/v/6227108340001/
https://video.foxnews.com/v/6227075830001/
https://video.foxnews.com/v/6227021937001/
https://video.foxnews.com/v/6227044250001/
https://video.foxnews.com/v/6226993978001/
https://video.foxnews.com/v/6226928115001/
https://video.foxnews.com/v/6226911233001/
https://video.foxnews.com/v/6226907817001/
https://www.foxnews.com/politics/white-house-staying-out-of-gamestop-controversy
https://www.foxnews.com/media/rush-limbaugh-gamestop-story-mirrors-politics-elites-attempt-regular-people-benefiting
https://www.foxbusiness.com/markets/why-gamestops-stock-surge-is-shaking-wall-street
https://www.foxnews.com/politics/yellen-robinhood-citadel-gamestop-spea

https://www.bbc.com/culture/article/20210128-ten-films-to-watch-in-february?
http://www.bbc.com/news/world-55819366
http://www.bbc.com/news/world
http://www.bbc.com/news/entertainment-arts-55851522
http://www.bbc.com/news/entertainment_and_arts
http://www.bbc.com/news/world-middle-east-55815820
http://www.bbc.com/news/world/middle_east
http://www.bbc.com/reel/video/p093vgj2/spectacular-tree-of-life-found-in-australian-lake
http://www.bbc.com/reel/video/p095b019/the-huge-nuclear-disaster-hidden-by-the-soviets
http://www.bbc.com/reel/video/p094qx83/is-ashwagandha-the-new-super-herb-
http://www.bbc.com/news/business
http://www.bbc.com/news/england/bristol
http://www.bbc.com/news/technology-55853565
http://www.bbc.com/news/business-55849898
http://www.bbc.com/news/business-55838413
http://www.bbc.com/news/business-55829879
http://www.bbc.com/news/business-55851791
http://www.bbc.com/future/article/20210126-the-richest-human-made-marine-habitats-in-the-world
http://www.bbc.com/future/articl

https://cnnespanol.cnn.com/2021/01/20/opinion-hasta-donde-pueden-llevarnos-las-mentiras/
https://cnnespanol.cnn.com/video/hay-gran-entusiasmo-entre-los-migrantes-con-biden-dice-juan-hernandez/
https://cnnespanol.cnn.com/2021/01/20/analisis-esto-es-lo-que-necesita-hoy-ee-uu-de-joe-biden/
https://cnnespanol.cnn.com/gallery/fotos-los-diferentes-tipos-de-mascarillas-y-su-efectividad-contra-el-covid-19/
https://cnnespanol.cnn.com/gallery/fotos-los-famosos-que-han-muerto-en-2021/
https://cnnespanol.cnn.com/gallery/famosos-muertos-2020/
https://cnnespanol.cnn.com/video/emilo-estafan-premio-produ-2020-yo-no-soy-diferente-sot/
https://cnnespanol.cnn.com/video/musica-premio-inclusion-emilio-estefan-serhumano-jennifer-montoya-cafe-cnn/
https://cnnespanol.cnn.com/video/yo-no-soy-diferente-proyecto-ser-humano-gian-marco-emilio-estefan-cnnee-showbiz-vo/
https://cnnespanol.cnn.com/2020/11/13/lewis-hamilton-siente-mas-orgullo-de-su-lucha-por-justicia-social-que-por-una-nueva-marca-en-la-formula-1/
htt

https://edition.cnn.com/videos/world/2021/01/22/who-china-criticized-covid-19-response-failure-coronavirus-vanier-pkg-intl-hnk-vpx.cnn/video/playlists/coronavirus-intl/
https://edition.cnn.com/videos/world/2021/01/19/south-africa-coronavirus-variant-second-wave-pandemic-mckenzie-pkg-intl-ldn-vpx.cnn/video/playlists/coronavirus-intl/
https://edition.cnn.com/videos/world/2021/01/19/sweden-backtracks-covid-19-approach-coronavirus-foster-pkg-intl-hnk-vpx.cnn/video/playlists/coronavirus-intl/
https://edition.cnn.com/videos/politics/2021/01/29/liz-cheney-backlash-trump-gaetz-wyoming-kafanov-pkg-ctn-vpx.cnn/video/playlists/this-week-in-politics/
https://edition.cnn.com/videos/politics/2021/01/29/marjorie-taylor-greene-town-hall-constituents-pelosi-savidge-ebof-sot-vpx.cnn/video/playlists/this-week-in-politics/
https://edition.cnn.com/videos/politics/2021/01/28/donald-trump-kevin-mccarthy-meeting-photo-tsr-vpx.cnn/video/playlists/this-week-in-politics/
https://edition.cnn.com/videos/politics/2

https://www.breitbart.com/asia/2021/01/26/afghanistan-says-600-freed-taliban-prisoners-already-rearrested/
https://www.breitbart.com/asia/2021/01/25/china-building-border-walls-boundaries-vietnam-myanmar/
https://www.breitbart.com/asia/2021/01/25/report-coronavirus-leading-pacific-island-nations-turn-chinese-investment/
https://www.breitbart.com/clips/2021/01/28/stefanik-cuomo-other-state-officials-should-be-immediately-subpoenaed-over-massive-corruption-scandal-with-nursing-homes/
https://www.breitbart.com/national-security/2021/01/28/experts-joe-bidens-foreign-policies-embolden-enemies-of-america/
https://www.breitbart.com/clips/2021/01/28/gop-sen-sullivan-john-kerry-gina-mccarthy-condescending-cavalier-in-green-energy-push/
https://www.breitbart.com/entertainment/2021/01/27/bette-midler-fantasizes-about-gop-lawmakers-getting-stranded-in-the-ocean/
https://www.breitbart.com/news/netflix-leads-glaad-media-awards-for-lgbtq-representation/
https://www.breitbart.com/entertainment/2021/01

https://www.breitbart.com/border/2021/01/27/construction-on-trumps-border-wall-ends-today-says-texas-rep/
https://www.breitbart.com/border/2021/01/26/feds-seize-500k-in-unreported-cash-from-boat-headed-to-bahamas/
https://www.breitbart.com/border/2021/01/26/u-s-mexico-top-10-cartel-most-wanted-list-gets-update/
https://www.breitbart.com/border/2021/01/25/migrant-toddler-smuggling-scheme-discovered-in-texas-border-city/
https://www.breitbart.com/border/2021/01/23/exclusive-biden-made-america-less-safe-with-single-pen-stroke-says-former-cbp-head/
https://www.breitbart.com/border/2021/01/24/mexican-cartel-gunmen-leave-19-torched-bodies-near-texas-border/
https://www.breitbart.com/border/2021/01/23/exclusive-border-state-labor-union-gunmen-tied-to-mexican-presidents-party/
https://www.breitbart.com/border/2021/01/23/texas-man-charged-with-directing-others-to-assassinate-aoc/
https://www.breitbart.com/border/2021/01/25/exclusiva-tiroteo-en-centro-de-rehabilitacion-en-nuevo-leon-vinculado-a-

In [6]:
size = 0
for paper in papers:
    size += paper.size()
size

1902

## Downloading & parsing data from sources

In [7]:
t1 = time.time()
newspaper.news_pool.set(papers, threads_per_source = 5)
newspaper.news_pool.join()
print('downloading took: %s seconds' % (time.time() - t1))

downloading took: 392.0524892807007 seconds


In [8]:
t1 = time.time()
for paper in papers:
    for article in paper.articles:
        article.parse()
print('parsing took: %s seconds' % (time.time() - t1))

Building prefix dict from C:\Users\mathi\anaconda3\envs\sc\lib\site-packages\jieba\dict.txt ...
Loading model from cache C:\Users\mathi\AppData\Local\Temp\jieba.cache
Loading model cost 1.09224534034729 seconds.
Prefix dict has been built succesfully.


parsing took: 310.78275847435 seconds


## Dataframe + Data Cleaning

In [20]:
t1 = time.time()
df = pd.DataFrame()
for paper in papers:
    df = pd.concat([df, pd.DataFrame({'author': [art.authors for art in paper.articles],
                  'date': [art.publish_date for art in paper.articles],
                  'title': [art.title for art in paper.articles],
                  'url': [art.url for art in paper.articles],
                  'language': [art.meta_lang for art in paper.articles],
                  'source_name': ([names[paper]] * paper.size()),
                  'text': [art.text for art in paper.articles]})])
print('building dataframe took: %s seconds' % (time.time() - t1))

building dataframe took: 0.058388471603393555 seconds


In [21]:
df

Unnamed: 0,author,date,title,url,language,source_name,text
0,[],NaT,Watch Fox News Channel and Fox Business Networ...,https://video.foxnews.com/v/5614615980001/#sp=...,en,Fox News,
1,[],NaT,Live Updates: Gaetz slams Cheney after the hig...,https://www.foxnews.com/politics/live-updates-...,en,Fox News,"Florida Rep. Matt Gaetz, one of former Preside..."
2,[],NaT,Watch Fox News Channel and Fox Business Networ...,https://video.foxnews.com/v/1241186546001,en,Fox News,
3,[],NaT,Hannity: Biden's executive orders causing 'lif...,https://video.foxnews.com/v/6227210611001/,en,Fox News,
4,[],NaT,Tucker: Our financial system is dangerously co...,https://video.foxnews.com/v/6227190092001/,en,Fox News,
...,...,...,...,...,...,...,...
590,[Warner Todd Huston],2021-01-25 00:00:00,Social Media Post Attemps to Show Blackballing...,https://www.breitbart.com/sports/2021/01/25/so...,en,Breitbart News,Tom Brady is once again headed to the Super Bo...
591,[Dylan Gwinn],2021-01-25 00:00:00,John Madden: Hiring Madden Players Could Help ...,https://www.breitbart.com/sports/2021/01/25/jo...,en,Breitbart News,He’s one of the oldest members of the Pro Foot...
592,[Penny Starr],2021-01-25 00:00:00,Montana Lawmakers Advance Bill Banning Biologi...,https://www.breitbart.com/sports/2021/01/25/mo...,en,Breitbart News,The Montana House Judiciary has passed the Sav...
593,[Warner Todd Huston],2021-01-25 00:00:00,Budweiser Shuns Super Bowl Ads to Focus on Vac...,https://www.breitbart.com/sports/2021/01/25/bu...,en,Breitbart News,Beer giant Budweiser is the latest big adverti...


In [27]:
df['word_count'] = [len(text.split()) for text in df['text']]
df['char_count'] = [len(text) for text in df['text']]
df

Unnamed: 0,author,date,title,url,language,source_name,text,word_count,char_count
0,[],NaT,Watch Fox News Channel and Fox Business Networ...,https://video.foxnews.com/v/5614615980001/#sp=...,en,Fox News,,0,0
1,[],NaT,Live Updates: Gaetz slams Cheney after the hig...,https://www.foxnews.com/politics/live-updates-...,en,Fox News,"Florida Rep. Matt Gaetz, one of former Preside...",173,1040
2,[],NaT,Watch Fox News Channel and Fox Business Networ...,https://video.foxnews.com/v/1241186546001,en,Fox News,,0,0
3,[],NaT,Hannity: Biden's executive orders causing 'lif...,https://video.foxnews.com/v/6227210611001/,en,Fox News,,0,0
4,[],NaT,Tucker: Our financial system is dangerously co...,https://video.foxnews.com/v/6227190092001/,en,Fox News,,0,0
...,...,...,...,...,...,...,...,...,...
590,[Warner Todd Huston],2021-01-25 00:00:00,Social Media Post Attemps to Show Blackballing...,https://www.breitbart.com/sports/2021/01/25/so...,en,Breitbart News,Tom Brady is once again headed to the Super Bo...,725,4423
591,[Dylan Gwinn],2021-01-25 00:00:00,John Madden: Hiring Madden Players Could Help ...,https://www.breitbart.com/sports/2021/01/25/jo...,en,Breitbart News,He’s one of the oldest members of the Pro Foot...,308,1728
592,[Penny Starr],2021-01-25 00:00:00,Montana Lawmakers Advance Bill Banning Biologi...,https://www.breitbart.com/sports/2021/01/25/mo...,en,Breitbart News,The Montana House Judiciary has passed the Sav...,537,3451
593,[Warner Todd Huston],2021-01-25 00:00:00,Budweiser Shuns Super Bowl Ads to Focus on Vac...,https://www.breitbart.com/sports/2021/01/25/bu...,en,Breitbart News,Beer giant Budweiser is the latest big adverti...,340,2055


In [28]:
word_threshold = 75

df = df[df['word_count'] > word_threshold]
df

Unnamed: 0,author,date,title,url,language,source_name,text,word_count,char_count
1,[],NaT,Live Updates: Gaetz slams Cheney after the hig...,https://www.foxnews.com/politics/live-updates-...,en,Fox News,"Florida Rep. Matt Gaetz, one of former Preside...",173,1040
14,[Morgan Phillips],NaT,White House refuses to address GameStop contro...,https://www.foxnews.com/politics/white-house-s...,en,Fox News,The White House is staying out of the Wall Str...,369,2363
15,"[Brian Flood, Brian Flood Covers The Media For...",NaT,Rush Limbaugh: GameStop saga mirrors politics ...,https://www.foxnews.com/media/rush-limbaugh-ga...,en,Fox News,Radio host Rush Limbaugh called the on-going G...,999,6075
16,[],NaT,Why GameStop's stock surge is shaking Wall Street,https://www.foxbusiness.com/markets/why-gamest...,en,Fox News,NEW YORK — It's not just you. What's going on ...,1329,7861
17,"[Sam Dorman, Sam Dorman Is A Reporter With Fox...",NaT,Yellen received $800G from hedge fund in Games...,https://www.foxnews.com/politics/yellen-robinh...,en,Fox News,Newly-confirmed Treasury Secretary Janet Yelle...,275,1723
...,...,...,...,...,...,...,...,...,...
590,[Warner Todd Huston],2021-01-25 00:00:00,Social Media Post Attemps to Show Blackballing...,https://www.breitbart.com/sports/2021/01/25/so...,en,Breitbart News,Tom Brady is once again headed to the Super Bo...,725,4423
591,[Dylan Gwinn],2021-01-25 00:00:00,John Madden: Hiring Madden Players Could Help ...,https://www.breitbart.com/sports/2021/01/25/jo...,en,Breitbart News,He’s one of the oldest members of the Pro Foot...,308,1728
592,[Penny Starr],2021-01-25 00:00:00,Montana Lawmakers Advance Bill Banning Biologi...,https://www.breitbart.com/sports/2021/01/25/mo...,en,Breitbart News,The Montana House Judiciary has passed the Sav...,537,3451
593,[Warner Todd Huston],2021-01-25 00:00:00,Budweiser Shuns Super Bowl Ads to Focus on Vac...,https://www.breitbart.com/sports/2021/01/25/bu...,en,Breitbart News,Beer giant Budweiser is the latest big adverti...,340,2055


In [29]:
df['language'].unique()

array(['en', 'ar', 'ru', 'sw', 'id', '', 'es'], dtype=object)

In [49]:
df.loc[df['language'] == '', 'language'] = [detect(text) for text in df[df['language'] == '']['text']]
df['language'].unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


array(['en', 'ar', 'ru', 'sw', 'id', 'es'], dtype=object)

In [51]:
df = df[df['language'] == 'en']
df.drop('language', axis = 1, inplace = True)
df.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


(1218, 8)

In [52]:
def get_category(url):
    li = url.split('/')
    for i in range(3, len(li)):
        if li[i].isdigit() == False:
            return li[i]
    return None

In [53]:
df['category'] = [get_category(url) for url in df['url']]
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['category'] = [get_category(url) for url in df['url']]


Unnamed: 0,author,date,title,url,source_name,text,word_count,char_count,category
1,[],NaT,Live Updates: Gaetz slams Cheney after the hig...,https://www.foxnews.com/politics/live-updates-...,Fox News,"Florida Rep. Matt Gaetz, one of former Preside...",173,1040,politics
14,[Morgan Phillips],NaT,White House refuses to address GameStop contro...,https://www.foxnews.com/politics/white-house-s...,Fox News,The White House is staying out of the Wall Str...,369,2363,politics
15,"[Brian Flood, Brian Flood Covers The Media For...",NaT,Rush Limbaugh: GameStop saga mirrors politics ...,https://www.foxnews.com/media/rush-limbaugh-ga...,Fox News,Radio host Rush Limbaugh called the on-going G...,999,6075,media
16,[],NaT,Why GameStop's stock surge is shaking Wall Street,https://www.foxbusiness.com/markets/why-gamest...,Fox News,NEW YORK — It's not just you. What's going on ...,1329,7861,markets
17,"[Sam Dorman, Sam Dorman Is A Reporter With Fox...",NaT,Yellen received $800G from hedge fund in Games...,https://www.foxnews.com/politics/yellen-robinh...,Fox News,Newly-confirmed Treasury Secretary Janet Yelle...,275,1723,politics
...,...,...,...,...,...,...,...,...,...
590,[Warner Todd Huston],2021-01-25 00:00:00,Social Media Post Attemps to Show Blackballing...,https://www.breitbart.com/sports/2021/01/25/so...,Breitbart News,Tom Brady is once again headed to the Super Bo...,725,4423,sports
591,[Dylan Gwinn],2021-01-25 00:00:00,John Madden: Hiring Madden Players Could Help ...,https://www.breitbart.com/sports/2021/01/25/jo...,Breitbart News,He’s one of the oldest members of the Pro Foot...,308,1728,sports
592,[Penny Starr],2021-01-25 00:00:00,Montana Lawmakers Advance Bill Banning Biologi...,https://www.breitbart.com/sports/2021/01/25/mo...,Breitbart News,The Montana House Judiciary has passed the Sav...,537,3451,sports
593,[Warner Todd Huston],2021-01-25 00:00:00,Budweiser Shuns Super Bowl Ads to Focus on Vac...,https://www.breitbart.com/sports/2021/01/25/bu...,Breitbart News,Beer giant Budweiser is the latest big adverti...,340,2055,sports


In [54]:
li = []
for auth in df['author']:
    if len(auth) == 0:
        li.append(None)
    else:
        li.append(str(auth[0]))
df['author'] = li
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['author'] = li


Unnamed: 0,author,date,title,url,source_name,text,word_count,char_count,category
1,,NaT,Live Updates: Gaetz slams Cheney after the hig...,https://www.foxnews.com/politics/live-updates-...,Fox News,"Florida Rep. Matt Gaetz, one of former Preside...",173,1040,politics
14,Morgan Phillips,NaT,White House refuses to address GameStop contro...,https://www.foxnews.com/politics/white-house-s...,Fox News,The White House is staying out of the Wall Str...,369,2363,politics
15,Brian Flood,NaT,Rush Limbaugh: GameStop saga mirrors politics ...,https://www.foxnews.com/media/rush-limbaugh-ga...,Fox News,Radio host Rush Limbaugh called the on-going G...,999,6075,media
16,,NaT,Why GameStop's stock surge is shaking Wall Street,https://www.foxbusiness.com/markets/why-gamest...,Fox News,NEW YORK — It's not just you. What's going on ...,1329,7861,markets
17,Sam Dorman,NaT,Yellen received $800G from hedge fund in Games...,https://www.foxnews.com/politics/yellen-robinh...,Fox News,Newly-confirmed Treasury Secretary Janet Yelle...,275,1723,politics
...,...,...,...,...,...,...,...,...,...
590,Warner Todd Huston,2021-01-25 00:00:00,Social Media Post Attemps to Show Blackballing...,https://www.breitbart.com/sports/2021/01/25/so...,Breitbart News,Tom Brady is once again headed to the Super Bo...,725,4423,sports
591,Dylan Gwinn,2021-01-25 00:00:00,John Madden: Hiring Madden Players Could Help ...,https://www.breitbart.com/sports/2021/01/25/jo...,Breitbart News,He’s one of the oldest members of the Pro Foot...,308,1728,sports
592,Penny Starr,2021-01-25 00:00:00,Montana Lawmakers Advance Bill Banning Biologi...,https://www.breitbart.com/sports/2021/01/25/mo...,Breitbart News,The Montana House Judiciary has passed the Sav...,537,3451,sports
593,Warner Todd Huston,2021-01-25 00:00:00,Budweiser Shuns Super Bowl Ads to Focus on Vac...,https://www.breitbart.com/sports/2021/01/25/bu...,Breitbart News,Beer giant Budweiser is the latest big adverti...,340,2055,sports


In [78]:
#sanity check of random articles text
df.iloc[random.randint(0, len(df.index))].text

'A star reporter from The New York Times came under fire for making racist and sexist remarks on a 2019 educational trip, The Daily Beast reported Thursday.\n\nDonald McNeil Jr., the Times\' science and health reporter who has spent the past year covering the coronavirus outbreak for the paper, was accused of using offensive language while leading a student trip in Peru, according to the report.\n\nThe trip, which was organized by the company Putney Student Travel, is part of a program called New York Times Student Journeys, which according to the Times, "offers educational travel programs for high school and middle school students". The paper provides a rotation of its journalists to accompany the children abroad.\n\nHowever, the Beast reports that following the 2019 trip to Peru, several participants filed complaints against McNeil, accusing the 66-year-old reporter of making "racist and sexist remarks throughout the trip including, according to two complaints, using the \'n-word.\'"

## Saving to csv

In [79]:
t1 = time.time()
df.to_csv('news.csv')
print('saving to csv took: %s seconds' % (time.time() - t1))

saving to csv took: 0.3507058620452881 seconds


# Twitter Api