In [41]:
import feedparser
import pandas as pd
import requests
import os
import tldextract

from bs4 import BeautifulSoup, element
from functools import reduce
from typing import List


In [26]:
newspapers_all = pd.read_csv('newspapers.csv')
newspapers_all.head()

Unnamed: 0,newspaper,newspaperLabel,idealogyLabel,link,countryLabel
0,http://www.wikidata.org/entity/Q461884,A Szabadság,communism,http://www.aszabadasag.hu,Hungary
1,http://www.wikidata.org/entity/Q1751826,La Segunda,right-wing,http://www.lasegunda.com/,Chile
2,http://www.wikidata.org/entity/Q1815813,Yeni Şafak,right-wing,http://www.yenisafak.com,Turkey
3,http://www.wikidata.org/entity/Q183644,Pravda,communism,http://www.gazeta-pravda.ru,Russia
4,http://www.wikidata.org/entity/Q183644,Pravda,communism,http://www.gazeta-pravda.ru,Soviet Union


In [27]:
idealogies = pd.read_csv('query.csv').head(n=10)['idealogyLabel']
idealogies

0                         conservatism
1                           liberalism
2                            left-wing
3                           right-wing
4                            communism
5                          centre-left
6                         centre-right
7                     social democracy
8    conservatism in the United States
9                             centrism
Name: idealogyLabel, dtype: object

In [28]:
result = [newspapers_all[newspapers_all['idealogyLabel'] == idealogy] for idealogy in idealogies]
newspapers = reduce(lambda acc, item: pd.concat([acc, item]), result[1:], result[0])
newspapers = newspapers[['newspaperLabel', 'idealogyLabel', 'link']]
newspapers

Unnamed: 0,newspaperLabel,idealogyLabel,link
15,Habertürk,conservatism,https://www.haberturk.com
16,Star Gazetesi,conservatism,https://www.star.com.tr/
20,O Globo,conservatism,http://www.oglobo.com.br/
21,El Diario,conservatism,https://www.eldiario.net/
22,La Segunda,conservatism,http://www.lasegunda.com/
...,...,...,...
144,The Washington Times,conservatism in the United States,http://www.washingtontimes.com/
145,The Washington Times,conservatism in the United States,http://www.washtimes.com/
23,Hindustan Times,centrism,http://www.hindustantimes.com
24,Gazeta Wyborcza,centrism,http://wyborcza.pl/


In [29]:
websites_to_scrape = newspapers['link']
websites_to_scrape

15           https://www.haberturk.com
16            https://www.star.com.tr/
20           http://www.oglobo.com.br/
21           https://www.eldiario.net/
22           http://www.lasegunda.com/
                    ...               
144    http://www.washingtontimes.com/
145          http://www.washtimes.com/
23       http://www.hindustantimes.com
24                 http://wyborcza.pl/
73                https://thehill.com/
Name: link, Length: 83, dtype: object

In [6]:
def is_website_reachable(website):
    try:
        res = requests.get(website)
    except:
        return False

    return res.status_code == 200

websites_to_scrape = websites_to_scrape[websites_to_scrape.map(is_website_reachable)]

In [7]:
websites_to_scrape

15                             https://www.haberturk.com
16                              https://www.star.com.tr/
20                             http://www.oglobo.com.br/
21                             https://www.eldiario.net/
22                             http://www.lasegunda.com/
                             ...                        
143    http://search.ebscohost.com/direct.asp?db=bwh&...
144                      http://www.washingtontimes.com/
145                            http://www.washtimes.com/
24                                   http://wyborcza.pl/
73                                  https://thehill.com/
Name: link, Length: 66, dtype: object

In [5]:
def save_article(article: List[str], source: str):
    dirname = os.path.join('data', source)
    os.makedirs(dirname, exist_ok=True)

    filename = str(len([name for name in os.listdir(dirname)]) + 1) + '.txt'

    with open(os.path.join(dirname, filename), 'w') as f:
        f.writelines(article)

In [4]:
# Scrapers
def article_p(URL):
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, "html.parser")
    if len(soup.find_all('article')) == 0:
        print(f'Skipping {URL}')
        return None

    article = soup.find_all('article')[0]
    ps = article.find_all('p')

    text = []
    for p in ps:
        current_paragraph = [content for content in p.contents if len(content) > 0 and type(content) == element.NavigableString]

        for item in current_paragraph:
            text.append(item)

    return text

In [3]:
# Scrapers
def div_p(URL, cls=None, id=None):
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, "html.parser")

    if cls is None and id is None:
        content = soup.find_all('div')
    elif cls is not None:
        content = soup.find_all('div', class_=cls)
    else:
        content = soup.find_all('div', id=id)

    if len(content) == 0:
        print(f'Skipping {URL}')
        return None

    article = content[0]
    ps = article.find_all('p')

    text = []
    for p in ps:
        current_paragraph = [content for content in p.contents if len(content) > 0 and type(content) == element.NavigableString]

        for item in current_paragraph:
            text.append(item)

    return text

In [24]:
feed = feedparser.parse('https://jungle.world/rss.xml')
website_name = 'jungle-world'

if os.path.exists(os.path.join('data', website_name)):
    raise Exception("Directory already exists")

amount = 0
for link in [entry['link'] for entry in feed['entries']]:
    print(f'Parsing {link}')
    # parsed = article_p(link)
    parsed = div_p(link, cls='row_bak')
    if parsed is not None:
        save_article(parsed, website_name)
        amount += 1

print(f'Parsed {amount} of articles')

Parsing https://jungle.world/artikel/2022/25/soziale-plicht
Parsing https://jungle.world/artikel/2022/25/soziale-plicht
Parsing https://jungle.world/artikel/2022/25/vom-guerillero-zum-praesidenten
Parsing https://jungle.world/artikel/2022/25/wie-schlechtes-wetter
Parsing https://jungle.world/artikel/2022/25/zentrales-sprachrohr-der-ns-propaganda
Parsing https://jungle.world/artikel/2022/25/rechte-schlagseite
Parsing https://jungle.world/artikel/2022/25/und-jetzt-den-mussolini
Parsing https://jungle.world/artikel/2022/25/hassprediger-gottes
Parsing https://jungle.world/artikel/2022/25/der-ehrliche-makler-steckt-der-klemme
Parsing https://jungle.world/artikel/2022/25/hirntot-gegen-putin
Parsed 10 of articles


In [23]:
div_p('https://jungle.world/artikel/2022/25/soziale-plicht', cls='row_bak')

['Seit die Wehrpflicht für Männer in Deutschland 2011 ausgesetzt wurde, wird alle paar Jahre wieder über einen Ersatz diskutiert: ein verpflichtender Arbeitsdienst für alle, der dem Gemeinwesen dienen soll. Bundespräsident Frank-Walter Steinmeier hat die Idee kürzlich unter dem Namen »soziale Dienstpflicht« wieder zur Diskussion gestellt. Die Argumente sind nicht neu: Das Pflichtjahr soll den Gemeinsinn stärken, den Austausch mit Menschen anregen, die man sonst nicht treffen würde, und nicht zuletzt soll es solidarisches Verhalten fördern. Durch Zwang soll also Gutes entstehen.',
 'Ganz abwegig ist der Gedanke nicht. Es ist keinesfalls auszuschließen, dass eine Dienstpflicht, so paradox es wegen ihres Zwangscharakters zunächst klingen mag, emanzipatorische Effekte zeitigen würde. Sie könnte beispielswei\xadse Männer dazu bringen, sich auch einmal mit Care-Tätigkeiten zu beschäftigen, die bei freiwilligem Engagement vorwiegend Frauen erledigen, denen das in ihrer Sozialisation viel eher

In [35]:
scraped_websites = pd.read_csv('scraped_websites.csv')
scraped_websites = scraped_websites[scraped_websites['Parsed ?'] == 'Yes']
scraped_websites['link'] = scraped_websites['Website']
scraped_websites.head()


Unnamed: 0,Website,Has RSS?,RSS link,How to parse ?,Parsed ?,# Articles,link
1,http://dailywire.com,Yes,https://www.dailywire.com/feeds/rss.xml,article -> p,Yes,50.0,http://dailywire.com
3,http://jornada.com.mx,Yes,https://web.jornada.com.mx/rss/edicion.xml?v=1,article -> p,Yes,123.0,http://jornada.com.mx
4,http://nzherald.co.nz,Yes,https://www.nzherald.co.nz/arc/outboundfeeds/r...,article -> p,Yes,10.0,http://nzherald.co.nz
5,http://observer.theguardian.com/,Yes,https://www.theguardian.com/uk/rss,article -> p,Yes,114.0,http://observer.theguardian.com/
8,http://www.arbejderen.dk,Yes,https://arbejderen.dk/rss,div -> p,Yes,10.0,http://www.arbejderen.dk


In [44]:
dataset = newspapers[newspapers['link'].isin(scraped_websites['Website'])]

dataset.head()

Unnamed: 0,newspaperLabel,idealogyLabel,link
15,Habertürk,conservatism,https://www.haberturk.com
16,Star Gazetesi,conservatism,https://www.star.com.tr/
86,Le Figaro,conservatism,https://www.lefigaro.fr/
91,Magyar Demokrata,conservatism,http://www.demokrata.hu/
28,Haaretz,liberalism,https://www.haaretz.com/


In [45]:
dataset['articlesLocation'] = dataset['link'].apply(lambda link: tldextract.extract(link).domain)
dataset['amountOfArticles'] = dataset['articlesLocation'].apply(lambda folder: len([name for name in os.listdir(os.path.join('data', folder))]))

dataset

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['articlesLocation'] = dataset['link'].apply(lambda link: tldextract.extract(link).domain)


Unnamed: 0,newspaperLabel,idealogyLabel,link,articlesLocation
15,Habertürk,conservatism,https://www.haberturk.com,haberturk
16,Star Gazetesi,conservatism,https://www.star.com.tr/,star
86,Le Figaro,conservatism,https://www.lefigaro.fr/,lefigaro
91,Magyar Demokrata,conservatism,http://www.demokrata.hu/,demokrata
28,Haaretz,liberalism,https://www.haaretz.com/,haaretz
29,168 Óra,liberalism,https://168.hu,168
30,Corriere della Sera,liberalism,https://www.corriere.it,corriere
33,Magyar Narancs,liberalism,https://magyarnarancs.hu/,magyarnarancs
39,The New Zealand Herald,liberalism,http://nzherald.co.nz,nzherald
79,Haaretz,left-wing,https://www.haaretz.com/,haaretz


In [48]:
dataset.to_csv(os.path.join('data', 'metadata'), index=False)