In [91]:
import requests
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
import pandas as pd
import numpy as np
import string

class page:
    
    #Create a master list of stopwords, letters, punctuation, and numbers

    excluded_list = []

    excluded_list.extend(stopwords.words('english'))
    excluded_list.extend(string.punctuation)
    excluded_list.extend(string.ascii_lowercase)
    extras = ["–","‘", "’", "“", "”","'s", "’s",'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten',
                'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen', 'seventeen',
                'eighteen', 'nineteen', 'twenty', 'thirty', 'fourty', 'fifty', 'sixty', 'seventy',
                'eighty', 'ninety', 'hundred', 'thousand', "n't", 'wo', 'ca', 'sha',
                "'re", "'d", "'ll"]
    excluded_list.extend(extras)
    excluded_set = set(excluded_list)
    
    def __init__(self, url):
        self.url = url

        #requests function get() along with content pulls the html code of the given website
        page = requests.get(url)
        cont = page.content

        #BeautifulSoup parses the requests content into a form navigable by bs
        self.soup = BeautifulSoup(cont, 'html.parser')
        

    '''
    links() extracts all relevant urls within the instance page, relevant means including the 
    "allsides.com/news/2024" string. This string is uniform to links to internal allsides news articles.
    '''

    def links(self):

        links = []
        #find_all('a) pulls all article tags <a>
        for link in self.soup.find_all('a'):
            #link.get('href') pulls the actual url from the 'href' within the article tag <a>
            href = str(link.get('href'))
            #restrict to news links from allsides in 2024
            if 'www.allsides.com/news/2024' in href:
                links.append(href)

        return list(set(links))
    
    '''
    bias() extracts the bias of the instance page. This is coded numerically. If the page is problematic or 
    has no bias, the code 888 is returned.
    '''
    
    def bias(self):
       
        sp_bias = self.soup.find('div', class_='article-media-bias-')
        
        #An error return option if link is broken
        if sp_bias is None:
            return 888

        #Get the bias from article
        bias = sp_bias.find('a').get_text()
        
        token = 888
        if bias == 'Center':
            token = 0
        if bias == 'Lean Left':
            token = -1
        if bias == 'Left':
            token = -2
        if bias == 'Lean Right':
            token = 1
        if bias == 'Right':
            token = 2

        return token
        
    

        
    '''
    text() method returns the relevant text in the article, what is relevant can be determined by the option type.
    type has two options, default is "article-description" for short form article text, alternate is
    "article-name" for just the title. 
    '''

    def text(self, type = 'article-description'):
        
        sp_text = self.soup.find('div', class_='article-description')
        
        #An error return option if link is broken
        if sp_text is None:
            return 888
        
        #Get text from article
        text_0 = sp_text.get_text()

        
        text_tokens = word_tokenize(text_0)

        ss= SnowballStemmer(language='english')

        #Remove stopwords from title.
        filtered_text = []
        for t in text_tokens:
            tl = t.lower()
            tlr1 = tl.replace('.', '')
            if '-' in tlr1:
                t_list_temp = word_tokenize(tlr1.replace('-', ' '))
                t_list = [x for x in t_list_temp]
            else:
                t_list = [tlr1]
            for wrd in t_list:
                if (not wrd in self.excluded_set) and (wrd.isalpha()):
                    filtered_text.append(ss.stem(wrd))

        return filtered_text


In [92]:

class iterative_scraper:
    
    def __init__(self):
        self.All = []

    def scrape(self, initial_url, n_steps , p, existing_url_list = []):
        pg = page(initial_url)
        l = pg.links()

        urls = [[x for x in l if x not in existing_url_list]]

        for i in range(n_steps):
            step_urls = []
            
            for url in urls[i]:
                if np.random.binomial(1, p[i], 1)[0] == 1:
                    pg = page(url)
                    b = pg.bias()
                    t = pg.text()
                    l = pg.links()
                    if (b != 888) and (t != 888):
                        self.All.append([url, b, t])
                    step_urls.extend([x for x in l if x not in existing_url_list])
                    existing_url_list.extend(step_urls)
            urls.append(step_urls)

    def to_df(self):
        df = pd.DataFrame(self.All, columns=['URL', 'Label', 'Text'])
        return df
    


In [95]:
import time

start_time = time.time()

compiled_df =pd.read_csv('AllSides_All.csv')

exist_urls = compiled_df['URL'].to_list()

scpr = iterative_scraper()

scpr.scrape(initial_url = 'https://www.allsides.com/unbiased-balanced-news', n_steps =3, p = [1,1,1], existing_url_list = exist_urls)

df = scpr.to_df()


print(df.shape)
print(df.head())

df_all = compiled_df.append(df)

df_all.to_csv('AllSides_All.csv', index= False)



print("--- %s seconds ---" % (time.time() - start_time))
         


(0, 3)
Empty DataFrame
Columns: [URL, Label, Text]
Index: []
--- 0.18048691749572754 seconds ---
