# News Collection

In [16]:
# load libraries
import requests
from bs4 import BeautifulSoup as bs
import time
import random
import pandas as pd

# BBC

## BBC Immigration

In [17]:
# initialise index, this tracks the page number we are on
index = 5

# create lists to store our data
all_news_links = []
all_headlines = []
all_dates = []

for page_number in range(5, 14):
    print(f"We are scraping page: {page_number}")
    # define our user headers
    headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        }
    bbc = f"https://www.bbc.co.uk/news/topics/c302m85qe1vt?page={page_number}"
    
    # request webpage
    res = requests.get(bbc, headers=headers)
    
    # check status for debugging
    res.raise_for_status()
    
    soup = bs(res.text, "html.parser")

    # this gets list of articles
    articles = soup.find_all("div", class_="ssrcss-1ns4t85-PromoSwitchLayoutAtBreakpoints et5qctl0")

    for article in articles:
        # check if the article date contains the word "Video" or "Audio"
        article_date = article.find("span", class_="visually-hidden ssrcss-1f39n02-VisuallyHidden e16en2lz0").get_text().strip()
        if "Video" in article_date or "Audio" in article_date:
            continue  # Skip this article and move to the next one
        
        # append article link
        article_info = article.find("a", class_="ssrcss-1mrs5ns-PromoLink exn3ah91")
        link = "https://www.bbc.co.uk" + article_info.attrs["href"]
        all_news_links.append(link)

        # append headline
        headline = article.find("p", class_="ssrcss-15dlehh-PromoHeadline exn3ah96").get_text().strip()
        all_headlines.append(headline)

        # append date
        all_dates.append(article_date)

    time.sleep(random.randint(1, 3))
    

# Convert data to dataframe
data = {"headline": all_headlines,
        "link": all_news_links,
        "date": all_dates
}
df = pd.DataFrame.from_dict(data)
# write to csv
df.to_csv(r"bbc-uk-immigration.csv", encoding="utf-8", header="true", index=False)


We are scraping page: 5
We are scraping page: 6
We are scraping page: 7
We are scraping page: 8
We are scraping page: 9
We are scraping page: 10
We are scraping page: 11
We are scraping page: 12
We are scraping page: 13


In [18]:
# reload csv
bbc_immigration = pd.read_csv("bbc-uk-immigration.csv")

headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}

# function to get body of article
def get_body(df, retries=3):
    bodies = []
    for link in df['link']:
        for attempt in range(retries):
            try:
                res = requests.get(link, headers=headers)
                res.raise_for_status()  # raise an HTTPError for bad responses
                soup = bs(res.content, 'lxml')
                main_content = soup.find_all("div", {"class": "ssrcss-uf6wea-RichTextComponentWrapper ep2nwvo0"})

                # extract text from the text-block
                article_text = ' '.join(content.get_text() for content in main_content).strip()
                
                # append the body text
                bodies.append(article_text)
                break  # break the retry loop on success
            
            except (requests.exceptions.RequestException, requests.exceptions.SSLError) as e:
                print(f"Error fetching {link}: {e}")
                if attempt < retries - 1:
                    wait_time = 2 ** attempt  # exponential backoff
                    print(f"Retrying in {wait_time} seconds...")
                    time.sleep(wait_time)
                else:
                    print(f"Failed to fetch {link} after {retries} attempts.")
                    bodies.append("")  # append an empty string if all retries fail

    # add the bodies list as a new column to the DataFrame
    df['body'] = bodies

    return df

df = get_body(bbc_immigration)

# rewrite to CSV
df.to_csv(r"bbc-uk-immigration.csv", encoding="utf-8", header=True, index=False)



## BBC Migration

In [None]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import random
import time

# initialise index, this tracks the page number we are on
index = 4

# create lists to store our data
all_news_links = []
all_headlines = []
all_dates = []

for page_number in range(4, 16):
    print(f"We are scraping page: {page_number}")
    # define our user headers
    headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        }
    bbc = f"https://www.bbc.co.uk/news/topics/cz4pr2gdg1et?page={page_number}"

    
    # request webpage
    res = requests.get(bbc, headers=headers)
    
    # check status for debugging
    res.raise_for_status()
    
    soup = bs(res.text, "html.parser")

    # this gets list of articles
    articles = soup.find_all("div", class_="ssrcss-1ns4t85-PromoSwitchLayoutAtBreakpoints et5qctl0")

    for article in articles:
        # check if the article date contains the word "Video" or "Audio"
        article_date = article.find("span", class_="visually-hidden ssrcss-1f39n02-VisuallyHidden e16en2lz0").get_text().strip()
        if "Video" in article_date or "Audio" in article_date:
            continue  # Skip this article and move to the next one
        
        # append article link
        article_info = article.find("a", class_="ssrcss-1mrs5ns-PromoLink exn3ah91")
        link = "https://www.bbc.co.uk" + article_info.attrs["href"]
        all_news_links.append(link)

        # append headline
        headline = article.find("p", class_="ssrcss-15dlehh-PromoHeadline exn3ah96").get_text().strip()
        all_headlines.append(headline)

        # append date
        all_dates.append(article_date)

    time.sleep(random.randint(1, 3))
    
# Convert data to dataframe
data = {"headline": all_headlines,
        "link": all_news_links,
        "date": all_dates
}
df = pd.DataFrame.from_dict(data)
# write to csv
df.to_csv(r"bbc-uk-migration.csv", encoding="utf-8", header="true", index=False)


In [None]:
# reload csv
bbc_migration = pd.read_csv("bbc-uk-migration.csv")

df = get_body(bbc_migration)

# rewrite to CSV
df.to_csv(r"bbc-uk-migration.csv", encoding="utf-8", header=True, index=False)


## BBC Visas

In [None]:
# create lists to store our data
all_news_links = []
all_headlines = []
all_dates = []

# define our user headers
headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
    }
bbc = f"https://www.bbc.co.uk/news/topics/c1m1wly10gzt?page=1"

# request webpage
res = requests.get(bbc, headers=headers)

# check status for debugging
res.raise_for_status()

soup = bs(res.text, "html.parser")

# this gets list of articles
articles = soup.find_all("div", class_="ssrcss-1ns4t85-PromoSwitchLayoutAtBreakpoints et5qctl0")

for article in articles:
    # check if the article date contains the word "Video" or "Audio"
    article_date = article.find("span", class_="visually-hidden ssrcss-1f39n02-VisuallyHidden e16en2lz0").get_text().strip()
    if "Video" in article_date or "Audio" in article_date:
        continue  # Skip this article and move to the next one
    
    # append article link
    article_info = article.find("a", class_="ssrcss-1mrs5ns-PromoLink exn3ah91")
    link = "https://www.bbc.co.uk" + article_info.attrs["href"]
    all_news_links.append(link)

    # append headline
    headline = article.find("p", class_="ssrcss-15dlehh-PromoHeadline exn3ah96").get_text().strip()
    all_headlines.append(headline)

    # append date
    all_dates.append(article_date)

time.sleep(random.randint(1, 3))


# Convert data to dataframe
data = {"headline": all_headlines,
        "link": all_news_links,
        "date": all_dates
}
df = pd.DataFrame.from_dict(data)
# write to csv
df.to_csv(r"bbc-uk-visas.csv", encoding="utf-8", header="true", index=False)

In [None]:
# reload csv
bbc_visas = pd.read_csv("bbc-uk-visas.csv")

df = get_body(bbc_visas)

# rewrite to CSV
df.to_csv(r"bbc-uk-visas.csv", encoding="utf-8", header=True, index=False)


## BBC Refugees and asylum seekers

In [None]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import random
import time

# initialise index, this tracks the page number we are on
index = 9

# create lists to store our data
all_news_links = []
all_headlines = []
all_dates = []

for page_number in range(9, 35):
    print(f"We are scraping page: {page_number}")
    # define our user headers
    headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        }
    bbc = f" https://www.bbc.co.uk/news/topics/cg41ylwvxmdt?page={page_number}"
    
    # request webpage
    res = requests.get(bbc, headers=headers)
    
    # check status for debugging
    res.raise_for_status()
    
    soup = bs(res.text, "html.parser")

    # this gets list of articles
    articles = soup.find_all("div", class_="ssrcss-1ns4t85-PromoSwitchLayoutAtBreakpoints et5qctl0")

    for article in articles:
        # check if the article date contains the word "Video" or "Audio"
        article_date = article.find("span", class_="visually-hidden ssrcss-1f39n02-VisuallyHidden e16en2lz0").get_text().strip()
        if "Video" in article_date or "Audio" in article_date:
            continue  # Skip this article and move to the next one
        
        # append article link
        article_info = article.find("a", class_="ssrcss-1mrs5ns-PromoLink exn3ah91")
        link = "https://www.bbc.co.uk" + article_info.attrs["href"]
        all_news_links.append(link)

        # append headline
        headline = article.find("p", class_="ssrcss-15dlehh-PromoHeadline exn3ah96").get_text().strip()
        all_headlines.append(headline)

        # append date
        all_dates.append(article_date)

    time.sleep(random.randint(1, 3))
    

# Convert data to dataframe
data = {"headline": all_headlines,
        "link": all_news_links,
        "date": all_dates
}
df = pd.DataFrame.from_dict(data)
# write to csv
df.to_csv(r"bbc-uk-refugees.csv", encoding="utf-8", header="true", index=False)


In [None]:
# reload csv
bbc_refugees = pd.read_csv("bbc-uk-refugees.csv")

df = get_body(bbc_refugees)

# rewrite to CSV
df.to_csv(r"bbc-uk-refugees.csv", encoding="utf-8", header=True, index=False)


## Merging all the dataframes

In [11]:
# load csvs
bbc_refugees = pd.read_csv("bbc-uk-refugees.csv")
bbc_visas = pd.read_csv("bbc-uk-visas.csv")
bbc_migration = pd.read_csv("bbc-uk-migration.csv")
bbc_immigration = pd.read_csv("bbc-uk-immigration.csv")

# combine all dataframes
bbc = pd.concat([bbc_refugees, bbc_visas, bbc_migration, bbc_immigration])

# write to csv
bbc.to_csv(r"all-bbc.csv", encoding="utf-8", header=True, index=False)


## Cleaning 

In [12]:
import pandas as pd
folder_path = f"/Volumes/Untitled/news/"
bbc = pd.read_csv(folder_path + "all-bbc.csv")
# remove rows in 'date' not from 2023
bbc = bbc[bbc['date'].str.contains("23")]
# convert date to datetime
bbc["date"] = pd.to_datetime(bbc["date"])
# remove duplicates in links
bbc = bbc.drop_duplicates(subset="link")
# remove rows with empty body
bbc = bbc[bbc["body"].notna()]
# remove column link
bbc = bbc.drop(columns="link")
# add column for source
bbc["news"] = "bbc"
# write to csv
bbc.to_csv(folder_path + "all-bbc.csv", encoding="utf-8", header=True, index=False)

  bbc["date"] = pd.to_datetime(bbc["date"])


# The Guardian

## Section: politics; Tag: uk/immigration

In [None]:
API_KEY = "XXX"
page = 1
articles_per_page = 50  # maximum number of articles per page
articles = []

while True:
    url = f"https://content.guardianapis.com/politics?tag=uk/immigration&api-key={API_KEY}&from-date=2023-01-01&to-date=2023-12-31&type=article&page={page}"
    
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()
        current_page_articles = data.get('response', {}).get('results', [])
        
        if not current_page_articles:  # no more articles available
            break
        
        articles.extend(current_page_articles)
        page += 1
    else:
        print(f"Error: {response.status_code} - {response.text}")
        break

with open('guardian-politics-uk-immigration.csv', 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['headline', 'link', 'date', 'body']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    
    for article in articles:
        article_url = article['webUrl']
        article_response = requests.get(article_url)
        
        if article_response.status_code == 200:
            soup = BeautifulSoup(article_response.content, 'html.parser')
            div_element = soup.find('div', class_='article-body-commercial-selector article-body-viewer-selector dcr-fp1ya')
            if div_element:
                paragraphs = div_element.find_all('p')
                article_text = ' '.join(p.get_text() for p in paragraphs)
            else:
                article_text = "No content found in the article."
            
            writer.writerow({
                'headline': article['webTitle'],
                'link': article_url,
                'date': article['webPublicationDate'],
                'body': article_text
            })
        else:
            print(f"Error retrieving article from {article_url}: {article_response.status_code}")



## Section: commentisfree; Tag: uk/immigration

In [None]:
API_KEY = "XXX"
page = 1
articles_per_page = 50  # maximum number of articles per page
articles = []

while True:
    url = f"https://content.guardianapis.com/commentisfree?tag=uk/immigration&api-key={API_KEY}&from-date=2023-01-01&to-date=2023-12-31&type=article&page={page}"
    
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()
        current_page_articles = data.get('response', {}).get('results', [])
        
        if not current_page_articles:  # no more articles available
            break
        
        articles.extend(current_page_articles)
        page += 1
    else:
        print(f"Error: {response.status_code} - {response.text}")
        break

with open('guardian-commentisfree-uk-immigration.csv', 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['headline', 'link', 'date', 'body']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    
    for article in articles:
        article_url = article['webUrl']
        article_response = requests.get(article_url)
        
        if article_response.status_code == 200:
            soup = BeautifulSoup(article_response.content, 'html.parser')
            div_element = soup.find('div', class_='article-body-commercial-selector article-body-viewer-selector dcr-fp1ya')
            if div_element:
                paragraphs = div_element.find_all('p')
                article_text = ' '.join(p.get_text() for p in paragraphs)
            else:
                article_text = "No content found in the article."
            
            writer.writerow({
                'headline': article['webTitle'],
                'link': article_url,
                'date': article['webPublicationDate'],
                'body': article_text
            })
        else:
            print(f"Error retrieving article from {article_url}: {article_response.status_code}")


## Section: uk-news, Tag: global-development/migration



In [None]:
API_KEY = "XXX"
page = 1
articles_per_page = 50  # maximum number of articles per page
articles = []

while True:
    url = f"https://content.guardianapis.com/uk-news?tag=global-development/migration&api-key={API_KEY}&from-date=2023-01-01&to-date=2023-12-31&type=article&page={page}"
    
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()
        current_page_articles = data.get('response', {}).get('results', [])
        
        if not current_page_articles:  # no more articles available
            break
        
        articles.extend(current_page_articles)
        page += 1
    else:
        print(f"Error: {response.status_code} - {response.text}")
        break

with open('guardian-uk_news-global_development-migration.csv', 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['headline', 'link', 'date', 'body']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    
    for article in articles:
        article_url = article['webUrl']
        article_response = requests.get(article_url)
        
        if article_response.status_code == 200:
            soup = BeautifulSoup(article_response.content, 'html.parser')
            div_element = soup.find('div', class_='article-body-commercial-selector article-body-viewer-selector dcr-fp1ya')
            if div_element:
                paragraphs = div_element.find_all('p')
                article_text = ' '.join(p.get_text() for p in paragraphs)
            else:
                article_text = "No content found in the article."
            
            writer.writerow({
                'headline': article['webTitle'],
                'link': article_url,
                'date': article['webPublicationDate'],
                'body': article_text
            })
        else:
            print(f"Error retrieving article from {article_url}: {article_response.status_code}")


Error: 400 - {"response":{"status":"error","message":"requested page is beyond the number of available pages"}}


## Section: uk-news; Tag: world/refugees

In [None]:
API_KEY = "XXX"
page = 1
articles_per_page = 50  # maximum number of articles per page
articles = []

while True:
    url = f"https://content.guardianapis.com/uk-news?tag=world/refugees&api-key={API_KEY}&from-date=2023-01-01&to-date=2023-12-31&type=article&page={page}"
    
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()
        current_page_articles = data.get('response', {}).get('results', [])
        
        if not current_page_articles:  # no more articles available
            break
        
        articles.extend(current_page_articles)
        page += 1
    else:
        print(f"Error: {response.status_code} - {response.text}")
        break

with open('guardian-uk_news-world-refugees.csv', 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['headline', 'link', 'date', 'body']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    
    for article in articles:
        article_url = article['webUrl']
        article_response = requests.get(article_url)
        
        if article_response.status_code == 200:
            soup = BeautifulSoup(article_response.content, 'html.parser')
            div_element = soup.find('div', class_='article-body-commercial-selector article-body-viewer-selector dcr-fp1ya')
            if div_element:
                paragraphs = div_element.find_all('p')
                article_text = ' '.join(p.get_text() for p in paragraphs)
            else:
                article_text = "No content found in the article."
            
            writer.writerow({
                'headline': article['webTitle'],
                'link': article_url,
                'date': article['webPublicationDate'],
                'body': article_text
            })
        else:
            print(f"Error retrieving article from {article_url}: {article_response.status_code}")


Error: 400 - {"response":{"status":"error","message":"requested page is beyond the number of available pages"}}


## Merging all the dataframes

In [None]:
# load csvs
g1 = pd.read_csv("guardian-politics-uk-immigration.csv")
g2 = pd.read_csv("guardian-commentisfree-uk-immigration.csv")
g3 = pd.read_csv("guardian-uk_news-global_development-migration.csv")
g4 = pd.read_csv("guardian-uk_news-world-refugees.csv")

# combine all dataframes
guardian = pd.concat([g1, g2, g3, g4])

# write to csv
guardian.to_csv(r"all-guardian.csv", encoding="utf-8", header=True, index=False)


## Cleaning

In [17]:
import pandas as pd
guardian = pd.read_csv(folder_path + "all-guardian.csv")
# convert date to datetime
guardian["date"] = pd.to_datetime(guardian["date"])
# remove dates not from 2023
guardian = guardian[guardian['date'].dt.year == 2023]
# remove duplicates in links
guardian = guardian.drop_duplicates(subset="link")
# remove rows with empty body
guardian = guardian[guardian["body"].notna()]
# remove column link
guardian = guardian.drop(columns="link")
# add column for source
guardian["news"] = "guardian"
# write to csv
guardian.to_csv(folder_path + "all-guardian.csv", encoding="utf-8", header=True, index=False)


# Mirror

## Immigration

In [None]:
# initialise index, this tracks the page number we are on
index = 1

# create lists to store our data
all_news_links = []

for page_number in range(1, 5):
    print(f"We are scraping page: {page_number}")
    # define our user headers
    headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        }
    if page_number == 1:
        mirror = "https://www.mirror.co.uk/all-about/immigration"
    else:
        mirror = f"https://www.mirror.co.uk/all-about/immigration?pageNumber={page_number}"

    # request webpage
    res = requests.get(mirror, headers=headers)
    
    # check status for debugging
    res.raise_for_status()
    
    soup = bs(res.text, "html.parser")

    # this gets list of articles
    articles = soup.find_all("article")

    # print out the href attribute of the <a> tag within each article that contains data-link-tracking
    for article in articles:
        # find the <a> tag within the article that contains data-link-tracking
        a_tag = article.find('a', {"data-link-tracking": True})
    
        # extract the href attribute if the <a> tag is found
        if a_tag:
            link = a_tag.get('href')

        all_news_links.append(link)

    time.sleep(random.randint(1, 3))
    

# Convert data to dataframe
data = {"link": all_news_links}
df = pd.DataFrame.from_dict(data)

# write to csv
df.to_csv(r"mirror-immigration.csv", encoding="utf-8", header="true", index=False)


We are scraping page: 1
We are scraping page: 2
We are scraping page: 3
We are scraping page: 4


In [None]:
# reload csv
mirror_immigration = pd.read_csv("mirror-immigration.csv")

headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}

# function to get body of article
# function to get body of article
def get_info(df, retries=3):
    all_headlines = []
    all_bodies = []
    all_dates = []
    for link in df['link']:
        for attempt in range(retries):
            try:
                res = requests.get(link, headers=headers)
                res.raise_for_status()  # raise an HTTPError for bad responses
                soup = bs(res.content, 'lxml')

                # extract the headline
                headline_tag = soup.find("h1", class_="lead-content__title")
                if headline_tag:
                    headline = headline_tag.get_text().strip()
                    all_headlines.append(headline)
                else:
                    all_headlines.append("None")

                # extract the date
                time_tag = soup.find("time", class_="date-published")
                if time_tag:
                    date = time_tag.get('datetime')
                    all_dates.append(date)
                else:
                    all_dates.append("None")

                # extract the article body
                div_element = soup.find('div', class_='article-body')
                if div_element:
                    paragraphs = div_element.find_all('p')
                    article_text = ' '.join(p.get_text() for p in paragraphs)
                    all_bodies.append(article_text)
                else:
                    all_bodies.append("None")

                break  # break the retry loop on success
            
            except (requests.exceptions.RequestException, requests.exceptions.SSLError) as e:
                print(f"Error fetching {link}: {e}")
                if attempt < retries - 1:
                    wait_time = 2 ** attempt  # exponential backoff
                    print(f"Retrying in {wait_time} seconds...")
                    time.sleep(wait_time)
                else:
                    print(f"Failed to fetch {link} after {retries} attempts.")
                    all_headlines.append("None")
                    all_dates.append("None")
                    all_bodies.append("None")

    # add as new column to df
    df['headline'] = all_headlines
    df['date'] = all_dates
    df['body'] = all_bodies

    return df

df = get_info(mirror_immigration)

# rewrite to CSV
df.to_csv(r"mirror-immigration.csv", encoding="utf-8", header=True, index=False)




## Migration

In [None]:
# initialise index, this tracks the page number we are on
index = 1

# create lists to store our data
all_news_links = []

for page_number in range(1, 3):
    print(f"We are scraping page: {page_number}")
    # define our user headers
    headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        }
    if page_number == 1:
        mirror = "https://www.mirror.co.uk/all-about/migration"
    else:
        mirror = f"https://www.mirror.co.uk/all-about/migration?pageNumber={page_number}"

    # request webpage
    res = requests.get(mirror, headers=headers)
    
    # check status for debugging
    res.raise_for_status()
    
    soup = bs(res.text, "html.parser")

    # this gets list of articles
    articles = soup.find_all("article")

    # print out the href attribute of the <a> tag within each article that contains data-link-tracking
    for article in articles:
        # find the <a> tag within the article that contains data-link-tracking
        a_tag = article.find('a', {"data-link-tracking": True})
    
        # extract the href attribute if the <a> tag is found
        if a_tag:
            link = a_tag.get('href')

        all_news_links.append(link)

    time.sleep(random.randint(1, 3))
    

# Convert data to dataframe
data = {"link": all_news_links}
df = pd.DataFrame.from_dict(data)

# write to csv
df.to_csv(r"mirror-migration.csv", encoding="utf-8", header="true", index=False)

We are scraping page: 1
We are scraping page: 2


In [None]:
# reload csv
mirror_migration = pd.read_csv("mirror-migration.csv")

headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}

df = get_info(mirror_migration)

# rewrite to CSV
df.to_csv(r"mirror-migration.csv", encoding="utf-8", header=True, index=False)

## Illegal Immigrants

In [None]:
# initialise index, this tracks the page number we are on
index = 1

# create lists to store our data
all_news_links = []

for page_number in range(1, 4):
    print(f"We are scraping page: {page_number}")
    # define our user headers
    headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        }
    if page_number == 1:
        mirror = "https://www.mirror.co.uk/all-about/illegal-immigrants"
    else:
        mirror = f"https://www.mirror.co.uk/all-about/illegal-immigrants?pageNumber={page_number}"

    # request webpage
    res = requests.get(mirror, headers=headers)
    
    # check status for debugging
    res.raise_for_status()
    
    soup = bs(res.text, "html.parser")

    # this gets list of articles
    articles = soup.find_all("article")

    # print out the href attribute of the <a> tag within each article that contains data-link-tracking
    for article in articles:
        # find the <a> tag within the article that contains data-link-tracking
        a_tag = article.find('a', {"data-link-tracking": True})
    
        # extract the href attribute if the <a> tag is found
        if a_tag:
            link = a_tag.get('href')

        all_news_links.append(link)

    time.sleep(random.randint(1, 3))
    

# Convert data to dataframe
data = {"link": all_news_links}
df = pd.DataFrame.from_dict(data)

# write to csv
df.to_csv(r"mirror-illegal-immigrants.csv", encoding="utf-8", header="true", index=False)

We are scraping page: 1
We are scraping page: 2
We are scraping page: 3


In [None]:
# reload csv
mirror_illegals = pd.read_csv("mirror-illegal-immigrants.csv")

headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}

df = get_info(mirror_illegals)

# rewrite to CSV
df.to_csv(r"mirror-illegal-immigrants.csv", encoding="utf-8", header=True, index=False)

## Migrant Crisis

In [None]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import random
import time

# initialise index, this tracks the page number we are on
index = 1

# create lists to store our data
all_news_links = []

for page_number in range(1, 6):
    print(f"We are scraping page: {page_number}")
    # define our user headers
    headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        }
    if page_number == 1:
        mirror = "https://www.mirror.co.uk/all-about/migrant-crisis"
    else:
        mirror = f"https://www.mirror.co.uk/all-about/migrant-crisis?pageNumber={page_number}"

    # request webpage
    res = requests.get(mirror, headers=headers)
    
    # check status for debugging
    res.raise_for_status()
    
    soup = bs(res.text, "html.parser")

    # this gets list of articles
    articles = soup.find_all("article")

    # print out the href attribute of the <a> tag within each article that contains data-link-tracking
    for article in articles:
        # find the <a> tag within the article that contains data-link-tracking
        a_tag = article.find('a', {"data-link-tracking": True})
    
        # extract the href attribute if the <a> tag is found
        if a_tag:
            link = a_tag.get('href')

        all_news_links.append(link)

    time.sleep(random.randint(1, 3))
    

# Convert data to dataframe
data = {"link": all_news_links}
df = pd.DataFrame.from_dict(data)

# write to csv
df.to_csv(r"mirror-migrant-crisis.csv", encoding="utf-8", header="true", index=False)

We are scraping page: 1
We are scraping page: 2
We are scraping page: 3
We are scraping page: 4
We are scraping page: 5


In [None]:
# reload csv
mirror_crisis = pd.read_csv("mirror-migrant-crisis.csv")

headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}

df = get_info(mirror_crisis)

# rewrite to CSV
df.to_csv(r"mirror-migrant-crisis.csv", encoding="utf-8", header=True, index=False)

## Home Office

In [None]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import random
import time

# initialise index, this tracks the page number we are on
index = 1

# create lists to store our data
all_news_links = []

for page_number in range(7, 21):
    print(f"We are scraping page: {page_number}")
    # define our user headers
    headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        }
    if page_number == 1:
        mirror = "https://www.mirror.co.uk/all-about/home-office"
    else:
        mirror = f"https://www.mirror.co.uk/all-about/home-office?pageNumber={page_number}"

    # request webpage
    res = requests.get(mirror, headers=headers)
    
    # check status for debugging
    res.raise_for_status()
    
    soup = bs(res.text, "html.parser")

    # this gets list of articles
    articles = soup.find_all("article")

    # print out the href attribute of the <a> tag within each article that contains data-link-tracking
    for article in articles:
        # find the <a> tag within the article that contains data-link-tracking
        a_tag = article.find('a', {"data-link-tracking": True})
    
        # extract the href attribute if the <a> tag is found
        if a_tag:
            link = a_tag.get('href')

        all_news_links.append(link)

    time.sleep(random.randint(1, 3))
    

# Convert data to dataframe
data = {"link": all_news_links}
df = pd.DataFrame.from_dict(data)

# write to csv
df.to_csv(r"mirror-home-office.csv", encoding="utf-8", header="true", index=False)

We are scraping page: 7
We are scraping page: 8
We are scraping page: 9
We are scraping page: 10
We are scraping page: 11
We are scraping page: 12
We are scraping page: 13
We are scraping page: 14
We are scraping page: 15
We are scraping page: 16
We are scraping page: 17
We are scraping page: 18
We are scraping page: 19
We are scraping page: 20


In [None]:
# reload csv
mirror_home_office = pd.read_csv("mirror-home-office.csv")

headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}

df = get_info(mirror_home_office)

# rewrite to CSV
df.to_csv(r"mirror-home-office.csv", encoding="utf-8", header=True, index=False)

## Border Force

In [None]:
# initialise index, this tracks the page number we are on
index = 1

# create lists to store our data
all_news_links = []

for page_number in range(1, 4):
    print(f"We are scraping page: {page_number}")
    # define our user headers
    headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        }
    if page_number == 1:
        mirror = "https://www.mirror.co.uk/all-about/border-force"
    else:
        mirror = f"https://www.mirror.co.uk/all-about/border-force?pageNumber={page_number}"

    # request webpage
    res = requests.get(mirror, headers=headers)
    
    # check status for debugging
    res.raise_for_status()
    
    soup = bs(res.text, "html.parser")

    # this gets list of articles
    articles = soup.find_all("article")

    # print out the href attribute of the <a> tag within each article that contains data-link-tracking
    for article in articles:
        # find the <a> tag within the article that contains data-link-tracking
        a_tag = article.find('a', {"data-link-tracking": True})
    
        # extract the href attribute if the <a> tag is found
        if a_tag:
            link = a_tag.get('href')

        all_news_links.append(link)

    time.sleep(random.randint(1, 3))
    

# Convert data to dataframe
data = {"link": all_news_links}
df = pd.DataFrame.from_dict(data)

# write to csv
df.to_csv(r"mirror-border-force.csv", encoding="utf-8", header="true", index=False)

We are scraping page: 1
We are scraping page: 2
We are scraping page: 3


In [None]:
# reload csv
mirror_border_force = pd.read_csv("mirror-border-force.csv")

headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}

df = get_info(mirror_home_office)

# rewrite to CSV
df.to_csv(r"mirror-border-force.csv", encoding="utf-8", header=True, index=False)

## Refugee Crisis

In [None]:
# initialise index, this tracks the page number we are on
index = 1

# create lists to store our data
all_news_links = []

for page_number in range(1, 3):
    print(f"We are scraping page: {page_number}")
    # define our user headers
    headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        }
    if page_number == 1:
        mirror = "https://www.mirror.co.uk/all-about/refugee-crisis"
    else:
        mirror = f"https://www.mirror.co.uk/all-about/refugee-crisis?pageNumber={page_number}"

    # request webpage
    res = requests.get(mirror, headers=headers)
    
    # check status for debugging
    res.raise_for_status()
    
    soup = bs(res.text, "html.parser")

    # this gets list of articles
    articles = soup.find_all("article")

    # print out the href attribute of the <a> tag within each article that contains data-link-tracking
    for article in articles:
        # find the <a> tag within the article that contains data-link-tracking
        a_tag = article.find('a', {"data-link-tracking": True})
    
        # extract the href attribute if the <a> tag is found
        if a_tag:
            link = a_tag.get('href')

        all_news_links.append(link)

    time.sleep(random.randint(1, 3))
    

# Convert data to dataframe
data = {"link": all_news_links}
df = pd.DataFrame.from_dict(data)

# write to csv
df.to_csv(r"mirror-refugee-crisis.csv", encoding="utf-8", header="true", index=False)

We are scraping page: 1
We are scraping page: 2


In [None]:
# reload csv
mirror_refugee_crisis = pd.read_csv("mirror-refugee-crisis.csv")

headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}

df = get_info(mirror_refugee_crisis)

# rewrite to CSV
df.to_csv(r"mirror-refugee-crisis.csv", encoding="utf-8", header=True, index=False)

## Asylum-seekers

In [None]:
# initialise index, this tracks the page number we are on
index = 1

# create lists to store our data
all_news_links = []

for page_number in range(1, 4):
    print(f"We are scraping page: {page_number}")
    # define our user headers
    headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        }
    if page_number == 1:
        mirror = "https://www.mirror.co.uk/all-about/asylum-seekers"
    else:
        mirror = f"https://www.mirror.co.uk/all-about/asylum-seekers?pageNumber={page_number}"

    # request webpage
    res = requests.get(mirror, headers=headers)
    
    # check status for debugging
    res.raise_for_status()
    
    soup = bs(res.text, "html.parser")

    # this gets list of articles
    articles = soup.find_all("article")

    # print out the href attribute of the <a> tag within each article that contains data-link-tracking
    for article in articles:
        # find the <a> tag within the article that contains data-link-tracking
        a_tag = article.find('a', {"data-link-tracking": True})
    
        # extract the href attribute if the <a> tag is found
        if a_tag:
            link = a_tag.get('href')

        all_news_links.append(link)

    time.sleep(random.randint(1, 3))
    

# Convert data to dataframe
data = {"link": all_news_links}
df = pd.DataFrame.from_dict(data)

# write to csv
df.to_csv(r"mirror-asylum-seekers.csv", encoding="utf-8", header="true", index=False)


We are scraping page: 1
We are scraping page: 2
We are scraping page: 3


In [None]:
# reload csv
mirror_asylum = pd.read_csv("mirror-asylum-seekers.csv")

headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}

df = get_info(mirror_asylum)

# rewrite to CSV
df.to_csv(r"mirror-asylum-seekers.csv", encoding="utf-8", header=True, index=False)

## Merging all the dataframes

In [None]:
import pandas as pd
# load csvs
m1 = pd.read_csv("mirror-asylum-seekers.csv")
m2 = pd.read_csv("mirror-refugee-crisis.csv")
m3 = pd.read_csv("mirror-border-force.csv")
m4 = pd.read_csv("mirror-home-office.csv")
m5 = pd.read_csv("mirror-migrant-crisis.csv")
m6 = pd.read_csv("mirror-illegal-immigrants.csv")
m7 = pd.read_csv("mirror-migration.csv")
m8 = pd.read_csv("mirror-immigration.csv")



# combine all dataframes
mirror = pd.concat([m1, m2, m3, m4, m5, m6, m7, m8])

# write to csv
mirror.to_csv(r"all-mirror.csv", encoding="utf-8", header=True, index=False)


## Cleaning

In [21]:
# load csv
mirror = pd.read_csv(folder_path + "all-mirror.csv")
# convert date to datetime
mirror["date"] = pd.to_datetime(mirror["date"])
# remove dates not from 2023
mirror = mirror[mirror['date'].dt.year == 2023]
# remove duplicates in links
mirror = mirror.drop_duplicates(subset="link")
# remove rows with link containing "us-news"
mirror = mirror[~mirror["link"].str.contains("us-news")]
# remove rows with empty body
mirror = mirror[mirror["body"].notna()]
# remove column link
mirror = mirror.drop(columns="link")
# add column for source
mirror["news"] = "mirror"
# write to csv
mirror.to_csv(folder_path + "all-mirror.csv", encoding="utf-8", header=True, index=False)

# Express

## Immigration

In [None]:
# initialise index, this tracks the page number we are on
index = 1

# create lists to store our data
all_news_links = []

for page_number in range(13, 45):
    print(f"We are scraping page: {page_number}")
    # define our user headers
    headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        }
    if page_number == 1:
        express = "https://www.express.co.uk/latest/immigration"
    else:
        express = f"https://www.express.co.uk/latest/immigration?pageNumber={page_number}"

    # request webpage
    res = requests.get(express, headers=headers)
    
    # check status for debugging
    res.raise_for_status()
    
    soup = bs(res.text, "html.parser")

    # this gets list of articles
    articles = soup.find_all("li", class_="post")

    # print out the href attribute of the <a> tag within each article that contains data-link-tracking
    for article in articles:
        # find the <a> tag within the article that contains data-link-tracking
        a_tag = article.find('a')
    
        # extract the href attribute if the <a> tag is found
        if a_tag:
            href_tag = a_tag.get('href')
            link = "https://www.express.co.uk" + href_tag

        all_news_links.append(link)

    time.sleep(random.randint(1, 3))
    

# Convert data to dataframe
data = {"link": all_news_links}
df = pd.DataFrame.from_dict(data)

# write to csv
df.to_csv(r"express-immigration.csv", encoding="utf-8", header="true", index=False)


We are scraping page: 13
We are scraping page: 14
We are scraping page: 15
We are scraping page: 16
We are scraping page: 17
We are scraping page: 18
We are scraping page: 19
We are scraping page: 20
We are scraping page: 21
We are scraping page: 22
We are scraping page: 23
We are scraping page: 24
We are scraping page: 25
We are scraping page: 26
We are scraping page: 27
We are scraping page: 28
We are scraping page: 29
We are scraping page: 30
We are scraping page: 31
We are scraping page: 32
We are scraping page: 33
We are scraping page: 34
We are scraping page: 35
We are scraping page: 36
We are scraping page: 37
We are scraping page: 38
We are scraping page: 39
We are scraping page: 40
We are scraping page: 41
We are scraping page: 42
We are scraping page: 43
We are scraping page: 44


In [None]:
# Reload CSV
express_immigration = pd.read_csv("express-immigration.csv")

headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}

# Function to get body of article
def get_info(df, retries=5):
    all_headlines = []
    all_dates = []
    all_bodies = []
    for link in df['link']:  
        for attempt in range(retries):
            try:
                res = requests.get(link, headers=headers)
                res.raise_for_status()  # raise an HTTPError for bad responses
                soup = BeautifulSoup(res.content, "html.parser")

                # Extract the headline information
                header_info = soup.find("header", class_="clearfix")

                if header_info:
                    # Get headline
                    headline = header_info.find("h1").get_text().strip()
                    all_headlines.append(headline)
                else:
                    all_headlines.append("None")
                
                # Extract date information
                dates_info = soup.find("div", class_="dates")
                if dates_info:
                    # Extract the first datetime attribute
                    published_time = dates_info.find("time")
                    if published_time and published_time.get("datetime"):
                        published_date = published_time.get("datetime")
                        all_dates.append(published_date)
                    else:
                        all_dates.append("None")
                else:
                    all_dates.append("None")
                    
                # Extract the article body
                article_body = soup.find("div", class_="text-description")
                if article_body:
                    paragraphs = article_body.find_all("p")
                    article_text = "\n".join(p.get_text() for p in paragraphs)
                    all_bodies.append(article_text)
                else:
                    all_bodies.append("None")
                
                # Break the retry loop on success
                break
            
            except requests.exceptions.RequestException as e:
                print(f"Error fetching {link}: {e}")
                if attempt < retries - 1:
                    wait_time = 2 ** attempt  # exponential backoff
                    print(f"Retrying in {wait_time} seconds...")
                    time.sleep(wait_time)
                else:
                    print(f"Failed to fetch {link} after {retries} attempts.")
                    all_headlines.append("None")
                    all_dates.append("None")
                    all_bodies.append("None")

    # Add as new columns to df
    df['headline'] = all_headlines
    df['date'] = all_dates
    df['body'] = all_bodies

    return df

df = get_info(express_immigration)

# Rewrite to CSV
df.to_csv("express-immigration.csv", encoding="utf-8", header=True, index=False)


## Migrant Crisis

In [None]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import random
import time

# initialise index, this tracks the page number we are on
index = 1

# create lists to store our data
all_news_links = []

for page_number in range(12, 59):
    print(f"We are scraping page: {page_number}")
    # define our user headers
    headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        }
    if page_number == 1:
        express = "https://www.express.co.uk/latest/migrant-crisis"
    else:
        express = f"https://www.express.co.uk/latest/migrant-crisis?pageNumber={page_number}"

    # request webpage
    res = requests.get(express, headers=headers)
    
    # check status for debugging
    res.raise_for_status()
    
    soup = bs(res.text, "html.parser")

    # this gets list of articles
    articles = soup.find_all("li", class_="post")

    # print out the href attribute of the <a> tag within each article that contains data-link-tracking
    for article in articles:
        # find the <a> tag within the article that contains data-link-tracking
        a_tag = article.find('a')
    
        # extract the href attribute if the <a> tag is found
        if a_tag:
            href_tag = a_tag.get('href')
            link = "https://www.express.co.uk" + href_tag

        all_news_links.append(link)

    time.sleep(random.randint(1, 3))
    

# Convert data to dataframe
data = {"link": all_news_links}
df = pd.DataFrame.from_dict(data)

# write to csv
df.to_csv(r"express-migrant-crisis.csv", encoding="utf-8", header="true", index=False)


We are scraping page: 12
We are scraping page: 13
We are scraping page: 14
We are scraping page: 15
We are scraping page: 16
We are scraping page: 17
We are scraping page: 18
We are scraping page: 19
We are scraping page: 20
We are scraping page: 21
We are scraping page: 22
We are scraping page: 23
We are scraping page: 24
We are scraping page: 25
We are scraping page: 26
We are scraping page: 27
We are scraping page: 28
We are scraping page: 29
We are scraping page: 30
We are scraping page: 31
We are scraping page: 32
We are scraping page: 33
We are scraping page: 34
We are scraping page: 35
We are scraping page: 36
We are scraping page: 37
We are scraping page: 38
We are scraping page: 39
We are scraping page: 40
We are scraping page: 41
We are scraping page: 42
We are scraping page: 43
We are scraping page: 44
We are scraping page: 45
We are scraping page: 46
We are scraping page: 47
We are scraping page: 48
We are scraping page: 49
We are scraping page: 50
We are scraping page: 51


In [None]:
# Reload CSV
express_migrant_crisis = pd.read_csv("express-migrant-crisis.csv")

headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}

df = get_info(express_migrant_crisis)

# Rewrite to CSV
df.to_csv("express-migrant-crisis.csv", encoding="utf-8", header=True, index=False)


## Merging all the dataframes

In [None]:
# load csvs
e1 = pd.read_csv("express-immigration.csv")
e2 = pd.read_csv("express-migrant-crisis.csv")

# combine all dataframes
express = pd.concat([e1,e2])

# write to csv
express.to_csv(r"all-express.csv", encoding="utf-8", header=True, index=False)


## Cleaning

In [29]:
# load csv
express = pd.read_csv(folder_path + "all-express.csv")
# convert date to datetime
express["date"] = pd.to_datetime(express["date"])
# remove dates not from 2023
express = express[express['date'].dt.year == 2023]
# remove duplicates in links
express = express.drop_duplicates(subset="link")
# remove rows with link containing "/us/"
express = express[~express["link"].str.contains("/us/")]
# remove rows with empty body
express = express[express["body"].notna()]
# remove column link
express = express.drop(columns="link")
# add column for source
express["news"] = "express"
# write to csv
express.to_csv(folder_path + "all-express.csv", encoding="utf-8", header=True, index=False)

## Daily Mail

## Immigration

In [19]:
# create lists to store our data
all_news_links = []
all_headlines = []
all_dates = []

for page_number in range(1, 4):
    print(f"We are scraping page: {page_number}")
    # define our user headers
    headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        }
    if page_number == 1:
        daily_mail = "https://www.dailymail.co.uk/news/immigration/index.html"
    else:
        daily_mail = f"https://www.dailymail.co.uk/news/immigration/index.html?page={page_number}"
    
    # request webpage
    res = requests.get(daily_mail, headers=headers)
    
    # check status for debugging
    res.raise_for_status()
    
    soup = bs(res.text, "html.parser")

    # get main article
    main_article = soup.find("div", class_="mainArticle--1PqPA article")
    if main_article:
        # append headline
        headline = main_article.find("h2", class_="linkro-darkred").get_text().strip()
        all_headlines.append(headline)
        # append article link
        link = main_article.find("a").attrs["href"]
        all_news_links.append(link)
        # append date
        date = main_article.find("div", class_="firstPubDate--28C6S").get_text().strip()
        all_dates.append(date)
    else:
        all_headlines.append("None")
        all_news_links.append("None")
        all_dates.append("None")
    
    # get list of articles
    articles = soup.find_all("li", class_="article--n-F20")

    for article in articles:
        # append headline
        headline = article.find("h2", class_="linkro-darkred").get_text().strip()
        all_headlines.append(headline)

        # append article link
        link = article.find("a").attrs["href"]
        all_news_links.append(link)

        # append date
        date = article.find("div", class_="firstPubDate--i6kS2").get_text().strip()
        all_dates.append(date)

    time.sleep(random.randint(1, 3))
    

# Convert data to dataframe
data = {"headline": all_headlines,
        "link": all_news_links,
        "date": all_dates
}
df = pd.DataFrame.from_dict(data)
# write to csv
df.to_csv(r"daily-mail-immigration.csv", encoding="utf-8", header="true", index=False)


We are scraping page: 1
We are scraping page: 2
We are scraping page: 3


In [20]:
# function to get body of article
def get_body(df, retries=3):
    all_bodies = []
    for link in df['link']:
        for attempt in range(retries):
            try:
                res = requests.get(link, headers=headers)
                res.raise_for_status()  # raise an HTTPError for bad responses
                soup = bs(res.content, 'lxml')
                # Extract the article body
                article_body = soup.find("div", itemprop="articleBody")
                if article_body:
                    paragraphs = article_body.find_all("p")
                    article_text = "\n".join(p.get_text() for p in paragraphs)
                    all_bodies.append(article_text)
                else:
                    all_bodies.append("None")
                break  # break the retry loop on success
            
            except (requests.exceptions.RequestException, requests.exceptions.SSLError) as e:
                print(f"Error fetching {link}: {e}")
                if attempt < retries - 1:
                    wait_time = 2 ** attempt  # exponential backoff
                    print(f"Retrying in {wait_time} seconds...")
                    time.sleep(wait_time)
                else:
                    print(f"Failed to fetch {link} after {retries} attempts.")
                    all_bodies.append("")  # append an empty string if all retries fail

    # add the bodies list as a new column to the DataFrame
    df['body'] = all_bodies

    return df

In [21]:
# reload csv
daily_mail_immigration = pd.read_csv("daily-mail-immigration.csv")
# get body of articles
df = get_body(daily_mail_immigration)
# rewrite to CSV
df.to_csv(r"daily-mail-immigration.csv", encoding="utf-8", header=True, index=False)

## English Channel

In [None]:
# create lists to store our data
all_news_links = []
all_headlines = []
all_dates = []

for page_number in range(2, 5):
    print(f"We are scraping page: {page_number}")
    # define our user headers
    headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        }
    if page_number == 1:
        daily_mail = "https://www.dailymail.co.uk/news/english-channel/index.html"
    else:
        daily_mail = f"https://www.dailymail.co.uk/news/english-channel/index.html?page={page_number}"
    
    # request webpage
    res = requests.get(daily_mail, headers=headers)
    
    # check status for debugging
    res.raise_for_status()
    
    soup = bs(res.text, "html.parser")

    # get main article
    main_article = soup.find("div", class_="mainArticle--1PqPA article")
    if main_article:
        # append headline
        headline = main_article.find("h2", class_="linkro-darkred").get_text().strip()
        all_headlines.append(headline)
        # append article link
        link = main_article.find("a").attrs["href"]
        all_news_links.append(link)
        # append date
        date = main_article.find("div", class_="firstPubDate--28C6S").get_text().strip()
        all_dates.append(date)
    else:
        all_headlines.append("None")
        all_news_links.append("None")
        all_dates.append("None")
    
    # get list of articles
    articles = soup.find_all("li", class_="article--n-F20")

    for article in articles:
        # append headline
        headline = article.find("h2", class_="linkro-darkred").get_text().strip()
        all_headlines.append(headline)

        # append article link
        link = article.find("a").attrs["href"]
        all_news_links.append(link)

        # append date
        date = article.find("div", class_="firstPubDate--i6kS2").get_text().strip()
        all_dates.append(date)

    time.sleep(random.randint(1, 3))
    

# Convert data to dataframe
data = {"headline": all_headlines,
        "link": all_news_links,
        "date": all_dates
}
df = pd.DataFrame.from_dict(data)
# write to csv
df.to_csv(r"daily-mail-channel.csv", encoding="utf-8", header="true", index=False)


We are scraping page: 2
We are scraping page: 3
We are scraping page: 4


In [None]:
# reload csv
daily_mail_channel = pd.read_csv("daily-mail-channel.csv")
# get body of articles
df = get_body(daily_mail_channel)
# rewrite to CSV
df.to_csv(r"daily-mail-channel.csv", encoding="utf-8", header=True, index=False)

## Home Office

In [None]:
# create lists to store our data
all_news_links = []
all_headlines = []
all_dates = []

for page_number in range(7, 16):
    print(f"We are scraping page: {page_number}")
    # define our user headers
    headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        }
    if page_number == 1:
        daily_mail = "https://www.dailymail.co.uk/news/the-home-office/index.html"
    else:
        daily_mail = f"https://www.dailymail.co.uk/news/the-home-office/index.html?page={page_number}"
    
    # request webpage
    res = requests.get(daily_mail, headers=headers)
    
    # check status for debugging
    res.raise_for_status()
    
    soup = bs(res.text, "html.parser")

    # get main article
    main_article = soup.find("div", class_="mainArticle--1PqPA article")
    if main_article:
        # append headline
        headline = main_article.find("h2", class_="linkro-darkred").get_text().strip()
        all_headlines.append(headline)
        # append article link
        link = main_article.find("a").attrs["href"]
        all_news_links.append(link)
        # append date
        date = main_article.find("div", class_="firstPubDate--28C6S").get_text().strip()
        all_dates.append(date)
    else:
        all_headlines.append("None")
        all_news_links.append("None")
        all_dates.append("None")
    
    # get list of articles
    articles = soup.find_all("li", class_="article--n-F20")

    for article in articles:
        # append headline
        headline = article.find("h2", class_="linkro-darkred").get_text().strip()
        all_headlines.append(headline)

        # append article link
        link = article.find("a").attrs["href"]
        all_news_links.append(link)

        # append date
        date = article.find("div", class_="firstPubDate--i6kS2").get_text().strip()
        all_dates.append(date)

    time.sleep(random.randint(1, 3))
    

# Convert data to dataframe
data = {"headline": all_headlines,
        "link": all_news_links,
        "date": all_dates
}
df = pd.DataFrame.from_dict(data)
# write to csv
df.to_csv(r"daily-mail-home-office.csv", encoding="utf-8", header="true", index=False)


We are scraping page: 7
We are scraping page: 8
We are scraping page: 9
We are scraping page: 10
We are scraping page: 11
We are scraping page: 12
We are scraping page: 13
We are scraping page: 14
We are scraping page: 15


In [None]:
# reload csv
daily_mail_home = pd.read_csv("daily-mail-home-office.csv")
# get body of articles
df = get_body(daily_mail_home)
# rewrite to CSV
df.to_csv(r"daily-mail-home-office.csv", encoding="utf-8", header=True, index=False)

## Merging all the dataframes

In [39]:
# load csvs
d1 = pd.read_csv("daily-mail-immigration.csv")
d2 = pd.read_csv("daily-mail-channel.csv")
d3 = pd.read_csv("daily-mail-home-office.csv")

# combine all dataframes
daily_mail = pd.concat([d1, d2, d3])

# write to csv
daily_mail.to_csv(r"all-daily-mail.csv", encoding="utf-8", header=True, index=False)

## Cleaning

In [40]:
# load csv
daily_mail = pd.read_csv(folder_path + "all-daily-mail.csv")
# convert date to datetime
daily_mail["date"] = pd.to_datetime(daily_mail["date"])
# remove dates not from 2023
daily_mail = daily_mail[daily_mail['date'].dt.year == 2023]
# remove duplicates in links
daily_mail = daily_mail.drop_duplicates(subset="link")
# remove rows where body mentions joe biden or america
daily_mail = daily_mail[~daily_mail["body"].str.contains("Biden|America|USA|Trump|Mexico|Australia")]
# remove rows with empty body
daily_mail = daily_mail[daily_mail["body"].notna()]
# remove column link
daily_mail = daily_mail.drop(columns="link")
# add column for source
daily_mail["news"] = "daily-mail"
# write to csv
daily_mail.to_csv(folder_path + "all-daily-mail.csv", encoding="utf-8", header=True, index=False)

  daily_mail["date"] = pd.to_datetime(daily_mail["date"])


  # The Sun

## Immigration

In [1]:
# create lists to store our data
all_news_links = []

for page_number in range(13, 45):
    print(f"We are scraping page: {page_number}")
    # define our user headers
    headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        }
    
    sun = f"https://www.thesun.co.uk/topic/uk-immigration-crisis/page/{page_number}/"

    # request webpage
    res = requests.get(sun, headers=headers)
    
    # check status for debugging
    res.raise_for_status()
    
    soup = bs(res.text, "html.parser")

    # this gets list of articles
    articles = soup.find_all("div", class_="teaser-item teaser__small teaser theme-news") + soup.find_all("div", class_="teaser-item teaser__medium teaser theme-news")

    # print out the href attribute of the <a> tag within each article that contains data-link-tracking
    for article in articles:
        # find the <a> tag within the article that contains data-link-tracking
        a_tag = article.find('a')
    
        # extract the href attribute if the <a> tag is found
        if a_tag:
            href_tag = a_tag.get('href')
            link = "https://www.thesun.co.uk" + href_tag

        all_news_links.append(link)

    time.sleep(random.randint(1, 3))
    

# Convert data to dataframe
data = {"link": all_news_links}
df = pd.DataFrame.from_dict(data)

# write to csv
df.to_csv(r"sun-immigration.csv", encoding="utf-8", header="true", index=False)


We are scraping page: 13
We are scraping page: 14
We are scraping page: 15
We are scraping page: 16
We are scraping page: 17
We are scraping page: 18
We are scraping page: 19
We are scraping page: 20
We are scraping page: 21
We are scraping page: 22
We are scraping page: 23
We are scraping page: 24
We are scraping page: 25
We are scraping page: 26
We are scraping page: 27
We are scraping page: 28
We are scraping page: 29
We are scraping page: 30
We are scraping page: 31
We are scraping page: 32
We are scraping page: 33
We are scraping page: 34
We are scraping page: 35
We are scraping page: 36
We are scraping page: 37
We are scraping page: 38
We are scraping page: 39
We are scraping page: 40
We are scraping page: 41
We are scraping page: 42
We are scraping page: 43
We are scraping page: 44


In [9]:
# Function to get body of article
def get_info(df, retries=5):
    all_headlines = []
    all_dates = []
    all_bodies = []
    for link in df['link']:  
        for attempt in range(retries):
            try:
                res = requests.get(link, headers=headers)
                res.raise_for_status()  # raise an HTTPError for bad responses
                soup = bs(res.content, "html.parser")

                # get headline
                header_info = soup.find("h1", class_="article__headline").get_text().strip()
                if header_info:
                    all_headlines.append(header_info)
                else:
                    all_headlines.append("None") 
                
                # get date
                dates_info = soup.find("span", class_="article__timestamp").get_text().strip()
                if dates_info:
                    all_dates.append(dates_info)
                else:
                    all_dates.append("None")
                    
               # get body
                article_body = soup.find("div", class_="article__content")
                if article_body:
                    paragraphs = article_body.find_all("p")
                    article_text = "\n".join(p.get_text() for p in paragraphs)
                    all_bodies.append(article_text)
                else:
                    all_bodies.append("None")

                # break the retry loop on success
                break
            
            except requests.exceptions.RequestException as e:
                print(f"Error fetching {link}: {e}")
                if attempt < retries - 1:
                    wait_time = 2 ** attempt  # exponential backoff
                    print(f"Retrying in {wait_time} seconds...")
                    time.sleep(wait_time)
                else:
                    print(f"Failed to fetch {link} after {retries} attempts.")
                    all_headlines.append("None")
                    all_dates.append("None")
                    all_bodies.append("None")

    # add as new columns to df
    df['headline'] = all_headlines
    df['date'] = all_dates
    df['body'] = all_bodies

    return df

In [8]:
# reload csv
sun_immigration = pd.read_csv("sun-immigration.csv")
# get body of articles
df = get_info(sun_immigration)
# rewrite to CSV
df.to_csv(r"sun-immigration.csv", encoding="utf-8", header=True, index=False)

## Refugee crisis

In [11]:
# create lists to store our data
all_news_links = []

for page_number in range(13, 32):
    print(f"We are scraping page: {page_number}")
    # define our user headers
    headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        }
    
    sun = f"https://www.thesun.co.uk/topic/refugee-crisis/page/{page_number}/"

    # request webpage
    res = requests.get(sun, headers=headers)
    
    # check status for debugging
    res.raise_for_status()
    
    soup = bs(res.text, "html.parser")

    # this gets list of articles
    articles = soup.find_all("div", class_="teaser-item teaser__small teaser theme-news") + soup.find_all("div", class_="teaser-item teaser__medium teaser theme-news")

    # print out the href attribute of the <a> tag within each article that contains data-link-tracking
    for article in articles:
        # find the <a> tag within the article that contains data-link-tracking
        a_tag = article.find('a')
    
        # extract the href attribute if the <a> tag is found
        if a_tag:
            href_tag = a_tag.get('href')
            link = "https://www.thesun.co.uk" + href_tag

        all_news_links.append(link)

    time.sleep(random.randint(1, 3))
    

# Convert data to dataframe
data = {"link": all_news_links}
df = pd.DataFrame.from_dict(data)

# write to csv
df.to_csv(r"sun-refugee.csv", encoding="utf-8", header="true", index=False)


We are scraping page: 13
We are scraping page: 14
We are scraping page: 15
We are scraping page: 16
We are scraping page: 17
We are scraping page: 18
We are scraping page: 19
We are scraping page: 20
We are scraping page: 21
We are scraping page: 22
We are scraping page: 23
We are scraping page: 24
We are scraping page: 25
We are scraping page: 26
We are scraping page: 27
We are scraping page: 28
We are scraping page: 29
We are scraping page: 30
We are scraping page: 31


In [12]:
# reload csv
sun_refugee = pd.read_csv("sun-refugee.csv")
# get body of articles
df = get_info(sun_refugee)
# rewrite to CSV
df.to_csv(r"sun-refugee.csv", encoding="utf-8", header=True, index=False)

## Border force

In [13]:
# create lists to store our data
all_news_links = []

for page_number in range(1, 3):
    print(f"We are scraping page: {page_number}")
    # define our user headers
    headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        }
    if page_number == 1:
        sun = "https://www.thesun.co.uk/topic/uk-border-force/"
    else:
        sun = f"https://www.thesun.co.uk/topic/uk-border-force/page/{page_number}/"
    # request webpage
    res = requests.get(sun, headers=headers)
    
    # check status for debugging
    res.raise_for_status()
    
    soup = bs(res.text, "html.parser")

    # this gets list of articles
    articles = soup.find_all("div", class_="teaser-item teaser__small teaser theme-news") + soup.find_all("div", class_="teaser-item teaser__medium teaser theme-news")

    # print out the href attribute of the <a> tag within each article that contains data-link-tracking
    for article in articles:
        # find the <a> tag within the article that contains data-link-tracking
        a_tag = article.find('a')
    
        # extract the href attribute if the <a> tag is found
        if a_tag:
            href_tag = a_tag.get('href')
            link = "https://www.thesun.co.uk" + href_tag

        all_news_links.append(link)

    time.sleep(random.randint(1, 3))
    

# Convert data to dataframe
data = {"link": all_news_links}
df = pd.DataFrame.from_dict(data)

# write to csv
df.to_csv(r"sun-border-force.csv", encoding="utf-8", header="true", index=False)


We are scraping page: 1
We are scraping page: 2


In [14]:
# reload csv
sun_b = pd.read_csv("sun-border-force.csv")
# get body of articles
df = get_info(sun_b)
# rewrite to CSV
df.to_csv(r"sun-border-force.csv", encoding="utf-8", header=True, index=False)

## Home Office

In [15]:
# create lists to store our data
all_news_links = []

for page_number in range(3, 7):
    print(f"We are scraping page: {page_number}")
    # define our user headers
    headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        }
    sun = f"https://www.thesun.co.uk/topic/home-office/page/{page_number}/"
    # request webpage
    res = requests.get(sun, headers=headers)
    
    # check status for debugging
    res.raise_for_status()
    
    soup = bs(res.text, "html.parser")

    # this gets list of articles
    articles = soup.find_all("div", class_="teaser-item teaser__small teaser theme-news") + soup.find_all("div", class_="teaser-item teaser__medium teaser theme-news")

    # print out the href attribute of the <a> tag within each article that contains data-link-tracking
    for article in articles:
        # find the <a> tag within the article that contains data-link-tracking
        a_tag = article.find('a')
    
        # extract the href attribute if the <a> tag is found
        if a_tag:
            href_tag = a_tag.get('href')
            link = "https://www.thesun.co.uk" + href_tag

        all_news_links.append(link)

    time.sleep(random.randint(1, 3))
    

# Convert data to dataframe
data = {"link": all_news_links}
df = pd.DataFrame.from_dict(data)

# write to csv
df.to_csv(r"sun-home-office.csv", encoding="utf-8", header="true", index=False)


We are scraping page: 3
We are scraping page: 4
We are scraping page: 5
We are scraping page: 6


In [16]:
# reload csv
sun_h = pd.read_csv("sun-home-office.csv")
# get body of articles
df = get_info(sun_h)
# rewrite to CSV
df.to_csv(r"sun-home-office.csv", encoding="utf-8", header=True, index=False)

## Merging all the dataframes

In [18]:
# reload csv
s1 = pd.read_csv("sun-immigration.csv")
s2 = pd.read_csv("sun-refugee.csv")
s3 = pd.read_csv("sun-border-force.csv")
s4 = pd.read_csv("sun-home-office.csv")

# combine all dataframes
sun = pd.concat([s1, s2, s3, s4])

# write to csv
sun.to_csv(r"all-sun.csv", encoding="utf-8", header=True, index=False)

## Cleaning

In [43]:
# load csv
sun = pd.read_csv(folder_path + "all-sun.csv")
# convert date to datetime
sun["date"] = pd.to_datetime(sun["date"])
# remove dates not from 2023
sun = sun[sun['date'].dt.year == 2023]
# remove duplicates in links
sun = sun.drop_duplicates(subset="link")
# remove rows with empty body
sun = sun[sun["body"].notna()]
# remove column link
sun = sun.drop(columns="link")
# add column for source
sun["news"] = "sun"
# write to csv
sun.to_csv(folder_path + "all-sun.csv", encoding="utf-8", header=True, index=False)

# The Telegraph

## Immigration

In [38]:
# create lists to store our data
all_news_links = []
all_headlines = []
all_dates = []

for page_number in range(11, 33):
    print(f"We are scraping page: {page_number}")
    # define our user headers
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
    }
    telegraph = f"https://www.telegraph.co.uk/immigration/page-{page_number}/"
    
    # request webpage
    res = requests.get(telegraph, headers=headers)
    
    # check status for debugging
    res.raise_for_status()
    
    soup = bs(res.text, "html.parser")
    
    # get list of articles
    articles = soup.find_all("div", class_="card__content")

    for article in articles:
        # append headline
        headline_span = article.find('span', class_='u-heading-6 list-headline__text')
        if headline_span:
            headline = headline_span.find('span').text.strip()
            all_headlines.append(headline)
        else:
            all_headlines.append("No headline found")
        
        # append article link
        href_tag = article.find("a").attrs["href"]
        link = "https://www.telegraph.co.uk" + href_tag
        all_news_links.append(link)

        # append date
        date_tag = article.find("time", class_="card__date")
        if date_tag:
            date = date_tag.get_text().strip()
            all_dates.append(date)
        else:
            all_dates.append("No date found")

    time.sleep(random.randint(1, 3))

# Convert data to dataframe
data = {"headline": all_headlines,
        "link": all_news_links,
        "date": all_dates
}
df = pd.DataFrame.from_dict(data)
# write to csv
df.to_csv("telegraph-immigration.csv", encoding="utf-8", header=True, index=False)

We are scraping page: 11
We are scraping page: 12
We are scraping page: 13
We are scraping page: 14
We are scraping page: 15
We are scraping page: 16
We are scraping page: 17
We are scraping page: 18
We are scraping page: 19
We are scraping page: 20
We are scraping page: 21
We are scraping page: 22
We are scraping page: 23
We are scraping page: 24
We are scraping page: 25
We are scraping page: 26
We are scraping page: 27
We are scraping page: 28
We are scraping page: 29
We are scraping page: 30
We are scraping page: 31
We are scraping page: 32


In [47]:
# function to get body of article
def get_body(df, retries=3):
    all_bodies = []
    for link in df['link']:
        for attempt in range(retries):
            try:
                res = requests.get(link, headers=headers)
                res.raise_for_status()  # raise an HTTPError for bad responses
                soup = bs(res.content, 'lxml')
                # Extract the article body
                article_body = soup.find("div", class_="articleBodyText section")
                if article_body:
                    paragraphs = article_body.find_all("p")
                    article_text = "\n".join(p.get_text() for p in paragraphs)
                    all_bodies.append(article_text)
                else:
                    all_bodies.append("None")
                break  # break the retry loop on success
            
            except (requests.exceptions.RequestException, requests.exceptions.SSLError) as e:
                print(f"Error fetching {link}: {e}")
                if attempt < retries - 1:
                    wait_time = 2 ** attempt  # exponential backoff
                    print(f"Retrying in {wait_time} seconds...")
                    time.sleep(wait_time)
                else:
                    print(f"Failed to fetch {link} after {retries} attempts.")
                    all_bodies.append("")  # append an empty string if all retries fail

    # add the bodies list as a new column to the DataFrame
    df['body'] = all_bodies

    return df

In [48]:
# reload csv
telegraph_immigration = pd.read_csv("telegraph-immigration.csv")
# get body of articles
df = get_body(telegraph_immigration)
# rewrite to CSV
df.to_csv(r"telegraph-immigration.csv", encoding="utf-8", header=True, index=False)

## Migrants

In [49]:


# create lists to store our data
all_news_links = []
all_headlines = []
all_dates = []

for page_number in range(7, 24):
    print(f"We are scraping page: {page_number}")
    # define our user headers
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
    }
    telegraph = f"https://www.telegraph.co.uk/migrants/page-{page_number}/"
    
    # request webpage
    res = requests.get(telegraph, headers=headers)
    
    # check status for debugging
    res.raise_for_status()
    
    soup = bs(res.text, "html.parser")
    
    # get list of articles
    articles = soup.find_all("div", class_="card__content")

    for article in articles:
        # append headline
        headline_span = article.find('span', class_='u-heading-6 list-headline__text')
        if headline_span:
            headline = headline_span.find('span').text.strip()
            all_headlines.append(headline)
        else:
            all_headlines.append("No headline found")
        
        # append article link
        href_tag = article.find("a").attrs["href"]
        link = "https://www.telegraph.co.uk" + href_tag
        all_news_links.append(link)

        # append date
        date_tag = article.find("time", class_="card__date")
        if date_tag:
            date = date_tag.get_text().strip()
            all_dates.append(date)
        else:
            all_dates.append("No date found")

    time.sleep(random.randint(1, 3))

# Convert data to dataframe
data = {"headline": all_headlines,
        "link": all_news_links,
        "date": all_dates
}
df = pd.DataFrame.from_dict(data)
# write to csv
df.to_csv("telegraph-migrants.csv", encoding="utf-8", header=True, index=False)

We are scraping page: 7
We are scraping page: 8
We are scraping page: 9
We are scraping page: 10
We are scraping page: 11
We are scraping page: 12
We are scraping page: 13
We are scraping page: 14
We are scraping page: 15
We are scraping page: 16
We are scraping page: 17
We are scraping page: 18
We are scraping page: 19
We are scraping page: 20
We are scraping page: 21
We are scraping page: 22
We are scraping page: 23


In [50]:
# reload csv
telegraph_migrants = pd.read_csv("telegraph-migrants.csv")
# get body of articles
df = get_body(telegraph_migrants)
# rewrite to CSV
df.to_csv(r"telegraph-migrants.csv", encoding="utf-8", header=True, index=False)

## Migration

In [51]:


# create lists to store our data
all_news_links = []
all_headlines = []
all_dates = []

for page_number in range(4, 15):
    print(f"We are scraping page: {page_number}")
    # define our user headers
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
    }
    telegraph = f"https://www.telegraph.co.uk/migration/page-{page_number}/"
    
    # request webpage
    res = requests.get(telegraph, headers=headers)
    
    # check status for debugging
    res.raise_for_status()
    
    soup = bs(res.text, "html.parser")
    
    # get list of articles
    articles = soup.find_all("div", class_="card__content")

    for article in articles:
        # append headline
        headline_span = article.find('span', class_='u-heading-6 list-headline__text')
        if headline_span:
            headline = headline_span.find('span').text.strip()
            all_headlines.append(headline)
        else:
            all_headlines.append("No headline found")
        
        # append article link
        href_tag = article.find("a").attrs["href"]
        link = "https://www.telegraph.co.uk" + href_tag
        all_news_links.append(link)

        # append date
        date_tag = article.find("time", class_="card__date")
        if date_tag:
            date = date_tag.get_text().strip()
            all_dates.append(date)
        else:
            all_dates.append("No date found")

    time.sleep(random.randint(1, 3))

# Convert data to dataframe
data = {"headline": all_headlines,
        "link": all_news_links,
        "date": all_dates
}
df = pd.DataFrame.from_dict(data)
# write to csv
df.to_csv("telegraph-migration.csv", encoding="utf-8", header=True, index=False)

We are scraping page: 4
We are scraping page: 5
We are scraping page: 6
We are scraping page: 7
We are scraping page: 8
We are scraping page: 9
We are scraping page: 10
We are scraping page: 11
We are scraping page: 12
We are scraping page: 13
We are scraping page: 14


In [53]:
# reload csv
telegraph_migration = pd.read_csv("telegraph-migration.csv")
# get body of articles
df = get_body(telegraph_migration)
# rewrite to CSV
df.to_csv(r"telegraph-migration.csv", encoding="utf-8", header=True, index=False)

## Migrant crisis

In [54]:


# create lists to store our data
all_news_links = []
all_headlines = []
all_dates = []

for page_number in range(6, 19):
    print(f"We are scraping page: {page_number}")
    # define our user headers
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
    }
    telegraph = f"https://www.telegraph.co.uk/migrant-crisis/page-{page_number}/"
    
    # request webpage
    res = requests.get(telegraph, headers=headers)
    
    # check status for debugging
    res.raise_for_status()
    
    soup = bs(res.text, "html.parser")
    
    # get list of articles
    articles = soup.find_all("div", class_="card__content")

    for article in articles:
        # append headline
        headline_span = article.find('span', class_='u-heading-6 list-headline__text')
        if headline_span:
            headline = headline_span.find('span').text.strip()
            all_headlines.append(headline)
        else:
            all_headlines.append("No headline found")
        
        # append article link
        href_tag = article.find("a").attrs["href"]
        link = "https://www.telegraph.co.uk" + href_tag
        all_news_links.append(link)

        # append date
        date_tag = article.find("time", class_="card__date")
        if date_tag:
            date = date_tag.get_text().strip()
            all_dates.append(date)
        else:
            all_dates.append("No date found")

    time.sleep(random.randint(1, 3))

# Convert data to dataframe
data = {"headline": all_headlines,
        "link": all_news_links,
        "date": all_dates
}
df = pd.DataFrame.from_dict(data)
# write to csv
df.to_csv("telegraph-migrant-crisis.csv", encoding="utf-8", header=True, index=False)

We are scraping page: 6
We are scraping page: 7
We are scraping page: 8
We are scraping page: 9
We are scraping page: 10
We are scraping page: 11
We are scraping page: 12
We are scraping page: 13
We are scraping page: 14
We are scraping page: 15
We are scraping page: 16
We are scraping page: 17
We are scraping page: 18


In [55]:
# reload csv
telegraph_migration = pd.read_csv("telegraph-migrant-crisis.csv")
# get body of articles
df = get_body(telegraph_migration)
# rewrite to CSV
df.to_csv(r"telegraph-migrant-crisis.csv", encoding="utf-8", header=True, index=False)

## Refugee crisis

In [56]:

# create lists to store our data
all_news_links = []
all_headlines = []
all_dates = []

for page_number in range(1, 3):
    print(f"We are scraping page: {page_number}")
    # define our user headers
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
    }
    if page_number == 1:
        telegraph = "https://www.telegraph.co.uk/refugee-crisis/"
    else:
        telegraph = f"https://www.telegraph.co.uk/refugee-crisis/page-{page_number}/"
    
    # request webpage
    res = requests.get(telegraph, headers=headers)
    
    # check status for debugging
    res.raise_for_status()
    
    soup = bs(res.text, "html.parser")
    
    # get list of articles
    articles = soup.find_all("div", class_="card__content")

    for article in articles:
        # append headline
        headline_span = article.find('span', class_='u-heading-6 list-headline__text')
        if headline_span:
            headline = headline_span.find('span').text.strip()
            all_headlines.append(headline)
        else:
            all_headlines.append("No headline found")
        
        # append article link
        href_tag = article.find("a").attrs["href"]
        link = "https://www.telegraph.co.uk" + href_tag
        all_news_links.append(link)

        # append date
        date_tag = article.find("time", class_="card__date")
        if date_tag:
            date = date_tag.get_text().strip()
            all_dates.append(date)
        else:
            all_dates.append("No date found")

    time.sleep(random.randint(1, 3))

# Convert data to dataframe
data = {"headline": all_headlines,
        "link": all_news_links,
        "date": all_dates
}
df = pd.DataFrame.from_dict(data)
# write to csv
df.to_csv("telegraph-refugee-crisis.csv", encoding="utf-8", header=True, index=False)

We are scraping page: 1
We are scraping page: 2


In [57]:
# reload csv
telegraph_refugee = pd.read_csv("telegraph-refugee-crisis.csv")
# get body of articles
df = get_body(telegraph_refugee)
# rewrite to CSV
df.to_csv(r"telegraph-refugee-crisis.csv", encoding="utf-8", header=True, index=False)

## Merging all the dataframes

In [58]:
# reload csv
t1 = pd.read_csv("telegraph-immigration.csv")
t2 = pd.read_csv("telegraph-migrants.csv")
t3 = pd.read_csv("telegraph-migration.csv")
t4 = pd.read_csv("telegraph-migrant-crisis.csv")
t5 = pd.read_csv("telegraph-refugee-crisis.csv")

# combine all dataframes
telegraph = pd.concat([t1, t2, t3, t4, t5])

# write to csv
telegraph.to_csv(r"all-telegraph.csv", encoding="utf-8", header=True, index=False)

## Cleaning

In [46]:
# load csv
telegraph = pd.read_csv(folder_path + "all-telegraph.csv")
# convert date to datetime
telegraph["date"] = pd.to_datetime(telegraph["date"])
# remove dates not from 2023
telegraph = telegraph[telegraph['date'].dt.year == 2023]
# remove duplicates in links
telegraph = telegraph.drop_duplicates(subset="link")
# remove rows with "/books/" or "/theatre/" in link
telegraph = telegraph[~telegraph["link"].str.contains("/books/|/theatre/|/dance/")]
# remove rows with empty body
telegraph = telegraph[telegraph["body"].notna()]
# remove rows with mentions of america in body
telegraph = telegraph[~telegraph["body"].str.contains("Biden|America|USA|Trump|Mexico|New York")]
# remove column link
telegraph = telegraph.drop(columns="link")
# add column for source
telegraph["news"] = "telegraph"
# write to csv
telegraph.to_csv(folder_path + "all-telegraph.csv", encoding="utf-8", header=True, index=False)


  telegraph["date"] = pd.to_datetime(telegraph["date"])


# Metro

## Immigration

In [59]:

# create lists to store our data
all_news_links = []

for page_number in range(3, 7):
    print(f"We are scraping page: {page_number}")
    # define our user headers
    headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        }
    if page_number == 1:
        metro = "https://metro.co.uk/tag/immigration/"
    else:
        metro = f"https://metro.co.uk/tag/immigration/page/{page_number}/"
    # request webpage
    res = requests.get(metro, headers=headers)
    
    # check status for debugging
    res.raise_for_status()
    
    soup = bs(res.text, "html.parser")

    # header articles in page 1
    li_elements = soup.find_all("li", class_=lambda x: x and 'metro__post' in x.split())

    # Extract the href attribute of the <a> tag within each <li>
    for li in li_elements:
        a_tag = li.find('a')
        if a_tag:
            link = a_tag.get('href')
            if link:
                all_news_links.append(link)

    # this gets list of articles
    articles = soup.find_all("article")

    # print out the href attribute of the <a> tag within each article that contains data-link-tracking
    for article in articles:
        # find the <a> tag within the article that contains data-link-tracking
        a_tag = article.find('a')
    
        # extract the href attribute if the <a> tag is found
        if a_tag:
            link = a_tag.get('href')

        all_news_links.append(link)

    time.sleep(random.randint(1, 3))
    

# Convert data to dataframe
data = {"link": all_news_links}
df = pd.DataFrame.from_dict(data)

# write to csv
df.to_csv(r"metro-immigration.csv", encoding="utf-8", header="true", index=False)


We are scraping page: 3
We are scraping page: 4
We are scraping page: 5
We are scraping page: 6


In [64]:
# Function to get body of article
def get_info(df, retries=5):
    all_headlines = []
    all_dates = []
    all_bodies = []
    for link in df['link']:  
        for attempt in range(retries):
            try:
                res = requests.get(link, headers=headers)
                res.raise_for_status()  # raise an HTTPError for bad responses
                soup = bs(res.content, "html.parser")

                # get headline
                headline = soup.find("h1", class_="post-title clear").get_text().strip()
                if headline:
                    all_headlines.append(headline)
                else:
                    all_headlines.append("None") 
                
                # get date
                published_info = soup.find("span", class_="post-published").get_text().strip()
                if published_info:
                    published_date = published_info.replace("Published", "").strip()
                    all_dates.append(published_date)
                else:
                    all_dates.append("None")
                    
                # get body
                article_body = soup.find("div", class_="article-body")
                if article_body:
                    paragraphs = article_body.find_all("p")
                    article_text = ""
                    for paragraph in paragraphs:
                        text = paragraph.get_text().strip()
                        if "Get in touch" in text:
                            break
                        article_text += text + "\n"
                    all_bodies.append(article_text.strip())
                else:
                    all_bodies.append("None")

                # break the retry loop on success
                break
            
            except requests.exceptions.RequestException as e:
                print(f"Error fetching {link}: {e}")
                if attempt < retries - 1:
                    wait_time = 2 ** attempt  # exponential backoff
                    print(f"Retrying in {wait_time} seconds...")
                    time.sleep(wait_time)
                else:
                    print(f"Failed to fetch {link} after {retries} attempts.")
                    all_headlines.append("None")
                    all_dates.append("None")
                    all_bodies.append("None")

    # add as new columns to df
    df['headline'] = all_headlines
    df['date'] = all_dates
    df['body'] = all_bodies

    return df

In [65]:
# reload csv
metro_immigration = pd.read_csv("metro-immigration.csv")
# get info
df = get_info(metro_immigration)
# rewrite to CSV
df.to_csv(r"metro-immigration.csv", encoding="utf-8", header=True, index=False)

## Immigration Nation

In [66]:


# create lists to store our data
all_news_links = []

for page_number in range(1, 3):
    print(f"We are scraping page: {page_number}")
    # define our user headers
    headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        }
    if page_number == 1:
        metro = "https://metro.co.uk/tag/immigration-nation/"
    else:
        metro = f"https://metro.co.uk/tag/immigration-nation/page/{page_number}/"
    # request webpage
    res = requests.get(metro, headers=headers)
    
    # check status for debugging
    res.raise_for_status()
    
    soup = bs(res.text, "html.parser")

    # header articles in page 1
    li_elements = soup.find_all("li", class_=lambda x: x and 'metro__post' in x.split())

    # Extract the href attribute of the <a> tag within each <li>
    for li in li_elements:
        a_tag = li.find('a')
        if a_tag:
            link = a_tag.get('href')
            if link:
                all_news_links.append(link)

    # this gets list of articles
    articles = soup.find_all("article")

    # print out the href attribute of the <a> tag within each article that contains data-link-tracking
    for article in articles:
        # find the <a> tag within the article that contains data-link-tracking
        a_tag = article.find('a')
    
        # extract the href attribute if the <a> tag is found
        if a_tag:
            link = a_tag.get('href')

        all_news_links.append(link)

    time.sleep(random.randint(1, 3))
    

# Convert data to dataframe
data = {"link": all_news_links}
df = pd.DataFrame.from_dict(data)

# write to csv
df.to_csv(r"metro-immigration-nation.csv", encoding="utf-8", header="true", index=False)


We are scraping page: 1
We are scraping page: 2


In [67]:
# reload csv
metro_immigration_nation = pd.read_csv("metro-immigration-nation.csv")
# get info
df = get_info(metro_immigration_nation)
# rewrite to CSV
df.to_csv(r"metro-immigration-nation.csv", encoding="utf-8", header=True, index=False)

## Migrants

In [69]:

# create lists to store our data
all_news_links = []

for page_number in range(1, 4):
    print(f"We are scraping page: {page_number}")
    # define our user headers
    headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        }
    if page_number == 1:
        metro = "https://metro.co.uk/tag/migrants/"
    else:
        metro = f"https://metro.co.uk/tag/migrants/page/{page_number}/"
    # request webpage
    res = requests.get(metro, headers=headers)
    
    # check status for debugging
    res.raise_for_status()
    
    soup = bs(res.text, "html.parser")

    # header articles in page 1
    li_elements = soup.find_all("li", class_=lambda x: x and 'metro__post' in x.split())

    # Extract the href attribute of the <a> tag within each <li>
    for li in li_elements:
        a_tag = li.find('a')
        if a_tag:
            link = a_tag.get('href')
            if link:
                all_news_links.append(link)

    # this gets list of articles
    articles = soup.find_all("article")

    # print out the href attribute of the <a> tag within each article that contains data-link-tracking
    for article in articles:
        # find the <a> tag within the article that contains data-link-tracking
        a_tag = article.find('a')
    
        # extract the href attribute if the <a> tag is found
        if a_tag:
            link = a_tag.get('href')

        all_news_links.append(link)

    time.sleep(random.randint(1, 3))
    

# Convert data to dataframe
data = {"link": all_news_links}
df = pd.DataFrame.from_dict(data)

# write to csv
df.to_csv(r"metro-migrants.csv", encoding="utf-8", header="true", index=False)


We are scraping page: 1
We are scraping page: 2
We are scraping page: 3


In [70]:
# reload csv
metro_migrants = pd.read_csv("metro-migrants.csv")
# get info
df = get_info(metro_migrants)
# rewrite to CSV
df.to_csv(r"metro-migrants.csv", encoding="utf-8", header=True, index=False)

## Asylum and Immigration Tribunal

In [71]:

# create lists to store our data
all_news_links = []
headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        }
metro = "https://metro.co.uk/tag/asylum-and-immigration-tribunal/"
# request webpage
res = requests.get(metro, headers=headers)
soup = bs(res.text, "html.parser")

# header articles in page 1
li_elements = soup.find_all("li", class_=lambda x: x and 'metro__post' in x.split())

# Extract the href attribute of the <a> tag within each <li>
for li in li_elements:
    a_tag = li.find('a')
    if a_tag:
        link = a_tag.get('href')
        if link:
            all_news_links.append(link)

time.sleep(random.randint(1, 3))
    
# Convert data to dataframe
data = {"link": all_news_links}
df = pd.DataFrame.from_dict(data)

# write to csv
df.to_csv(r"metro-asylum.csv", encoding="utf-8", header="true", index=False)


In [72]:
# reload csv
metro_asylum = pd.read_csv("metro-asylum.csv")
# get info
df = get_info(metro_asylum)
# rewrite to CSV
df.to_csv(r"metro-asylum.csv", encoding="utf-8", header=True, index=False)

## Border agency

In [73]:


# create lists to store our data
all_news_links = []
headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        }
metro = "https://metro.co.uk/tag/border-agency/"
# request webpage
res = requests.get(metro, headers=headers)
soup = bs(res.text, "html.parser")

# header articles in page 1
li_elements = soup.find_all("li", class_=lambda x: x and 'metro__post' in x.split())

# Extract the href attribute of the <a> tag within each <li>
for li in li_elements:
    a_tag = li.find('a')
    if a_tag:
        link = a_tag.get('href')
        if link:
            all_news_links.append(link)

time.sleep(random.randint(1, 3))
    
# Convert data to dataframe
data = {"link": all_news_links}
df = pd.DataFrame.from_dict(data)

# write to csv
df.to_csv(r"metro-border.csv", encoding="utf-8", header="true", index=False)


In [74]:
# reload csv
metro_border = pd.read_csv("metro-border.csv")
# get info
df = get_info(metro_border)
# rewrite to CSV
df.to_csv(r"metro-border.csv", encoding="utf-8", header=True, index=False)

## Merging all the dataframes

In [75]:
# reload the csvs
m1 = pd.read_csv("metro-immigration.csv")
m2 = pd.read_csv("metro-migrants.csv")
m3 = pd.read_csv("metro-asylum.csv")
m4 = pd.read_csv("metro-border.csv")
m5 = pd.read_csv("metro-immigration-nation.csv")

# combine all dataframes
metro = pd.concat([m1, m2, m3, m4, m5])

# write to csv
metro.to_csv(r"all-metro.csv", encoding="utf-8", header=True, index=False)

## Cleaning

In [50]:
# load csv
metro = pd.read_csv(folder_path + "all-metro.csv")
metro['date'] = pd.to_datetime(metro['date'], format='%b %d, %Y, %I:%M%p')
# remove dates not from 2023
metro = metro[metro['date'].dt.year == 2023]
# remove duplicates in links
metro = metro.drop_duplicates(subset="link")
# remove rows with empty body
metro = metro[metro["body"].notna()]
# remove column link
metro = metro.drop(columns="link")
# add column for source
metro["news"] = "metro"
# write to csv
metro.to_csv(folder_path + "all-metro.csv", encoding="utf-8", header=True, index=False)

# News Cleaning From Lexis Nexis

<b> Search Terms: </b>

("UK" OR "United Kingdom" OR "Britain" OR "British") AND ("immigration" OR "immigrants" OR "migrants" OR "asylum seekers" OR "refugees" OR "Rwanda plan" OR "Rwanda bill" OR "stop the boats" OR "visa" OR "graduate visa" OR "skilled worker visa" OR "border control" OR "immigration policy" OR "deportation" OR "immigration law" OR "immigration reform" OR "immigration crisis" OR "immigration debate" OR "migration" OR "immigration system" OR "immigration rules" OR "immigration enforcement" OR "refugee status" OR "asylum policy")

<b> Newspapers used: </b>

1. times.co.uk
2. the independent 

<b> Timeline: </b>
1 Jan 2023 - 31 Dec 2023

In [None]:
# Load libraries
#!pip install striprtf
from striprtf.striprtf import rtf_to_text
import pandas as pd
import re
import os

# get current working directory to merge folder path
current_dir = os.getcwd()

In [None]:
def process_file(file_path):
    """
    Function to process a single file and extract the article body and date
    """
    with open(file_path, 'r') as infile:
        content = infile.read()
        text = rtf_to_text(content)
        
        # extract content between 'Body' and 'Load-Date' which is the article body
        body_match = re.search(r'Body(.*)Load-Date:', text, re.DOTALL)
        load_date_match = re.search(r'Load-Date:\s*(.*)', text)

        if body_match:
            body = body_match.group(1).strip()
        else:
            body = ""
        
        if load_date_match:
            date = load_date_match.group(1).strip()
        else:
            date = ""

        body = body.replace('\n', ' ')
        body = re.sub(r'[^a-zA-Z0-9\s]', '', body)

    return body, date

## Times

In [None]:
times = []

# path to the folder containing times files
folder_path = os.path.join(current_dir, 'times')

for filename in os.listdir(folder_path):
    if filename.endswith('.RTF'):
        file_path = os.path.join(folder_path, filename)
        cleaned_text, date = process_file(file_path)
        file_name = os.path.splitext(filename)[0]
        times.append({'headline': file_name, 'body': cleaned_text, 'date': date})

# create pandas dataframe
times_df = pd.DataFrame(times)
times_df.to_csv('times.csv', index=False)

In [None]:
times1 = []

# path to the folder containing times1 files
folder_path = os.path.join(current_dir, 'times1')

for filename in os.listdir(folder_path):
    if filename.endswith('.RTF'):
        file_path = os.path.join(folder_path, filename)
        cleaned_text, date = process_file(file_path)
        file_name = os.path.splitext(filename)[0]
        times.append({'headline': file_name, 'body': cleaned_text, 'date': date})

# create pandas dataframe
times1_df = pd.DataFrame(times)
times1_df.to_csv('times1.csv', index=False)

### Merging the dataframes

In [66]:
# reload the csv
times = pd.read_csv('times.csv')
times1 = pd.read_csv('times1.csv')

# merge the two dataframes
times = pd.concat([times, times1], ignore_index=True)
times.to_csv('all-times.csv', index=False)

## Cleaning

In [68]:
# load csv
times = pd.read_csv(folder_path + "all-times.csv")
# convert date to datetime
times["date"] = pd.to_datetime(times["date"])
# remove rows with empty body, time or headline
times = times[times["body"].notna()]
times = times[times["date"].notna()]
times = times[times["headline"].notna()]
# add column for source
times["news"] = "times"
# write to csv
times.to_csv(folder_path + "all-times.csv", encoding="utf-8", header=True, index=False)

## The Independent

In [None]:
independent = []

# path to the folder containing times1 files
folder_path = os.path.join(current_dir, 'independent')

for filename in os.listdir(folder_path):
    if filename.endswith('.RTF'):
        file_path = os.path.join(folder_path, filename)
        cleaned_text, date = process_file(file_path)
        file_name = os.path.splitext(filename)[0]
        independent.append({'headline': file_name, 'body': cleaned_text, 'date': date})

# create pandas dataframe
independent_df = pd.DataFrame(independent)
independent_df.to_csv('independent.csv', index=False)

In [None]:
independent1 = []

# path to the folder containing times1 files
folder_path = os.path.join(current_dir, 'independent1')

for filename in os.listdir(folder_path):
    if filename.endswith('.RTF'):
        file_path = os.path.join(folder_path, filename)
        cleaned_text, date = process_file(file_path)
        file_name = os.path.splitext(filename)[0]
        independent1.append({'headline': file_name, 'body': cleaned_text, 'date': date})

# create pandas dataframe
independent1_df = pd.DataFrame(independent1)
independent1_df.to_csv('independent1.csv', index=False)

### Merging all the dataframes

In [67]:
# reload the csvs
independent = pd.read_csv('independent.csv')
independent1 = pd.read_csv('independent1.csv')

# merge the two dataframes
independent = pd.concat([independent, independent1], ignore_index=True)
independent.to_csv('all-independent.csv', index=False)

## Cleaning 

In [69]:
# load csv
independent = pd.read_csv(folder_path + "all-independent.csv")
# convert date to datetime
independent["date"] = pd.to_datetime(independent["date"])
# remove rows with empty body, time or headline
independent = independent[independent["body"].notna()]
independent = independent[independent["date"].notna()]
independent = independent[independent["headline"].notna()]
# add column for source
independent["news"] = "independent"
# write to csv
independent.to_csv(folder_path + "all-independent.csv", encoding="utf-8", header=True, index=False)

# Combine all news

In [72]:
# load csvs

times = pd.read_csv(folder_path + "all-times.csv")
independent = pd.read_csv(folder_path + "all-independent.csv")
guardian = pd.read_csv(folder_path + "all-guardian.csv")
sun = pd.read_csv(folder_path + "all-sun.csv")
telegraph = pd.read_csv(folder_path + "all-telegraph.csv")
metro = pd.read_csv(folder_path + "all-metro.csv")
bbc = pd.read_csv(folder_path + "all-bbc.csv")
daily_mail = pd.read_csv(folder_path + "all-daily-mail.csv")
express = pd.read_csv(folder_path + "all-express.csv")
mirror = pd.read_csv(folder_path + "all-mirror.csv")

# merge all dataframes
all_news = pd.concat([times, independent, guardian, sun, telegraph, metro, bbc, daily_mail, express, mirror])
# write to csv
all_news.to_csv(folder_path + "all-news.csv", encoding="utf-8", header=True, index=False)


# T5 

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelWithLMHead
import pandas as pd
import os

In [4]:
all_news = pd.read_csv("/Volumes/Untitled/news/all-news.csv")
# extract text from body column
texts = all_news["body"]

In [None]:
tokenizer = AutoTokenizer.from_pretrained('T5-base')
model = AutoModelWithLMHead.from_pretrained('T5-base', return_dict=True)

In [6]:
average_length = 618.9125
min_length = int(0.1*average_length)
max_length = int(0.2*average_length)

In [7]:
summaries = []
for text in texts:
    inputs = tokenizer.encode("summarize: " + text, return_tensors='pt', max_length=512, truncation=True)
    output = model.generate(inputs, min_length=min_length,max_length=max_length)
    summary = tokenizer.decode(output[0])

    summaries.append(summary)
texts['summary'] = summaries 

In [19]:
# remove <pad> and </s> tokens from summaries
summaries = [summary.replace("<pad>", "").replace("</s>", "") for summary in summaries]
summaries = [summary.strip() for summary in summaries]

In [21]:
# add as new column to dataframe

all_news['summary'] = summaries
all_news.to_csv("/Volumes/Untitled/news/all-news-summarised.csv", index=False)