In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import random
from fake_useragent import UserAgent
import numpy as np
import time
from tqdm import tqdm

# We split the data into 3 parts so that each of our group members can run it to create "manual parallelism." If we do actual parallelism we run into issues with the rate limit.

In [None]:
######################################################
# commented out to prevent accidental execution #
######################################################


# split data into 3 equal parts

# df = pd.read_csv("url_only_data.csv")
# df1 = df.iloc[:len(df)//3]
# df2 = df.iloc[len(df)//3:2*len(df)//3]
# df3 = df.iloc[2*len(df)//3:]
# # Save each part to a separate CSV file
# df1.to_csv("url_only_data_part1.csv", index=False)
# df2.to_csv("url_only_data_part2.csv", index=False)
# df3.to_csv("url_only_data_part3.csv", index=False)

In [None]:
# Load URLs
df = pd.read_csv("url_only_data_part3.csv")
urls = df['url'].dropna().tolist()

# Scrape headlines
def extract_headline(url):
    ua = UserAgent()
    headers = {
        "User-Agent": ua.random,
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.5",
        "Referer": "https://www.google.com/",
        "Connection": "keep-alive",
    }
    try:
        session = requests.Session()
        session.headers.update(headers)
        resp = session.get(url)
        resp.raise_for_status()
        soup = BeautifulSoup(resp.text, 'html.parser')

        # Try <h1>, <title>, <h2>, and meta tags
        for tag in ['h1', 'title', 'h2', 'meta']:
            if tag == 'meta':
                meta = soup.find('meta', property='og:title') or soup.find('meta', attrs={'name': 'title'})
                if meta and meta.get('content'):
                    return meta['content'], None
            else:
                tag_result = soup.find(tag)
                if tag_result and tag_result.get_text(strip=True):
                    return tag_result.get_text(strip=True), None

        return '', 'Headline not found'
    except Exception as e:
        print(str(e))
        return None, str(e)

results = []
for url in tqdm(urls):
    headline, error = extract_headline(url)
    results.append({'url': url, 'headline': headline, 'error': error})
    delay = random.random()
    time.sleep(delay)

# Save and view
result_df = pd.DataFrame(results)
result_df.to_csv("headline_results_3.csv", index=False)

# Here we combine the results and validate the headlines

In [None]:
df1 = pd.read_csv("headline_results1.csv")
df2 = pd.read_csv("headline_results2.csv")
df3 = pd.read_csv("headline_results3.csv")
# Combine the three DataFrames
combined_df = pd.concat([df1, df2, df3], ignore_index=True)
# Save the combined DataFrame to a new CSV file
combined_df.to_csv("headlines.csv", index=False)

In [None]:
final_df = pd.read_csv("headlines.csv")
# print number of nans in the headline column
print(f"there are {final_df['headline'].isna().sum()} nans in the headline column")
nan_error = final_df[final_df['headline'].isna()]["error"]
nan_error

there are 3 nans in the headline column


2323    500 Server Error: Internal Server Error for ur...
2354    404 Client Error: Not Found for url: https://w...
3092    500 Server Error: Internal Server Error for ur...
Name: error, dtype: object

These are server side errors. We can't do anything about them.

# Here we extract the news outlets

In [None]:
final_df['outlet'] = final_df['url'].str.extract(r'https?://(?:www\.)?([\w\-]+)\.com')
# print number of nans in the outlet column
print(f"there are {final_df['outlet'].isna().sum()} nans in the outlet column")

there are 0 nans in the outlet column


In [None]:
final_df.to_csv("headlines_and_outlet.csv", index=False)