# **USA Today News Scraper**

In [5]:
import requests
import bs4
import re

import os
import csv

# CSV file path containing links
csv_file = '/content/UTO_news_links.csv'

# output folder path
output_folder = '/content/UTO_scraped_texts'  # You can modify this path

# Function to scrape data from a given link and save it in a text file
def scrape_and_save(link, serial_number, output_folder):
    response = requests.get(link)
    soup = bs4.BeautifulSoup(response.text, 'html.parser')

    # Extract the desired texts
    # Date
    # Find the div element with the specified class name
    date_div = str(soup.find('div', class_='gnt_ar_dt'))

    pattern = r"\w+\s\d{1,2},\s\d{4}"
    date_match = re.search(pattern, date_div)

    if date_match:
      date = date_match.group(0)
      # print(date)  # Output: May 24, 2022
    else:
      date = ""

    # Headline
    headline_element = soup.find('h1', class_='gnt_ar_hl') or soup.find('h1', class_='gnt_sv_hl')
    headline = headline_element.get_text() if headline_element else ""
    headline = headline.strip() + "."  # Add a period at the end

    # # Sub-headline
    # sub_headline_element = soup.find('h2', class_='dmnc_generic-header-header-module__sF01k secondaryRoman secondaryRoman-30 md_secondaryRoman-40 text-gray-dark')
    # sub_headline = sub_headline_element.get_text() if sub_headline_element else ""

    # Paragraphs
    content_element = soup.find('div', class_='gnt_ar_b') or soup.find('div', class_='articleBody') or soup.find('div', class_='gnt_sv_vb')
    p_tags = content_element.find_all('p') if content_element else []

    # Photo captions
    # caption_content = soup.find_all('figcaption', class_='dmnc_images-image-elements-module__ku669 secondaryRoman secondaryRoman-10 mt-3 text-gray-medium')
    # caption = [fig.get_text() for fig in caption_content] if caption_content else []

    # Create the output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Save the texts in a separate text file within the output folder
    filename = f"{serial_number}_{link.replace('/', '_').replace(':', '')}.txt"
    file_path = os.path.join(output_folder, filename)
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(date + '\n')
        file.write(headline + '\n')
        # file.write(sub_headline + '\n')

        # Write the content of the p_tags
        for p in p_tags:
            file.write(p.get_text() + '\n')

# Read the links from the CSV file [columns for both serial_number and link are explicitly specified]
with open(csv_file, 'r') as file:
    reader = csv.reader(file)
    next(reader) # Skip the header row
    # links = list(reader)
    for row in reader:
      serial_number = row[0]  # Specify the column number for the serial number
      link = row[1]  # Specify the column number for the link
      scrape_and_save(link, serial_number, output_folder)
