In [1]:
import datetime
import os
import pandas as pd
from pygooglenews import GoogleNews
from bs4 import BeautifulSoup


# Function to generate all possible combinations of 5 terms from a list of terms in a sliding window manner
def generate_combinations(terms):
    num_terms = len(terms)
    combinations = []
    window_size = 5

    for i in range(num_terms - window_size + 1):
        combination = terms[i : i + window_size]
        combinations.append(combination)

    return combinations

#Parsing RSS DESCRIPTION WITH B4SOUP
def google_news_search(keywords, date):
    base_url = "https://news.google.com/search"

    # Parse the date using datetime module
    start_date = datetime.datetime.strptime(date, '%m/%d/%Y')
    prev_date = start_date - datetime.timedelta(days=1)

    # Format dates in the required format 'YYYY-MM-DD'
    prev_date_str = prev_date.strftime('%Y-%m-%d')
    start_date_str = start_date.strftime('%Y-%m-%d')

    news_data = []

    query = ' '.join(keywords)
    print(f"Searching for: {query} within the date range {prev_date_str}-{start_date_str}")

    # Create GoogleNews instance with country set to 'US'
    gn = GoogleNews(lang='en', country='US')

    # Perform the search with geolocation setting
    search = gn.search(query, when=None, from_=prev_date_str, to_=start_date_str)

    for entry in search['entries']:
        title = entry['title']
        actual_link = entry['link']
        published_date = entry['published']
        
        # Extract and clean the description using BeautifulSoup
        description_html = entry.get('description', '')
        description_text = BeautifulSoup(description_html, 'html.parser').get_text()

        news_data.append({"title": title, "link": actual_link, "published_date": published_date, "description": description_text})

    print("News data:", news_data)

    return news_data


# Load the CSV file into a DataFrame
input_file = r'D:\Research\Python\Data\WIP\Google Search Results\Search_Terms_Hashtags.csv'
df = pd.read_csv(input_file)

# Iterate through each row in the original DataFrame
for _, row in df.iterrows():
    spike_date = row['Spike Date']
    terms = row['Terms'].split()  # Assuming the terms are separated by spaces
    term_combinations = generate_combinations(terms)

    # Call google_news_search for each term combination and store the results in the results_data list
    results_data = []  # Move this inside the loop to reset for each row
    for combination in term_combinations:
        news_data = google_news_search(combination, spike_date)

        # Extend the results_data list with the combination and Spike Date for each row
        for item in news_data:
            item['Spike Date'] = spike_date
            item['combination'] = ' '.join(combination)
        results_data.extend(news_data)

    # Create a DataFrame from the results data
    results_df = pd.DataFrame(results_data, columns=['Spike Date', 'combination', 'title', 'link', 'published_date', 'source', 'description', 'author', 'region'])

    # Save the results DataFrame as CSV in the same directory as the input file
    output_file = os.path.join(r'D:\Research\Python\Data\WIP\Google Search Results', f'Spike_Date_{spike_date.replace("/", "-")}.csv')
    results_df.to_csv(output_file, index=False)

Searching for: healthcare Medicare Medicaid COVID19 Fraud within the date range 2021-01-06-2021-01-07
News data: [{'title': 'Northwell Health rescinds more than 2500 patient medical bill lawsuits - Healthcare Finance News', 'link': 'https://news.google.com/rss/articles/CBMibGh0dHBzOi8vd3d3LmhlYWx0aGNhcmVmaW5hbmNlbmV3cy5jb20vbmV3cy9ub3J0aHdlbGwtaGVhbHRoLXJlc2NpbmRzLW1vcmUtMjUwMC1wYXRpZW50LW1lZGljYWwtYmlsbC1sYXdzdWl0c9IBAA?oc=5', 'published_date': 'Wed, 06 Jan 2021 08:00:00 GMT', 'description': 'Northwell Health rescinds more than 2500 patient medical bill lawsuits\xa0\xa0Healthcare Finance News'}, {'title': 'New Direct-Contracting Options Raise Concerns with Some Value ... - Home Health Care News', 'link': 'https://news.google.com/rss/articles/CBMieWh0dHBzOi8vaG9tZWhlYWx0aGNhcmVuZXdzLmNvbS8yMDIxLzAxL25ldy1kaXJlY3QtY29udHJhY3Rpbmctb3B0aW9ucy1yYWlzZS1jb25jZXJucy13aXRoLXNvbWUtdmFsdWUtYmFzZWQtY2FyZS12ZXRlcmFucy_SAQA?oc=5', 'published_date': 'Wed, 06 Jan 2021 08:00:00 GMT', 'description': 'N