In [1]:
pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.signal import savgol_filter, find_peaks
import time
import os
import json
import re

from pytrends.request import TrendReq
from serpapi import GoogleSearch
from requests.exceptions import RequestException
from fuzzywuzzy import fuzz



In [3]:
# Create an instance of the pytrends request object
pytrends = TrendReq(hl='en-US', tz=360)

ReadTimeout: HTTPSConnectionPool(host='trends.google.com', port=443): Read timed out. (read timeout=2)

In [None]:
# Function to get top related queries for a given keyword, country, and timeframe
def get_trends_data(keyword, country_code, timeframe):
    # Build the payload for Google Trends
    pytrends.build_payload([keyword], cat=0, timeframe=timeframe, geo=country_code, gprop='')

    # Try to get interest over time data
    try:
        interest_over_time_df = pytrends.interest_over_time()

        if not interest_over_time_df.empty:
            print(f"Interest over time data for '{keyword}' in {country_code} during {timeframe}:")
            print(interest_over_time_df)
        else:
            print(f"No interest over time data found for '{keyword}' in {country_code} during {timeframe}.")
    except Exception as e:
        print(f"Error fetching data: {e}")

In [None]:
# Test the function
keyword = "COVID-19"  # Case-sensitive keyword
country_code = "PH"  # Philippines country code
timeframe = "2020-12-05 2021-08-31"  # Time period for the data

get_trends_data(keyword, country_code, timeframe)

Error fetching data: The request failed: Google returned a response with code 429


In [None]:
def get_related_queries(term, start_date, end_date, api_key, tz=0, geo='US', category=0, gprop='', delay=2):
    # Search parameters
    search_params = {
        "engine": "google_trends",
        "data_type": "RELATED_QUERIES",
        "q": term,
        "date": f"{start_date} {end_date}",
        "tz": tz,
        "geo": geo,
        "cat": category,
        "gprop": gprop,
        "api_key": api_key
    }

    # Make the API request
    try:
        search = GoogleSearch(search_params)
        result = search.get_dict()
        time.sleep(delay)  # To avoid API rate limits

        # Save the full API response to a JSON file
        filename = f"related_queries_response_{term}.json"
        with open(filename, "w") as f:
            json.dump(result, f, indent=2)

        print(f"Success: Full API response for '{term}' saved to {filename}")
        related_queries = result.get("related_queries", {}).get("top", [])

        return filename, related_queries
    except Exception as e:
        print(f"Failed to fetch related queries for '{term}': {e}")
        return None, []

def merge_json_files(file_list, output_filename="merged_related_queries.json"):
    merged_data = {}

    for file_name in file_list:
        if os.path.exists(file_name):
            try:
                with open(file_name, "r") as f:
                    data = json.load(f)

                # Extract term from filename
                term = file_name.split("_")[-1].replace(".json", "")

                # Extract related queries
                related_queries_data = data.get("related_queries", {}).get("top", [])
                merged_data[term] = related_queries_data

                print(f"Success: Extracted data from '{file_name}'")
            except Exception as e:
                print(f"Failed to process file '{file_name}': {e}")
        else:
            print(f"File '{file_name}' not found. Skipping...")

    # Save the merged JSON to a new file
    try:
        with open(output_filename, "w") as f:
            json.dump(merged_data, f, indent=2)

        print(f"Success: Merged JSON saved to {output_filename}")
    except Exception as e:
        print(f"Failed to save merged JSON: {e}")

    return merged_data

# Application
if __name__ == "__main__":
    # 1. Ask the user how many words they want to search
    try:
        count = int(input("How many search terms do you want to query? "))
    except ValueError:
        print("Invalid input. Please enter an integer.")
        exit(1)

    # 2. Ask the user for those words
    terms = []
    for i in range(count):
        term_input = input(f"Enter term {i+1}: ")
        terms.append(term_input.strip())

    # Define your default parameters
    start_date = "2021-09-01"
    end_date = "2022-05-28"
    api_key = "ca323c4dd720a4742fabc2e8273735b63f595ab56ae2eb5e15e363392968a85c"  # Replace with your actual API key

    # Optional parameters
    time_zone = 0  # UTC
    location = 'PH'  # Philippines location code
    category = 0  # Default to all categories
    search_property = ''  # Default to web search

    # Step 1: Fetch related queries for each user-provided term
    json_files = []
    for term in terms:
        print(f"\nFetching related queries for '{term}'...")
        filename, queries = get_related_queries(
            term,
            start_date,
            end_date,
            api_key,
            tz=time_zone,
            geo=location,
            category=category,
            gprop=search_property
        )
        if filename:
            json_files.append(filename)
        if queries:
            print(f"Success: Found related queries for '{term}'.")
        else:
            print(f"No related queries found for '{term}'.")

    # Step 2: Merge JSON files
    print("\nMerging all JSON files...")
    merge_json_files(json_files)



Fetching related queries for 'vaccine'...


KeyboardInterrupt: 

In [None]:
def extract_queries_from_merged_file(filename):
    # Load the merged JSON data from the file
    with open(filename, "r") as f:
        data = json.load(f)

    # Collect all queries for each term
    all_queries = {}
    for term, queries in data.items():
        # Extract the "query" field from each query object
        all_queries[term] = [query["query"] for query in queries]

    return all_queries

# Application
if __name__ == "__main__":
    filename = "merged_related_queries.json"  # Merged JSON file

    # Extract queries from the merged JSON file
    queries_by_term = extract_queries_from_merged_file(filename)

    # Display the extracted queries
    for term, queries in queries_by_term.items():
        print(f"\nQueries for '{term}':")
        for query in queries:
            print(query)



Queries for 'vaccine':
covid vaccine
vaccine registration
philippines vaccine
covid 19
covid 19 vaccine
vaccine pfizer
pfizer
covid vaccine near me
manila vaccine
vaccine for covid
what is vaccine
covid-19
vaccine side effects
covid-19 vaccine
booster vaccine
moderna
moderna vaccine
vaccination
vaccine certificate
vaccine meaning
covid vaccine registration
covid vaccine philippines
sinovac
astrazeneca
covid vaccine manila

Queries for 'bakuna':
sa
bakuna baguio
bakuna sa covid
covid 19
covid-19
vaccine
ano ang bakuna
ano ang
bakuna registration
bakuna baguio gov ph
mga bakuna sa covid 19
bakuna kontra covid-19
bakuna sa pilipinas
bakuna registration baguio
bakuna sa bata
balita
bakuna meaning
bakuna kahulugan
kahalagahan ng bakuna
vaccination
bakuna laban sa covid 19
bakuna makati
side effects ng bakuna sa covid-19
kahalagahan ng bakuna kontra covid 19 paggamit ng alinman sa uri ng pangungusap
bakuna baguio gov ph master list

Queries for 'COVID':
covid 19
vaccine
covid vaccine
covid 

In [None]:
def extract_queries_from_merged_file(filename):
    # Load the merged JSON data from the file
    with open(filename, "r") as f:
        data = json.load(f)

    # Collect all queries for each term
    all_queries = []
    for term, queries in data.items():
        # Extract the "query" field from each query object
        all_queries.extend([query["query"] for query in queries])
    return all_queries

def load_health_terms(filename):
    with open(filename, "r") as f:
        health_terms = [line.strip() for line in f if line.strip()]  # Remove empty lines
    return health_terms

def load_stopwords(filename):
    # Load stopwords from the file
    with open(filename, "r") as f:
        stopwords = [line.strip().lower() for line in f if line.strip()]  # Remove empty lines and normalize to lowercase
    return stopwords

def clean_query(query, stopwords):
    # Remove each stopword from the query using regex for word boundaries
    for stopword in stopwords:
        query = re.sub(rf'\b{stopword}\b', '', query, flags=re.IGNORECASE)
    # Remove any extra whitespace and return the cleaned query
    return ' '.join(query.split())

def filter_relevant_queries(queries, health_terms, stopwords, threshold=70):  # THRESHOLD
    relevant_queries = []
    for query in queries:
        # Clean the query by removing stopwords
        cleaned_query = clean_query(query, stopwords)
        
        # Skip empty queries after cleaning
        if not cleaned_query:
            continue
        
        # Check for relevance against health terms
        for term in health_terms:
            if fuzz.ratio(cleaned_query.lower(), term.lower()) >= threshold:
                relevant_queries.append(cleaned_query)
                break  # Stop checking once a match is found
    return relevant_queries


In [None]:
# Application
if __name__ == "__main__":
    # Load related queries from the merged JSON file
    json_filename = "merged_related_queries.json"  # Merged JSON file
    all_queries = extract_queries_from_merged_file(json_filename)

    # Load health-seeking terms from the text file
    health_terms_filename = "covid_health_seeking_terms.txt"
    health_terms = load_health_terms(health_terms_filename)

    # Define default stopwords to exclude from the queries
    default_stopwords = ["covid", "covid19", "coronavirus", "covid-19", "sars-cov-2"]

    # Load additional stopwords from the stopwords file
    stopwords_filename = "stopwords.txt"
    additional_stopwords = load_stopwords(stopwords_filename)

    # Combine all stopwords
    stopwords = set(default_stopwords + additional_stopwords)

    # Filter relevant queries
    relevant_queries = filter_relevant_queries(all_queries, health_terms, stopwords)

    # Remove duplicates
    unique_relevant_queries = list(set(relevant_queries))

    # Optionally sort them alphabetically
    unique_relevant_queries.sort()

    # Store them in a list variable named 'words'
    words = unique_relevant_queries

    # Print out the final results
    print("Relevant Health-Seeking Queries:")
    for query in relevant_queries:
        print(query)

    print("\nUnique Relevant Health-Seeking Queries:")
    for query in words:
        print(query)

    # Now you can use 'words' anywhere else in your code
    # Example:
    print("\nWords List:")
    print(words)

Relevant Health-Seeking Queries:
vaccine
vaccine
vaccine near
vaccine
vaccine
vaccine side effects
vaccine
bakuna sa
vaccine
kahalagahan ng bakuna
side effects ng bakuna sa
vaccine
vaccine
cases
pandemic
pandemic
symptoms
vaccine
vaccine near
vaccine
test
cases
pandemic
pandemic
pandemic
pandemic
pandemic
pandemic essay
time pandemic
pandemic
pandemic
bureau quarantine
quarantine
home quarantine
quarantine meaning
pandemic
quarantine status
quarantine hotel
quarantine period
quarantine tagalog
ofw quarantine
quarantine certificate

Unique Relevant Health-Seeking Queries:
bakuna sa
bureau quarantine
cases
home quarantine
kahalagahan ng bakuna
ofw quarantine
pandemic
pandemic essay
quarantine
quarantine certificate
quarantine hotel
quarantine meaning
quarantine period
quarantine status
quarantine tagalog
side effects ng bakuna sa
symptoms
test
time pandemic
vaccine
vaccine near
vaccine side effects

Words List:
['bakuna sa', 'bureau quarantine', 'cases', 'home quarantine', 'kahalagahan n

In [None]:
print(words)

words =['cases', 'doh', 'enhance quarantine', 'general quarantine', 'home quarantine', 'manila quarantine', 'mask', 'observe social distancing', 'pandemic', 'quarantine', 'quarantine meaning', 'quarantine pass', 'quarantine tagalog', 'social distance', 'social distancing', 'social distancing distance', 'social distancing guidelines', 'social distancing meaning', 'social distancing meter', 'social distancing pandemic', 'social distancing philippines', 'social distancing poster', 'social distancing sign', 'social distancing signs', 'social distancing tagalog', 'symptoms', 'test', 'vaccine']

['bakuna sa', 'bureau quarantine', 'effect pandemic', 'essay pandemic', 'home quarantine', 'manila quarantine', 'ofw quarantine', 'pandemic essay', 'quarantine certificate', 'quarantine hotel', 'quarantine meaning', 'quarantine pass', 'quarantine status', 'quarantine tagalog', 'time pandemic']


In [None]:
# Load words from the file into a set
with open("covid_health_seeking_terms.txt", "r") as f:
    words_to_remove = set(f.read().splitlines())


# Filter out words that are in the file
filtered_words = [word for word in words if word not in words_to_remove]

print(filtered_words)

['enhance quarantine', 'general quarantine', 'home quarantine', 'manila quarantine', 'observe social distancing', 'quarantine meaning', 'quarantine pass', 'quarantine tagalog', 'social distance', 'social distancing distance', 'social distancing guidelines', 'social distancing meaning', 'social distancing meter', 'social distancing pandemic', 'social distancing philippines', 'social distancing poster', 'social distancing sign', 'social distancing signs', 'social distancing tagalog']


In [None]:
words = filtered_words[:15]

In [None]:
words

['enhance quarantine',
 'general quarantine',
 'home quarantine',
 'manila quarantine',
 'observe social distancing',
 'quarantine meaning',
 'quarantine pass',
 'quarantine tagalog',
 'social distance',
 'social distancing distance',
 'social distancing guidelines',
 'social distancing meaning',
 'social distancing meter',
 'social distancing pandemic',
 'social distancing philippines']

In [None]:
set_A = words[:5]
set_B = words[4:9]
set_D = words[8:13]
set_F = words[12:16]

In [None]:
print(set_A)
print(set_B)
print(set_D)
print(set_F)

['enhance quarantine', 'general quarantine', 'home quarantine', 'manila quarantine', 'observe social distancing']
['observe social distancing', 'quarantine meaning', 'quarantine pass', 'quarantine tagalog', 'social distance']
['social distance', 'social distancing distance', 'social distancing guidelines', 'social distancing meaning', 'social distancing meter']
['social distancing meter', 'social distancing pandemic', 'social distancing philippines']


In [70]:
df_A = pd.read_csv('T3_5_A.csv', skiprows=1)
df_B = pd.read_csv('T3_5_B.csv', skiprows=1)
df_D = pd.read_csv('T3_5_D.csv', skiprows=1)
# df_F = pd.read_csv('3_set_F.csv', skiprows=1)


In [71]:
def clean_df(df):
    new_columns = [df.columns[0]] + [col.replace(': (Philippines)', '') for col in df.columns[1:]]
    df.columns = new_columns
    df.rename(columns={"Day": "date"}, inplace=True)
    pd.to_datetime(df['date'],format='%Y-%m-%d')
    df.replace("<1", 1, inplace=True)
    return df

In [72]:
df_A

Unnamed: 0,Day,time pandemic: (Philippines),pandemic essay: (Philippines),bakuna: (Philippines),bureau quarantine: (Philippines),home quarantine: (Philippines)
0,2022-01-13,54,38,63,33,17
1,2022-01-14,42,31,63,25,11
2,2022-01-15,44,37,40,25,12
3,2022-01-16,45,31,48,17,14
4,2022-01-17,44,28,63,25,11
...,...,...,...,...,...,...
131,2022-05-24,50,30,19,24,0
132,2022-05-25,43,28,13,23,5
133,2022-05-26,52,33,19,20,0
134,2022-05-27,42,27,15,12,0


In [73]:
clean_df(df_A)
clean_df(df_B)
clean_df(df_D)
# clean_df(df_F)

Unnamed: 0,date,ofw quarantine,quarantine tagalog
0,2022-01-13,0,0
1,2022-01-14,0,0
2,2022-01-15,78,0
3,2022-01-16,0,0
4,2022-01-17,67,0
...,...,...,...
131,2022-05-24,0,0
132,2022-05-25,0,0
133,2022-05-26,0,0
134,2022-05-27,0,0


In [75]:
def scale_and_merge(df_base, df_to_scale, common_word):
    avg_base = df_base[common_word].mean()
    avg_scale = df_to_scale[common_word].mean()
    multiplier = avg_base / avg_scale
    multiplier = min(multiplier,1)
    print(multiplier)
    df_to_scale.iloc[:, 1:] = (df_to_scale.iloc[:, 1:] * multiplier).round().astype(int)
    df_to_scale.iloc[:, 1:] = df_to_scale.iloc[:,1:].clip(upper=100)
    df_to_scale = df_to_scale.drop(columns=[common_word])

    return pd.merge(df_base, df_to_scale, how='outer', on='date')

df_C = scale_and_merge(df_A, df_B, common_word='home quarantine')

df_E = scale_and_merge(df_C, df_D, common_word='ofw quarantine')

# df_G = scale_and_merge(df_E, df_F, common_word='time pandemic')
df_E['date'] = pd.to_datetime(df_E['date']) 


df_cases = pd.read_csv('DOH_cases_T3_5.csv')
df_cases['date'] = pd.to_datetime(df_cases['date'], format='%Y-%m-%d')

# Final merge
df_FINAL = pd.merge(df_E, df_cases, how='outer', on='date')
df_FINAL

1
0.9894736842105264


Unnamed: 0.1,date,time pandemic,pandemic essay,bakuna,bureau quarantine,home quarantine,quarantine meaning,quarantine hotel,quarantine certificate,ofw quarantine,quarantine tagalog,Unnamed: 0,new_cases
0,2022-01-13,54,38,63,33,17,10,16,0,0,0,136,28966.0
1,2022-01-14,42,31,63,25,11,10,18,0,0,0,137,36573.0
2,2022-01-15,44,37,40,25,12,0,16,0,10,0,138,38370.0
3,2022-01-16,45,31,48,17,14,13,14,0,0,0,139,36560.0
4,2022-01-17,44,28,63,25,11,9,11,0,8,0,140,36505.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
131,2022-05-24,50,30,19,24,0,6,0,0,0,0,267,145.0
132,2022-05-25,43,28,13,23,5,0,0,0,0,0,268,170.0
133,2022-05-26,52,33,19,20,0,6,0,0,0,0,269,194.0
134,2022-05-27,42,27,15,12,0,0,0,0,0,0,270,202.0


In [76]:
df_FINAL.fillna(0, inplace=True)

In [77]:
df_FINAL.to_csv("../rnn/google_T3_5.csv", index=False)