In [None]:
!pip install pytrends

Collecting pytrends
  Downloading pytrends-4.9.2-py3-none-any.whl.metadata (13 kB)
Downloading pytrends-4.9.2-py3-none-any.whl (15 kB)
Installing collected packages: pytrends
Successfully installed pytrends-4.9.2


In [None]:
from pytrends.request import TrendReq
from google.colab import drive
import pandas as pd
import datetime
import time
import random

In [None]:
drive.mount('/content/drive')
path = '/content/drive/MyDrive/Volatility/New_data_2000_2024/'

Mounted at /content/drive


In [None]:
filename = 'daily_garch_data.csv'
daily_data = pd.read_csv(path + filename)

In [None]:
daily_data.index = daily_data['Date']
daily_data.drop(columns=['Date'], inplace=True)

In [None]:
daily_data.index

Index(['2000-02-02', '2000-02-03', '2000-02-04', '2000-02-07', '2000-02-08',
       '2000-02-09', '2000-02-10', '2000-02-11', '2000-02-14', '2000-02-15',
       ...
       '2024-08-19', '2024-08-20', '2024-08-21', '2024-08-22', '2024-08-23',
       '2024-08-26', '2024-08-27', '2024-08-28', '2024-08-29', '2024-08-30'],
      dtype='object', name='Date', length=6184)

In [None]:
# Initialize pytrends
pytrends = TrendReq(hl='en-US', tz=360)

# Define the keyword and timeframe
keyword = 'S&P 500'
start_date = '2009-12-01'
end_date = '2024-09-19'

In [None]:
# Function to fetch Google Trends data
def fetch_gt_data(keyword, start_date, end_date):
    # Define the timeframes in chunks of 5 years due to Google Trends' limitations
    timeframes = pd.date_range(start=start_date, end=end_date, freq='5Y').tolist()
    timeframes.append(datetime.datetime.strptime(end_date, '%Y-%m-%d'))

    all_data = pd.DataFrame()

    for i in range(len(timeframes)-1):
        # Format the timeframe
        timeframe = f"{timeframes[i].strftime('%Y-%m-%d')} {timeframes[i+1].strftime('%Y-%m-%d')}"
        print(f"Fetching data for: {timeframe}")

        # Retry logic with delay
        success = False
        while not success:
            try:
                # Build the payload
                pytrends.build_payload([keyword], timeframe=timeframe)

                # Get interest over time
                data = pytrends.interest_over_time()

                if not data.empty:
                    all_data = pd.concat([all_data, data])

                success = True
            except Exception as e:
                print(f"Error: {e}. Retrying in a few seconds...")
                time.sleep(random.randint(30, 60))  # Random delay between 30 to 60 seconds

    return all_data

In [None]:
# Fetch the data
gt_data = fetch_gt_data(keyword, start_date, end_date)

Fetching data for: 2009-12-31 2014-12-31
Fetching data for: 2014-12-31 2019-12-31
Fetching data for: 2019-12-31 2024-09-19


In [None]:
gt_data['gt_data'] = gt_data['S&P 500']
gt_data.drop(columns=['isPartial', 'S&P 500'], inplace=True)

In [None]:
gt_data

Unnamed: 0_level_0,gt_data
date,Unnamed: 1_level_1
2009-12-27,32
2010-01-03,43
2010-01-10,43
2010-01-17,40
2010-01-24,45
...,...
2024-08-18,45
2024-08-25,41
2024-09-01,41
2024-09-08,42


In [None]:
gt_data.head(10)

Unnamed: 0_level_0,gt_data
date,Unnamed: 1_level_1
2009-12-27,32
2010-01-03,43
2010-01-10,43
2010-01-17,40
2010-01-24,45
2010-01-31,47
2010-02-07,39
2010-02-14,34
2010-02-21,34
2010-02-28,39


In [None]:
gt_data.drop(gt_data.index[:5], inplace=True)

In [None]:
gt_data.drop(gt_data.index[-2:], inplace=True)

In [None]:
gt_data

Unnamed: 0_level_0,gt_data
date,Unnamed: 1_level_1
2010-01-31,47
2010-02-07,39
2010-02-14,34
2010-02-21,34
2010-02-28,39
...,...
2024-08-04,80
2024-08-11,46
2024-08-18,45
2024-08-25,41


In [None]:
# Remove duplicate dates
gt_data = gt_data[~gt_data.index.duplicated(keep='first')]

# Interpolate to daily frequency
daily_data = gt_data.resample('D').interpolate(method='time')

# Daily data

In [None]:
daily_data

Unnamed: 0_level_0,gt_data
date,Unnamed: 1_level_1
2010-01-31,47.000000
2010-02-01,45.857143
2010-02-02,44.714286
2010-02-03,43.571429
2010-02-04,42.428571
...,...
2024-08-28,41.000000
2024-08-29,41.000000
2024-08-30,41.000000
2024-08-31,41.000000


In [None]:
daily_data.drop(daily_data.index[-2:], inplace=True)

In [None]:
daily_data.drop(daily_data.index[:3], inplace=True)

In [None]:
daily_data

Unnamed: 0_level_0,gt_data
date,Unnamed: 1_level_1
2010-02-03,43.571429
2010-02-04,42.428571
2010-02-05,41.285714
2010-02-06,40.142857
2010-02-07,39.000000
...,...
2024-08-26,41.000000
2024-08-27,41.000000
2024-08-28,41.000000
2024-08-29,41.000000


In [None]:
# # Create a date range
# daily_range = pd.date_range(start='2000-02-02', end='2022-12-30', freq='D')

In [None]:
# gt_daily_data = pd.DataFrame(columns=['gt_data'], index=daily_range)

In [None]:
# gt_daily_data

In [None]:
# gt_daily_data.update(daily_data)

In [None]:
# gt_daily_data

In [None]:
# gt_daily_data['gt_data'].fillna(method='bfill', inplace=True)

In [None]:
# gt_daily_data

In [None]:
filename = 'gt_daily_data.csv'
daily_data.to_csv(path + filename)

# Weekly data

In [None]:
# gt_weekly_data = gt_daily_data.drop(gt_daily_data.index[:4])

In [None]:
# gt_weekly_data

In [None]:
# weekly_data

In [None]:
# filename = 'gt_weekly_data.csv'
# gt_weekly_data.to_csv(path + filename)

# Monthly data

In [None]:
# gt_monthly_data = gt_data_daily.drop(gt_daily_data.index[:19])

In [None]:
# gt_monthly_data

In [None]:
# monthly_data

In [None]:
# filename = 'gt_monthly_data.csv'
# gt_monthly_data.to_csv(path + filename)