<h1>Preliminar setup</h1>

In [None]:
import yfinance as yf
import ast 
import pandas as pd
from datetime import datetime
import os
import numpy as np
import matplotlib.pyplot as plt

european = ['^SPX', '^NDX', '^RUT']
#european = ['^NDX']

american = ['NVDA', 'JNJ', 'XOM']

#Parametric string
opt_filename = './data/options_daily/raw/{date_dir}/{date_file}_{title}_{type}.csv'

opt_filename_proc = './data/options_daily/proc/{date_dir}/{date_file}_{title}_{type}.csv'

title_filename = './data/title/{title}.csv'

#List of dates day by day from 2024_11_12 to 2024_11_29 
dates = pd.date_range(start='2024-11-11', end='2024-11-29').strftime('%Y_%m_%d').tolist()

<h1>Scrape title data</h1>

In [None]:
def scrape_title_data(title, start_date, end_date):
    stock = yf.Ticker(title)
    historical_data = stock.history(start=start_date, end=end_date)
    #Add column log ret given by ln(close_price(t))-ln(close_price(t-1))
    historical_data['log_ret'] = np.log(historical_data['Close']) - np.log(historical_data['Close'].shift(1))
    #remove first row
    historical_data = historical_data.iloc[1:]
    historical_data.to_csv(title_filename.format(title=title))

In [None]:
start_date = "2004-11-29"
end_date = "2024-12-02"

for title in american + european:
    print(f"Scraping {title}")
    scrape_title_data(title, start_date, end_date)
print("Done")

<h1>Scrape Options Data</h1>

In [None]:
def scrape_options_data(options, today):
    
    for idx in options:
        spx = yf.Ticker(idx)

        # get option chain for specific expiration
        try:
            opt = spx.option_chain('0000-00-00')
        except Exception as e:
            list_string = "[" + str(e).split('[')[1]
            list_string = list_string.replace(" ", "")
            list_string = list_string.replace(",", "','")
            list_string = list_string.replace("[", "['")
            list_string = list_string.replace("]", "']")
            option_dates = ast.literal_eval(list_string)
        
        all_calls = pd.DataFrame()
        all_puts = pd.DataFrame()
        
        # Define the cutoff date
        cutoff_date = datetime(2024, 12, 31)
        
        for date in option_dates:
            # Convert date to a datetime object if it's not already one
            if isinstance(date, str):
                date_obj = datetime.strptime(date, '%Y-%m-%d')
            
            if date_obj < cutoff_date:
                opt = spx.option_chain(date)
                
                #Process calls
                call = opt.calls
                call['expiration_date'] = date #add expiration date to the dataframe
                all_calls = pd.concat([all_calls, call], ignore_index=True)
                #all_calls = all_calls[all_calls.isna().sum(axis=1) <= 1]
                #all_calls = all_calls.dropna()
                
                #Process puts
                put = opt.puts
                put['expiration_date'] = date #add expiration_date to the dataframe
                all_puts = pd.concat([all_puts, put], ignore_index=True)
                #all_puts = all_puts[all_puts.isna().sum(axis=1) <= 1]
                #all_puts = all_puts.dropna()
        
        #If doesn't exist, create a data folder
        all_calls.to_csv('./data/options_daily/raw/' + today + '/' + today + '_' + idx + '_calls.csv', index=False)
        all_puts.to_csv('./data/options_daily/raw/' + today + '/' + today + '_' + idx + '_puts.csv', index=False)

In [None]:
#Get today date in format yyyy_mm_dd
today = pd.Timestamp.today().strftime('%Y_%m_%d')

try:
    os.makedirs('./data/options_daily/raw/' + today)
except Exception as e:
    print('Data already written for today')
    exit()

print('Scraping European options data')
scrape_options_data(european, today)

print('Scraping American options data')
scrape_options_data(american, today)
    
print('Scraping completed')

<h1>Take only data until 29/11/2024 </h1>

In [None]:
#For all datasets take only the rows with expiration_date until 2024-11-29
for date in dates:
    for idx in european + american:
        for option_type in ['calls', 'puts']:
            df = pd.read_csv(opt_filename.format(date_dir=date, date_file=date, title=idx, type=option_type))
            df['expiration_date'] = pd.to_datetime(df['expiration_date'])
            df = df[df['expiration_date'] <= '2024-11-29']
            df.to_csv(opt_filename_proc.format(date_dir=date, date_file=date, title=idx, type=option_type), index=False)

print('Done')


<h1>Take for every day only the expiration dates contained in all files (Intersection)</h1>

In [None]:
from functools import reduce

dates_lists = []

#Build a list made of a list for every day sampled, for each day build a list, for each title, containing all the expiration dates for that title
for date in dates:
    day_dates_lists = []
    for idx in european + american:
        option_dates_list = []
        for option_type in ['calls', 'puts']:
            df = pd.read_csv(opt_filename_proc.format(date_dir=date, date_file=date, title=idx, type=option_type))
            option_dates_list.extend(df['expiration_date'].unique())
        day_dates_lists.append(set(option_dates_list)) #Set to remove duplicates from the add of the same dates in put and call dataset
            
    dates_lists.append(day_dates_lists)

#Note that there is no 2024-11-28 cause it's Thanksgiving
#print(len(dates_lists)) #Expected 19 as we have sampled 19 days
#print(len(dates_lists[0])) #Expected 6 as we have 6 titles
#print(len(dates_lists[0][0])) #Expected X

#Now take the intersection of the expiration dates for each title in each day
intersection_lists = []
for day_lists in dates_lists:
    intersection = list(reduce(lambda x, y: set(x) & set(y), day_lists))
    print(len(intersection))
    intersection_lists.append(intersection)
print(len(intersection_lists)) #Expected 19 as we have sampled 19 days
print(intersection_lists)
print('Done')

<h1>Take only data in intersection list of erxpirations</h1>

In [None]:
#For all datasets take only the dates in the instersection list
for i in range(0, len(dates)):
    for idx in european + american:
        for option_type in ['calls', 'puts']:
            df = pd.read_csv(opt_filename_proc.format(date_dir=dates[i], date_file=dates[i], title=idx, type=option_type))
            df = df[df['expiration_date'].isin(intersection_lists[i])]
            df.to_csv(opt_filename_proc.format(date_dir=dates[i], date_file=dates[i], title=idx, type=option_type), index=False)

print('Done')

<h1>Take for each day only the put and calls with the same last trade</h1>

<h1>Calcolo del tasso privo di rischio</h1>

In [None]:
#Divido i dataset per anno

dataset_2002_2023 = './data/bond/daily-treasury-rates_2002-2023.csv'
dataset_2024 = './data/bond/daily-treasury-rates_2024.csv'

#From the dataset 2002_2003 create a dataset for each year and save it in the same folder
df = pd.read_csv(dataset_2002_2023)
df['Date'] = pd.to_datetime(df['Date'])
df['year'] = df['Date'].dt.year
#Create a dataset for each year fill the empty values with NaN
for year in range(2002, 2024):
    df_year = df[df['year'] == year]
    #remove the column year
    df_year = df_year.drop(columns=['year'])
    df_year.to_csv(f'./data/bond/daily-treasury-rates_{year}.csv', index=False, na_rep='NaN')
    

In [None]:
#Calcolo la media dei rendimenti

tb_dates = ['2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023', '2024']

#create a dateset concatenating each year
df_all = pd.DataFrame()
for date in tb_dates:
    df = pd.read_csv(f'./data/bond/daily-treasury-rates_{date}.csv')
    df_all = pd.concat([df_all, df], ignore_index=True)

#drop date column
df_all = df_all.drop(columns=['Date'])

#Take mean skipping nan values
means = df_all.mean(skipna=True)

df_means = pd.DataFrame(means).T

#output to csv
df_means.to_csv('./data/bond/daily-treasury-rates_means.csv', index=False)

<h1>Calcolo della volatilitá di lungo periodo</h1>

In [14]:
#Dato che i titoli nel mercato sono autocorrelati e eteroschedastici cioé hanno varianza variabile posso usare un modello garch per calcolare la volatilitá di lungo periodo
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri

# Import the fGarch library in R
ro.r("""
if (!require(fGarch)) install.packages("fGarch", repos="http://cran.r-project.org")
library(fGarch)

# Carica anche i pacchetti richiesti per ridurre l'avviso
if (!require(fBasics)) install.packages("fBasics", repos="http://cran.r-project.org")
if (!require(timeDate)) install.packages("timeDate", repos="http://cran.r-project.org")
if (!require(timeSeries)) install.packages("timeSeries", repos="http://cran.r-project.org")

library(fBasics)
library(timeDate)
library(timeSeries)
""")

pandas2ri.activate()

european = ['^SPX']

american = []

for title in european + american:
    df = pd.read_csv(title_filename.format(title=title))
    
    # Prendo come training set tutti i dati fino al 31 ottobre 2024
    r_data_training = df[df['Date'] <= '2024-10-31']['log_ret']

    # Converti la Serie Pandas in un DataFrame per facilitarne la conversione in R
    r_data_training = pd.DataFrame(r_data_training, columns=["log_ret"])

    # Converti il DataFrame Pandas in un oggetto R
    r_data_training = pandas2ri.py2rpy(r_data_training)

    # Passa il dato a R
    ro.globalenv['returns'] = r_data_training

    # Scrivi lo script per calcolare il modello GARCH con la serie reale
    r_script = """
    library(fGarch)

    # Fit GARCH(1,1) con i dati reali
    garch_model <- garchFit(~garch(1, 1), data = returns)

    # Estrai i coefficienti
    coefficients <- coef(garch_model)

    # Restituisci i coefficienti
    coefficients
    """

    # Esecuzione del codice in R
    ro.r(r_script)

    # Recupera i risultati da R
    coefficients = ro.r('coefficients')
    #Returns mu, omega, alpha1, beta1
    print("MU -> " + str(coefficients[0]))
    print("OMEGA -> " + str(coefficients[1]))
    print("ALPHA1 -> " + str(coefficients[2]))
    print("BETA1 -> " + str(coefficients[3]))
    


Series Initialization:
 ARMA Model:                arma
 Formula Mean:              ~ arma(0, 0)
 GARCH Model:               garch
 Formula Variance:          ~ garch(1, 1)
 ARMA Order:                0 0
 Max ARMA Order:            0
 GARCH Order:               1 1
 Max GARCH Order:           1
 Maximum Order:             1
 Conditional Dist:          norm
 h.start:                   2
 llh.start:                 1
 Length of Series:          5014
 Recursion Init:            mci
 Series Scale:             

 0.01210523

Parameter Initialization:
 Initial Parameters:          $params
 Limits of Transformations:   $U, $V
 Which Parameters are Fixed?  $includes
 Parameter Matrix:
                     U           V     params includes
    mu     -0.26293673   0.2629367 0.02629367     TRUE
    omega   0.00000100 100.0000000 0.10000000     TRUE
    alpha1  0.00000001   1.0000000 0.10000000     TRUE
    gamma1 -0.99999999   1.0000000 0.10000000    FALSE
    beta1   0.00000001   1.0000000 0.80000000     TRUE
    delta   0.00000000   2.0000000 2.00000000    FALSE
    skew    0.10000000  10.0000000 1.00000000    FALSE
    shape   1.00000000  10.0000000 4.00000000    FALSE
 Index List of Parameters to be Optimized:
    mu  omega alpha1  beta1 
     1      2      3      5 
 Persistence:                  0.9 


--- START OF TRACE ---
Selected Algorithm: nlminb 

R coded nlminb Solver: 

  0:     6019.3330: 0.0262937 0.100000 0.100000 0.800000
  1:     5869.8823: 0.0262943 0.0730638 0.0985277 0.785939
