<h1>Scrape Options Data</h1>

In [3]:
import yfinance as yf
import ast 
import pandas as pd
from datetime import datetime
import os


def scrape_options_data(options, today):
    
    for idx in options:
        spx = yf.Ticker(idx)

        # get option chain for specific expiration
        try:
            opt = spx.option_chain('0000-00-00')
        except Exception as e:
            list_string = "[" + str(e).split('[')[1]
            list_string = list_string.replace(" ", "")
            list_string = list_string.replace(",", "','")
            list_string = list_string.replace("[", "['")
            list_string = list_string.replace("]", "']")
            option_dates = ast.literal_eval(list_string)
        
        all_calls = pd.DataFrame()
        all_puts = pd.DataFrame()
        
        # Define the cutoff date
        cutoff_date = datetime(2024, 12, 31)
        
        for date in option_dates:
            # Convert date to a datetime object if it's not already one
            if isinstance(date, str):
                date_obj = datetime.strptime(date, '%Y-%m-%d')
            
            if date_obj < cutoff_date:
                opt = spx.option_chain(date)
                
                #Process calls
                call = opt.calls
                call['expiration_date'] = date #add expiration date to the dataframe
                all_calls = pd.concat([all_calls, call], ignore_index=True)
                #all_calls = all_calls[all_calls.isna().sum(axis=1) <= 1]
                #all_calls = all_calls.dropna()
                
                #Process puts
                put = opt.puts
                put['expiration_date'] = date #add expiration_date to the dataframe
                all_puts = pd.concat([all_puts, put], ignore_index=True)
                #all_puts = all_puts[all_puts.isna().sum(axis=1) <= 1]
                #all_puts = all_puts.dropna()
        
        #If doesn't exist, create a data folder
        all_calls.to_csv('./data/raw/' + today + '/' + today + '_' + idx + '_calls.csv', index=False)
        all_puts.to_csv('./data/raw/' + today + '/' + today + '_' + idx + '_puts.csv', index=False)

In [4]:
european = ['^SPX', '^NDX', '^RUT']
#european = ['^NDX']

american = ['NVDA', 'JNJ', 'XOM']

#Get today date in format yyyy_mm_dd
today = pd.Timestamp.today().strftime('%Y_%m_%d')

try:
    os.makedirs('./data/raw/' + today)
except Exception as e:
    print('Data already written for today')
    exit()

print('Scraping European options data')
scrape_options_data(european, today)

print('Scraping American options data')
scrape_options_data(american, today)
    
print('Scraping completed')

Scraping European options data
Scraping American options data
Scraping completed


<h1>Take only data until 29/11/2024 </h1>

In [24]:
#List of dates day by day from 2024_11_12 to 2024_11_29 
dates = pd.date_range(start='2024-11-11', end='2024-11-29').strftime('%Y_%m_%d').tolist()

#Parametric string
filename = './data/raw/{date_dir}/{date_file}_{title}_{type}.csv'

filename_proc = './data/proc/{date_dir}/{date_file}_{title}_{type}.csv'


#For all datasets take only the rows with expiration_date until 2024-11-29
for date in dates:
    for idx in european + american:
        for option_type in ['calls', 'puts']:
            df = pd.read_csv(filename.format(date_dir=date, date_file=date, title=idx, type=option_type))
            df['expiration_date'] = pd.to_datetime(df['expiration_date'])
            df = df[df['expiration_date'] <= '2024-11-29']
            df.to_csv(filename_proc.format(date_dir=date, date_file=date, title=idx, type=option_type), index=False)

print('Done')


OSError: Cannot save file into a non-existent directory: 'data\proc\2024_11_11'

<h1>Take for every day only the expiration dates contained in all files (Intersection)</h1>

In [None]:
from functools import reduce

dates_lists = []

#Build a list made of a list for every day sampled, for each day build a list, for each title, containing all the expiration dates for that title
for date in dates:
    day_dates_lists = []
    for idx in european + american:
        option_dates_list = []
        for option_type in ['calls', 'puts']:
            df = pd.read_csv(filename.format(date_dir=date, date_file=date, title=idx, type=option_type))
            option_dates_list.extend(df['expiration_date'].unique())
        day_dates_lists.append(set(option_dates_list)) #Set to remove duplicates from the add of the same dates in put and call dataset
            
    dates_lists.append(day_dates_lists)

#Note that there is no 2024-11-28 cause it's Thanksgiving
#print(len(dates_lists)) #Expected 19 as we have sampled 19 days
#print(len(dates_lists[0])) #Expected 6 as we have 6 titles
#print(len(dates_lists[0][0])) #Expected X

#Now take the intersection of the expiration dates for each title in each day
intersection_lists = []
for day_lists in dates_lists:
    intersection = list(reduce(lambda x, y: set(x) & set(y), day_lists))
    print(len(intersection))
    intersection_lists.append(intersection)
#print(len(intersection_lists)) #Expected 19 as we have sampled 19 days

print('Done')

3
3
3
3
3
2
2
2
2
2
2
2
1
1
1
1
1
1
1
19
Done


<h1>Take only data in intersection list of erxpirations</h1>

In [None]:
#For all datasets take only the rows with expiration_date until 2024-11-29
for date in dates:
    for idx in european + american:
        for option_type in ['calls', 'puts']:
            df = pd.read_csv(filename_proc.format(date_dir=date, date_file=date, title=idx, type=option_type))
            df['expiration_date'] = pd.to_datetime(df['expiration_date'])
            df = df[df['expiration_date'] <= '2024-11-29']
            df.to_csv(filename.format(date_dir=date, date_file=date, title=idx, type=option_type), index=False)

print('Done')