In [1]:
import numpy as np
import pandas as pd
import pathlib

In [2]:
path = pathlib.Path(r"C:\Users\Mathiass\OneDrive - Universität Zürich UZH\Documents\mt_literature\data")

In [3]:
data = pd.read_parquet(path/"final_df_filledmean_small.parquet")

In [276]:
class CVSplitter:
    """ Generator for sklearn gridsearch cv
    Args:
    dates: pandas.Series (type datetime),
    init_train_length: int,
    val_length: int
    """
    def __init__(self, dates, init_train_length=10, val_length=2, test_length=1):
        # find indeces where years change (will ignore last year end in dates)
#         self.init_train_length = init_train_length
        self.val_length = val_length
        self.test_length = test_length
        self.eoy_idx =  np.where((dates.dt.year.diff() == 1))[0]
        self.eoy_idx = np.append(self.eoy_idx, len(dates)) #append end of year of last year in dates

        assert init_train_length + val_length + test_length <= len(self.eoy_idx) + 1, "defined train and val are larger "\
            "than number of years in dataset"
        assert init_train_length > 0, "init_train_length must be strictly greater than 0"

        # align
        self.train_start_idx = init_train_length - 1

        self.train_eoy = self.eoy_idx[self.train_start_idx:-(val_length+test_length)]
        self.val_eoy = self.eoy_idx[self.train_start_idx + val_length:-test_length]
        # For generate_idx():
        self.test_eoy = self.eoy_idx[self.train_start_idx + val_length + test_length:]

    def generate_cv(self):
        for i in range(len(self.eoy_idx) - (self.train_start_idx + self.val_length)):
            yield (list(range(self.train_eoy[i])), 
                   list(range(self.train_eoy[i], self.val_eoy[i])))
            
    def generate_idx(self):
        for i in range(len(self.eoy_idx) - (self.train_start_idx + self.val_length + self.test_length)):
            yield ({"train": self.train_eoy[i], "val": self.val_eoy[i], "test": self.test_eoy[i]})
#         assert len(self.train_eoy) == len(self.val_eoy) == (len(self.eoy_idx) - self.init_train_length - self.val_length + 1), \
#     "train_eoy, val_eoy or eoy_idx - init_train_length - val_length not equal"
#         return [(self.train_eoy[i], self.val_eoy[i]) for i in range(len(self.train_eoy))]

In [277]:
splitter = CVSplitter(data["date"], 10, 2)


In [278]:
splitter.train_eoy

array([ 725915,  842623,  984164, 1139047, 1281046, 1386145, 1521399,
       1640341, 1767855, 1947057, 2138317, 2319506, 2528654, 2780818],
      dtype=int64)

In [279]:
splitter.val_eoy

array([ 984164, 1139047, 1281046, 1386145, 1521399, 1640341, 1767855,
       1947057, 2138317, 2319506, 2528654, 2780818, 3027743, 3428810],
      dtype=int64)

In [280]:
splitter.test_eoy

array([1139047, 1281046, 1386145, 1521399, 1640341, 1767855, 1947057,
       2138317, 2319506, 2528654, 2780818, 3027743, 3428810, 3823386],
      dtype=int64)

In [281]:
eoy_indeces = list(splitter.generate_idx())


In [282]:
len(eoy_indeces)

14

In [290]:
year_idx = 0

In [292]:
 eoy_train, eoy_val, eoy_test = eoy_indeces[year_idx]["train"], eoy_indeces[year_idx]["val"], eoy_indeces[year_idx]["test"]

In [296]:
data = data[:eoy_test]

In [308]:
data.iloc[eoy_train]

date               2006-01-31 00:00:00
strike_price                      22.5
best_bid                           4.0
best_offer                         4.3
volume                               2
open_interest                      759
impl_volatility               0.429084
delta                         0.755031
gamma                         0.053092
vega                          4.336227
theta                        -3.868462
cfadj                                1
days_no_trading                      0
days_to_exp                        109
forwardprice                 25.725454
spotprice                        25.36
adj_spot                         25.36
ir_rate                       0.048364
mid_price                         4.15
cp_flag_C                          1.0
cp_flag_P                          0.0
option_ret                   -0.006684
Name: 725915, dtype: object

In [302]:
data[:eoy_train]

Unnamed: 0,date,strike_price,best_bid,best_offer,volume,open_interest,impl_volatility,delta,gamma,vega,...,days_no_trading,days_to_exp,forwardprice,spotprice,adj_spot,ir_rate,mid_price,cp_flag_C,cp_flag_P,option_ret
0,1996-01-31,20.0,4.6250,5.0000,15,25,0.380062,0.793831,0.044693,4.456725,...,0,171,24.142672,24.00,24.0000,0.052253,4.81250,1.0,0.0,-0.025367
1,1996-01-31,25.0,1.4375,1.6250,16,403,0.294709,0.462826,0.081279,6.401439,...,0,171,24.142672,24.00,24.0000,0.052253,1.53125,1.0,0.0,-0.003343
2,1996-01-31,27.5,0.9375,1.1875,1,55,0.343211,0.324547,0.063405,5.831637,...,0,171,24.142672,24.00,24.0000,0.052253,1.06250,1.0,0.0,-0.049677
3,1996-01-31,25.0,1.8125,2.0625,10,49,0.307167,-0.591168,0.120232,4.230949,...,0,80,24.058134,24.00,24.0000,0.053710,1.93750,0.0,1.0,-0.012452
4,1996-01-31,22.5,0.5000,0.6875,1,11,0.281269,-0.285548,0.109995,3.762847,...,0,80,24.058134,24.00,24.0000,0.053710,0.59375,0.0,1.0,-0.032655
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
725910,2005-12-31,25.0,0.9500,1.1000,142,197,0.773296,-0.107375,0.011299,3.918592,...,0,113,38.865859,38.31,9.5775,0.046958,1.02500,0.0,1.0,-0.235028
725911,2005-12-31,30.0,2.2000,2.4500,80,277,0.761198,-0.206342,0.017761,6.044483,...,0,113,38.865859,38.31,9.5775,0.046958,2.32500,0.0,1.0,-0.196221
725912,2005-12-31,25.0,1.5000,1.7500,55,236,0.691490,-0.129318,0.010748,6.015813,...,0,204,39.340514,38.31,9.5775,0.047731,1.62500,0.0,1.0,-0.060922
725913,2005-12-31,22.5,1.0000,1.2000,2,155,0.703129,-0.093119,0.008339,4.752594,...,0,204,39.340514,38.31,9.5775,0.047731,1.10000,0.0,1.0,-0.082089


In [299]:
data[eoy_train:eoy_val]

Unnamed: 0,date,strike_price,best_bid,best_offer,volume,open_interest,impl_volatility,delta,gamma,vega,...,days_no_trading,days_to_exp,forwardprice,spotprice,adj_spot,ir_rate,mid_price,cp_flag_C,cp_flag_P,option_ret
725915,2006-01-31,22.5,4.00,4.30,2,759,0.429084,0.755031,0.053092,4.336227,...,0,109,25.725454,25.36,25.36,0.048364,4.150,1.0,0.0,-0.006684
725916,2006-01-31,25.0,1.75,1.90,20,539,0.443815,0.582005,0.098778,3.477253,...,0,46,25.508360,25.36,25.36,0.047332,1.825,1.0,0.0,-0.018157
725917,2006-01-31,25.0,2.50,2.65,20,1312,0.406117,0.594785,0.069167,5.347711,...,0,109,25.725454,25.36,25.36,0.048364,2.575,1.0,0.0,-0.006787
725918,2006-01-31,30.0,0.75,0.90,3,467,0.396880,0.273002,0.060721,4.582913,...,0,109,25.725454,25.36,25.36,0.048364,0.825,1.0,0.0,-0.013303
725919,2006-01-31,30.0,1.45,1.65,1,59,0.387934,0.363197,0.051650,7.025910,...,0,200,26.046278,25.36,25.36,0.048980,1.550,1.0,0.0,-0.000063
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
984159,2007-12-31,12.5,0.75,0.85,395,770,0.620217,-0.226900,0.059068,2.477106,...,0,110,15.222083,15.08,15.08,0.044727,0.800,0.0,1.0,0.039513
984160,2007-12-31,12.5,1.25,1.40,6,2503,0.617623,-0.251122,0.046471,3.539909,...,0,201,15.312980,15.08,15.08,0.042511,1.325,0.0,1.0,0.053625
984161,2007-12-31,30.0,1.55,1.85,8,440,0.428068,0.416171,0.060823,5.842341,...,0,110,27.778781,27.41,27.41,0.044727,1.700,1.0,0.0,-0.001141
984162,2007-12-31,30.0,2.55,3.10,1,40,0.443603,0.484154,0.044278,8.088678,...,0,201,28.056289,27.41,27.41,0.042511,2.825,1.0,0.0,-0.020562


In [297]:
data[eoy_val:eoy_test]

Unnamed: 0,date,strike_price,best_bid,best_offer,volume,open_interest,impl_volatility,delta,gamma,vega,...,days_no_trading,days_to_exp,forwardprice,spotprice,adj_spot,ir_rate,mid_price,cp_flag_C,cp_flag_P,option_ret
984164,2008-01-31,2.5,0.50,0.60,120,4739,0.738505,0.760397,0.390382,0.334411,...,0,51,2.922437,2.91,2.91,0.031109,0.550,1.0,0.0,-0.057965
984165,2008-01-31,2.5,0.75,0.80,119,1339,0.800324,0.717773,0.233348,0.611027,...,0,142,2.942771,2.91,2.91,0.028966,0.775,1.0,0.0,-0.065141
984166,2008-01-31,12.5,4.10,4.40,2,111,0.529119,0.736593,0.042532,4.734251,...,0,352,15.175080,14.79,14.79,0.026726,4.250,1.0,0.0,-0.018021
984167,2008-01-31,20.0,1.10,1.25,10,1668,0.446056,0.340097,0.056641,5.314732,...,0,352,15.175080,14.79,14.79,0.026726,1.175,1.0,0.0,-0.017128
984168,2008-01-31,17.5,1.80,2.00,25,1430,0.469257,0.468302,0.058415,5.766483,...,0,352,15.175080,14.79,14.79,0.026726,1.900,1.0,0.0,-0.019201
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1139042,2008-12-31,25.0,0.20,0.35,3,18,0.370191,0.157918,0.067692,2.328794,...,0,80,20.724662,20.88,20.88,0.009975,0.275,1.0,0.0,-0.048762
1139043,2008-12-31,17.5,0.65,0.75,1002,290,0.484620,0.377779,0.129076,2.308742,...,0,52,16.270622,16.25,16.25,0.009110,0.700,1.0,0.0,-0.105822
1139044,2008-12-31,15.0,1.50,1.80,12,210,0.595673,-0.340737,0.062425,3.623688,...,0,136,16.312013,16.25,16.25,0.010303,1.650,0.0,1.0,-0.067045
1139045,2008-12-31,15.0,2.00,2.35,10,20,0.567908,-0.339148,0.049845,4.749967,...,0,234,16.361360,16.25,16.25,0.010702,2.175,0.0,1.0,-0.069238


In [293]:
eoy_test

1139047

In [275]:
eoy_indeces[0]["train"]

725915

In [243]:
eoy_train

725915

In [244]:
eoy_val

984164

In [150]:
data.iloc[:eoy_train]

Unnamed: 0,date,strike_price,best_bid,best_offer,volume,open_interest,impl_volatility,delta,gamma,vega,...,days_no_trading,days_to_exp,forwardprice,spotprice,adj_spot,ir_rate,mid_price,cp_flag_C,cp_flag_P,option_ret
0,1996-01-31,20.0,4.6250,5.0000,15,25,0.380062,0.793831,0.044693,4.456725,...,0,171,24.142672,24.00,24.00,0.052253,4.81250,1.0,0.0,-0.025367
1,1996-01-31,25.0,1.4375,1.6250,16,403,0.294709,0.462826,0.081279,6.401439,...,0,171,24.142672,24.00,24.00,0.052253,1.53125,1.0,0.0,-0.003343
2,1996-01-31,27.5,0.9375,1.1875,1,55,0.343211,0.324547,0.063405,5.831637,...,0,171,24.142672,24.00,24.00,0.052253,1.06250,1.0,0.0,-0.049677
3,1996-01-31,25.0,1.8125,2.0625,10,49,0.307167,-0.591168,0.120232,4.230949,...,0,80,24.058134,24.00,24.00,0.053710,1.93750,0.0,1.0,-0.012452
4,1996-01-31,22.5,0.5000,0.6875,1,11,0.281269,-0.285548,0.109995,3.762847,...,0,80,24.058134,24.00,24.00,0.053710,0.59375,0.0,1.0,-0.032655
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3027738,2019-12-31,24.0,0.6500,0.8000,3,46,0.360287,-0.166259,0.034158,5.029415,...,0,171,29.543403,29.56,29.56,0.017617,0.72500,0.0,1.0,-0.037194
3027739,2019-12-31,25.0,0.8500,1.0000,3,118,0.349684,-0.205924,0.040182,5.746246,...,0,171,29.543403,29.56,29.56,0.017617,0.92500,0.0,1.0,-0.030085
3027740,2019-12-31,26.0,1.0500,1.2500,2,28,0.336636,-0.250409,0.046582,6.402798,...,0,171,29.543403,29.56,29.56,0.017617,1.15000,0.0,1.0,-0.025181
3027741,2019-12-31,30.0,2.6500,2.8500,3,3,0.311346,-0.484026,0.063009,7.992966,...,0,171,29.543403,29.56,29.56,0.017617,2.75000,0.0,1.0,-0.014358


In [151]:
data.iloc[eoy_train:eoy_val]

Unnamed: 0,date,strike_price,best_bid,best_offer,volume,open_interest,impl_volatility,delta,gamma,vega,...,days_no_trading,days_to_exp,forwardprice,spotprice,adj_spot,ir_rate,mid_price,cp_flag_C,cp_flag_P,option_ret
3027743,2020-01-31,40.0,11.55,11.85,2,10,0.437823,0.963243,0.012055,1.281852,...,0,49,51.416092,51.57,51.57,0.016838,11.700,1.0,0.0,0.012955
3027744,2020-01-31,47.5,4.65,4.95,1,63,0.211775,0.778440,0.051251,8.093108,...,0,105,51.545701,51.57,51.57,0.016609,4.800,1.0,0.0,0.074107
3027745,2020-01-31,50.0,3.10,3.25,12,4538,0.191125,-0.406989,0.039811,19.266680,...,0,350,51.238400,51.57,51.57,0.015170,3.175,0.0,1.0,0.102011
3027746,2020-01-31,55.0,5.65,5.95,256,273,0.175302,-0.621201,0.042014,18.626100,...,0,350,51.238400,51.57,51.57,0.015170,5.800,0.0,1.0,0.064509
3027747,2020-01-31,53.0,0.47,0.52,2,28,0.192894,0.299942,0.132450,4.880461,...,0,28,51.365429,51.57,51.57,0.016615,0.495,1.0,0.0,0.163453
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3823381,2021-11-30,13.0,1.15,1.50,134,665,0.437066,0.487561,0.095777,3.813131,...,0,227,12.112223,12.31,12.31,0.002822,1.325,1.0,0.0,-0.068384
3823382,2021-11-30,15.0,0.10,0.75,5,14,0.335686,0.249430,0.098387,3.066143,...,0,227,12.112223,12.31,12.31,0.002822,0.425,1.0,0.0,0.055073
3823383,2021-11-30,11.0,0.20,1.10,3,111,0.308935,-0.297605,0.114422,3.328838,...,0,227,12.112223,12.31,12.31,0.002822,0.650,0.0,1.0,0.020419
3823384,2021-11-30,12.0,1.10,1.60,53,35,0.372075,-0.422035,0.106850,3.744469,...,0,227,12.112223,12.31,12.31,0.002822,1.350,0.0,1.0,-0.016824


In [84]:
len(splitter.eoy_idx)

26

In [85]:
for i in range(26 - 10 - 2 + 1):
    print(i)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14


In [86]:
splitter.train_eoy[14]

3027743

In [87]:
for i in generator:
    print(i)

(725915, 984164)
(842623, 1139047)
(984164, 1281046)
(1139047, 1386145)
(1281046, 1521399)
(1386145, 1640341)
(1521399, 1767855)
(1640341, 1947057)
(1767855, 2138317)
(1947057, 2319506)
(2138317, 2528654)
(2319506, 2780818)
(2528654, 3027743)
(2780818, 3428810)
(3027743, 3823386)


In [None]:
for i in generator:
    print(i)

In [304]:
15+10+5

30

In [306]:
np.sum([15, 10, 5])

30