In [1]:
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
import random
import pickle

### Read Data

In [2]:
df = pd.read_pickle('../data/preprocessed_data_4.pkl')

Number of companies: 1859
Number of rows: 78997
Number of columns: 662


In [3]:
df.head()

Unnamed: 0,datacqtr,gvkey,nq_eps_actual_direction,nq_eps_actual_change,nq_eps_predicted_mean_direction,nq_eps_predicted_mean_change,nq_eps_predicted_median_direction,nq_eps_predicted_median_change,cusip,tic,...,txdbq_percChange_4Q,txpq_percChange_4Q,txtq_percChange_4Q,wcapq_percChange_4Q,xaccq_percChange_4Q,xintq_percChange_4Q,xoprq_percChange_4Q,xrdq_percChange_4Q,xsgaq_percChange_4Q,prccq_percChange_4Q
0,1986Q2,1078,0.0,-0.070126,0.0,-0.018233,0.0,-0.018233,2824100,ABT,...,,,0.21546,0.003846,,-0.228619,0.122464,,0.151169,-0.063181
1,1986Q2,1209,0.0,-0.245902,1.0,0.04918,1.0,0.04918,9158106,APD,...,,-0.163255,-1.223304,0.230693,,0.386834,0.062878,,-0.010275,-0.370787
2,1986Q2,1230,1.0,5.217391,1.0,2.652174,1.0,3.173913,11659109,ALK,...,,0.329233,-0.633446,-0.188379,,0.241703,0.01539,,,-0.295699
3,1986Q2,1618,0.0,-0.55,1.0,0.5,1.0,0.5,32159105,AXR,...,,,-0.058403,,,0.320497,0.008626,,0.096117,0.193182
4,1986Q2,1632,0.0,-0.214592,1.0,0.287554,1.0,0.287554,32654105,ADI,...,,0.123781,1.093974,0.061268,,-0.014607,0.067954,,0.168251,-0.222222


### Prepare Data

In [4]:
def add_one_quarter(quarter):
    year, qtr = int(quarter[:4]), int(quarter[-1])
    qtr += 1
    if qtr > 4:
        year += 1
        qtr = 1
    return f"{year}Q{qtr}"

df['datacqtr'] = df['datacqtr'].apply(add_one_quarter)

In [5]:
quarters_list = [x for x in sorted(df['datacqtr'].unique()) if x>='1993Q1' and x<='2019Q4']
print(len(quarters_list))
print(min(quarters_list),max(quarters_list))
len(quarters_list) == ((2019-1993+1)*4)

108
1993Q1 2019Q4


True

In [6]:
# create time series split of quarters
tscv = TimeSeriesSplit(n_splits=7, max_train_size=80, test_size=4)

print('Number of subsets:', len(list(tscv.split(quarters_list))))

Number of subsets: 7


In [7]:
for i, (train_index, test_index) in enumerate(tscv.split(quarters_list)):
    train_quarters, test_quarter = [quarters_list[i] for i in train_index], [quarters_list[i] for i in test_index]
    print('Number of train quarters:', len(train_quarters),
          'Number of test quarters:', len(test_quarter),
          'Start-end train:', train_quarters[0], train_quarters[-1],
          'Start-end test:', test_quarter[0], test_quarter[-1],
          )

Number of train quarters: 80 Number of test quarters: 4 Start-end train: 1993Q1 2012Q4 Start-end test: 2013Q1 2013Q4
Number of train quarters: 80 Number of test quarters: 4 Start-end train: 1994Q1 2013Q4 Start-end test: 2014Q1 2014Q4
Number of train quarters: 80 Number of test quarters: 4 Start-end train: 1995Q1 2014Q4 Start-end test: 2015Q1 2015Q4
Number of train quarters: 80 Number of test quarters: 4 Start-end train: 1996Q1 2015Q4 Start-end test: 2016Q1 2016Q4
Number of train quarters: 80 Number of test quarters: 4 Start-end train: 1997Q1 2016Q4 Start-end test: 2017Q1 2017Q4
Number of train quarters: 80 Number of test quarters: 4 Start-end train: 1998Q1 2017Q4 Start-end test: 2018Q1 2018Q4
Number of train quarters: 80 Number of test quarters: 4 Start-end train: 1999Q1 2018Q4 Start-end test: 2019Q1 2019Q4


In [8]:
def random_undersample_majority(df_train):
    random.seed(123)
    min_size = df_train.groupby('nq_eps_actual_direction').size().min()
    
    majority_class = df_train['nq_eps_actual_direction'].value_counts().idxmax()
    minority_class = df_train['nq_eps_actual_direction'].value_counts().idxmin()

    index_class = list(zip(df_train.index, df_train['nq_eps_actual_direction']))

    indices_to_sample =  [i for i,v in index_class if v==minority_class]
    indices_to_sample += random.sample([i for i,v in index_class if v==majority_class], k=min_size)

    df_train= df_train.loc[indices_to_sample].sort_index()

    return df_train

In [9]:
df_model = df.copy(deep=True)

### Create Trainiing, Validation, and Testing sets

In [10]:
subsets = {}
for i, (train_index, test_index) in enumerate(tscv.split(quarters_list)):
    train_quarters, test_quarters = [quarters_list[i] for i in train_index], [quarters_list[i] for i in test_index]

    # validation quarters
    valid_quarters = train_quarters[-4:]
    train_quarters = train_quarters[:-4]

    # split into training, validation, testing subsets
    df_train = df_model[df_model['datacqtr'].isin(train_quarters)]
    df_train = df_train.sort_values(by='datacqtr').reset_index(drop=True)

    df_val = df_model[(df_model['datacqtr'].isin(valid_quarters))]
    df_val = df_val.sort_values(by='datacqtr').reset_index(drop=True)

    df_test = df_model[(df_model['datacqtr'].isin(test_quarters))]
    df_test = df_test.sort_values(by='datacqtr').reset_index(drop=True)


    # randomly undersample the majority class
    if True:
        df_train = random_undersample_majority(df_train)
    
    subsets[str(test_quarters[0][:4])] = {'train':df_train, 'val':df_val, 'test':df_test}

In [11]:
len(subsets)

7

In [12]:
subsets.keys()

dict_keys(['2013', '2014', '2015', '2016', '2017', '2018', '2019'])

In [13]:
file = open('../data/subsets.pkl', 'wb')
pickle.dump(subsets, file=file)
file.close()