In [1]:
import pandas as pd
import numpy as np

### Read Data

In [2]:
df = pd.read_pickle('../data/preprocessed_data_3.pkl')

(102964, 82)

### Add rows for missing quarters

In [3]:
def generate_quarters(start, end):
    start_year, start_quarter = int(start[:4]), int(start[5])
    end_year, end_quarter = int(end[:4]), int(end[5])

    quarters = []

    current_year, current_quarter = start_year, start_quarter

    while current_year < end_year or (current_year == end_year and current_quarter <= end_quarter):
        quarters.append(f'{current_year}Q{current_quarter}')
        
        if current_quarter == 4:
            current_quarter = 1
            current_year += 1
        else:
            current_quarter += 1

    return quarters

In [4]:
df2 = []
for code, code_df in df.groupby('gvkey'):
    if len(code_df)<2:
        df2.append(code_df)
        continue
    
    all_quarters = generate_quarters(code_df['datacqtr'].min(), code_df['datacqtr'].max())
    all_quarters_df = pd.DataFrame({'datacqtr': all_quarters})

    merged_df = all_quarters_df.merge(code_df.copy(deep=True), on='datacqtr', how='left')
    merged_df['gvkey'] = code

    df2.append(merged_df)

df = pd.concat(df2)

### Sort rows

In [5]:
df = df.sort_values(by=['gvkey', 'datacqtr'])

### Add target variables

In [6]:
# nextQuarter EPS value
df['nq_eps_actual'] = df.groupby('gvkey')['eps_actual'].shift(-1)

# nextQuarter EPS direction
condition = pd.isnull(df['nq_eps_actual']) | pd.isnull(df['eps_actual'])

df['nq_eps_actual_direction'] = np.where(condition, 
                                            np.nan,
                                            (df['nq_eps_actual'] > df['eps_actual']).astype(int))

# nextQuarter EPS change
df['nq_eps_actual_change'] = np.where(condition, 
                                            np.nan,
                                            (df['nq_eps_actual']-df['eps_actual'])/abs(df['eps_actual']))

df = df.drop(columns=['nq_eps_actual'])

In [7]:
# Predicted Mean NextQuarterEPS direction
df['nq_eps_predicted_mean'] = df.groupby('gvkey')['eps_predicted_mean'].shift(-1)

condition = pd.isnull(df['nq_eps_predicted_mean']) | pd.isnull(df['eps_actual'])

df['nq_eps_predicted_mean_direction'] = np.where(condition, 
                                            np.nan,
                                            (df['nq_eps_predicted_mean'] > df['eps_actual']).astype(int))

# Predicted Mean NextQuarterEPS change
df['nq_eps_predicted_mean_change'] = np.where(condition, 
                                            np.nan,
                                            (df['nq_eps_predicted_mean']-df['eps_actual'])/abs(df['eps_actual']))

df = df.drop(columns=['nq_eps_predicted_mean'])
df = df.drop(columns=['eps_predicted_mean'])

In [8]:
# Predicted Median NextQuarter EPS direction
df['nq_eps_predicted_median'] = df.groupby('gvkey')['eps_predicted_median'].shift(-1)

condition = pd.isnull(df['nq_eps_predicted_median']) | pd.isnull(df['eps_actual'])

df['nq_eps_predicted_median_direction'] = np.where(condition, 
                                            np.nan,
                                            (df['nq_eps_predicted_median'] > df['eps_actual']).astype(int))

# Predicted Median NextQuarter EPS change
df['nq_eps_predicted_median_change'] = np.where(condition, 
                                            np.nan,
                                            (df['nq_eps_predicted_median']-df['eps_actual'])/abs(df['eps_actual']))

df = df.drop(columns=['nq_eps_predicted_median'])
df = df.drop(columns=['eps_predicted_median'])

### Apply Feature Transformation

In [9]:
ignore_cols = ['datacqtr','gvkey','cusip','tic', 'announcement_date','analyst_date',
               'nq_eps_actual_direction','nq_eps_actual_change',
               'nq_eps_predicted_mean_direction','nq_eps_predicted_mean_change',
               'nq_eps_predicted_median_direction','nq_eps_predicted_median_change',
               'mkvaltq','gsector']

df = df.rename(columns={'eps_actual':'eps'})
features = [x for x in df.columns if x not in ignore_cols]

In [10]:
print(len(features))
features

72


['eps',
 'acomincq',
 'acoq',
 'actq',
 'ancq',
 'aocipenq',
 'aoq',
 'apq',
 'atq',
 'capxy',
 'chechy',
 'cheq',
 'ciotherq',
 'cogsq',
 'cshopq',
 'dcomq',
 'diladq',
 'dlcq',
 'dlttq',
 'doq',
 'dpactq',
 'dpq',
 'drcq',
 'drltq',
 'dvpq',
 'dvy',
 'esopctq',
 'fincfy',
 'gdwlq',
 'intanq',
 'invchy',
 'invtq',
 'ivchy',
 'ivltq',
 'ivncfy',
 'lcoq',
 'lctq',
 'lltq',
 'loq',
 'ltq',
 'mibq',
 'mibtq',
 'miiq',
 'niq',
 'nopiq',
 'oancfy',
 'oibdpq',
 'ppentq',
 'pstkq',
 'rcpq',
 'rdipq',
 'recchy',
 'recdq',
 'rectq',
 'req',
 'revtq',
 'seqq',
 'sivy',
 'spiq',
 'sppivy',
 'stkcoq',
 'tstkq',
 'txdbq',
 'txpq',
 'txtq',
 'wcapq',
 'xaccq',
 'xintq',
 'xoprq',
 'xrdq',
 'xsgaq',
 'prccq']

In [11]:
n=4
for i in range(1,n+1):
    # shifting values
    shifted_values = df.groupby('gvkey')[features].shift(i)
    
    # add lag values
    lag_cols = [(x + f'_lag_{i}Q') for x in features]
    df[lag_cols] = shifted_values

    total_assets_col = f'atq_lag_{i}Q'
    non_asset_cols = [col for col in lag_cols if col != total_assets_col #and col not in [f'eps_lag_{i}Q',f'prccq_lag_{i}Q']
                        ]
    df[non_asset_cols] = df[non_asset_cols].div(df[total_assets_col], axis=0).replace([np.inf, -np.inf], np.nan)

    # perc values
    perc_cols = [(x + f'_percChange_{i}Q') for x in features]
    custom_pct_change = (df[features] - shifted_values) / shifted_values.abs()
    df[perc_cols] = custom_pct_change

else:
    features_rename = {x:x+'_current' for x in features}
    features = [x+'_current' for x in features]
    df = df.rename(columns=features_rename)

    total_assets_col = 'atq_current'
    non_asset_cols = [col for col in features if col != total_assets_col
                        ]
    df[non_asset_cols] = df[non_asset_cols].div(df[total_assets_col], axis=0).replace([np.inf, -np.inf], np.nan)

### Clean data

In [14]:
df.head()

Unnamed: 0,datacqtr,gvkey,cusip,tic,gsector,announcement_date,analyst_date,eps_current,mkvaltq,acomincq_current,...,txdbq_percChange_4Q,txpq_percChange_4Q,txtq_percChange_4Q,wcapq_percChange_4Q,xaccq_percChange_4Q,xintq_percChange_4Q,xoprq_percChange_4Q,xrdq_percChange_4Q,xsgaq_percChange_4Q,prccq_percChange_4Q
0,1985Q4,1004,361105,AIR,20,1985-12-19,1985-11-14,0.001035,,,...,,,,,,,,,,
1,1986Q1,1004,361105,AIR,20,1986-03-21,1985-12-19,0.000812,,,...,,,,,,,,,,
2,1986Q2,1004,361105,AIR,20,1986-07-08,1986-03-20,0.000806,,,...,,,,,,,,,,
3,1986Q3,1004,361105,AIR,20,1986-09-24,1986-07-17,0.00071,,,...,,,,,,,,,,
4,1986Q4,1004,361105,AIR,20,1986-12-16,1986-10-16,0.000802,,,...,,,0.44206,0.573277,,-0.031812,0.235266,,0.138542,-0.020833


In [15]:
# change infinity values to missing
df = df.replace([np.inf, -np.inf], np.nan)

In [16]:
# drop rows that have invalid/missing past data
df = df.dropna(subset=['atq_current','atq_lag_1Q','atq_lag_2Q','atq_lag_3Q','atq_lag_4Q']).reset_index(drop=True)
print('Number of rows:', len(df))
df.head()

Number of rows: 84412


Unnamed: 0,datacqtr,gvkey,cusip,tic,gsector,announcement_date,analyst_date,eps_current,mkvaltq,acomincq_current,...,txdbq_percChange_4Q,txpq_percChange_4Q,txtq_percChange_4Q,wcapq_percChange_4Q,xaccq_percChange_4Q,xintq_percChange_4Q,xoprq_percChange_4Q,xrdq_percChange_4Q,xsgaq_percChange_4Q,prccq_percChange_4Q
0,1986Q4,1004,361105,AIR,20,1986-12-16,1986-10-16,0.000802,,,...,,,0.44206,0.573277,,-0.031812,0.235266,,0.138542,-0.020833
1,1987Q1,1004,361105,AIR,20,1987-03-23,1987-03-19,0.00073,,,...,,,0.310638,0.542275,,-0.067909,0.101843,,0.080037,0.484663
2,1987Q2,1004,361105,AIR,20,1987-07-14,1987-04-16,0.000793,,,...,,0.0,0.313008,0.240443,,0.06383,0.223686,,0.113433,0.285714
3,1987Q3,1004,361105,AIR,20,1987-09-22,1987-07-16,0.000814,,,...,,,-0.071146,0.060516,,-0.036364,0.1247,,0.139701,0.590426
4,1987Q4,1004,361105,AIR,20,1987-12-16,1987-11-19,0.000774,,,...,,,-0.214286,0.067197,,0.66,0.065054,,0.088119,-0.292553


In [17]:
# drop rows that have missing target variable
df = df.dropna(subset = ['nq_eps_actual_direction','nq_eps_actual_change',
               'nq_eps_predicted_mean_direction','nq_eps_predicted_mean_change',
               'nq_eps_predicted_median_direction','nq_eps_predicted_median_change'])

In [18]:
# sort columns
first_columns = ['datacqtr', 'gvkey',
                 'nq_eps_actual_direction','nq_eps_actual_change',
                'nq_eps_predicted_mean_direction','nq_eps_predicted_mean_change',
                'nq_eps_predicted_median_direction','nq_eps_predicted_median_change']
df =  df[first_columns + [x for x in df.columns if x not in first_columns]]

In [19]:
df = df.sort_values(by=['datacqtr','gvkey']).reset_index(drop=True)

### Save Data

In [21]:
df.to_pickle('../data/preprocessed_data_4.pkl')