In [1]:
# Import packages
import os
import sys
import warnings
import inspect
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib.patches import Patch
from scipy.interpolate import make_interp_spline

# Set directories
notebook_dir = os.getcwd()
base_dir = os.path.join(notebook_dir, '..')
data_dir = os.path.join(base_dir, 'data', 'processed')
raw_data_dir = os.path.join(base_dir, 'data', 'raw')
src_dir = os.path.join(base_dir, 'src')
graph_dir = os.path.join(base_dir, 'results', 'graphs')
sys.path.append(src_dir)

# Ignore future warning
warnings.simplefilter(action='ignore', category=FutureWarning)

In [150]:
# function for creating factors
def create_factors(feature_path = 'credit_risk_', feature_name = "credit_factor", pct = 0.2):
    '''
    feature_path: first part of the csv file name
    feature_name: name of the feature column
    pct: percent quantile for high and low partition
    
    function returns data frame of features for all cycles
    '''
    
    for i in range(4):
        # read processed data files
        csv_string = feature_path + str(i + 1) + '.csv'
        feature_df = pd.read_csv(os.path.join(data_dir, csv_string))
        # drop empty columns
        feature_df = feature_df.loc[:, ~feature_df.columns.str.startswith('Unnamed')]
        price_string = 'price_' + str(i + 1) + '.csv'
        price_df = pd.read_csv(os.path.join(data_dir, price_string))
        # create year column
        temp_df = feature_df.copy()
        temp_df['Year'] = pd.DatetimeIndex(feature_df['Date']).year
        temp_price = price_df.copy()
        temp_price['Year'] = pd.DatetimeIndex(temp_price['Date']).year
        # average feature value by year
        yearly_feature = temp_df.groupby(['Year']).mean()
        yearly_feature = yearly_feature.transpose()
        unique_yrs = temp_df['Year'].unique()
        n_yrs = len(unique_yrs)
        n_stocks = len(yearly_feature.index)
        n = int(n_stocks * pct)

        for idx, year in enumerate(unique_yrs):
            # stocks with lowest and highest feature values for the year
            low_stocks = yearly_feature.iloc[:, idx].sort_values().index.values[0:n]
            high_stocks = yearly_feature.iloc[:, idx].sort_values().index.values[(n_stocks - n):n_stocks]
            # stock prices of stocks with lowest feature value
            low_price = temp_price[temp_price['Year'] == year][np.append(low_stocks, "Date")]
            # stock prices of stocks with highest feature value
            high_price = temp_price[temp_price['Year'] == year][np.append(high_stocks, "Date")]

            if year != max(unique_yrs):
                # add stock prices of the "first day" of the next year for return calculation
                low_price = pd.concat( 
                    [
                        low_price, 
                        temp_price[temp_price['Year'] == (year + 1)][np.append(low_stocks, "Date")].head(1)
                    ]
                )

                # add stock prices of the "first day" of the next year for return calculation
                high_price = pd.concat( 
                    [
                        high_price, 
                        temp_price[temp_price['Year'] == (year + 1)][np.append(high_stocks, "Date")].head(1)
                    ]
                )
             # save date column
            temp_dates = low_price['Date'][1:]
            # calculate returns and take average across stocks (rows)
            temp_returns_low = low_price.drop(
                columns = ['Date']
            ).pct_change().iloc[1:, :].mean(axis = 1).to_frame(name = feature_name)
            # calculate returns and take average across stocks (rows)
            temp_returns_high = high_price.drop(
                columns = ['Date']
            ).pct_change().iloc[1:, :].mean(axis = 1).to_frame(name = feature_name)
            # return of high credit risk - return of low credit risk
            temp_returns = temp_returns_high.sub(temp_returns_low)
            temp_returns['Date'] = temp_dates
            if idx == 0:
                returns_df = temp_returns
            else:
                returns_df = pd.concat([returns_df, temp_returns])
        if i == 0:
            daily_returns = returns_df
        else:
            daily_returns = pd.concat([daily_returns, returns_df])
    return daily_returns

In [155]:
# create credit factors
credit_factors = create_factors(feature_path = 'credit_risk_', feature_name = "credit_factor", pct = 0.2)
credit_factors.to_csv(os.path.join(data_dir, "credit_factor_daily.csv"))

# create turnover factors
turnover_factors = create_factors(feature_path = 'turnover_', feature_name = "turnover_factor", pct = 0.2)
turnover_factors.to_csv(os.path.join(data_dir, "turnover_factor_daily.csv"))