# Features And Label Generation

In [1]:
import numpy as np
import pandas as pd

In [2]:
raw_sp = pd.read_parquet('../data/raw/sp500.parquet')
sp500 = raw_sp['Adj Close']

raw_csi = pd.read_parquet('../data/raw/csi300.parquet')
csi300 = raw_csi['Adj Close']

In [11]:
((sp500.shape, csi300.shape), 
(sp500.index.min(), sp500.index.max()), 
(csi300.index.min(), csi300.index.max()))

(((3018, 503), (2913, 300)),
 (Timestamp('2012-01-03 00:00:00'), Timestamp('2023-12-29 00:00:00')),
 (Timestamp('2012-01-04 00:00:00'), Timestamp('2023-12-29 00:00:00')))

In [12]:
class FeatureAndLabelGenerator:
    def __init__(self, data):
        """
        data: pandas.DataFrame
               The adjusted closing prices of stocks
        """
        self.data = data
        self.periods = list(range(1, 21)) + list(range(40, 241, 20))

    def calculate_returns(self, sticker):
        """
        periods: list
                 The periods for calculating returns
        @ return: pandas.DataFrame
                The returns for each stock and each period
        """
        returns = {}
        for period in self.periods:
            returns[f'return_{period}'] = self.data[sticker].pct_change(periods=period).shift(1)
        returns['Label'] = returns['return_1'].shift(-1)
        return pd.concat(returns, axis=1)

    def trigger_return_cal(self):
        """
        Run the feature generation process
        """
        returns_data = {stock: self.calculate_returns(stock) for stock in self.data.columns}
        return pd.concat(returns_data)

    def calculate_label(self, return_frame):
        """
        frame: pandas.DataFrame
                 The return of stocks
        return_label: str
                 The column return should be choose as gb

        @ return: pandas.DataFrame
                The label for each stock and each period
        """
        tmp = return_frame.reset_index().rename(columns={'level_0': 'Sticker'}).dropna(subset=['return_1', 'Label'])
        cross_mean = tmp.groupby('Date').agg({'Label': 'mean'}).rename(columns={'Label': 'Cross_mean'}).reset_index()

        tmp = tmp.merge(cross_mean, on='Date', how='left')
        tmp['GB'] = np.where(tmp['Label'] > tmp['Cross_mean'], 1, 0)
        return tmp.drop(columns=['Label', 'Cross_mean'])

    def run(self):
        returns_df = self.trigger_return_cal()
        final = self.calculate_label(returns_df)
        return final


In [13]:
sp500_instance = FeatureAndLabelGenerator(sp500)
data_sp500 = sp500_instance.run()

  returns[f'return_{period}'] = self.data[sticker].pct_change(periods=period).shift(1)


In [16]:
data_sp500.iloc[1000:, :].head() # check sample

Unnamed: 0,Sticker,Date,return_1,return_2,return_3,return_4,return_5,return_6,return_7,return_8,...,return_80,return_100,return_120,return_140,return_160,return_180,return_200,return_220,return_240,GB
1000,A,2015-12-28,0.008858,0.022319,0.032084,0.050873,0.03335,0.017137,0.03259,0.046957,...,0.216182,0.040431,0.062134,0.060528,-0.003103,-0.027139,0.033394,0.0799,0.073893,0
1001,A,2015-12-29,-0.008543,0.000239,0.013585,0.023267,0.041895,0.024522,0.008448,0.023769,...,0.179321,0.029009,0.081323,0.047013,-0.014402,-0.02402,0.030331,0.052215,0.078075,1
1002,A,2015-12-30,0.013883,0.005221,0.014125,0.027656,0.037472,0.056359,0.038745,0.022448,...,0.188335,0.058898,0.091546,0.051069,0.016209,-0.011161,0.021148,0.063368,0.123236,1
1003,A,2015-12-31,-0.004486,0.009335,0.000712,0.009576,0.023047,0.032819,0.051621,0.034085,...,0.206286,0.057574,0.073411,0.046096,0.01407,-0.022631,0.022191,0.048931,0.111182,1
1004,A,2016-01-04,-0.005826,-0.010285,0.003455,-0.005118,0.003695,0.017087,0.026802,0.045495,...,0.161172,0.038943,0.052203,0.058014,0.002408,-0.022693,0.003208,0.017215,0.114029,0


In [17]:
data_sp500.to_parquet('../data/golden/sp500_golden.parquet')

In [18]:
csi300_instance = FeatureAndLabelGenerator(csi300)
data_csi300 = csi300_instance.run()

In [19]:
data_csi300.iloc[1000:, :].head(5)

Unnamed: 0,Sticker,Date,return_1,return_2,return_3,return_4,return_5,return_6,return_7,return_8,...,return_80,return_100,return_120,return_140,return_160,return_180,return_200,return_220,return_240,GB
1000,000001.SZ,2016-02-24,-0.016521,0.007968,0.002973,-0.002956,0.010989,0.033708,0.020161,0.017085,...,-0.121528,-0.064695,-0.160166,-0.259693,-0.319435,-0.385922,-0.394012,-0.217646,-0.126752,0
1001,000001.SZ,2016-02-25,0.002964,-0.013605,0.010956,0.005946,0.0,0.013986,0.036772,0.023185,...,-0.115854,-0.073905,-0.117391,-0.241405,-0.26289,-0.345161,-0.39039,-0.21981,-0.096472,1
1002,000001.SZ,2016-02-26,-0.047291,-0.044466,-0.060253,-0.036853,-0.041625,-0.047291,-0.033966,-0.012257,...,-0.14197,-0.096262,-0.0657,-0.222669,-0.286873,-0.368799,-0.420958,-0.266486,-0.137297,1
1003,000001.SZ,2016-02-29,0.01241,-0.035468,-0.032609,-0.048591,-0.0249,-0.029732,-0.035468,-0.021978,...,-0.131322,-0.085901,0.034884,-0.224247,-0.326685,-0.384277,-0.407385,-0.249864,-0.114189,1
1004,000001.SZ,2016-03-01,-0.023493,-0.011375,-0.058128,-0.055336,-0.070943,-0.047809,-0.052527,-0.058128,...,-0.158451,-0.093839,-0.035318,-0.24127,-0.313218,-0.393786,-0.397226,-0.269798,-0.138861,0


In [20]:
data_csi300.to_parquet('../data/golden/csi300_golden.parquet')

## Train Test Split

In [22]:
from sklearn.model_selection import train_test_split
import pandas as pd

In [23]:
sp = pd.read_parquet('../data/golden/sp500_golden.parquet')
csi = pd.read_parquet('../data/golden/csi300_golden.parquet')

In [25]:
sp.head()

Unnamed: 0,Sticker,Date,return_1,return_2,return_3,return_4,return_5,return_6,return_7,return_8,...,return_80,return_100,return_120,return_140,return_160,return_180,return_200,return_220,return_240,GB
0,A,2012-01-05,-0.00795,,,,,,,,...,,,,,,,,,,1
1,A,2012-01-06,0.022382,0.014254,,,,,,,...,,,,,,,,,,1
2,A,2012-01-09,0.010811,0.033435,0.025219,,,,,,...,,,,,,,,,,1
3,A,2012-01-10,0.026203,0.037297,0.060515,0.052084,,,,,...,,,,,,,,,,1
4,A,2012-01-11,0.018759,0.045455,0.056757,0.080409,0.07182,,,,...,,,,,,,,,,0


In [26]:
s, m, e = 2017, 2020, 2021
sp_train = sp[(sp.Date.dt.year >= s) & (sp.Date.dt.year < m)]
sp_test =  sp[(sp.Date.dt.year >= m) & (sp.Date.dt.year < e)]

csi_train = csi[(csi.Date.dt.year >= s) & (csi.Date.dt.year < m)]
csi_test =  csi[(csi.Date.dt.year >= m) & (csi.Date.dt.year < e)]

In [27]:
sp_train.shape, sp_test.shape, csi_train.shape, csi_test.shape

((366310, 34), (124885, 34), (179129, 34), (66968, 34))

In [28]:
sp_train.head()

Unnamed: 0,Sticker,Date,return_1,return_2,return_3,return_4,return_5,return_6,return_7,return_8,...,return_80,return_100,return_120,return_140,return_160,return_180,return_200,return_220,return_240,GB
1256,A,2017-01-03,-0.001753,-0.000614,-0.017589,-0.010998,-0.006049,-0.00756,-0.011211,0.005765,...,-0.027692,-0.034659,-0.015572,0.022398,0.077572,0.122613,0.172284,0.238479,0.236153,1
1257,A,2017-01-04,0.020413,0.018624,0.019786,0.002464,0.00919,0.01424,0.012698,0.008973,...,-0.005946,-0.016198,0.00712,0.0435,0.091374,0.134437,0.169426,0.255321,0.262402,1
1258,A,2017-01-05,0.013121,0.033801,0.031989,0.033166,0.015617,0.022432,0.027548,0.025985,...,0.055118,-0.016129,0.011831,0.056486,0.069094,0.146823,0.200905,0.252062,0.253709,0
1259,A,2017-01-06,-0.011889,0.001076,0.02151,0.01972,0.020883,0.003543,0.010276,0.015331,...,0.022748,-0.028028,0.002587,0.034032,0.058529,0.129914,0.179784,0.2658,0.250991,1
1260,A,2017-01-09,0.031156,0.018896,0.032265,0.053336,0.05149,0.052689,0.034809,0.041752,...,0.074577,-0.003953,0.030951,0.066482,0.085629,0.153724,0.229178,0.294441,0.293406,1


In [29]:
sp_train.to_parquet('../data/golden/sp500_train.parquet')
sp_test.to_parquet('../data/golden/sp500_test.parquet')

csi_train.to_parquet('../data/golden/csi300_train.parquet')
csi_test.to_parquet('../data/golden/csi300_test.parquet')