In [1]:
from pandas import Series, MultiIndex
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
import os
from pipeline import Dataset

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
__num_of_stocks = 54
__point_per_day = 50

read in data and parse the identifiers


In [4]:
def data_loader(data_name, sample_size=None, train=True):
    """
    :param data_name: [string] name of data to load
        Available option: "fundamental", "market", "return"
    :param sample_size: [int] number of rows to load
        default = None: load everything
    :return: [pd.dataframe]
    """
    if train:
        DATA_DIR = '../../' + 'data/raw/'
    else:
        DATA_DIR = '../../' + 'data/test/'
    FILE_LIST = os.listdir(DATA_DIR)
    DATA_TYPE = data_name
    DATA_NAME = [name for name in FILE_LIST if DATA_TYPE in name]
    # DATA_NAME = '/first_round_train_market_data.csv'

    if not DATA_NAME:
        raise ValueError('DATA_TYPE does not exist')

    DATA_PATH = DATA_DIR + DATA_NAME[0]

    if os.path.isfile(DATA_PATH):
        print('DATA_PATH is a valid path')
    else:
        raise ValueError('DATA_PATH is not valid path')

    return pd.read_csv(DATA_PATH, nrows=sample_size)

In [5]:
sample_size = 1000
f_df = data_loader('fundamental', sample_size=sample_size)
m_df = data_loader('market', sample_size=sample_size)
r_df = data_loader('return', sample_size=sample_size)

DATA_PATH is a valid path
DATA_PATH is a valid path
DATA_PATH is a valid path


In [6]:
f_df = data_loader('fundamental')
m_df = data_loader('market')
r_df = data_loader('return')

DATA_PATH is a valid path
DATA_PATH is a valid path
DATA_PATH is a valid path


In [7]:
def parse_date_time_column(s: Series):
    return s.str.split(pat='s|d|p', expand=True).iloc[:, 1:].astype(int).rename(columns={1: 'asset', 2: 'day', 3: 'timeslot'})

In [8]:
def parse_date_column(s: Series):
    return s.str.split(pat='s|d', expand=True).iloc[:, 1:].astype(int).rename(columns={1: 'asset', 2: 'day'})

In [9]:
def pre_process_df_with_date_time(df):
    date_time_series = df['date_time']
    df.drop(columns='date_time', inplace=True)
    df.index = MultiIndex.from_frame(parse_date_time_column(date_time_series))

In [10]:
def pre_process_df_with_date(df):
    date_series = df['date_time']
    df.drop(columns='date_time', inplace=True)
    df.index = MultiIndex.from_frame(parse_date_column(date_series))

In [11]:
pre_process_df_with_date_time(m_df)

In [12]:
pre_process_df_with_date(r_df)
pre_process_df_with_date(f_df)

now we can build some features

In [23]:
date_time_series = f_df['date_time']
data_test = parse_date_time_column(date_time_series)
MultiIndex.from_frame(data_test)

MultiIndex([( 0,    1),
            ( 1,    1),
            ( 2,    1),
            ( 3,    1),
            ( 4,    1),
            ( 5,    1),
            ( 6,    1),
            ( 7,    1),
            ( 8,    1),
            ( 9,    1),
            ...
            (44, 1000),
            (45, 1000),
            (46, 1000),
            (47, 1000),
            (48, 1000),
            (49, 1000),
            (50, 1000),
            (51, 1000),
            (52, 1000),
            (53, 1000)],
           names=['asset', 'day'], length=54000)

In [15]:
m_df = m_df.swaplevel(1,0)
m_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,open,close,high,low,volume,money
day,asset,timeslot,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,1,25.0164,24.9557,25.1378,24.9436,642243.0,16100600.0
1,1,1,16.3256,16.3499,16.4106,16.3256,15552.0,254030.1
1,2,1,9.2006,9.1763,9.2006,9.1521,396647.0,3638304.0
1,3,1,9.1521,9.2006,9.2006,9.1521,317044.0,2907092.0
1,4,1,5.1829,5.1222,5.1829,5.1101,294784.0,1514496.0


In [16]:
r_df = r_df.swaplevel(1,0)
r_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,return
day,asset,Unnamed: 2_level_1
1,0,-0.026877
1,1,-0.052674
1,2,-0.002691
1,3,-0.018515
1,4,-0.019184


In [17]:
f_df = f_df.swaplevel(1,0)
f_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,turnoverRatio,transactionAmount,pe_ttm,pe,pb,ps,pcf
day,asset,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,3.6794,17229.0,34.4425,32.3029,4.9425,3.818,-578.77
1,1,2.515,3706.0,28.9934,27.2726,5.0552,3.0484,23.826
1,2,1.2858,5136.0,42.9352,41.9279,4.8083,4.1392,-58.2185
1,3,2.2007,3280.0,15.2245,13.8032,2.1904,0.6691,61.0491
1,4,0.8627,5291.0,-369.985,-433.1736,3.0714,2.937,-25.2279


In [14]:
m_agg_df = m_df.groupby(level=[0, 1]).mean().sort_index()
m_agg_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,open,close,high,low,volume,money
asset,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,1,24.550554,24.53672,24.578718,24.508076,454075.84,11149250.0
0,2,23.9558,23.938318,23.988574,23.90749,401517.02,9601600.0
0,3,23.62613,23.628554,23.66109,23.591664,307969.24,7275962.0
0,4,23.837576,23.84728,23.87982,23.803594,563240.5,13447010.0
0,5,25.965612,26.002992,26.075094,25.890594,2249971.94,58798070.0


In [20]:
full_df = pd.concat([m_agg_df, f_df, r_df], axis=1).dropna()

In [18]:
parse_data_dir = '../../data/parsed/'
dataset=Dataset(market=m_df.reset_index(), fundamental=f_df.reset_index(), ref_return=r_df.reset_index())
dataset.dump(parse_data_dir)


In [None]:
pairplot = sns.pairplot(data=full_df[['close', 'volume', 'money', 'turnoverRatio', 'transactionAmount', 'pe_ttm', 'pcf', 'return']].iloc[:100], kind='kde')

In [35]:
fig = pairplot.fig
fig.savefig("./image/pairplot.png")

In [36]:
X = full_df[['close', 'volume', 'money', 'turnoverRatio', 'transactionAmount', 'pe_ttm', 'pcf']]
y = full_df['return']

kf = KFold(n_splits=5, shuffle=True, random_state=10)
for train, test in kf.split(full_df.index):
    reg = LinearRegression().fit(X.iloc[train], y.iloc[train])
    train_score = reg.score(X.iloc[train], y.iloc[train])
    test_score = reg.score(X.iloc[test], y.iloc[test])
    print(f'train score: {train_score:.4f}, test score: {test_score:.4f}')

train score: 0.0037, test score: 0.0043
train score: 0.0043, test score: 0.0018
train score: 0.0036, test score: 0.0050
train score: 0.0044, test score: 0.0016
train score: 0.0038, test score: 0.0041



## Overall setting:
- 54 assets/investments
- 50 timeslots a day for Market data
- 1-min frequency for Market data?
- 1000 days
## Features in Fundamental data:
- "date_time": sAdB format
- "turnoveRatio": Turnover Ratio
- "transactionAmount": total amount of transactions (why bold?)
- "pe_ttm": Price-to-Earnings Ratio (Trailing Twelve Months)
- "pe": Price-to-Earnings Ratio
- "pb": Price-to-Book Ratio
- "ps": Price-to-Sales Ratio
- "pcf": Price-to-Cash-Flow Ratio

## Features in Market data:
- 'date_time': sAdBpC format
- 'open': price at the beginning of this timeslot
- 'close': price at the end of this timeslot
- 'high':  highest price in this timeslot
- 'low':  lowest price in this timeslot
- 'volume':  total amount of units traded
- 'money:  total amount of money traded

## Features in Return data:
- 'date_time': sAdB format
- 'return': label data, return of investment
    - Two-day fixed period holding;
    - Trade at the end of the day;
    - Can't use data after, e.g. can't obtain return of sAd(B-1) as it needs sAd(B+1)
    - Computed percentage: $$ sAdB = \frac{sAd(B+2)p50 - sAdBp50}{sAdBp50}$$
