In [15]:
from pandas import Series, MultiIndex
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as pl
import numpy as np
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
import os
import pickle
from datatools import data_quantization
from pipeline import Dataset

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
__num_of_stocks = 54
__point_per_day = 50

read in data and parse the identifiers


In [4]:
def data_loader(data_name, sample_size=None, train=True):
    """
    :param data_name: [string] name of data to load
        Available option: "fundamental", "market", "return"
    :param sample_size: [int] number of rows to load
        default = None: load everything
    :return: [pd.dataframe]
    """
    if train:
        DATA_DIR = '../../' + 'data/raw/'
    else:
        DATA_DIR = '../../' + 'data/test/'
    FILE_LIST = os.listdir(DATA_DIR)
    DATA_TYPE = data_name
    DATA_NAME = [name for name in FILE_LIST if DATA_TYPE in name]
    # DATA_NAME = '/first_round_train_market_data.csv'

    if not DATA_NAME:
        raise ValueError('DATA_TYPE does not exist')

    DATA_PATH = DATA_DIR + DATA_NAME[0]

    if os.path.isfile(DATA_PATH):
        print('DATA_PATH is a valid path')
    else:
        raise ValueError('DATA_PATH is not valid path')

    return pd.read_csv(DATA_PATH, nrows=sample_size)

In [5]:
sample_size = 1000
f_df = data_loader('fundamental', sample_size=sample_size)
m_df = data_loader('market', sample_size=sample_size)
r_df = data_loader('return', sample_size=sample_size)

DATA_PATH is a valid path
DATA_PATH is a valid path
DATA_PATH is a valid path


In [32]:
f_df = data_loader('fundamental')
m_df = data_loader('market')
r_df = data_loader('return')

DATA_PATH is a valid path
DATA_PATH is a valid path
DATA_PATH is a valid path


In [6]:
def parse_date_time_column(s: Series):
    return s.str.split(pat='s|d|p', expand=True).iloc[:, 1:].astype(int).rename(columns={1: 'asset', 2: 'day', 3: 'timeslot'})

In [7]:
def parse_date_column(s: Series):
    return s.str.split(pat='s|d', expand=True).iloc[:, 1:].astype(int).rename(columns={1: 'asset', 2: 'day'})

In [8]:
def pre_process_df_with_date_time(df):
    date_time_series = df['date_time']
    df.drop(columns='date_time', inplace=True)
    df.index = MultiIndex.from_frame(parse_date_time_column(date_time_series))

In [9]:
def pre_process_df_with_date(df):
    date_series = df['date_time']
    df.drop(columns='date_time', inplace=True)
    df.index = MultiIndex.from_frame(parse_date_column(date_series))

In [10]:
pre_process_df_with_date_time(m_df)

In [11]:
pre_process_df_with_date(r_df)
pre_process_df_with_date(f_df)

now we can build some features

In [12]:
m_agg_df = m_df.groupby(level=[0, 1]).mean().sort_index()
m_agg_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,open,close,high,low,volume,money
asset,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,1,24.550554,24.53672,24.578718,24.508076,454075.84,11149250.0
0,2,23.9558,23.938318,23.988574,23.90749,401517.02,9601600.0
0,3,23.62613,23.628554,23.66109,23.591664,307969.24,7275962.0
0,4,23.837576,23.84728,23.87982,23.803594,563240.5,13447010.0
0,5,25.965612,26.002992,26.075094,25.890594,2249971.94,58798070.0


In [20]:
full_df = pd.concat([m_agg_df, f_df, r_df], axis=1).dropna()

In [33]:
parse_data_dir = '../../data/parsed/'
dataset=Dataset(market=m_df, fundamental=f_df, ref_return=r_df)
dataset.dump(parse_data_dir)


In [None]:
pairplot = sns.pairplot(data=full_df[['close', 'volume', 'money', 'turnoverRatio', 'transactionAmount', 'pe_ttm', 'pcf', 'return']].iloc[:100], kind='kde')

In [35]:
fig = pairplot.fig
fig.savefig("./image/pairplot.png")

In [36]:
X = full_df[['close', 'volume', 'money', 'turnoverRatio', 'transactionAmount', 'pe_ttm', 'pcf']]
y = full_df['return']

kf = KFold(n_splits=5, shuffle=True, random_state=10)
for train, test in kf.split(full_df.index):
    reg = LinearRegression().fit(X.iloc[train], y.iloc[train])
    train_score = reg.score(X.iloc[train], y.iloc[train])
    test_score = reg.score(X.iloc[test], y.iloc[test])
    print(f'train score: {train_score:.4f}, test score: {test_score:.4f}')

train score: 0.0037, test score: 0.0043
train score: 0.0043, test score: 0.0018
train score: 0.0036, test score: 0.0050
train score: 0.0044, test score: 0.0016
train score: 0.0038, test score: 0.0041
