In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data_dir = '/home/lzhao/data/tmp/crypto'

In [3]:
origin_train_df = pd.read_csv(os.path.join(data_dir, 'train.csv'))
supp_train_df = pd.read_csv(os.path.join(data_dir, 'supplemental_train.csv'))
asset_details_df = pd.read_csv(os.path.join(data_dir, 'asset_details.csv'))

In [4]:
origin_train_df.head(5)

Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target
0,1514764860,2,40.0,2376.58,2399.5,2357.14,2374.59,19.233005,2373.116392,-0.004218
1,1514764860,0,5.0,8.53,8.53,8.53,8.53,78.38,8.53,-0.014399
2,1514764860,1,229.0,13835.194,14013.8,13666.11,13850.176,31.550062,13827.062093,-0.014643
3,1514764860,5,32.0,7.6596,7.6596,7.6567,7.6576,6626.71337,7.657713,-0.013922
4,1514764860,7,5.0,25.92,25.92,25.874,25.877,121.08731,25.891363,-0.008264


In [5]:
supp_train_df.head(5)

Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target
0,1623542400,3,1201.0,1.478556,1.48603,1.478,1.483681,654799.561103,1.481439,-0.002594
1,1623542400,2,1020.0,580.306667,583.89,579.91,582.276667,1227.988328,581.697038,-0.009143
2,1623542400,0,626.0,343.7895,345.108,343.64,344.598,1718.832569,344.441729,-0.004525
3,1623542400,1,2888.0,35554.289632,35652.46465,35502.67,35602.004286,163.811537,35583.469303,0.003096
4,1623542400,4,433.0,0.312167,0.3126,0.31192,0.312208,585577.410442,0.312154,0.001426


In [6]:
train_df = pd.concat([origin_train_df, supp_train_df]).drop_duplicates()

In [7]:
asset_details_df.head(2)

Unnamed: 0,Asset_ID,Weight,Asset_Name
0,2,2.397895,Bitcoin Cash
1,0,4.304065,Binance Coin


In [5]:
asset_details_df.Asset_ID.unique()

array([ 2,  0,  1,  5,  7,  6,  9, 11, 13, 12,  3,  8, 10,  4])

# 2. Feature Engineering

In [14]:
# Two features from the competition tutorial
def upper_shadow(df): 
    return df['High'] - np.maximum(df['Close'], df['Open'])
    
def lower_shadow(df): 
    return np.minimum(df['Close'], df['Open']) - df['Low']

In [8]:
train_df['date'] = pd.to_datetime(train_df.timestamp, unit='s')
train_df = train_df.sort_values(by='date')

In [9]:
groups, _ = pd.factorize(train_df.date.dt.year.astype(str) + '_'  \
    + train_df.date.dt.month.astype(str) + '_' \
    + train_df.date.dt.day.astype(str))

train_df['groups'] = groups


In [12]:
train_df.head(5)

Unnamed: 0,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target,groups
0,2,40.0,2376.58,2399.5,2357.14,2374.59,19.233005,2373.116392,-0.004218,0
1,0,5.0,8.53,8.53,8.53,8.53,78.38,8.53,-0.014399,0
2,1,229.0,13835.194,14013.8,13666.11,13850.176,31.550062,13827.062093,-0.014643,0
3,5,32.0,7.6596,7.6596,7.6567,7.6576,6626.71337,7.657713,-0.013922,0
4,7,5.0,25.92,25.92,25.874,25.877,121.08731,25.891363,-0.008264,0


In [11]:
train_df.drop(columns=['timestamp', 'date'], inplace=True)

In [15]:
train_df['upper_Shadow'] = upper_shadow(train_df)
train_df['lower_Shadow'] = lower_shadow(train_df)
train_df["high_div_low"] = train_df["High"] / train_df["Low"]
train_df["open_sub_close"] = train_df["Open"] - train_df["Close"]

In [16]:
# 填充空值 以及 极值
train_df = train_df.fillna(0)
train_df = train_df.replace([np.inf, -np.inf], 0)

In [18]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24236806 entries, 0 to 24236805
Data columns (total 14 columns):
 #   Column          Dtype  
---  ------          -----  
 0   Asset_ID        int64  
 1   Count           float64
 2   Open            float64
 3   High            float64
 4   Low             float64
 5   Close           float64
 6   Volume          float64
 7   VWAP            float64
 8   Target          float64
 9   groups          int64  
 10  upper_Shadow    float64
 11  lower_Shadow    float64
 12  high_div_low    float64
 13  open_sub_close  float64
dtypes: float64(12), int64(2)
memory usage: 2.7 GB


# 3. Train