In [39]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression

In [40]:
data = pd.read_csv('data_analysis.csv')

initial_time = float(list(data.columns)[0])

# Renaming columns
previous_columns = list(data.columns)
new_columns = ['time', 'bid_price','ask_price', 'bid_volume_0', 'bid_volume_1', 'bid_volume_2', 'bid_volume_3', 'bid_volume_4', 'ask_volume_0', 'ask_volume_1', 'ask_volume_2', 'ask_volume_3', 'ask_volume_4']
columns_dict = dict(zip(previous_columns, new_columns))
data = data.rename(columns=columns_dict)

# Adjusting initial time
data['time'] = data['time'] - initial_time

# Generating sprad and mid_price
data['mid_price'] = (data['ask_price'] + data['bid_price']) / 2
data['spread'] = data['ask_price'] - data['bid_price']

# Getting rid of bid and ask price columns
data = data.drop(columns=['bid_price', 'ask_price'])

In [45]:
SHIFTS = [1,3,7,14,28,56]

In [44]:
data

Unnamed: 0,time,bid_volume_0,bid_volume_1,bid_volume_2,bid_volume_3,bid_volume_4,ask_volume_0,ask_volume_1,ask_volume_2,ask_volume_3,ask_volume_4,mid_price,spread
0,0.281,55,30,45,25,10000,34,30,10035,0,0,126300.0,200
1,0.469,45,55,30,45,25,10035,0,0,0,0,126800.0,400
2,0.750,25,55,105,70,25,65,10,10000,0,0,126650.0,100
3,0.969,25,140,75,10045,25,85,35,10,10000,0,126650.0,100
4,1.234,25,185,75,10045,25,85,65,10,10000,0,126650.0,100
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3595,898.969,74,32,55,10000,113,109,80,101,10079,2,125150.0,300
3596,899.265,104,32,55,10000,113,109,80,101,10079,2,125150.0,300
3597,899.484,104,92,35,20,113,99,10101,79,2,88,125200.0,400
3598,899.765,48,32,66,55,113,109,10,20,10101,79,125100.0,200


# Creating features

## Momentum features

In [None]:
def create_return_features(data : pd.DataFrame, shifts : list) -> pd.DataFrame:
    '''
    Takes the raw dataframe (data) with the day shifts we want to compute returns (shifts).
    Return the dataframe with the new features.
    '''
    new_data = data.copy()

    for day in shifts:
        new_data[f'returns_{day}'] = data['mid_price'].div(data['mid_price'].shift(day)) - 1
        
        if day != 1:
            new_data[f'return_average_price_{day}'] = data['mid_price'].div(data['mid_price'].rolling(day).mean()) - 1

    return new_data.dropna(axis = 0)

In [None]:
def create_variable_next_day_price(data : pd.DataFrame) -> pd.DataFrame:
    '''
    Takes raw dataframe and adds a new column with the price on the next day
    '''
    new_data = data.copy()
    new_data['target'] = data['mid_price'].shift(-1) - data['mid_price']
    new_data.drop(columns='mid_price')

    return new_data.dropna(axis = 0)

In [None]:
def create_complete_data(data : pd.DataFrame, shifts : list) -> pd.DataFrame:
    '''
    Takes raw dataframe and generate the dataframe with new features and next day price
    '''
    new_data = data.copy()
    new_data = create_return_features(data, shifts)
    new_data = create_variable_next_day_price(new_data)

    return new_data

In [None]:
data_new = create_complete_data(data, SHIFTS)

## Generate 10k position bid and ask features

In [2]:
# to do

# Correlation study

In [None]:
def correlation_study_df(data : pd.DataFrame, shifts : list) -> pd.DataFrame:
    '''
    Takes the raw dataframe and time shifts and generate the dataset with all features up to 2009
    for the correlation study
    '''
    
    data_corr = create_complete_data(data, shifts)

    return data_corr.drop(columns=['mid_price'])

In [None]:
data_corr = correlation_study_df(data, SHIFTS)

plt.figure(figsize=(12,12))
sns.heatmap(data = data_corr.corr(), annot=True, vmin = -1, vmax = 1)

In [None]:
COLUMNS_TO_DROP = ['returns_1', 'returns_3', 'returns_7','returns_14','return_average_price_14','return_average_price_56','return_average_price_28']

plt.figure(figsize=(12,12))
sns.heatmap(data = data_corr.drop(columns = COLUMNS_TO_DROP).corr(), annot=True, vmin = -1, vmax = 1)

In [None]:
def create_complete_data_without_corr_columns(data : pd.DataFrame, shifts : list, columns_to_drop) -> pd.DataFrame:
    '''
    Takes raw dataframe and generate the dataframe with new features and next day price without the highly correlated columns
    '''
    new_data = data.copy()
    new_data = create_return_features(data, shifts)
    new_data = create_variable_next_day_price(new_data)
    new_data = new_data.drop(columns = ['next_day_price'])

    return new_data.drop(columns = columns_to_drop)

In [None]:
data_complete = create_complete_data_without_corr_columns(data, SHIFTS, COLUMNS_TO_DROP)