In [17]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

import numpy as np
import pandas as pd
import statsmodels.stats.api as sm

%pylab inline


Populating the interactive namespace from numpy and matplotlib


In [18]:
# loading dataset
def getData(strt_date = "2019-01-01", end_date = "2019-12-31"):
    FF = pd.read_csv("C:\\Users\\murty\\Desktop\\Fulldata.csv")
    # Date format change
    FF["Date"] = pd.to_datetime(FF["Date"])
    # Data selection
    df = FF[(FF["Date"] >= strt_date) & (FF["Date"] <= end_date)]
    # Reset Index
    df = df.reset_index()
    df = df.drop(["index"], axis =1)
    # Average Column and no. of days calculations
    df["Average"] = (FF["Open"] + 2*FF["High"] + FF["Close"])/4
    df["Days"] = range(1,df.shape[0]+1)
    df["delta_Open"] = (df.Open - df.Open.shift(periods=1))
    df["delta_Close"] = (df.Close - df.Close.shift(periods=1))
    df["delta_RMW"] = (df.RMW - df.RMW.shift(periods=1))
    df["delta_SMB"] = (df.SMB - df.SMB.shift(periods=1))
    df["delta_MktRF"] = (df["Mkt-RF"] - df["Mkt-RF"].shift(periods=1))
    df["delta_HML"] = (df.HML - df.HML.shift(periods=1))
    df["delta_CMA"] = (df.CMA - df.CMA.shift(periods=1))
    df["delta_Average"] = (df.Average - df.Average.shift(periods=1))
    df = df.dropna().reset_index(drop=True)
    
    df["label"] = df.delta_Open
    df = df.drop("delta_Open", axis=1)
    
    return df

In [87]:

## One year data
df = getData()

## Removing correlated columns
df.columns
df = df.drop(["High","Low","Adj Close"," HC- Open"," HC- High"," HC- Low"," HC- Close"], axis=1)
df

Unnamed: 0,Date,Open,Close,Volume,Mkt-RF,SMB,HML,RMW,CMA,RF,Average,Days,delta_Close,delta_RMW,delta_SMB,delta_MktRF,delta_HML,delta_CMA,delta_Average,label
0,2019-01-03,90.940002,90.639999,9820200,-2.45,0.52,1.23,-0.25,0.91,0.010,53.531250,2,-0.639999,-0.12,-0.21,-2.68,0.08,0.65,-0.328125,-0.089996
1,2019-01-04,90.839996,92.489998,10565700,3.55,0.39,-0.74,-0.09,-0.58,0.010,52.343750,3,1.849998,0.16,-0.13,6.00,-1.97,-1.49,-1.187500,-0.100006
2,2019-01-07,91.910004,92.120003,9012500,0.94,0.82,-0.67,-0.70,-0.41,0.010,53.984375,4,-0.369995,-0.61,0.43,-2.61,0.07,0.17,1.640625,1.070007
3,2019-01-08,92.699997,92.459999,9551300,1.01,0.44,-0.53,0.35,-0.09,0.010,57.507812,5,0.339996,1.05,-0.38,0.07,0.14,0.32,3.523438,0.789993
4,2019-01-09,92.690002,90.949997,15015600,0.56,0.51,-0.04,0.09,-0.18,0.010,58.296875,6,-1.510002,-0.26,0.07,-0.45,0.49,-0.09,0.789062,-0.009995
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246,2019-12-24,124.790001,125.220001,2022100,0.01,0.36,-0.07,-0.28,0.02,0.007,37.156250,248,0.320000,-0.16,0.20,-0.09,0.21,-0.29,0.054688,-0.729996
247,2019-12-26,125.099999,125.220001,4760400,0.49,-0.56,0.00,0.23,-0.19,0.007,37.203125,249,0.000000,0.51,-0.92,0.48,0.07,-0.21,0.046875,0.309998
248,2019-12-27,125.559998,126.089996,5192000,-0.09,-0.54,-0.07,0.24,0.16,0.007,38.687500,250,0.869995,0.01,0.02,-0.58,-0.07,0.35,1.484375,0.459999
249,2019-12-30,125.760002,124.470001,3884300,-0.57,0.27,0.58,0.15,0.45,0.007,38.687500,251,-1.619995,-0.09,0.81,-0.48,0.65,0.29,0.000000,0.200004


In [88]:
df_x = df.drop("label", axis =1).shift(periods=1).dropna().reset_index(drop=True)
df_y = df[["label"]].shift(periods=-1).dropna().reset_index(drop=True)
modified_df = pd.concat([df_x,df_y], axis =1)

modified_df


Unnamed: 0,Date,Open,Close,Volume,Mkt-RF,SMB,HML,RMW,CMA,RF,Average,Days,delta_Close,delta_RMW,delta_SMB,delta_MktRF,delta_HML,delta_CMA,delta_Average,label
0,2019-01-03,90.940002,90.639999,9820200.0,-2.45,0.52,1.23,-0.25,0.91,0.010,53.531250,2.0,-0.639999,-0.12,-0.21,-2.68,0.08,0.65,-0.328125,-0.100006
1,2019-01-04,90.839996,92.489998,10565700.0,3.55,0.39,-0.74,-0.09,-0.58,0.010,52.343750,3.0,1.849998,0.16,-0.13,6.00,-1.97,-1.49,-1.187500,1.070007
2,2019-01-07,91.910004,92.120003,9012500.0,0.94,0.82,-0.67,-0.70,-0.41,0.010,53.984375,4.0,-0.369995,-0.61,0.43,-2.61,0.07,0.17,1.640625,0.789993
3,2019-01-08,92.699997,92.459999,9551300.0,1.01,0.44,-0.53,0.35,-0.09,0.010,57.507812,5.0,0.339996,1.05,-0.38,0.07,0.14,0.32,3.523438,-0.009995
4,2019-01-09,92.690002,90.949997,15015600.0,0.56,0.51,-0.04,0.09,-0.18,0.010,58.296875,6.0,-1.510002,-0.26,0.07,-0.45,0.49,-0.09,0.789062,-1.800003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,2019-12-23,125.519997,124.900001,6619500.0,0.10,0.16,-0.28,-0.12,0.31,0.007,37.101562,247.0,-0.459999,-0.15,0.43,-0.38,0.01,0.37,0.234375,-0.729996
246,2019-12-24,124.790001,125.220001,2022100.0,0.01,0.36,-0.07,-0.28,0.02,0.007,37.156250,248.0,0.320000,-0.16,0.20,-0.09,0.21,-0.29,0.054688,0.309998
247,2019-12-26,125.099999,125.220001,4760400.0,0.49,-0.56,0.00,0.23,-0.19,0.007,37.203125,249.0,0.000000,0.51,-0.92,0.48,0.07,-0.21,0.046875,0.459999
248,2019-12-27,125.559998,126.089996,5192000.0,-0.09,-0.54,-0.07,0.24,0.16,0.007,38.687500,250.0,0.869995,0.01,0.02,-0.58,-0.07,0.35,1.484375,0.200004


In [89]:
## Train and Test Data
train_start_date = "2019-01-01"
train_end_date =  "2019-10-31"
test_start_date = "2019-11-01"
test_end_date = "2019-12-31"
df_train = df[(df["Date"] >= train_start_date) & (df["Date"] <= train_end_date)]
df_test = df[(df["Date"] >= test_start_date) & (df["Date"] <= test_end_date)]
df_train = df_train.drop(["Date","Days","Close","Mkt-RF","SMB","HML","RMW","CMA","Average"], axis=1)
df_test = df_test.drop(["Date","Days","Close","Mkt-RF","SMB","HML","RMW","CMA","Average"], axis=1)

In [90]:
df_train

Unnamed: 0,Open,Volume,RF,delta_Close,delta_RMW,delta_SMB,delta_MktRF,delta_HML,delta_CMA,delta_Average,label
0,90.940002,9820200,0.010,-0.639999,-0.12,-0.21,-2.68,0.08,0.65,-0.328125,-0.089996
1,90.839996,10565700,0.010,1.849998,0.16,-0.13,6.00,-1.97,-1.49,-1.187500,-0.100006
2,91.910004,9012500,0.010,-0.369995,-0.61,0.43,-2.61,0.07,0.17,1.640625,1.070007
3,92.699997,9551300,0.010,0.339996,1.05,-0.38,0.07,0.14,0.32,3.523438,0.789993
4,92.690002,15015600,0.010,-1.510002,-0.26,0.07,-0.45,0.49,-0.09,0.789062,-0.009995
...,...,...,...,...,...,...,...,...,...,...,...
205,124.820000,6093300,0.007,-1.540001,0.48,0.93,0.25,1.04,0.61,1.039062,1.059998
206,123.480003,5459100,0.007,0.230003,-0.40,-0.15,0.12,-0.34,0.01,0.203125,-1.339996
207,123.199997,5659800,0.007,0.119995,0.06,0.13,-0.75,0.64,0.11,0.328125,-0.280006
208,123.989998,4734400,0.007,1.340004,-0.50,-1.06,0.40,-1.62,-0.08,0.257812,0.790001


In [96]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [108]:
X_train= df_train.drop(["label"],axis=1)
X_test= df_test.drop(["label"], axis=1)
Y_train = df_train[["label"]]
y_test=df_test[["label"]]

In [124]:
max_samples=0.3571428571428571* X_train.shape[0]

In [127]:
regressor = RandomForestRegressor(n_estimators=2,max_depth=3, max_features=3,bootstrap=True, max_samples, random_state=0)
regressor.fit(X_train, Y_train)
y_pred = regressor.predict(X_test)

SyntaxError: positional argument follows keyword argument (<ipython-input-127-4694c9a71ed4>, line 1)

In [123]:
from sklearn import metrics

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('R_2 Error:', metrics.r2_score(y_test, y_pred))


Mean Absolute Error: 0.8292044784035432
Mean Squared Error: 1.420776271681064
Root Mean Squared Error: 1.1919632006404661
R_2 Error: -0.0994280633664908
