In [10]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

In [2]:
#Load the data
train_data = pd.read_csv('stock_data.csv',parse_dates=['date'])
test_data = pd.read_csv('test_stock_data.csv',parse_dates=['date'])
train_data

Unnamed: 0,code,date,open,close,high,low,volume
0,hs300,2015-01-05,3566.090,3641.540,3669.040,3551.510,451198098.0
1,hs300,2015-01-06,3608.430,3641.060,3683.230,3587.230,420962185.0
2,hs300,2015-01-07,3620.920,3643.790,3671.190,3601.700,320191232.0
3,hs300,2015-01-08,3650.070,3559.260,3659.950,3552.100,295003045.0
4,hs300,2015-01-09,3547.570,3546.720,3689.750,3536.400,349982672.0
...,...,...,...,...,...,...,...
64118,300001,2015-01-26,9.311,9.402,9.438,9.293,52650.0
64119,300001,2015-01-27,9.493,9.402,9.547,9.289,46393.0
64120,300001,2015-01-28,9.379,9.339,9.479,9.289,40671.0
64121,300001,2015-01-29,9.357,9.080,9.357,9.058,45559.0


In [3]:
#Compute statistics of the data
train_data.describe()

Unnamed: 0,open,close,high,low,volume
count,64123.0,64123.0,64123.0,64123.0,64123.0
mean,18.610258,18.707356,19.010598,18.329883,792989.9
std,151.012527,151.412967,152.887387,149.678863,11929820.0
min,1.397,1.391,1.415,1.379,245.0
25%,6.812,6.8505,6.972,6.702,45903.5
50%,9.884,9.943,10.124,9.717,101679.0
75%,14.9485,15.046,15.3205,14.679,246368.0
max,4138.88,4088.18,4166.02,4037.77,468581300.0


In [4]:
#increase or remain = 1, decrease = 0
train_data.sort_values(by=['code','date'])
train_data['next_close'] = train_data.groupby('code')['close'].shift(-1)
train_data['return'] = train_data['next_close'] - train_data['close']
train_data['ret_sign'] = (train_data['return']>=0) 
train_data['ret_sign'] = train_data['ret_sign'].astype(int)
train_data.dropna(inplace=True)
train_data

Unnamed: 0,code,date,open,close,high,low,volume,next_close,return,ret_sign
0,hs300,2015-01-05,3566.090,3641.540,3669.040,3551.510,451198098.0,3641.060,-0.480,0
1,hs300,2015-01-06,3608.430,3641.060,3683.230,3587.230,420962185.0,3643.790,2.730,1
2,hs300,2015-01-07,3620.920,3643.790,3671.190,3601.700,320191232.0,3559.260,-84.530,0
3,hs300,2015-01-08,3650.070,3559.260,3659.950,3552.100,295003045.0,3546.720,-12.540,0
4,hs300,2015-01-09,3547.570,3546.720,3689.750,3536.400,349982672.0,3513.580,-33.140,0
...,...,...,...,...,...,...,...,...,...,...
64117,300001,2015-01-23,9.538,9.311,9.597,9.307,74090.0,9.402,0.091,1
64118,300001,2015-01-26,9.311,9.402,9.438,9.293,52650.0,9.402,0.000,1
64119,300001,2015-01-27,9.493,9.402,9.547,9.289,46393.0,9.339,-0.063,0
64120,300001,2015-01-28,9.379,9.339,9.479,9.289,40671.0,9.080,-0.259,0


In [5]:
#increase or remain = 1, decrease = 0
train_data.sort_values(by=['code','date'])
train_data['next_close'] = train_data.groupby('code')['close'].shift(-1)
train_data['return'] = train_data['next_close'] - train_data['close']
train_data['ret_sign'] = (train_data['return']>=0) 
train_data['ret_sign'] = train_data['ret_sign'].astype(int)
train_data.dropna(inplace=True)
train_data

Unnamed: 0,code,date,open,close,high,low,volume,next_close,return,ret_sign
0,hs300,2015-01-05,3566.090,3641.540,3669.040,3551.510,451198098.0,3641.060,-0.480,0
1,hs300,2015-01-06,3608.430,3641.060,3683.230,3587.230,420962185.0,3643.790,2.730,1
2,hs300,2015-01-07,3620.920,3643.790,3671.190,3601.700,320191232.0,3559.260,-84.530,0
3,hs300,2015-01-08,3650.070,3559.260,3659.950,3552.100,295003045.0,3546.720,-12.540,0
4,hs300,2015-01-09,3547.570,3546.720,3689.750,3536.400,349982672.0,3513.580,-33.140,0
...,...,...,...,...,...,...,...,...,...,...
64116,300001,2015-01-22,9.475,9.624,9.692,9.402,80687.0,9.311,-0.313,0
64117,300001,2015-01-23,9.538,9.311,9.597,9.307,74090.0,9.402,0.091,1
64118,300001,2015-01-26,9.311,9.402,9.438,9.293,52650.0,9.402,0.000,1
64119,300001,2015-01-27,9.493,9.402,9.547,9.289,46393.0,9.339,-0.063,0


In [6]:
#Use Logistic Regression
LGR = LogisticRegression(fit_intercept=True)
LGR.fit(train_data[['open','close','high','low','volume']],train_data['ret_sign'])
predictions = LGR.predict(train_data[['open','close','high','low','volume']])
train_data['pred']=predictions
accuracy_score(train_data['ret_sign'],train_data['pred'])

0.5929100306216685

In [7]:
train_data[['ret_sign','pred']]

Unnamed: 0,ret_sign,pred
0,0,1
1,1,1
2,0,1
3,0,1
4,0,1
...,...,...
64116,0,1
64117,1,1
64118,1,1
64119,0,1


In [8]:
test_data.sort_values(by=['code','date'])
test_data['next_close'] = test_data.groupby('code')['close'].shift(-1)
test_data['return'] = test_data['next_close'] - test_data['close']
test_data['ret_sign'] = (test_data['return']>=0) 
test_data['ret_sign'] = test_data['ret_sign'].astype(int)
test_data.dropna(inplace=True)
test_data

Unnamed: 0,code,date,open,close,high,low,volume,next_close,return,ret_sign
0,hs300,2015-04-01,4057.500,4123.900,4139.500,4046.940,329512853.0,4124.780,0.880,1
1,hs300,2015-04-02,4149.950,4124.780,4156.840,4068.650,339070487.0,4170.540,45.760,1
2,hs300,2015-04-03,4104.670,4170.540,4170.560,4092.380,321303766.0,4260.040,89.500,1
3,hs300,2015-04-07,4213.890,4260.040,4260.470,4197.020,415362654.0,4295.800,35.760,1
4,hs300,2015-04-08,4277.450,4295.800,4304.780,4204.830,458676715.0,4262.140,-33.660,0
...,...,...,...,...,...,...,...,...,...,...
23106,300002,2015-04-23,21.454,21.928,22.665,21.204,343823.0,23.402,1.474,1
23107,300002,2015-04-24,21.520,23.402,23.461,21.257,495119.0,23.389,-0.013,0
23108,300002,2015-04-27,24.093,23.389,24.337,22.738,408427.0,22.237,-1.152,0
23109,300002,2015-04-28,23.152,22.237,23.382,21.724,396219.0,22.297,0.060,1


In [9]:
#Use Logistic Regression
predictions = LGR.predict(test_data[['open','close','high','low','volume']])
test_data['pred']=predictions
accuracy_score(test_data['ret_sign'],test_data['pred'])

0.5891490524781341

In [15]:
#Use Logistic Regression
KNNC = KNeighborsClassifier(n_neighbors=50)
KNNC.fit(train_data[['open','close','high','low','volume']],train_data['ret_sign'])
predictions = KNNC.predict(train_data[['open','close','high','low','volume']])
train_data['pred']=predictions
accuracy_score(train_data['ret_sign'],train_data['pred'])

0.600249509891285

In [16]:
#Use Logistic Regression
predictions = KNNC.predict(test_data[['open','close','high','low','volume']])
test_data['pred']=predictions
accuracy_score(test_data['ret_sign'],test_data['pred'])

0.560905612244898