# Stock Price Prediction

In [1]:
# The stock of a corporation signifies ownership in the corporation.

# For e.g if an investor own 50 shares of stock in a company that has, in total 1000 outstanding shares,
# that investor or shareholder would own and have claim on 5% of the company's assets and earnings.

# Stocks of a company can be traded between shareholders and other parties via stock
# exchanges and organizations.

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler

In [3]:
data_raw=pd.read_csv("stocks.csv", index_col="Date")
data_raw.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1988-01-04,1952.589966,2030.01001,1950.76001,2015.25,2015.25,20880000
1988-01-05,2056.370117,2075.27002,2021.390015,2031.5,2031.5,27200000
1988-01-06,2036.469971,2058.189941,2012.77002,2037.800049,2037.800049,18800000
1988-01-07,2019.890015,2061.51001,2004.640015,2051.889893,2051.889893,21370000
1988-01-08,2046.579956,2058.689941,1898.040039,1911.310059,1911.310059,27440000


In [4]:
# Open- The starting price for a given trading day
# Close: The final price on that day
# High: The highest prices at which the stock traded on that day
# Low: The lowest price at which the stock traded on that day
# Volume: The total number of shares traded before the market is closed 
#         on that day.

In [5]:
# Step 1 :Create a fucntion generate_features

In [6]:
def generate_features(df):
    df_new=pd.DataFrame()
    
    # here plass all values of OPEN CLOSE HIGH LOW and VOLUME
    # shift 1 will shift all the data points by 1 row
    # if we have to create some blank variables in the model to predict the future prices
    # .i.e it tell us the price of the same share on the next day
    df_new['open']=df['Open']
    df_new['open_1']=df['Open'].shift(1)
    df_new['close_1']=df['Close'].shift(1)
    df_new['high_1']=df['High'].shift(1)
    df_new['low_1']=df['Low'].shift(1)
    df_new['volume_1']=df['Volume'].shift(1)

    # average price
    # avg_price_price_5 means avg price for the past 5 days 
    # avg_price_price_30 means avg price for the past 30 days and so on 
    # 21 and 252 because on weekends share market will be closed 
    
    df_new['avg_price_5']= df['Close'].rolling(5).mean().shift(1)
    df_new['avg_price_30']= df['Close'].rolling(21).mean().shift(1)
    df_new['avg_price_365']= df['Close'].rolling(252).mean().shift(1)
    # ratio of past week ,past moth and past year average prices
    df_new['ratio_avg_price_5_30']= df_new['avg_price_5']/df_new['avg_price_30']
    df_new['ratio_avg_price_5_365']= df_new['avg_price_5']/df_new['avg_price_365']
    df_new['ratio_avg_price_30_365']= df_new['avg_price_30']/df_new['avg_price_365']
    
    #similarly for volume
    # average volume
    df_new['avg_volume_5']= df['Volume'].rolling(5).mean().shift(1)
    df_new['avg_volume_30']= df['Volume'].rolling(21).mean().shift(1)
    df_new['avg_volume_365']= df['Volume'].rolling(252).mean().shift(1)
    df_new['ratio_avg_volume_5_30']= df_new['avg_volume_5']/df_new['avg_volume_30']
    df_new['ratio_avg_volume_5_365']= df_new['avg_volume_5']/df_new['avg_volume_365']
    df_new['ratio_avg_volume30_365']= df_new['avg_volume_30']/df_new['avg_volume_365']
    
    #standard deviation of prices
    df_new['std_price_5']= df['Close'].rolling(5).std().shift(1)
    df_new['std_price_30']= df['Close'].rolling(21).std().shift(1)
    df_new['std_price_365']= df['Close'].rolling(252).std().shift(1)
    df_new['ratio_std_price_5_30']= df_new['std_price_5']/df_new['std_price_30']
    df_new['ratio_std_price_5_365']= df_new['std_price_5']/df_new['std_price_365']
    df_new['ratio_std_price_30_365']= df_new['std_price_30']/df_new['std_price_365']
    
    # standard deviation of volume
    df_new['std_volume_5']= df['Volume'].rolling(5).std().shift(1)
    df_new['std_volume_30']= df['Volume'].rolling(21).std().shift(1)
    df_new['std_volume_365']= df['Volume'].rolling(252).std().shift(1)
    df_new['ratio_std_volume_5_30']= df_new['std_volume_5']/df_new['std_volume_30']
    df_new['ratio_std_volume_5_365']= df_new['std_volume_5']/df_new['std_volume_365']
    df_new['ratio_std_volume_30_365']= df_new['std_volume_30']/df_new['std_volume_365']
    
    #target is the last variable .This is what we want to predict
    # the target
    df_new['close']=df['Close']
    df_new=df_new.dropna(axis=0)#axis=0 =>dropping all rows with null value
    return df_new

In [7]:
data= generate_features(data_raw)
data.head()

Unnamed: 0_level_0,open,open_1,close_1,high_1,low_1,volume_1,avg_price_5,avg_price_30,avg_price_365,ratio_avg_price_5_30,...,ratio_std_price_5_30,ratio_std_price_5_365,ratio_std_price_30_365,std_volume_5,std_volume_30,std_volume_365,ratio_std_volume_5_30,ratio_std_volume_5_365,ratio_std_volume_30_365,close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1988-12-30,2183.389893,2169.110107,2182.679932,2193.040039,2165.179932,12220000.0,2168.215967,2146.005697,2061.050753,1.01035,...,0.372085,0.123335,0.331471,4260343.0,4572098.0,6461774.0,0.931814,0.659315,0.707561,2168.570068
1989-01-03,2163.209961,2183.389893,2168.570068,2193.75,2162.5,11140000.0,2169.857959,2148.579985,2061.659166,1.009903,...,0.332631,0.106706,0.320794,2605936.0,4627472.0,6476618.0,0.563145,0.402361,0.714489,2144.639893
1989-01-04,2153.75,2163.209961,2144.639893,2168.389893,2127.139893,17310000.0,2164.999951,2150.616176,2062.108134,1.006688,...,0.677084,0.191809,0.283287,3633685.0,4665697.0,6453401.0,0.778809,0.563065,0.722983,2177.679932
1989-01-05,2184.290039,2153.75,2177.679932,2183.389893,2146.610107,15710000.0,2167.999951,2154.682838,2062.663213,1.006181,...,0.914097,0.204474,0.22369,3057422.0,4671999.0,6455623.0,0.654414,0.473606,0.72371,2190.540039
1989-01-06,2195.889893,2184.290039,2190.540039,2205.179932,2173.040039,20310000.0,2172.821973,2157.86284,2063.213412,1.006932,...,1.089841,0.24502,0.224822,3744766.0,4825499.0,6454034.0,0.776037,0.580221,0.747672,2194.290039


In [8]:
#Total 31 features have been created 

In [9]:
#Now to specify the train and test dates 

In [10]:
start_train="1988-01-01"
end_train="2015-12-31"

start_test="2016-01-01"
end_test="2016-12-31"

In [11]:
#recreation of trained data

data_train= data[start_train:end_train]

X_train= data_train.drop('close',axis=1).values
y_train= data_train['close'].values

In [12]:
# from sklearn.model_selection import train_test_split

In [13]:
#now checking shape of the data
print(X_train.shape)
print(y_train.shape)

(6806, 30)
(6806,)


In [14]:
#similarly we need to process the test data 

In [15]:
data_test= data[start_test:end_test]

X_test= data_test.drop('close',axis=1).values
y_test= data_test['close'].values

In [16]:
# First experiment with linear regression

#Step 1: Scale the data
scaler=StandardScaler()

X_scaled_train= scaler.fit_transform(X_train)
X_scaled_test= scaler.transform(X_test)

#Step 2: Set the parameters to find the optimum data
param_grid={
    "alpha":[1e-5, 3e-5,1e-4],
    "eta0":[0.01,0.03,0.1]
}

# here (cv=5)cross validation by which u split data to n folds in those folds 80% of 
# folds will be used for training and 20% will be used for validation .
# in this manner by various permutation and combination the whole data set 
# will be trained once 
# get search cv runs model on hyper parameter and finds the best parameter where the error rate is minimum 
# the hyper parameter - learning rate(lr) , alpha parameter - parameter to apply penanlty on to the independent variables (l2)l2 penanlty is lasso regression 
#by alpha parameter 
from sklearn.linear_model import SGDRegressor
lr=SGDRegressor(penalty="l2", max_iter=100)
grid_search= GridSearchCV(lr, param_grid, cv=5, scoring="r2")
grid_search.fit(X_scaled_train,y_train)

print(grid_search.best_params_)



{'alpha': 1e-05, 'eta0': 0.03}


In [17]:
#now we will select the best parameters to predict the best from the dataset

In [18]:
lr_best= grid_search.best_estimator_

predictions_lr=lr_best.predict(X_scaled_test)

In [19]:
#once prediction is done print the matrix

print('MSE:{0:.3f}'.format(mean_squared_error(y_test, predictions_lr)))
print('MAE:{0:.3f}'.format(mean_absolute_error(y_test, predictions_lr)))
print('R^2:{0:.3f}'.format(r2_score(y_test, predictions_lr)))

MSE:20991.119
MAE:105.854
R^2:0.976


In [20]:
# Here we can see that we have a accuracy of 97%