# Stock Price Prediction

In [None]:
# The stock of a corporation signifies ownership in the corporation

# For e.g. if an investor owns 50 shares of stock in a company that has, in total 1000 outstanding shares,
# that investor or shareholder would own and have claim on 5% of the company assets and earnings.

import warnings
warnings.filterwarnings("ignore")

import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler

In [None]:
data_raw= pd.read_csv("stocks.csv", index_col="Date")
data_raw.head()

In [None]:
# Open- The starting price for a given trading day
# Close: The final price on that day
# High: The highest prices at which the stock traded on that day
# Low: The lowest price at which the stock traded on that day
# Volume: The total number of shares traded before the market is closed on that day

In [None]:
def generate_features(df):
    df_new=pd.DataFrame()
    df_new['open']=df['Open']
    df_new['open_1']=df['Open'].shift(1)
    df_new['close_1']=df['Close'].shift(1)
    df_new['high_1']=df['High'].shift(1)
    df_new['low_1']=df['Low'].shift(1)
    df_new['volume_1']=df['Volume'].shift(1)
    # average price
    df_new['avg_price_5']= df['Close'].rolling(5).mean().shift(1)
    df_new['avg_price_30']= df['Close'].rolling(21).mean().shift(1)
    df_new['avg_price_365']= df['Close'].rolling(252).mean().shift(1)
    df_new['ratio_avg_price_5_30']= df_new['avg_price_5']/df_new['avg_price_30']
    df_new['ratio_avg_price_5_365']= df_new['avg_price_5']/df_new['avg_price_365']
    df_new['ratio_avg_price_30_365']= df_new['avg_price_30']/df_new['avg_price_365']
    # average volume
    df_new['avg_volume_5']= df['Volume'].rolling(5).mean().shift(1)
    df_new['avg_volume_30']= df['Volume'].rolling(21).mean().shift(1)
    df_new['avg_volume_365']= df['Volume'].rolling(252).mean().shift(1)
    df_new['ratio_avg_volume_5_30']= df_new['avg_volume_5']/df_new['avg_volume_30']
    df_new['ratio_avg_volume_5_365']= df_new['avg_volume_5']/df_new['avg_volume_365']
    df_new['ratio_avg_volume30_365']= df_new['avg_volume_30']/df_new['avg_volume_365']
    #standard deviation of prices
    df_new['std_price_5']= df['Close'].rolling(5).std().shift(1)
    df_new['std_price_30']= df['Close'].rolling(21).std().shift(1)
    df_new['std_price_365']= df['Close'].rolling(252).std().shift(1)
    df_new['ratio_std_price_5_30']= df_new['std_price_5']/df_new['std_price_30']
    df_new['ratio_std_price_5_365']= df_new['std_price_5']/df_new['std_price_365']
    df_new['ratio_std_price_30_365']= df_new['std_price_30']/df_new['std_price_365']
    # standard deviation of volume
    df_new['std_volume_5']= df['Volume'].rolling(5).std().shift(1)
    df_new['std_volume_30']= df['Volume'].rolling(21).std().shift(1)
    df_new['std_volume_365']= df['Volume'].rolling(252).std().shift(1)
    df_new['ratio_std_volume_5_30']= df_new['std_volume_5']/df_new['std_volume_30']
    df_new['ratio_std_volume_5_365']= df_new['std_volume_5']/df_new['std_volume_365']
    df_new['ratio_std_volume_30_365']= df_new['std_volume_30']/df_new['std_volume_365']
    # the target
    df_new['close']=df['Close']
    df_new=df_new.dropna(axis=0)
    return df_new

In [None]:
data= generate_features(data_raw)
data.head()

In [None]:
start_train= "1988-01-01"
end_train="2015-12-31"

start_test="2016-01-01"
end_test="2016-12-31"

In [None]:
data_train=data[start_train:end_train]

X_train= data_train.drop('close',axis=1).values
y_train= data_train['close'].values

In [None]:
print(X_train.shape)
print(y_train.shape)

In [None]:
data_test= data[start_test:end_test]

X_test= data_test.drop('close',axis=1).values
y_test= data_test['close'].values

In [None]:
# First experiment with Linear Regression

scaler=StandardScaler()

X_scaled_train= scaler.fit_transform(X_train)
X_scaled_test=scaler.transform(X_test)

param_grid={
    "alpha":[1e-5,3e-5,1e-4],
    "eta0":[0.01,0.03,0.1]
}
# l1, l2

from sklearn.linear_model import SGDRegressor

lr=SGDRegressor(penalty="l2", max_iter=100)
grid_search= GridSearchCV(lr, param_grid, cv=5, scoring="r2")
grid_search.fit(X_scaled_train, y_train)

print(grid_search.best_params_)

In [None]:
lr_best= grid_search.best_estimator_

predictions_lr= lr_best.predict(X_scaled_test)

In [None]:
print('MSE:{0:.3f}'.format(mean_squared_error(y_test, predictions_lr)))
print('MAE:{0:.3f}'.format(mean_absolute_error(y_test, predictions_lr)))
print('R^2:{0:.3f}'.format(r2_score(y_test, predictions_lr)))

In [None]:
# Experiment with Random Forest

param_grid={
    'max_depth':[5,10,15],
    'min_samples_split':[5,10],
    'max_features':['auto','sqrt'],
    'min_samples_leaf':[3,4]
}


In [None]:
from sklearn.ensemble import RandomForestRegressor

rf=RandomForestRegressor(n_estimators=100, n_jobs=-1)

grid_search= GridSearchCV(rf, param_grid, cv=5, scoring="r2", n_jobs=-1)
grid_search.fit(X_scaled_train, y_train)

print(grid_search.best_params_)

In [None]:
rf_best= grid_search.best_estimator_

predictions_rf= rf_best.predict(X_scaled_test)

In [None]:
print('MSE:{0:.3f}'.format(mean_squared_error(y_test, predictions_rf)))
print('MAE:{0:.3f}'.format(mean_absolute_error(y_test, predictions_rf)))
print('R^2:{0:.3f}'.format(r2_score(y_test, predictions_rf)))