In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor



In [2]:
data=pd.read_csv("StockData.csv")
df=pd.DataFrame(data)
df


Unnamed: 0,Index,Date,Open,High,Low,Close,Adj Close,Volume
0,NYA,1965-12-31,528.690002,528.690002,528.690002,528.690002,528.690002,0.0
1,NYA,1966-01-03,527.210022,527.210022,527.210022,527.210022,527.210022,0.0
2,NYA,1966-01-04,527.840027,527.840027,527.840027,527.840027,527.840027,0.0
3,NYA,1966-01-05,531.119995,531.119995,531.119995,531.119995,531.119995,0.0
4,NYA,1966-01-06,532.070007,532.070007,532.070007,532.070007,532.070007,0.0
...,...,...,...,...,...,...,...,...
112452,N100,2021-05-27,1241.119995,1251.910034,1241.119995,1247.069946,1247.069946,379696400.0
112453,N100,2021-05-28,1249.469971,1259.209961,1249.030029,1256.599976,1256.599976,160773400.0
112454,N100,2021-05-31,1256.079956,1258.880005,1248.140015,1248.930054,1248.930054,91173700.0
112455,N100,2021-06-01,1254.609985,1265.660034,1254.609985,1258.579956,1258.579956,155179900.0


In [3]:
data_NYA=df[(df["Index"]=="NYA") & (df['Date'] >= '2010-01-04')] #Selecting only the data of NYA, starting from the given date.
data_NYA

Unnamed: 0,Index,Date,Open,High,Low,Close,Adj Close,Volume
11077,NYA,2010-01-04,7184.979980,7331.120117,7184.979980,7326.740234,7326.740234,3.991400e+09
11078,NYA,2010-01-05,7326.740234,7359.459961,7313.600098,7354.870117,7354.870117,2.491020e+09
11079,NYA,2010-01-06,7354.850098,7389.279785,7342.490234,7377.700195,7377.700195,4.972660e+09
11080,NYA,2010-01-07,7377.700195,7398.209961,7325.509766,7393.930176,7393.930176,5.270680e+09
11081,NYA,2010-01-08,7393.930176,7426.410156,7367.810059,7425.350098,7425.350098,4.389590e+09
...,...,...,...,...,...,...,...,...
13943,NYA,2021-05-24,16375.000000,16508.519530,16375.000000,16464.689450,16464.689450,2.947400e+09
13944,NYA,2021-05-25,16464.689450,16525.810550,16375.150390,16390.189450,16390.189450,3.420870e+09
13945,NYA,2021-05-26,16390.189450,16466.339840,16388.320310,16451.960940,16451.960940,3.674490e+09
13946,NYA,2021-05-27,16451.960940,16546.359380,16451.960940,16531.949220,16531.949220,5.201110e+09


In [7]:
X = data_NYA[['Open', 'High', 'Low', 'Volume']] #Asssigning selected and relevent columns as independent variables
X.head()

Unnamed: 0,Open,High,Low,Volume
11077,7184.97998,7331.120117,7184.97998,3991400000.0
11078,7326.740234,7359.459961,7313.600098,2491020000.0
11079,7354.850098,7389.279785,7342.490234,4972660000.0
11080,7377.700195,7398.209961,7325.509766,5270680000.0
11081,7393.930176,7426.410156,7367.810059,4389590000.0


In [8]:
X.isna().sum()

Open      0
High      0
Low       0
Volume    0
dtype: int64

In [9]:
Y=data_NYA['Close'].shift(-1) #Shift the 'Close' values up by one day to represent the next day's closing price
Y

11077     7354.870117
11078     7377.700195
11079     7393.930176
11080     7425.350098
11081     7449.049805
             ...     
13943    16390.189450
13944    16451.960940
13945    16531.949220
13946    16555.660160
13947             NaN
Name: Close, Length: 2871, dtype: float64

In [10]:
X = X[:-1] #Dropping a row of X to match with Y
Y = Y.dropna()
X

Unnamed: 0,Open,High,Low,Volume
11077,7184.979980,7331.120117,7184.979980,3.991400e+09
11078,7326.740234,7359.459961,7313.600098,2.491020e+09
11079,7354.850098,7389.279785,7342.490234,4.972660e+09
11080,7377.700195,7398.209961,7325.509766,5.270680e+09
11081,7393.930176,7426.410156,7367.810059,4.389590e+09
...,...,...,...,...
13942,16350.450200,16475.000000,16344.759770,3.344620e+09
13943,16375.000000,16508.519530,16375.000000,2.947400e+09
13944,16464.689450,16525.810550,16375.150390,3.420870e+09
13945,16390.189450,16466.339840,16388.320310,3.674490e+09


In [11]:
Y.head()

11077    7354.870117
11078    7377.700195
11079    7393.930176
11080    7425.350098
11081    7449.049805
Name: Close, dtype: float64

In [12]:
Y.isna().sum()

0

In [13]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,random_state=42) #Test split not mentioned because value is 0.25

In [14]:
X_train.head()

Unnamed: 0,Open,High,Low,Volume
13723,12089.24023,12103.16016,11831.04004,4829020000.0
12990,11906.62988,11930.12988,11896.36035,3308060000.0
11515,7043.120117,7094.359863,6873.709961,4787920000.0
12733,10729.11035,10730.87988,10619.38965,3848750000.0
12488,10725.87988,10789.66016,10628.94043,4269130000.0


## Using linear regression

In [12]:
model1=LinearRegression() #Declaring model

In [13]:
model1.fit(X_train,Y_train) # Fitting the train datasets into the model

In [13]:
prediction=model1.predict(X_test)
prediction

array([ 6801.41696867, 13210.15147848,  8701.46008929,  7764.8386823 ,
       12909.59520579, 10854.55890682, 14417.31787848, 10721.63849647,
        8891.47770894, 12464.98114655, 10998.02165412, 10966.46328965,
       10474.13471875,  7869.97332489, 12364.0554345 ,  8383.90633657,
       10889.7298246 ,  7879.88275963, 10559.80391321, 10959.56768502,
        7560.90595881, 13028.04983987, 10303.80152753, 12951.32914081,
       11061.37192682,  7878.49924677, 13598.52552867,  9209.76689719,
       13816.48226546,  7577.86452303, 10914.02182554, 11519.15849814,
       12390.54973221,  8020.32635958, 16453.64519432, 10283.98808131,
        6653.53063618, 11324.87110256, 12677.47711222, 11221.33488665,
        9743.06319319, 12255.34728516, 12926.93321073, 12149.62278895,
       11426.22331973, 13102.71911449, 12948.78561947, 11123.94552596,
       10612.26519155, 12753.0861271 ,  8985.34204906, 13443.17119049,
        9903.56320175, 10597.74830884,  8434.75244014, 13099.99976277,
      

In [14]:
mae=mean_absolute_error(Y_test, prediction)
mse = mean_squared_error(Y_test, prediction)
r2=r2_score(Y_test,prediction)

In [38]:
print(f"The mean absolute error of the model is {mae}")
print(f"The mean squared error of the model is {mse}")
print(f"The R squared of the model is {r2}")


The mean absolute error of the model is 78.84083244011913
The mean squared error of the model is 15765.034028898419
The R squared of the model is 0.996770470971794


## Using Support Vector Machine


In [25]:
model2=SVR(kernel="rbf")

In [26]:
model2.fit(X_train,Y_train)

In [27]:
prediction=model2.predict(X_test)

In [28]:
mae=mean_absolute_error(Y_test, prediction)
mse = mean_squared_error(Y_test, prediction)
r2=r2_score(Y_test,prediction)

In [29]:
print(f"The mean absolute error of the model is {mae}")
print(f"The mean squared error of the model is {mse}")
print(f"The R squared of the model is {r2}")

The mean absolute error of the model is 1797.728694303074
The mean squared error of the model is 4914870.356943365
The R squared of the model is -0.006830461546831534


In [24]:
X_test


Unnamed: 0,Open,High,Low,Volume
11520,6722.979980,6856.879883,6674.290039,2.510620e+09
13471,13137.959960,13221.059570,13137.959960,3.154240e+09
11838,8714.700195,8716.120117,8684.769531,3.340650e+09
11728,7791.490234,7823.919922,7710.830078,4.193740e+09
13751,12967.940430,12975.540040,12877.190430,3.881310e+09
...,...,...,...,...
12458,10859.110350,10873.660160,10768.620120,4.078540e+09
13736,12515.839840,12563.450200,12482.269530,4.027890e+09
11559,7149.709961,7485.549805,7149.709961,5.801910e+09
13749,12864.740230,12932.889650,12856.690430,3.193400e+09


In [16]:
new=pd.DataFrame({"Open":[10141.769530],"High":[10873.660160],"Low":[10132.500000],"Volume":[407854000]})
new

Unnamed: 0,Open,High,Low,Volume
0,10141.76953,10873.66016,10132.5,407854000


## Using random forest


In [30]:
model3=RandomForestRegressor(n_estimators=40,
    max_depth=50,
    min_samples_split=5,
    min_samples_leaf=6,
    max_features= 0.7)

In [31]:
model3.fit(X_train,Y_train)

In [41]:
prediction=model3.predict(X_test)


In [42]:
mae=mean_absolute_error(Y_test, prediction)
mse = mean_squared_error(Y_test, prediction)
r2=r2_score(Y_test,prediction)

In [43]:
print(f"The mean absolute error of the model is {mae}")
print(f"The mean squared error of the model is {mse}")
print(f"The R squared of the model is {r2}")

The mean absolute error of the model is 83.65029165985432
The mean squared error of the model is 16558.81134141496
The R squared of the model is 0.9966078625772924


In [None]:
527.210022 527.210022 527.210022 527.210022 527.210022

In [None]:
 527.840027 527.840027 527.840027 527.840027 527.840027

In [35]:
new

Unnamed: 0,Open,High,Low,Volume
1,1254.609985,1265.660034,1254.609985,155179900.0


In [25]:
new.at[1,"Volume"]= 155179900.0


In [34]:
new.drop(new.index[0],inplace=True)

In [47]:
new2=pd.DataFrame({"Open":[1258.489990],"High":[1263.709961],"Low":[1258.239990],"Volume":[407854000]})
1258.489990 1263.709961 1258.239990 1263.619995 1263.619995 148465000.0
1254.609985 1265.660034 1254.609985 1258.579956 1258.579956 155179900.0

In [29]:
predict=model3.predict(new)

NameError: name 'model3' is not defined

In [38]:
new_test=pd.DataFrame({"Close":[1258.579956]})

In [39]:
new_test

Unnamed: 0,Close
0,1258.579956


In [44]:
pred=model3.predict(new)

In [47]:
mae=mean_absolute_error(new_test, pred)
mse = mean_squared_error(new_test, pred)
r2=r2_score(new_test,pred)



In [48]:
mae,mse,r2

(5403.263878323676, 29195260.538797416, nan)