In [53]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from datetime import datetime
import seaborn as sns
import math

# # Introduction


df = pd.read_csv(r'C:\Users\mikel\OneDrive\Desktop\Python\Datasets\sphist.csv')

# Convert Date col to Pandas Date Format
df['Date'] = pd.to_datetime(df['Date'])

# Data Cleaning

In [54]:
# Boolean Condition (from datetime import datetime)
a = df["Date"] > datetime(year=2015, month=4, day=1)

df = df.sort_values(by='Date', ascending=True)

print(df.iloc[:2])
print('mean is: ' + str(df['Close'][:4].mean()))

            Date   Open   High    Low  Close     Volume  Adj Close
16589 1950-01-03  16.66  16.66  16.66  16.66  1260000.0      16.66
16588 1950-01-04  16.85  16.85  16.85  16.85  1890000.0      16.85
mean is: 16.855


In [55]:
df['day_5'] = 0
df['day_30'] = 0
df['day_365'] = 0
print(df[:2])

            Date   Open   High    Low  Close     Volume  Adj Close  day_5  \
16589 1950-01-03  16.66  16.66  16.66  16.66  1260000.0      16.66      0   
16588 1950-01-04  16.85  16.85  16.85  16.85  1890000.0      16.85      0   

       day_30  day_365  
16589       0        0  
16588       0        0  


In [73]:
# Calculates rolling mean and shifts index position by 1 day
df['day_5'] = df['Close'].rolling(5).mean()
df['day_5'] = df['day_5'].shift()
df['day_30'] = df['Close'].rolling(30).mean()
df['day_30'] = df['day_30'].shift()
df['day_365'] = df['Close'].rolling(365).mean()
df['day_365'] = df['day_365'].shift()
print(df.head())

            Date       Open       High        Low      Close     Volume  \
16224 1951-06-19  22.020000  22.020000  22.020000  22.020000  1100000.0   
16223 1951-06-20  21.910000  21.910000  21.910000  21.910000  1120000.0   
16222 1951-06-21  21.780001  21.780001  21.780001  21.780001  1100000.0   
16221 1951-06-22  21.549999  21.549999  21.549999  21.549999  1340000.0   
16220 1951-06-25  21.290001  21.290001  21.290001  21.290001  2440000.0   

       Adj Close  day_5  day_30  day_365  
16224  22.020000    NaN     NaN      NaN  
16223  21.910000    NaN     NaN      NaN  
16222  21.780001    NaN     NaN      NaN  
16221  21.549999    NaN     NaN      NaN  
16220  21.290001    NaN     NaN      NaN  


In [57]:
# Drop rows with NaN
df = df.dropna(axis=0)
print(df.head())

            Date       Open       High        Low      Close     Volume  \
16224 1951-06-19  22.020000  22.020000  22.020000  22.020000  1100000.0   
16223 1951-06-20  21.910000  21.910000  21.910000  21.910000  1120000.0   
16222 1951-06-21  21.780001  21.780001  21.780001  21.780001  1100000.0   
16221 1951-06-22  21.549999  21.549999  21.549999  21.549999  1340000.0   
16220 1951-06-25  21.290001  21.290001  21.290001  21.290001  2440000.0   

       Adj Close   day_5     day_30    day_365  
16224  22.020000  21.800  21.703333  19.447726  
16223  21.910000  21.900  21.683000  19.462411  
16222  21.780001  21.972  21.659667  19.476274  
16221  21.549999  21.960  21.631000  19.489562  
16220  21.290001  21.862  21.599000  19.502082  


# Linear Regression

In [58]:
# Training Data
a = df['Date'] < datetime(year=2013, month=1, day=1)
b = df['Date'] >= datetime(year=2013, month=1, day=1)
train = df[a]
test = df[b]
print(train.head(10))
print(test.head(10))

            Date       Open       High        Low      Close     Volume  \
16224 1951-06-19  22.020000  22.020000  22.020000  22.020000  1100000.0   
16223 1951-06-20  21.910000  21.910000  21.910000  21.910000  1120000.0   
16222 1951-06-21  21.780001  21.780001  21.780001  21.780001  1100000.0   
16221 1951-06-22  21.549999  21.549999  21.549999  21.549999  1340000.0   
16220 1951-06-25  21.290001  21.290001  21.290001  21.290001  2440000.0   
16219 1951-06-26  21.299999  21.299999  21.299999  21.299999  1260000.0   
16218 1951-06-27  21.370001  21.370001  21.370001  21.370001  1360000.0   
16217 1951-06-28  21.100000  21.100000  21.100000  21.100000  1940000.0   
16216 1951-06-29  20.959999  20.959999  20.959999  20.959999  1730000.0   
16215 1951-07-02  21.100000  21.100000  21.100000  21.100000  1350000.0   

       Adj Close   day_5     day_30    day_365  
16224  22.020000  21.800  21.703333  19.447726  
16223  21.910000  21.900  21.683000  19.462411  
16222  21.780001  21.972  2

In [71]:
# Linear Regression

lr = LinearRegression()
lr.fit(train[['day_5']], train[['Close']])
prediction = lr.predict(test[['day_5']])
mse = mean_squared_error(test['Close'], prediction)
print('MSE_5:',mse)

lr = LinearRegression()
lr.fit(train[['day_30']], train[['Close']])
prediction = lr.predict(test[['day_30']])
mse = mean_squared_error(test['Close'], prediction)
print('MSE_30:',mse)
lr = LinearRegression()
lr.fit(train[['day_365']], train[['Close']])
prediction = lr.predict(test[['day_365']])
mse = mean_squared_error(test['Close'], prediction)
print('MSE_365:',mse)

MSE_5: 493.2698288743041
MSE_30: 1636.5809684230187
MSE_365: 24609.62854669091


In [74]:
# Additional indicators for Linear Regression
lr = LinearRegression()
lr.fit(train[['day_5', 'day_30']], train[['Close']])
prediction = lr.predict(test[['day_5', 'day_30']])
mse = mean_squared_error(test['Close'], prediction)
print('MSE_5_365:',mse)

lr = LinearRegression()
lr.fit(train[['day_5', 'day_30', 'day_365']], train[['Close']])
prediction = lr.predict(test[['day_5', 'day_30', 'day_365']])
mse = mean_squared_error(test['Close'], prediction)
print('MSE_5_30_365:',mse)

MSE_5_365: 493.8476589907151
MSE_5_30_365: 493.7313030125896
