In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import chart_studio.plotly as py
import plotly.graph_objs as go 
from plotly.offline import plot

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

In [2]:
ibm = pd.read_csv('ibm.csv')

In [24]:
ibm[['Date']] = pd.to_datetime(ibm['Date'], format='%m/%d/%Y')

In [25]:
ibm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14059 entries, 0 to 14058
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   Date     14059 non-null  datetime64[ns]
 1   Open     14059 non-null  float64       
 2   High     14059 non-null  float64       
 3   Low      14059 non-null  float64       
 4   Close    14059 non-null  float64       
 5   Volume   14059 non-null  int64         
 6   OpenInt  14059 non-null  int64         
dtypes: datetime64[ns](1), float64(4), int64(2)
memory usage: 769.0 KB


In [26]:
ibm['Date'] = pd.to_datetime(ibm['Date'], format='%m/%d/%Y')

In [27]:
print(f'Dataframe contains stock prices between {ibm.Date.min()} {ibm.Date.max()}')
print(f'Total days = {(ibm.Date.max() - ibm.Date.min()).days} days')

Dataframe contains stock prices between 1962-01-02 00:00:00 2017-11-10 00:00:00
Total days = 20401 days


In [28]:
ibm.describe()

Unnamed: 0,Open,High,Low,Close,Volume,OpenInt
count,14059.0,14059.0,14059.0,14059.0,14059.0,14059.0
mean,48.535548,48.987464,48.111897,48.554195,5782966.0,0.0
std,49.270969,49.663765,48.912626,49.297698,5429533.0,0.0
min,3.3901,3.5655,3.3235,3.3901,0.0,0.0
25%,13.2385,13.358,13.145,13.234,1471590.0,0.0
50%,23.029,23.213,22.819,23.029,4801835.0,0.0
75%,77.3835,78.154,76.7435,77.4445,8224838.0,0.0
max,186.01,186.46,185.06,186.36,83165900.0,0.0


In [29]:
layout = go.Layout(
    title='Stock Prices of IBM',
    xaxis=dict(
        title='Date',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    ),
    yaxis=dict(
        title='Price',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    )
)

ibm_data = [{'x':ibm['Date'], 'y':ibm['Close']}]
plot = go.Figure(data=ibm_data, layout=layout)

In [30]:
iplot(plot)

In [31]:
# Building the regression model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

#For model evaluation
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score

In [32]:
X = np.array(ibm.index).reshape(-1,1)
Y = ibm['Close']
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.3, random_state=101)

In [33]:
scaler = StandardScaler().fit(X_train)

In [34]:
from sklearn.linear_model import LinearRegression

In [35]:
#Creating a linear model
lm = LinearRegression()
lm.fit(X_train, Y_train)

LinearRegression()

In [54]:
#Plot actual and predicted values for train dataset
trace0 = go.Scatter(
    x = X_train.T[0],
    y = Y_train,
    mode = 'markers',
    name = 'Actual'
)
trace1 = go.Scatter(
    x = X_train.T[0],
    y = lm.predict(X_train).T,
    mode = 'lines',
    name = 'Predicted'
)
ibm_data = [trace0,trace1]
#ibm_data = [{'x':ibm['Date'], 'y':ibm['Close']}]
layout.xaxis.title.text = 'Day'
plot2 = go.Figure(data=ibm_data, layout=layout)

In [55]:
iplot(plot2)

In [38]:
#Calculate scores for model evaluation
scores = f'''
{'Metric'.ljust(10)}{'Train'.center(20)}{'Test'.center(20)}
{'r2_score'.ljust(10)}{r2_score(Y_train, lm.predict(X_train))}\t{r2_score(Y_test, lm.predict(X_test))}
{'MSE'.ljust(10)}{mse(Y_train, lm.predict(X_train))}t{mse(Y_test, lm.predict(X_test))}
'''

print(scores)


Metric           Train                Test        
r2_score  0.7288791430441117	0.7320575337999633
MSE       660.2304928354728]t647.9330502913741

