In [39]:
import pandas as pd
import matplotlib.pyplot as plt
import quandl
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel


### Download stock data from Quandl 

In [40]:
stock_data=quandl.get("TC1/HDFC", authtoken="z3tQnw4f2yVky285bp6T")
stock_data

Unnamed: 0_level_0,Open Price,High Price,Low Price,Last Traded Price,Close Price,Total Traded Quantity,Turnover (in Lakhs)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2000-01-03,22.85,22.85,22.85,22.85,22.85,292150.95,66.75
2000-01-04,24.67,24.67,23.13,23.66,23.66,3278746.97,775.00
2000-01-05,22.58,23.65,22.19,22.97,22.80,3456472.99,792.54
2000-01-06,23.44,24.44,22.97,23.05,23.08,3929548.40,918.17
2000-01-07,22.58,23.07,21.88,22.36,22.31,2531002.91,569.05
...,...,...,...,...,...,...,...
2010-12-27,651.35,654.98,645.54,645.63,647.68,1113957.71,7249.07
2010-12-28,647.77,654.06,646.01,653.49,651.72,1079885.79,7023.75
2010-12-29,653.03,669.49,652.23,668.11,667.54,1849823.87,12249.31
2010-12-30,663.03,680.90,663.03,674.62,678.52,3029700.82,20473.61


### Convert data into Dataframe 

In [41]:
ds=pd.DataFrame(stock_data)
ds.to_csv('tc1.csv')
data=pd.read_csv('tc1.csv')

PermissionError: [Errno 13] Permission denied: 'tc1.csv'

### Data analysis 

In [None]:
data.shape

In [42]:
data.describe()

Unnamed: 0,Open Price,High Price,Low Price,Last Traded Price,Close Price,Total Traded Quantity,Turnover (in Lakhs)
count,2747.0,2747.0,2747.0,2747.0,2747.0,2747.0,2747.0
mean,220.615508,224.89142,216.157612,220.556604,220.595104,2865266.0,8306.436036
std,181.564336,184.862839,178.047946,181.362707,181.412885,3230842.0,11834.931533
min,0.0,0.0,0.0,19.26,22.1,35350.4,18.35
25%,54.52,55.33,53.68,54.53,54.625,979869.4,659.825
50%,147.51,149.43,146.11,148.08,148.02,1970621.0,2837.54
75%,356.58,363.615,350.615,357.935,357.08,3578130.0,13236.61
max,710.44,800.98,688.11,703.09,707.51,57842640.0,244662.7


In [43]:
data.isnull().sum()

Date                     0
Open Price               0
High Price               0
Low Price                0
Last Traded Price        0
Close Price              0
Total Traded Quantity    0
Turnover (in Lakhs)      0
dtype: int64

In [44]:
data.corr()

Unnamed: 0,Open Price,High Price,Low Price,Last Traded Price,Close Price,Total Traded Quantity,Turnover (in Lakhs)
Open Price,1.0,0.99943,0.999484,0.999154,0.999184,0.339953,0.662011
High Price,0.99943,1.0,0.999145,0.999506,0.999542,0.347801,0.669035
Low Price,0.999484,0.999145,1.0,0.999529,0.999558,0.331583,0.654559
Last Traded Price,0.999154,0.999506,0.999529,1.0,0.999966,0.340217,0.662214
Close Price,0.999184,0.999542,0.999558,0.999966,1.0,0.340485,0.662537
Total Traded Quantity,0.339953,0.347801,0.331583,0.340217,0.340485,1.0,0.845982
Turnover (in Lakhs),0.662011,0.669035,0.654559,0.662214,0.662537,0.845982,1.0


### Feature selection

In [45]:
# Since we predit Open prices, it becomes the dependent variable
# X is our independent and Y is dependent variable
X=data.drop(['Date', 'Open Price'], axis=1)     #  Date could impact our model but since it's categorical we drop it 
Y=data['Open Price']
print(X.head())
print('\n Dependent variable \n',Y.head())

   High Price  Low Price  Last Traded Price  Close Price  \
0       22.85      22.85              22.85        22.85   
1       24.67      23.13              23.66        23.66   
2       23.65      22.19              22.97        22.80   
3       24.44      22.97              23.05        23.08   
4       23.07      21.88              22.36        22.31   

   Total Traded Quantity  Turnover (in Lakhs)  
0              292150.95                66.75  
1             3278746.97               775.00  
2             3456472.99               792.54  
3             3929548.40               918.17  
4             2531002.91               569.05  

 Dependent variable 
 0    22.85
1    24.67
2    22.58
3    23.44
4    22.58
Name: Open Price, dtype: float64


#### Split data into training (80% of data) and testing(20%) data

In [46]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size = 0.2,random_state = 0)

### Linear Regression model training and evaluation

In [47]:
lr=LinearRegression()
lr.fit(x_train,y_train)


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [48]:
lr.score(x_test,y_test)

0.9994676707787391

#### Predicting Open Price given a sample

In [49]:
sample_data=[[27.56,26.08,27.35,27.29,15672940,4213.63]]
lr.predict(sample_data)

array([26.61488234])

##### On that day TC1 Opens at price 26.24 and our model predicted 26.61, which is very close to the prediction