In [1]:
import pandas as pd
import numpy as np
import quandl, math, datetime, pickle
from sklearn import preprocessing, cross_validation, svm
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from matplotlib import style

style.use('ggplot') 

In [2]:
df = quandl.get('WIKI/GOOGL')

In [3]:
df = df[['Adj. Open', 'Adj. High', 'Adj. Low', 'Adj. Close', 'Adj. Volume']]
df['HL_PCT'] = (df['Adj. High'] - df['Adj. Low']) / df['Adj. Close'] * 100
df['PCT_change'] = (df['Adj. Close'] - df['Adj. Open']) / df['Adj. Open'] * 100

df = df[['Adj. Close', 'HL_PCT', 'PCT_change', 'Adj. Volume']]
print(df.head())

            Adj. Close    HL_PCT  PCT_change  Adj. Volume
Date                                                     
2004-08-19   50.322842  8.072956    0.324968     44659000
2004-08-20   54.322689  7.921706    7.227007     22834300
2004-08-23   54.869377  4.049360   -1.227880     18256100
2004-08-24   52.597363  7.657099   -5.726357     15247300
2004-08-25   53.164113  3.886792    1.183658      9188600


In [4]:
forecast_col = 'Adj. Close'
df.fillna(-99999, inplace=True) #replacing NaN with -99999

forecast_out = int(math.ceil(0.01*len(df))) # move 10% forward to future

df['label'] = df[forecast_col].shift(periods=-forecast_out)

In [5]:
X = np.array(df.drop(['label'], 1))

X = preprocessing.scale(X)
X_lately = X[-forecast_out:]
X = X[:-forecast_out]

df.dropna(inplace=True)

y = np.array(df.label)



In [6]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2)

clf = LinearRegression(n_jobs=3)# n_jobs=-1 for using alll possible processes - multiprocessing
clf.fit(X_train, y_train)

with open('linearregression.pickle','wb') as f:
    pickle.dump(clf,f)

In [7]:
pickle_in = open('linearregression.pickle','rb')
clf = pickle.load(pickle_in)

accuracy = clf.score(X_test, y_test)
#print(accuracy)

In [8]:
'''
clf2 = svm.SVR(kernel='linear')
clf2.fit(X_train, y_train)

accuracy2 = clf2.score(X_test, y_test)

print(accuracy2)
'''

forecast_set = clf.predict(X_lately)
print(forecast_set, '\n', accuracy, '\n', forecast_out)

[ 813.71331927  819.04387535  830.07495497  832.96039982  832.11459338
  835.68524916  835.26615071  837.01260522  833.20560135  835.19359941
  830.34902867  834.31401791  848.91380461  854.89710503  863.72671183
  861.77837079  848.51321106  828.2960321   825.71987284  820.64368623
  823.76100353  826.13864538  827.5499289   834.86687188  835.65995037
  836.15453251  840.68964574  844.89680088  845.99904671  843.26474983
  848.17767312  852.17502088] 
 0.969113935335 
 32


In [9]:
df['forecast'] = np.nan
last_date = df.iloc[-1].name
last_unix = last_date.timestamp()
one_day = 86400
next_unix = last_unix + one_day

for i in forecast_set:
    next_date = datetime.datetime.fromtimestamp(next_unix)
    next_unix += one_day
    df.loc[next_date] = [np.nan for _ in range(len(df.columns)-1)] + [i]

In [10]:
df['Adj. Close'].plot()
df['forecast'].plot()
plt.legend(loc=4)
plt.xlabel('Date')
plt.ylabel('Price')
plt.show()