In [1]:
import pandas as pd
import numpy as np

In [2]:
nasdaqFile = "/users/nirmalanawale/nirmi/myworkspace/python/data/nasdaq.csv"
spFile = "/users/nirmalanawale/nirmi/myworkspace/python/data/sp_500.csv"
googFile = "/users/nirmalanawale/nirmi/myworkspace/python/data/goog.csv"
oilFile = "/users/nirmalanawale/nirmi/myworkspace/python/data/uso.csv"
xomFile = "/users/nirmalanawale/nirmi/myworkspace/python/data/xom.csv"

In [3]:
def readFile(filename):
    data = pd.read_csv(filename, sep = ",", usecols =[0,6], names = ["Date", "Price"], header = 0)
    returns = np.array(data["Price"][:-1] , np.float)/np.array(data["Price"][1:], np.float) - 1
    data["Returns"] = np.append(returns, np.nan)
    data.index = data["Date"]
    return data

In [4]:
from sklearn import datasets, linear_model

In [5]:
oilData = readFile(oilFile)

In [6]:
# to remove last 'NaN' value
oilDataUnfilled = oilData["Returns"][0:-1]

In [7]:
nasdaqData = readFile(nasdaqFile)

In [11]:
nasdaqDataFilled = nasdaqData["Returns"][0:-1]

In [12]:
nasdaqDataFilled = nasdaqDataFilled[0:len(oilDataUnfilled)]

In [13]:
nasdaqDataFilled

Date
2018-07-27    0.015284
2018-07-30   -0.023526
2018-07-31   -0.012817
2018-08-01    0.068347
2018-08-02    0.031010
2018-08-03   -0.098058
2018-08-06    0.018466
2018-08-07    0.052561
2018-08-08    0.027005
2018-08-09   -0.029702
2018-08-10    0.094523
2018-08-13   -0.001627
2018-08-14   -0.161612
2018-08-15    0.155299
2018-08-16    0.073998
2018-08-17    0.084863
2018-08-20   -0.038299
2018-08-21    0.063940
2018-08-22   -0.116725
2018-08-23   -0.003557
2018-08-24   -0.186505
2018-08-27    0.182445
2018-08-28    0.033896
2018-08-29   -0.058304
2018-08-30    0.060287
2018-08-31   -0.147839
2018-09-04   -0.141429
2018-09-05    0.096298
2018-09-06    0.103570
2018-09-07    0.051400
                ...   
2018-09-14   -0.109634
2018-09-17   -0.014603
2018-09-18    0.074771
2018-09-19   -0.057023
2018-09-20   -0.382989
2018-09-21    0.534039
2018-09-24    0.056046
2018-09-25   -0.003870
2018-09-26    0.118446
2018-09-27   -0.116255
2018-09-28    0.017830
2018-10-01   -0.045221
2018-1

In [14]:
# verticall stack array
combined = np.stack((nasdaqDataFilled, oilDataUnfilled))

In [15]:
combined

array([[ 0.01528406, -0.02352566, -0.01281734,  0.06834671,  0.03101006,
        -0.09805779,  0.01846568,  0.05256125,  0.02700509, -0.02970151,
         0.09452315, -0.00162724, -0.16161192,  0.15529871,  0.07399832,
         0.08486318, -0.03829891,  0.06394011, -0.11672491, -0.00355666,
        -0.18650528,  0.18244532,  0.03389562, -0.05830376,  0.06028707,
        -0.147839  , -0.141429  ,  0.09629836,  0.10356973,  0.05139974,
        -0.12013671, -0.0092491 ,  0.00997533,  0.14013729, -0.10963426,
        -0.01460317,  0.0747706 , -0.05702293, -0.38298865,  0.53403852,
         0.05604552, -0.00387022,  0.11844559, -0.11625515,  0.01783   ,
        -0.04522094, -0.21525729, -0.06938969,  0.2250926 ,  0.19345073,
        -0.08922181, -0.20091768, -0.01973695,  0.18450811,  0.23335272,
        -0.18085999,  0.1093491 , -0.07118364,  0.00542212,  0.10945496,
        -0.16573459, -0.06803836,  0.07066135, -0.07520625],
       [-0.1262739 ,  0.06133995, -0.10323069,  0.04358397,  0.

In [16]:
# but we want the arrays to be horizontally stacked
# use transpose operation
combined = combined.T

In [17]:
combined
[x11, ]

array([[ 0.01528406, -0.1262739 ],
       [-0.02352566,  0.06133995],
       [-0.01281734, -0.10323069],
       [ 0.06834671,  0.04358397],
       [ 0.03101006,  0.05180459],
       [-0.09805779, -0.19296564],
       [ 0.01846568,  0.52840315],
       [ 0.05256125, -0.53674194],
       [ 0.02700509,  0.78888751],
       [-0.02970151, -0.10519898],
       [ 0.09452315, -0.34105427],
       [-0.00162724,  0.44544749],
       [-0.16161192, -0.60178732],
       [ 0.15529871,  1.57557529],
       [ 0.07399832, -0.09549486],
       [ 0.08486318,  0.1653027 ],
       [-0.03829891,  0.63400351],
       [ 0.06394011, -0.47178548],
       [-0.11672491,  0.76059526],
       [-0.00355666, -0.19381541],
       [-0.18650528,  0.31614156],
       [ 0.18244532,  0.01281442],
       [ 0.03389562, -0.3944468 ],
       [-0.05830376,  0.23619531],
       [ 0.06028707,  0.30275021],
       [-0.147839  , -0.2682265 ],
       [-0.141429  ,  0.16795613],
       [ 0.09629836, -0.17153503],
       [ 0.10356973,

In [19]:
# y data
xomData = readFile(xomFile)
xomDataFilled = xomData["Returns"][0:-1][0:len(oilDataUnfilled)]

In [20]:
xomNasdaqOilModel = linear_model.LinearRegression()

In [21]:
xomNasdaqOilModel.fit(combined,xomDataFilled)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [22]:
xomNasdaqOilModel.score(combined,xomDataFilled)

0.31874619144429406

In [23]:
import statsmodels.api as sm

In [24]:
x = sm.add_constant(combined)

In [26]:
model = sm.OLS(xomDataFilled, x)

In [27]:
results = model.fit()

In [28]:
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                Returns   R-squared:                       0.319
Model:                            OLS   Adj. R-squared:                  0.296
Method:                 Least Squares   F-statistic:                     14.27
Date:                Sun, 28 Oct 2018   Prob (F-statistic):           8.24e-06
Time:                        22:54:44   Log-Likelihood:                 1.1548
No. Observations:                  64   AIC:                             3.690
Df Residuals:                      61   BIC:                             10.17
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0292      0.031      0.947      0.3

In [29]:
# multiple linear model for categorical data
googData = readFile(googFile)

In [30]:
googData

Unnamed: 0_level_0,Date,Price,Returns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-07-27,2018-07-27,2130600,0.151738
2018-07-30,2018-07-30,1849900,0.124764
2018-07-31,2018-07-31,1644700,0.049451
2018-08-01,2018-08-01,1567200,0.023444
2018-08-02,2018-08-02,1531300,0.405378
2018-08-03,2018-08-03,1089600,0.007303
2018-08-06,2018-08-06,1081700,-0.275971
2018-08-07,2018-08-07,1494000,0.090272
2018-08-08,2018-08-08,1370300,0.627821
2018-08-09,2018-08-09,841800,-0.240732


In [31]:
googData["Months"] = [int(x[5:7]) for x in googData["Date"]]

In [32]:
googData

Unnamed: 0_level_0,Date,Price,Returns,Months
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-07-27,2018-07-27,2130600,0.151738,7
2018-07-30,2018-07-30,1849900,0.124764,7
2018-07-31,2018-07-31,1644700,0.049451,7
2018-08-01,2018-08-01,1567200,0.023444,8
2018-08-02,2018-08-02,1531300,0.405378,8
2018-08-03,2018-08-03,1089600,0.007303,8
2018-08-06,2018-08-06,1081700,-0.275971,8
2018-08-07,2018-08-07,1494000,0.090272,8
2018-08-08,2018-08-08,1370300,0.627821,8
2018-08-09,2018-08-09,841800,-0.240732,8


In [38]:
#dummy = sm.categorical(googData["Months"].values.reshape(1,-1). drop = True)

SyntaxError: keyword can't be an expression (<ipython-input-38-6337adeba41c>, line 1)