In [2]:
import pandas as pd
import sys
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
import statsmodels.api as sm
sns.set(style="ticks")
%matplotlib inline

In [3]:
df = pd.read_csv('ucidata.csv', sep=",", header=None)

#replaces question marks with "0"
for c in range(0, 127):
    for r in range(0, 1994):
        if df[c][r] == "?":
            df.set_value(r, c, 0)
            
#turns all columns that are numerics encoded as strings into floats
for c in range(4, 127):
    for r in range(0, 1994):
        if type(df[c][r]) == str:
            df.set_value(r, c, float(df[c][r]))

In [4]:
#takes out features columns
x = df.iloc[:, 4:126]

#takes of what we want to predict
target = df.iloc[:, 127]

In [11]:
#runs linear regression and returns summary of model
def reg_m(y, x):
    model = sm.OLS(y, x.astype(float)).fit()
    #fits simple ordinary least squares model
    predictions = model.predict(x)
    #makes predictions for y based on x
    return(model.summary())
reg_m(target, x)

0,1,2,3
Dep. Variable:,127,R-squared:,0.853
Model:,OLS,Adj. R-squared:,0.843
Method:,Least Squares,F-statistic:,88.95
Date:,"Fri, 30 Jun 2017",Prob (F-statistic):,0.0
Time:,10:41:05,Log-Likelihood:,1273.9
No. Observations:,1994,AIC:,-2304.0
Df Residuals:,1872,BIC:,-1621.0
Df Model:,122,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
4.0,-0.0013,0.001,-1.254,0.210,-0.003 0.001
5.0,0.1734,0.402,0.432,0.666,-0.614 0.961
6.0,-0.0173,0.087,-0.200,0.842,-0.187 0.153
7.0,0.2056,0.051,4.053,0.000,0.106 0.305
8.0,-0.0236,0.058,-0.411,0.681,-0.136 0.089
9.0,0.0007,0.034,0.022,0.983,-0.065 0.067
10.0,0.0933,0.054,1.741,0.082,-0.012 0.198
11.0,0.1785,0.104,1.723,0.085,-0.025 0.382
12.0,-0.0861,0.150,-0.575,0.566,-0.380 0.208

0,1,2,3
Omnibus:,389.508,Durbin-Watson:,1.999
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1263.113
Skew:,0.967,Prob(JB):,5.23e-275
Kurtosis:,6.385,Cond. No.,49100.0


In [5]:
x_train, x_test, y_train, y_test= train_test_split(x, target, test_size=0.30, random_state=0)

In [21]:
"""Multivariate Linear Regression with Sk Learn"""

multi_regression_model = LinearRegression(fit_intercept=True, normalize = True)
multi_regression_model.fit(x_train, y_train)

train_MSE = np.mean((y_train - multi_regression_model.predict(x_train))**2)
test_MSE = np.mean((y_test - multi_regression_model.predict(x_test))**2)
print ('The train MSE is {}, the test MSE is {}'.format(train_MSE, test_MSE))

train_R_sq = multi_regression_model.score(x_train, y_train)
test_R_sq = multi_regression_model.score(x_test, y_test)
print ('The train R^2 is {}, the test R^2 is {}'.format(train_R_sq, test_R_sq))

The train MSE is 0.015565029548165637, the test MSE is 0.019588942468657524
The train R^2 is 0.7144096927883361, the test R^2 is 0.6349076121953989


In [29]:
"""Linear least squares with l2 regularization."""
ridge_regression = Ridge(alpha=1.0, fit_intercept=True)
ridge_regression.fit(x_train, y_train)

train_MSE = np.mean((y_train - ridge_regression.predict(x_train))**2)
test_MSE = np.mean((y_test - ridge_regression.predict(x_test))**2)
print ('The train MSE is {}, the test MSE is {}'.format(train_MSE, test_MSE))

train_R_sq = ridge_regression.score(x_train, y_train)
test_R_sq = ridge_regression.score(x_test, y_test)
print ('The train R^2 is {}, the test R^2 is {}'.format(train_R_sq, test_R_sq))

The train MSE is 0.015939528670830955, the test MSE is 0.019392751497559074
The train R^2 is 0.7075383072145718, the test R^2 is 0.6385641561981515


In [30]:
#print 'Ridge regression model:\n {} + {}^T . x'.format(ridge_regression.intercept_, ridge_regression.coef_)

In [61]:
clf = BayesianRidge(alpha_1 = 6, alpha_2 = 1/25, lambda_1 = 6, lambda_2 = 1/25)
clf.fit(x_train, y_train)

train_MSE = np.mean((y_train - clf.predict(x_train))**2)
test_MSE = np.mean((y_test - clf.predict(x_test))**2)
print ('The train MSE is {}, the test MSE is {}'.format(train_MSE, test_MSE))

train_R_sq = clf.score(x_train, y_train)
test_R_sq = clf.score(x_test, y_test)
print ('The train R^2 is {}, the test R^2 is {}'.format(train_R_sq, test_R_sq))

The train MSE is 0.01643020496376541, the test MSE is 0.01939902425115551
The train R^2 is 0.6985352794460089, the test R^2 is 0.6384472466410227


In [16]:
lasso_regression = Lasso(alpha=0.02, fit_intercept=True)
lasso_regression.fit(x_train, y_train)

train_MSE = np.mean((y_train - lasso_regression.predict(x_train))**2)
test_MSE = np.mean((y_test - lasso_regression.predict(x_test))**2)
print ('The train MSE is {}, the test MSE is {}'.format(train_MSE, test_MSE))

train_R_sq = lasso_regression.score(x_train, y_train)
test_R_sq = lasso_regression.score(x_test, y_test)
print ('The train R^2 is {}, the test R^2 is {}'.format(train_R_sq, test_R_sq))
print ('Lasso regression model:\n {} + {}^T . x'.format(lasso_regression.intercept_, lasso_regression.coef_))

The train MSE is 0.03130122039137473, the test MSE is 0.030706615363592576
The train R^2 is 0.4256788835504513, the test R^2 is 0.42770001277870073
Lasso regression model:
 0.26962628198172317 + [-0.00131359  0.         -0.          0.         -0.09486575  0.          0.
 -0.          0.          0.          0.          0.          0.         -0.
 -0.         -0.         -0.          0.          0.         -0.         -0.
 -0.         -0.         -0.         -0.         -0.         -0.         -0.
  0.          0.          0.          0.         -0.          0.         -0.
 -0.         -0.          0.         -0.          0.          0.          0.
  0.          0.         -0.         -0.02875738 -0.         -0.         -0.
 -0.          0.          0.26750297  0.          0.          0.          0.
  0.          0.          0.          0.          0.         -0.          0.
  0.          0.         -0.         -0.          0.         -0.          0.
  0.         -0.          0.       

In [58]:
#find out the mean of the y train, set the gamma parameters
#run again and check for better results?
from fractions import Fraction
x=np.mean(y_train)
print ("the mean of the y_train is approximately 6/25")

the mean of the y_train is approximately 6/25


In [14]:
gen_cross_terms = PolynomialFeatures(interaction_only=True)

#fit interaction terms using x train values
cross_terms = gen_cross_terms.fit_transform(x_train)

array([[ 1.  ,  3.  ,  0.03, ...,  0.  ,  0.  ,  0.  ],
       [ 1.  ,  5.  ,  0.04, ...,  0.  ,  0.  ,  0.  ],
       [ 1.  ,  2.  ,  0.01, ...,  0.  ,  0.  ,  0.  ],
       ..., 
       [ 1.  ,  9.  ,  0.  , ...,  0.  ,  0.  ,  0.  ],
       [ 1.  ,  3.  ,  0.02, ...,  0.  ,  0.  ,  0.  ],
       [ 1.  ,  4.  ,  0.05, ...,  0.  ,  0.  ,  0.  ]])

In [15]:
#Interaction Terms with Ridge Regression
#create instance of PolynomialFeatures
gen_cross_terms = PolynomialFeatures(interaction_only=True)

#fit interaction terms using x train values
cross_terms = gen_cross_terms.fit_transform(x_train)

#combine x train terms with new interaction terms
X_train_with_cross = np.hstack((x_train, cross_terms))

#fit interaction terms using x test values
cross_terms = gen_cross_terms.fit_transform(x_test)

#combine x test terms with new interaction terms
X_test_with_cross = np.hstack((x_test, cross_terms))

#Create instance of ridge regression to use with train interaction terms 
ridge_regression_inter = Ridge(alpha=1.0, fit_intercept=True)

#fit ridge regression with all x train and interaction terms and y train terms
ridge_regression_inter.fit(X_train_with_cross, y_train)

ridge_regression_inter.predict(X_test_with_cross)

#Create instance of ridge regression to use with test interaction terms 
#ridge_regression_inter2 = Ridge(alpha=1.0, fit_intercept=True)

#fit ridge regression with all x test and interaction terms and y test terms
#ridge_regression_inter2.fit(X_test_with_cross, y_test)

#calculate MSE and R squared values
train_MSE = np.mean((y_train - ridge_regression_inter.predict(X_train_with_cross))**2)
test_MSE = np.mean((y_test - ridge_regression_inter.predict(X_test_with_cross))**2)
print ('The train MSE with interaction terms is {}, the test MSE is {}'.format(train_MSE, test_MSE))
train_R_sq = ridge_regression_inter.score(X_train_with_cross, y_train)
test_R_sq = ridge_regression_inter.score(X_test_with_cross, y_test)
print ('The train R^2 with interaction terms is {}, the test R^2 is {}'.format(train_R_sq, test_R_sq))

The train MSE with interaction terms is 0.0048158637569161765, the test MSE is 0.027020815744616775
The train R^2 with interaction terms is 0.9116375586971309, the test R^2 is 0.49639475656153775
