In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd 
import sklearn as sk
from matplotlib import style  
style.use('dark_background')

### Read csv file

In [2]:
df = pd.read_csv('housing.data.csv',sep="\s+")
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


### Create independent variables

In [3]:
X = df['RM']
Y = df['MEDV']

### Split train and test set

In [4]:
from sklearn.model_selection import train_test_split
trainX, testX, trainY, testY = train_test_split(np.array(X),np.array(Y),test_size = 0.2, random_state = 99, shuffle = True)

trainX = trainX.reshape(-1,1)
testX = testX.reshape(-1,1)
testY = testY.reshape(-1,1)
trainY = trainY.reshape(-1,1)

print(trainX.shape)
print(testX.shape)
print(trainY.shape)
print(testY.shape)

(404, 1)
(102, 1)
(404, 1)
(102, 1)


## Order 1 Polynominal

In [5]:
from sklearn.linear_model import LinearRegression
linReg = LinearRegression()

linReg

linReg.fit(trainX, trainY)

LinearRegression()

In [6]:
print('Intercept:', linReg.intercept_)
print('Coefficient/Theta', linReg.coef_)

Intercept: [-35.51592711]
Coefficient/Theta [[9.19589002]]


In [7]:
from sklearn.metrics import mean_squared_error

pred_Lin = linReg.predict(testX)
print('Mean squared error ', mean_squared_error(testY, pred_Lin))

Mean squared error  48.89573259954015


## Order 4 Polynominal 

In [8]:
from sklearn.preprocessing import PolynomialFeatures

# Create the Order 4 Polynominal object
poly_feat = PolynomialFeatures(degree = 4, include_bias = True)

# Use the poly_feat object to transform/fit your features
poly_feat.fit(trainX)

# Tranform X for train and test set
trainX_poly = poly_feat.transform(trainX)
testX_poly = poly_feat.transform(testX)

# Fit a Order 4 (Linear Regressoin object)
linear_4 = LinearRegression()
linear_4.fit(trainX_poly, trainY)


LinearRegression()

In [9]:
print('Intercept:', linReg.intercept_)
print('Coefficient/Theta', linReg.coef_)

Intercept: [-35.51592711]
Coefficient/Theta [[9.19589002]]


In [10]:
pred_4 = linear_4.predict(testX_poly)
print('Mean squared error ', mean_squared_error(testY, pred_4))

Mean squared error  47.841368354184965


## Lasso Polynomial Regression

In [12]:
from sklearn.linear_model import Lasso 

# A list to hold different values of alpha
lRegPara = [0.01,0.05,0.1,0.25,0.5,0.75,1]

for regPara in lRegPara:
    # Create the polynominal regression object
    polyLassoReg = Lasso(alpha = regPara, normalize = True)
    
    # Create, then fit and transform at the same time trainX by using poly_feat object
    polyFitTrainX = poly_feat.fit_transform(trainX)
    polyFitTestX = poly_feat.fit_transform(testX)
    
    # Fit the model
    polyLassoReg.fit(polyFitTrainX, trainY)
    
    # Predict
    preds = polyLassoReg.predict(polyFitTestX)
    
    # MSE
    mse = mean_squared_error(testY, preds)
    print("Alpha",regPara,":",mse)
    

Alpha 0.01 : 43.452825570089935
Alpha 0.05 : 45.10427471743117
Alpha 0.1 : 49.09674940842307
Alpha 0.25 : 73.5947099242971
Alpha 0.5 : 94.2823146327446
Alpha 0.75 : 94.2823146327446
Alpha 1 : 94.2823146327446


## K-fold cross validation

In [14]:
from sklearn.model_selection import KFold
kFold = KFold(n_splits=5, shuffle=True)