In [105]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series

from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

%matplotlib inline

#### Read The Data

In [106]:
header = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model year', 'origin', 'car name']
input_data = pd.read_csv('auto-mpg.data', names=header)

In [107]:
input_data.loc[:5]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino
5,15.0,8,429.0,198.0,4341.0,10.0,70,1,ford galaxie 500


#### Preprocess the data

In [108]:
input_data = input_data.dropna()

#### Split Data into Training Set and Test Set

In [109]:
training_set = input_data.sample(frac=0.8)

test_set_index = input_data.index.difference(training_set.index)
test_set = input_data.loc[test_set_index]

#### Construct X and Y matrix

In [110]:
# Get the regressors matrix
# Ignore the last column which contains the name of the cars
df_X = training_set[training_set.columns.tolist()[1:-1]]
df_X = df_X.astype(float)
X = np.mat(DataFrame.as_matrix(df_X))

# Prepend the bias column to the matrix X
bias_vector = np.ones((X.shape[0],1))
X = np.hstack((bias_vector, X))

X.shape

(314, 8)

In [111]:
# Get the response vector
Y = np.mat(DataFrame.as_matrix(training_set[training_set.columns.tolist()[0]])).T
Y.astype(float)

Y.shape

(314, 1)

#### Print The Equation for The Linear Model
Note: This model discards the last variable: car name

$$ mpg = \beta_0 + \beta_1\times(cylinders) + \beta_2\times(díplacement) + \beta_3\times(horsepower) + \beta_4\times(weight) + \beta_5\times(acceleration) + \beta_6\times(model\_year) + \beta_7\times(origin) $$ 

#### Fit Linear Model

In [119]:
# Create a linear regression model
lr = linear_model.LinearRegression()

# cross_val_predict returns an array of the same size as `y` where each entry
# is a prediction obtained by cross validation:
predicted = lr.fit(X,Y)

#### Test The Model on The Test Set

In [113]:
test_X = test_set[test_set.columns.tolist()[1:-1]].astype(float)
test_X = np.mat(DataFrame.as_matrix(test_X))

# Prepend the bias column to the matrix X
bias_vector = np.ones((test_X.shape[0],1))
test_X = np.hstack((bias_vector, test_X))

In [114]:
# Get the response vector of test set
test_Y = np.mat(DataFrame.as_matrix(test_set[test_set.columns.tolist()[0]])).T
test_Y = test_Y.astype(float)

#### Make Prediction Using The Test Set and The Fitted Model

In [115]:
# Make predictions using the testing set
mpg_pred = lr.predict(test_X)

#### Display The Coefficients

In [116]:
# The coefficients
print('Coefficients: \n', lr.coef_)

Coefficients: 
 [[ 0.         -0.54294227  0.02202568 -0.02121832 -0.00644247  0.07159726
   0.7655258   1.51979032]]


#### Report on The Performance

In [117]:
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(test_Y, mpg_pred))

SSRes = mean_squared_error(test_Y, mpg_pred)*(len(test_Y))
print("Sum squared error: %.2f" % SSRes)

# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(test_Y, mpg_pred))

Mean squared error: 5.80
Sum squared error: 452.29
Variance score: 0.89


In [118]:
print("The sum squared error obtained from using least squares linear regression with Scikit-learn seems to be roughly the same as the one obtained from the normal equation method in homework4")

The sum squared error obtained from using least squares linear regression with Scikit-learn seems to be roughly the same as the one obtained from the normal equation method in homework4
