In [51]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as stats
import seaborn as sns

from sklearn import datasets, linear_model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

%matplotlib inline

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [58]:
print("Current Working Directory " , os.getcwd())
df3 = pd.read_csv("data/cars_data_df2.csv")
df3.head(2)

Current Working Directory  /home/jovyan/work


Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
1,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


## some slicing by column names

In [59]:
x_data = df3.loc[:,"symboling":"highway-mpg"]

# x_data = df3.loc[:,["horsepower","peak-rpm"]]
x_data.head(2)

y_data = df3.loc[:,"price"]

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg
0,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,four,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30
1,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,five,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22


## splitting data in train and test
using *train_teest_split()*

In [21]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3, random_state=0)
x_train.head(2)
y_train.head(2)

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg
118,0,91.0,toyota,diesel,std,four,sedan,fwd,front,95.7,...,four,110,idi,3.27,3.35,22.5,56.0,4500.0,34,36
95,2,104.0,saab,gas,std,four,sedan,fwd,front,99.1,...,four,121,mpfi,3.54,3.07,9.3,110.0,5250.0,21,28


118     7898.0
95     15510.0
Name: price, dtype: float64

## cross-validation 
reference:

- https://scikit-learn.org/stable/
- https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html

course python jupyter notebook:
- https://labs.cognitiveclass.ai/tools/jupyterlab/lab/tree/labs/DA0101EN/edx/Review-Exploratory-Data-Analysis.ipynb

In [62]:
lr = LinearRegression()
lr.fit(x_train[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']], y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

prediction using train data

In [63]:
yhat_train = lr.predict(x_train[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']])
yhat_train[0:5]

array([  8969.47194235,  14168.33124468,  10498.17498863,  12971.03656183,
        12132.46883687])

prredict using teest data

In [64]:
yhat_test = lr.predict(x_test[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']])
yhat_test[0:5]

array([  5980.19773393,   6241.13261664,   7923.97012176,  16249.69487877,
        14508.23225674])

## ridge regression

In [52]:
# ridge_model = Ridge(alpha = 0.1)

# ridge_model.fit(x_train[["horsepower"]], y_train)
# y_hat = ridge_model.predict(x_train)

n_samples, n_features = 10, 5
np.random.seed(0)
y = np.random.randn(n_samples)
X = np.random.randn(n_samples, n_features)

clf = Ridge(alpha=1.0)
clf.fit(X, y) 

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [60]:
parameters = [{"alpha":[1, 10, 100, 1000]}]
rr = Ridge()
grid = GridSearchCV(rr, parameters, cv = 4)
grid.fit(x_data[["horsepower", "curb-weight", "engine-size", "highway-mpg"]], y_data)

GridSearchCV(cv=4, error_score='raise-deprecating',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'alpha': [1, 10, 100, 1000]}], pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring=None, verbose=0)

In [61]:
scores = grid.cv_results_
scores



{'mean_fit_time': array([ 0.00367719,  0.00300854,  0.00323045,  0.00304157]),
 'std_fit_time': array([ 0.00052565,  0.00021984,  0.00029498,  0.00025564]),
 'mean_score_time': array([ 0.00148559,  0.00135392,  0.0017277 ,  0.0013805 ]),
 'std_score_time': array([ 0.00012696,  0.00010922,  0.00030943,  0.00012264]),
 'param_alpha': masked_array(data = [1 10 100 1000],
              mask = [False False False False],
        fill_value = ?),
 'params': [{'alpha': 1}, {'alpha': 10}, {'alpha': 100}, {'alpha': 1000}],
 'split0_test_score': array([ 0.75719355,  0.75718998,  0.75715627,  0.75692478]),
 'split1_test_score': array([ 0.77259426,  0.77261999,  0.7728676 ,  0.7747935 ]),
 'split2_test_score': array([ 0.62243871,  0.62252846,  0.62335428,  0.62784211]),
 'split3_test_score': array([ 0.70002411,  0.70010714,  0.70089342,  0.70646333]),
 'mean_test_score': array([ 0.71306266,  0.71311139,  0.71356789,  0.71650593]),
 'std_test_score': array([ 0.05889451,  0.05886122,  0.05855668,  0.