In [49]:
%matplotlib inline


# Numerical libraries
import numpy as np   

# Import Linear Regression machine learning library
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

from sklearn.metrics import r2_score

# to handle data in form of rows and columns 
import pandas as pd    

# importing ploting libraries
import matplotlib.pyplot as plt   

#importing seaborn for statistical plots
import seaborn as sns

In [50]:
from sklearn.linear_model import LinearRegression,Ridge,Lasso

In [51]:
mpg_df = pd.read_csv("car-mpg.csv")  
mpg_df = mpg_df.drop('car_name', axis=1)
mpg_df['origin'] = mpg_df['origin'].replace({1: 'america', 2: 'europe', 3: 'asia'})
mpg_df = pd.get_dummies(mpg_df, columns=['origin'])
mpg_df = mpg_df.replace('?', np.nan)
mpg_df = mpg_df.apply(lambda x: x.fillna(x.median()),axis=0)

# separate independent and dependent variables

In [52]:
# Copy all the predictor variables into X dataframe. Since 'mpg' is dependent variable drop it
X = mpg_df.drop('mpg', axis=1)

# Copy the 'mpg' column alone into the y dataframe. This is the dependent variable
y = mpg_df[['mpg']]


In [53]:
from sklearn import preprocessing

# scale all the columns of the mpg_df. This will produce a numpy array
X_scaled = preprocessing.scale(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)  # ideally the training and test should be 

In [54]:

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.30, random_state=1)

In [28]:

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)

In [None]:
### Forward and Backward Regression

In [7]:
from statsmodels.api import add_constant
X2 = add_constant(X_train)

In [8]:
import statsmodels.formula.api as smf
def forward_selected(data, response):
    """Linear model designed by forward selection.

    Parameters:
    -----------
    data : pandas DataFrame with all possible predictors and response

    response: string, name of response column in data

    Returns:
    --------
    model: an "optimal" fitted statsmodels linear model
           with an intercept
           selected by forward selection
           evaluated by adjusted R-squared
    """
    remaining = set(data.columns)
    remaining.remove(response)
    selected = []
    current_score, best_new_score = 0.0, 0.0
    while remaining and current_score == best_new_score:
        scores_with_candidates = []
        for candidate in remaining:
            formula = "{} ~ {} + 1".format(response,
                                           ' + '.join(selected + [candidate]))
            score = smf.ols(formula, data).fit().rsquared_adj
            scores_with_candidates.append((score, candidate))
        scores_with_candidates.sort()
        best_new_score, best_candidate = scores_with_candidates.pop()
        if current_score < best_new_score:
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_new_score
    formula = "{} ~ {} + 1".format(response,
                                   ' + '.join(selected))
    model = smf.ols(formula, data).fit()
    return model

# fit a simple linear model

In [55]:
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)

for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))

The coefficient for cyl is 2.5059518049385052
The coefficient for disp is 2.535708286056052
The coefficient for hp is -1.7889335736325254
The coefficient for wt is -5.551819873098726
The coefficient for acc is 0.11485734803440907
The coefficient for yr is 2.931846548211611
The coefficient for car_type is 2.977869737601943
The coefficient for origin_america is -0.5832955290165979
The coefficient for origin_asia is 0.3474931380432245
The coefficient for origin_europe is 0.37741646808688323


# Create a regularized RIDGE model and note the coefficients

In [70]:
ridge_model=Ridge(alpha=10)

In [71]:
ridge_model.fit(X_train,y_train)

Ridge(alpha=10)

In [72]:
for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, [regression_model.coef_[0][idx],ridge_model.coef_[0][idx]]))
#for idx, col_name in enumerate(X_train.columns):
 #   print("The coefficient for {} is {}".format(col_name, ridge_model.coef_[0][idx]))

The coefficient for cyl is [2.5059518049385052, 1.4520179020254593]
The coefficient for disp is [2.535708286056052, 0.9300879960464672]
The coefficient for hp is [-1.7889335736325254, -1.6535616174264334]
The coefficient for wt is [-5.551819873098726, -3.967353383122223]
The coefficient for acc is [0.11485734803440907, -0.11819965764346413]
The coefficient for yr is [2.931846548211611, 2.7280885802012826]
The coefficient for car_type is [2.977869737601943, 2.2019288357544795]
The coefficient for origin_america is [-0.5832955290165979, -0.5223468241979613]
The coefficient for origin_asia is [0.3474931380432245, 0.3451933718440451]
The coefficient for origin_europe is [0.37741646808688323, 0.30234911395905173]


In [73]:
print(' training  Score',[regression_model.score(X_train,y_train),ridge_model.score(X_train,y_train)])
print(' test Score',[regression_model.score(X_test,y_test),ridge_model.score(X_test,y_test)])


 training  Score [0.8343770256960538, 0.8275133545583995]
 test Score [0.8513421387780066, 0.8569374430520706]


In [37]:
ridge = Ridge(alpha=.3)
ridge.fit(X_train,y_train)
print ("Ridge model:", (ridge.coef_))
    

Ridge model: [[ 2.47057467  2.44494419 -1.78573889 -5.47285499  0.10115618  2.92319984
   2.94492098 -0.57949986  0.34667456  0.37344909]]


# Create a regularized LASSO model and note the coefficients

In [74]:
lasso = Lasso(alpha=0.05)
lasso.fit(X_train,y_train)
print ("Lasso model:", (lasso.coef_))

# Observe, many of the coefficients have become 0 indicating drop of those dimensions from the model

Lasso model: [ 1.83851731  1.19548187 -1.32260539 -4.8020951   0.          2.82349284
  2.33448624 -0.82674844  0.          0.        ]


In [75]:
for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, lasso.coef_[idx]))

The coefficient for cyl is 1.8385173123867278
The coefficient for disp is 1.1954818725345313
The coefficient for hp is -1.3226053859414477
The coefficient for wt is -4.8020951012605035
The coefficient for acc is 0.0
The coefficient for yr is 2.8234928410713223
The coefficient for car_type is 2.3344862378800255
The coefficient for origin_america is -0.8267484380841996
The coefficient for origin_asia is 0.0
The coefficient for origin_europe is 0.0


## Let us compare their scores

In [76]:
print(regression_model.score(X_train, y_train))
print(regression_model.score(X_test, y_test))


0.8343770256960538
0.8513421387780066


In [47]:
print(ridge.score(X_train, y_train))
print(ridge.score(X_test, y_test))

NameError: name 'ridge' is not defined

In [46]:
print(lasso.score(X_train, y_train))
print(lasso.score(X_test, y_test))

0.8322076784726787
0.854995582934768


In [13]:
# More or less similar results but with less complex models.  Complexity is a function of variables and coefficients
## Note - with Lasso, we get equally good result in test though not so in training.  Further, the number of dimensions is much less
# in LASSO model than ridge or un-regularized model

# Let us generate polynomial models reflecting the non-linear interaction between some dimensions

In [48]:
from sklearn.preprocessing import PolynomialFeatures

In [88]:
poly = PolynomialFeatures(degree = 2, interaction_only=True)
#poly = PolynomialFeatures(2)

In [89]:
X_scaled.shape

(398, 10)

In [90]:
X_poly = poly.fit_transform(X_scaled)
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.30, random_state=1)
X_train.shape

(278, 56)

In [87]:
X_train[0]

array([ 1.        , -0.85632057, -0.8491159 , -1.08197717, -0.89317209,
       -0.24256954,  1.35119925,  0.94141167,  0.77355903, -0.49764335,
       -0.46196822,  0.72711541,  0.92651931,  0.76484164,  0.20771729,
       -1.15705972, -0.80615018, -0.66241451,  0.42614224,  0.39559289,
        0.91872401,  0.75840662,  0.20596966, -1.14732476, -0.79936761,
       -0.65684127,  0.42255688,  0.39226456,  0.96639182,  0.26245471,
       -1.46196675, -1.01858593, -0.83697321,  0.53843874,  0.49983907,
        0.21665635, -1.20685347, -0.84084263, -0.69092134,  0.44448115,
        0.41261712, -0.32775979, -0.2283578 , -0.18764186,  0.12071312,
        0.11205942,  1.27203474,  1.04523239, -0.67241532, -0.62421111,
        0.7282375 , -0.46848725, -0.43490227, -0.38495651, -0.35735969,
        0.22989541])

# Fit a simple non regularized linear model on poly features-

In [93]:
regression_model.fit(X_train, y_train)
print(regression_model.coef_[0])


[ 3.24082770e-13 -1.14204220e+12 -4.43738735e+00 -2.24947964e+00
 -2.98166341e+00 -1.56730367e+00  3.00442772e+00 -1.52060575e+12
 -7.80788356e+11  3.71375223e+12 -3.23609457e+12 -1.15918732e+00
 -1.43925476e+00 -3.57818604e-03  2.58444214e+00 -1.91918182e+00
 -3.65891647e+12 -6.45319147e+12 -2.39436996e+12 -2.28543203e+12
  3.90441895e-01  2.09503174e-01 -4.23446655e-01  3.58471680e+00
 -2.02703094e+00 -9.03672940e+11 -7.44778888e+11 -7.10893285e+11
  2.47772217e-01 -6.70440674e-01 -1.92620850e+00 -7.47558594e-01
 -2.15947171e+11 -1.77976884e+11 -1.69879374e+11 -1.72500610e-01
  5.30212402e-01 -3.32050323e+00  1.69388998e+12  1.39605098e+12
  1.33253411e+12  5.85876465e-01  1.53894043e+00  4.76389633e+11
  3.92625390e+11  3.74761903e+11  4.00207520e-01 -1.27131857e+10
 -1.04778089e+10 -1.00010944e+10 -1.09798815e+12  8.13175594e+11
  7.76178109e+11  2.20248210e+11 -5.15971535e+12  2.83957085e+12]


In [94]:
ridge = Ridge(alpha=.3)
ridge.fit(X_train,y_train)
print ("Ridge model:", (ridge.coef_))

Ridge model: [[ 0.          3.73512981 -2.93500874 -2.13974194 -3.56547812 -1.28898893
   3.01290805  2.04739082  0.0786974   0.21972225 -0.3302341  -1.46231096
  -1.17221896  0.00856067  2.48054694 -1.67596093  0.99537516 -2.29024279
   4.7699338  -2.08598898  0.34009408  0.35024058 -0.41761834  3.06970569
  -2.21649433  1.86339518 -2.62934278  0.38596397  0.12088534 -0.53440382
  -1.88265835 -0.7675926  -0.90146842  0.52416091  0.59678246 -0.26349448
   0.5827378  -3.02842915 -0.36548074  0.5956112  -0.15941014  0.49168856
   1.45652375 -0.43819158 -0.20964198  0.77665496  0.36489921 -0.4750838
   0.3551047   0.23188557 -1.42941282  2.06831543 -0.34986402 -0.32320394
   0.39054656  0.06283411]]


In [95]:
print(ridge.score(X_train, y_train))
print(ridge.score(X_test, y_test))


0.9143225702003365
0.8613398053698541


In [109]:
lasso = Lasso(alpha=0.09)
lasso.fit(X_train,y_train)
print ("Lasso model:", (lasso.coef_))


Lasso model: [ 0.00000000e+00 -0.00000000e+00 -0.00000000e+00 -1.57994956e+00
 -5.25925497e+00 -0.00000000e+00  2.87734427e+00  1.04842906e-02
 -1.21504098e-01  0.00000000e+00  0.00000000e+00 -0.00000000e+00
 -0.00000000e+00  0.00000000e+00  3.35421507e-01 -0.00000000e+00
  0.00000000e+00 -0.00000000e+00  1.70974264e-01 -0.00000000e+00
  0.00000000e+00  1.15621326e+00  0.00000000e+00  0.00000000e+00
 -0.00000000e+00  0.00000000e+00  0.00000000e+00 -0.00000000e+00
  3.63420040e-02  0.00000000e+00 -6.84544028e-01 -0.00000000e+00
  0.00000000e+00 -0.00000000e+00 -0.00000000e+00 -0.00000000e+00
 -0.00000000e+00 -7.61560567e-01  0.00000000e+00 -0.00000000e+00
 -0.00000000e+00  1.96273379e-01  0.00000000e+00 -6.52677925e-01
  0.00000000e+00  3.59800528e-01  0.00000000e+00 -3.85308960e-01
  4.51963503e-03  0.00000000e+00 -0.00000000e+00 -0.00000000e+00
  1.65720500e-01 -0.00000000e+00  0.00000000e+00 -0.00000000e+00]


In [110]:
print(lasso.score(X_train, y_train))
print(lasso.score(X_test, y_test))


0.8908510504987864
0.8798857278694309
