In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../datasets/cars_clean.csv')
df.drop('Unnamed: 0', axis=1, inplace=True)
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,age,origin_Europe,origin_Japan,origin_US
0,18.0,8,307.0,130,3504,12.0,50,0,0,1
1,15.0,8,350.0,165,3693,11.5,50,0,0,1
2,18.0,8,318.0,150,3436,11.0,50,0,0,1
3,16.0,8,304.0,150,3433,12.0,50,0,0,1
4,17.0,8,302.0,140,3449,10.5,50,0,0,1


In [3]:
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [10]:
df.columns

Index(['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'age', 'origin_Europe', 'origin_Japan', 'origin_US'],
      dtype='object')

In [14]:
names = ['cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'age']
X = pd.DataFrame(StandardScaler().fit_transform(
    df.drop(['mpg', 'origin_Europe', 'origin_Japan', 'origin_US'],
            axis=1).astype('float64')), columns=names)
Y = df['mpg']

In [23]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [38]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [39]:
linear_model = LinearRegression(normalize=True).fit(x_train, y_train)
y_pred = linear_model.predict(x_test)

print('Training score: ', linear_model.score(x_train, y_train),
      '\nTesting Score: ', r2_score(y_test, y_pred))

Training score:  0.8021731249481205 
Testing Score:  0.8309341130858503


In [44]:
def adjusted_r2(r_square, labels, features):
    adj_r_square = 1 - ((1 - r_square) * (len(labels) - 1)) / (len(labels) - features.shape[1] - 1)
    return adj_r_square

In [45]:
print('Adjuested r2_score: ', adjusted_r2(r2_score(y_test, y_pred), y_test, x_test))

Adjuested r2_score:  0.8170382867641394


In [48]:
features_corr = X.corr()
features_corr

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,age
cylinders,1.0,0.950721,0.839061,0.896017,-0.505419,0.348746
displacement,0.950721,1.0,0.89376,0.932824,-0.543684,0.370164
horsepower,0.839061,0.89376,1.0,0.860676,-0.684376,0.41175
weight,0.896017,0.932824,0.860676,1.0,-0.417457,0.306564
acceleration,-0.505419,-0.543684,-0.684376,-0.417457,1.0,-0.288137
age,0.348746,0.370164,0.41175,0.306564,-0.288137,1.0


In [49]:
abs(features_corr) > 0.8

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,age
cylinders,True,True,True,True,False,False
displacement,True,True,True,True,False,False
horsepower,True,True,True,True,False,False
weight,True,True,True,True,False,False
acceleration,False,False,False,False,True,False
age,False,False,False,False,False,True


In [50]:
trimmed_features = X.drop(['cylinders', 'displacement', 'weight'], axis=1)
trimmed_features_corr = trimmed_features.corr()
trimmed_features_corr

Unnamed: 0,horsepower,acceleration,age
horsepower,1.0,-0.684376,0.41175
acceleration,-0.684376,1.0,-0.288137
age,0.41175,-0.288137,1.0


In [51]:
X2 = trimmed_features

In [55]:
x_train, x_test, y_train, y_test = train_test_split(X2, Y, test_size=0.2)

linear_model = LinearRegression(normalize=True).fit(x_train, y_train)
y_pred = linear_model.predict(x_test)

print('Training score: ', linear_model.score(x_train, y_train),
      '\nTesting Score: ', r2_score(y_test, y_pred),
      '\nAdjuested r2_score: ', adjusted_r2(r2_score(y_test, y_pred), y_test, x_test))

Training score:  0.6961901545015534 
Testing Score:  0.7029045741917492 
Adjuested r2_score:  0.6911771231730025


In [57]:
!pip install statsmodels

Defaulting to user installation because normal site-packages is not writeable
Collecting statsmodels
  Downloading statsmodels-0.11.1-cp37-none-win_amd64.whl (8.2 MB)
Collecting patsy>=0.5
  Downloading patsy-0.5.1-py2.py3-none-any.whl (231 kB)
Installing collected packages: patsy, statsmodels
Successfully installed patsy-0.5.1 statsmodels-0.11.1


In [59]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [63]:
vif = pd.DataFrame()
vif['VIF Factor'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['features'] = X.columns
vif.round(2)
vif

Unnamed: 0,VIF Factor,features
0,10.589025,cylinders
1,19.722593,displacement
2,8.661728,horsepower
3,10.445886,weight
4,2.507414,acceleration
5,1.239368,age


In [64]:
X3 = X.drop(['cylinders', 'displacement', 'weight'], axis=1)

In [None]:
vif = pd.DataFrame()
vif['VIF Factor'] = [variance_inflation_factor(X3.values, i) for i in range(X.shape[1])]
vif['features'] = X3.columns
vif.round(2)
vif