In [96]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.ensemble import RandomForestClassifier

In [97]:
df = pd.read_csv("https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv")

In [98]:
df.head(2)

Unnamed: 0,model,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4


In [99]:
df.drop("model", axis=1, inplace=True)

In [100]:
#pd.get_dummies(df.mod
df.head(2)

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4


In [101]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 11 columns):
mpg     32 non-null float64
cyl     32 non-null int64
disp    32 non-null float64
hp      32 non-null int64
drat    32 non-null float64
wt      32 non-null float64
qsec    32 non-null float64
vs      32 non-null int64
am      32 non-null int64
gear    32 non-null int64
carb    32 non-null int64
dtypes: float64(5), int64(6)
memory usage: 2.8 KB


In [102]:
df.columns

Index(['mpg', 'cyl', 'disp', 'hp', 'drat', 'wt', 'qsec', 'vs', 'am', 'gear',
       'carb'],
      dtype='object')

In [103]:
print(df.nunique())
print(df.groupby("cyl").mpg.count())

mpg     25
cyl      3
disp    27
hp      22
drat    22
wt      29
qsec    30
vs       2
am       2
gear     3
carb     6
dtype: int64
cyl
4    11
6     7
8    14
Name: mpg, dtype: int64


In [104]:
y = "hp"
X = [x for x in df.columns if x != y]

X_train, X_test, y_train, y_test = train_test_split(df[X], df[y], test_size=0.30, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(22, 10) (10, 10) (22,) (10,)


In [105]:
reg = LinearRegression(fit_intercept=True).fit(X_train, y_train)
y_pred = reg.predict(X_test)
yh  = [x for x in zip(y_test, y_pred)]
print(yh)
rootMeanSquaredError = sqrt(mean_squared_error(y_test, y_pred))
print(rootMeanSquaredError)

[(175, 164.89320493372372), (215, 220.36408411244835), (175, 192.27685132386171), (66, 53.937477566656639), (95, 1.8273192481138949), (123, 147.79206995634917), (335, 251.61713672838761), (66, 71.411240041774022), (180, 159.23445944053344), (110, 142.44530365932087)]
42.82125491580876


## Variable Importance

In [106]:
clf = RandomForestClassifier(n_estimators=50, max_features='sqrt')
df_X = df[X].copy()
df_X['randomVar'] = np.random.randint(1, 6, df_X.shape[0])
clf = clf.fit(df_X, df[y])
features = pd.DataFrame()
features['feature'] = df_X.columns
features['importance'] = clf.feature_importances_
features.sort_values(by=['importance'], ascending=True, inplace=True)
features.set_index('feature', inplace=True)
features = features.sort_values(by="importance", ascending=False).reset_index(drop=False)
features

Unnamed: 0,feature,importance
0,mpg,0.166527
1,drat,0.147681
2,qsec,0.133459
3,wt,0.131451
4,disp,0.126264
5,carb,0.084156
6,randomVar,0.066386
7,cyl,0.047771
8,gear,0.045335
9,vs,0.029004


In [107]:
randomVarIndex = features[features.feature=="randomVar"].index.values[0]

In [108]:
feat_positive = list(features[features.index < randomVarIndex].feature.values)
feat_positive

['mpg', 'drat', 'qsec', 'wt', 'disp', 'carb']

In [109]:
reg = LinearRegression(fit_intercept=True).fit(X_train[feat_positive], y_train)
y_pred = reg.predict(X_test[feat_positive])
yh  = [x for x in zip(y_test, map(int, y_pred))]
print(yh)
rootMeanSquaredError = sqrt(mean_squared_error(y_test, y_pred))
print(rootMeanSquaredError)

[(175, 176), (215, 224), (175, 193), (66, 46), (95, 47), (123, 130), (335, 255), (66, 66), (180, 162), (110, 152)]
34.080896886757586


In [110]:
# Compare variable importance with predictive capacity of each var with intercept, Mean RMSE with train-test loop 