In [62]:
import itertools
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv("https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv")

In [3]:
df.head(2)

Unnamed: 0,model,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4


In [4]:
df.drop("model", axis=1, inplace=True)

In [5]:
#pd.get_dummies(df.mod
df.head(2)

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 11 columns):
mpg     32 non-null float64
cyl     32 non-null int64
disp    32 non-null float64
hp      32 non-null int64
drat    32 non-null float64
wt      32 non-null float64
qsec    32 non-null float64
vs      32 non-null int64
am      32 non-null int64
gear    32 non-null int64
carb    32 non-null int64
dtypes: float64(5), int64(6)
memory usage: 2.8 KB


In [7]:
df.columns

Index(['mpg', 'cyl', 'disp', 'hp', 'drat', 'wt', 'qsec', 'vs', 'am', 'gear',
       'carb'],
      dtype='object')

In [8]:
print(df.nunique())
print(df.groupby("cyl").mpg.count())

mpg     25
cyl      3
disp    27
hp      22
drat    22
wt      29
qsec    30
vs       2
am       2
gear     3
carb     6
dtype: int64
cyl
4    11
6     7
8    14
Name: mpg, dtype: int64


In [9]:
y = "hp"
X = [x for x in df.columns if x != y]

X_train, X_test, y_train, y_test = train_test_split(df[X], df[y], test_size=0.30, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(22, 10) (10, 10) (22,) (10,)


In [10]:
reg = LinearRegression(fit_intercept=True).fit(X_train, y_train)
y_pred = reg.predict(X_test)
yh  = [x for x in zip(y_test, y_pred)]
print(yh)
rootMeanSquaredError = sqrt(mean_squared_error(y_test, y_pred))
print(rootMeanSquaredError)

[(175, 164.89320493372372), (215, 220.36408411244835), (175, 192.27685132386171), (66, 53.937477566656639), (95, 1.8273192481138949), (123, 147.79206995634917), (335, 251.61713672838761), (66, 71.411240041774022), (180, 159.23445944053344), (110, 142.44530365932087)]
42.82125491580876


## Variable Importance

In [11]:
clf = RandomForestClassifier(n_estimators=50, max_features='sqrt')
df_X = df[X].copy()
df_X['randomVar'] = np.random.randint(1, 6, df_X.shape[0])
clf = clf.fit(df_X, df[y])
features = pd.DataFrame()
features['feature'] = df_X.columns
features['importance'] = clf.feature_importances_
features.sort_values(by=['importance'], ascending=True, inplace=True)
features.set_index('feature', inplace=True)
features = features.sort_values(by="importance", ascending=False).reset_index(drop=False)
features

Unnamed: 0,feature,importance
0,wt,0.162954
1,qsec,0.151038
2,disp,0.150997
3,mpg,0.149162
4,drat,0.13695
5,carb,0.087067
6,randomVar,0.064252
7,cyl,0.035814
8,vs,0.0273
9,gear,0.018944


In [12]:
randomVarIndex = features[features.feature=="randomVar"].index.values[0]

In [13]:
feat_positive = list(features[features.index < randomVarIndex].feature.values)
feat_positive

['wt', 'qsec', 'disp', 'mpg', 'drat', 'carb']

In [14]:
reg = LinearRegression(fit_intercept=True).fit(X_train[feat_positive], y_train)
y_pred = reg.predict(X_test[feat_positive])
yh  = [x for x in zip(y_test, map(int, y_pred))]
print(yh)
rootMeanSquaredError = sqrt(mean_squared_error(y_test, y_pred))
print(rootMeanSquaredError)

[(175, 176), (215, 224), (175, 193), (66, 46), (95, 47), (123, 130), (335, 255), (66, 66), (180, 162), (110, 152)]
34.08089688675762


In [15]:
# Compare variable importance with predictive capacity of each var with intercept, Mean RMSE with train-test loop 

## Linear regression brute force eval

In [16]:
y = "hp"
X = [x for x in df.columns if x != y]

In [17]:
X

['mpg', 'cyl', 'disp', 'drat', 'wt', 'qsec', 'vs', 'am', 'gear', 'carb']

In [99]:
def split_fit_eval(df, y):
    X = [x for x in df.columns if x != y]
    res = []
    elements = np.arange(2,len(X)+1,1)
    ucombin=[]
    for e in elements:
        ucombin.append(list(itertools.combinations(X, e)))
    comb_flat_list = [list(item) for sublist in ucombin for item in sublist]
    for enum, x in enumerate(comb_flat_list):
        if enum % 100 == 0:
            print(enum)
        rmse = []
        df_X = df[x].copy()
        df_X["intercept"] = 1.0
        for rs in range(10):
            X_train, X_test, y_train, y_test = train_test_split(df_X, df[y], test_size=0.30, random_state=rs)
            reg = LinearRegression(fit_intercept=False).fit(X_train, y_train)
            y_pred = reg.predict(X_test)
            rmse.append(sqrt(mean_squared_error(y_test, y_pred)))
        res.append((x, np.mean(rmse)))
    res = pd.DataFrame(res, columns=["var", "rmse"])
    res = res.sort_values(by="rmse").reset_index(drop=True)
    return res

In [100]:
r = split_fit_eval(df=df, y="hp")

0
100
200
300
400
500
600
700
800
900
1000


In [101]:
r.head(10)

Unnamed: 0,var,rmse
0,"[cyl, disp, qsec, vs]",25.69181
1,"[disp, qsec]",25.888003
2,"[disp, qsec, vs]",26.227388
3,"[cyl, disp, qsec]",26.380458
4,"[cyl, disp, wt, qsec, vs]",26.929386
5,"[cyl, wt, qsec, vs]",27.273564
6,"[disp, drat, qsec]",27.429389
7,"[disp, wt, qsec, vs]",27.501642
8,"[disp, qsec, am]",27.682395
9,"[disp, drat, qsec, vs]",27.765474


In [59]:
features

Unnamed: 0,feature,importance
0,wt,0.162954
1,qsec,0.151038
2,disp,0.150997
3,mpg,0.149162
4,drat,0.13695
5,carb,0.087067
6,randomVar,0.064252
7,cyl,0.035814
8,vs,0.0273
9,gear,0.018944


## Var Importance - Correlations

In [98]:
df_corr = df.corr()
df_corr = df_corr["hp"]
df_corr = df_corr.reset_index(drop=False)
df_corr["hp"] = df_corr["hp"].apply(lambda x : abs(x))
df_corr.sort_values(by="hp", ascending=False).reset_index(drop=True)

Unnamed: 0,index,hp
0,hp,1.0
1,cyl,0.832447
2,disp,0.790949
3,mpg,0.776168
4,carb,0.749812
5,vs,0.723097
6,qsec,0.708223
7,wt,0.658748
8,drat,0.448759
9,am,0.243204
