### Code Snippets for Feature Importance

please rename variable to use:
* x_train = dataframe with final training features
* y_train = series with final training targets
* x_test = dataframe with final test features
* y_test = series with final test target data
* best_model = model with hyperparameters selected by ML pipeline
* best_model_poly = the poly degree from the ML pipeline for best model

In [None]:
# fitting best model
# this part may have been taken care of else where.
# if so, just rename dataframes to match schema


from sklearn.preprocessing import PolynomialFeatures

best_model_poly = 2

poly = PolynomialFeatures(degree=best_model_poly)

x_train_poly = poly.fit_transform(x_train)
x_test_poly = poly.transform(x_test)

best_model = linear_model.LinearRegression()
best_model.fit(x_train_poly, y_train)

y_hat = best_model.predict(x_test_poly)


In [None]:
# plot feature importance

features = pd.DataFrame([best_model.coef_], columns = poly.get_feature_names(x_train.columns), index=["coef"])
top_ten = features.T.sort_values(by="coef",ascending=False)
top_ten = pd.concat((features.head(5), features.tail(5))).T


sns.set(style='white', rc={'figure.figsize':(20, 8)})

ax = sns.barplot(data=top_ten, palette="coolwarm_r")

plt.title('Top Ten Most Imporant Features by Magnitude \n', fontsize=20)
plt.ylabel('Coefficient Magnitude', fontsize=18)
plt.xlabel("")

plt.xticks(rotation=45, fontsize=16)
plt.yticks(fontsize=14)

plt.show(ax)

In [None]:
# define t-scores

def t_scores(y_hat, y, x_test, model):
    '''
    Function to compute t-scores for model.
    Based on https://gist.github.com/brentp/5355925
    Accessed 4/30/2020.
    
    uses formulas:
    t = coef - 0 / std err
    std err = sqrt(sum of sq err / sample var)
    sum of sq err = (y-hat - y)^2 / n - p
    
    '''
    sse = np.sum((y_hat - y) ** 2, axis=0) / float(x_test.shape[0] - x_test.shape[1])
    se = np.array([np.sqrt(np.diagonal(sse * np.linalg.inv(np.dot(x_test.T, x_test))))])

    t = model.coef_ / se
    
    return t


# get scores
t_vals = t_scores(y_hat, y_test, x_test_poly, best_model)

x_test_poly_df = pd.DataFrame([best_model.coef_], columns = poly.get_feature_names(x_train.columns))


# iterate over t_val array and add coefiecients and significance
t_score_df = {"coef" : [],
              "t_score": [],
              "sig": []}

for n, var in enumerate(x_test_poly_df.columns):
    score = t_vals[0][n]
    sig = abs(score) > 2.0
    
    t_score_df.get("coef", []).append(var)
    t_score_df.get("t_score", []).append(score)
    t_score_df.get("sig", []).append(sig)
    
t_score_df = pd.DataFrame(t_score_df)


# vizualize top ten
top_ten_t_scores = pd.concat((t_score_df[t_score_df["sig"] == True] \
                             .sort_values(by="t_score", ascending=False).head(),
                              t_score_df[t_score_df["sig"] == True] \
                             .sort_values(by="t_score", ascending=False).tail()))

sns.set(style='white', rc={'figure.figsize':(20, 8)})

ax = sns.barplot(x="coef", y="t_score", data=top_ten_t_scores,
                 palette="coolwarm_r")

plt.title('Top Ten Most Significant Features \n', fontsize=20)
plt.ylabel('t-score value', fontsize=18)
plt.xlabel("")

plt.xticks(rotation=45, fontsize=16)
plt.yticks(fontsize=14)

plt.show(ax)