# Feature Selection
- Numerical vs. Target (Correlation)
- Numerical vs. Numerical (Correlation)
- Categorical vs. Target (Chi-Squared)
- Categorical vs. Categorical (Chi-Squared)

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
%matplotlib inline


from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from imblearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import cross_val_score, KFold

In [5]:
X_train = pd.read_csv('../data/X_train_prepared.csv')
y_train = pd.read_csv('../data/y_train.csv')
X_test = pd.read_csv('../data/X_test_prepared.csv')
y_test = pd.read_csv('../data/y_test.csv')

In [6]:
X_train.head()

Unnamed: 0,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,...,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,missing_marital_status,missing_education_level,missing_income_category
0,55,M,2,Graduate,Married,$120K +,Blue,50,3,2,...,1536,9380.0,1.317,1592,34,1.0,0.141,0,0,0
1,32,M,0,Graduate,Single,,Blue,27,6,3,...,0,9238.0,0.809,2522,68,0.478,0.0,0,0,1
2,35,F,4,Uneducated,Married,,Blue,25,2,3,...,587,2792.0,0.67,5121,80,0.702,0.174,0,0,1
3,40,F,2,,Married,Less than $40K,Blue,36,3,1,...,2376,2717.0,0.822,2341,57,0.541,0.467,0,1,0
4,45,F,4,High School,Married,$40K - $60K,Blue,36,5,2,...,2232,716.0,0.858,3635,79,0.58,0.757,0,0,0


In [5]:
# gender (1), one hot encode, drop_first
# card_category: blue or not (1), one hot encode
# marital_status (2), one hot encode, drop_first
# income_category (1) label
# education_level (1) label 

def categorical_to_numerical(df):
    
    # ordinal: one hot encode
    if 'Marital_Status' in df.columns:
        df_dummies = pd.get_dummies(df[['Gender','Marital_Status']],drop_first=True)
        df = pd.concat([df, df_dummies], axis=1)
        df.drop(['Gender','Marital_Status'],axis=1,inplace=True)
    else:
        df_dummies = pd.get_dummies(df['Gender'],drop_first=True)
        df = pd.concat([df, df_dummies], axis=1)
        df.drop('Gender',axis=1,inplace=True)
    
    # nominal: label encode
    if 'Card_Category' in df.columns:
        card_mapping = {'Blue' : 0, 'Silver' : 1, 'Gold' : 2, 'Platinum' : 3}
        df['Card_Category'] = df['Card_Category'].map(card_mapping)
    
    edu_mapping = {'Uneducated' : 0, 'High School' : 1, 'College' : 2, 'Graduate' : 3, 'Post-Graduate' : 4, 'Doctorate' : 5}
    df['Education_Level'] = df['Education_Level'].map(edu_mapping)
    
    # label 
    inc_mapping = {'Less than $40K' : 0, '$40K - $60K' : 1, '$60K - $80K' : 2, '$80K - $120K' : 3, '$120K +' : 4}
    df['Income_Category'] = df['Income_Category'].map(inc_mapping)
    return df

In [6]:
X_train = categorical_to_numerical(X_train)

In [17]:
X_test = categorical_to_numerical(X_test)

In [11]:
def drop_features(df):
    df.drop(['Marital_Status_Single','Gender_M','Marital_Status_Married','Card_Category','Dependent_count','missing_marital_status','missing_education_level','missing_income_category','Contacts_Count_12_mon','Credit_Limit','Months_on_book','Avg_Open_To_Buy','Total_Trans_Amt','Avg_Utilization_Ratio','Total_Amt_Chng_Q4_Q1','Total_Relationship_Count'],axis=1,inplace=True)

`Total_Trans_Ct`: This variable explains approximately 14% of the variation of bank attrition. Surprisingly, `Total_Trans_Amt` has a much lower percentage of variation of bank attrition explained (approximately 5%) even though both variables are highly correlated with each other. Therefore, we will keep `Total_Trans_Ct` and drop `Total_Trans_Amt`.

The other variables not included in the first list do not explain a large percentage of variation in churn so we will drop them.

**3.4.2 Key Findings**

`Credit_Limit`,`Avg_Open_To_Buy`,0.872472: Since Avg_Open_To_Buy is part of the equation of solving Credit_Limit, this correlation is not suprising. We will drop both features since both are not correlated highly with the target variable.

`Total_Trans_Ct`,`Total_Trans_Amt`,0.771498: Since Total_Trans_Ct is more correlated with the target variable, we will keep it and drop Total_Trans_Amt.

`Customer_Age`,`Months_on_book`,0.591667: We will drop Months_on_book.

`Total_Revolving_Bal`,`Avg_Utilization_Ratio`,0.507283: We will just keep Total_Revolving_Bal since it is used in the equation to find Avg_Utilization_Ratio and it is more correlated with the target variable.

`Avg_Open_To_Buy`,`Avg_Utilization_Ratio`,0.459306: We will  drop both features because Avg_Open_To_Buy isn't highly correlated with the target variable and Avg_Utilization_Ratio is correlated with another feature that is highly correlated with the target.

In [9]:
X_train_fs= X_train.copy()
drop_features(X_train_fs)
X_train_fs.head()

In [18]:
X_test_fs= X_test.copy()
drop_features(X_test_fs)

Unnamed: 0,Customer_Age,Education_Level,Income_Category,Months_Inactive_12_mon,Total_Revolving_Bal,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1
0,44,0.0,2.0,2,1597,59,0.903
1,44,,,3,0,60,0.538
2,37,1.0,0.0,1,2216,67,0.489
3,34,3.0,1.0,2,1285,33,1.2
4,51,1.0,0.0,4,2071,71,0.972


# Feature Scaling

We will perform normalization.

In [13]:
scaler = MinMaxScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)

scaler_fs = MinMaxScaler()
X_train_fs = pd.DataFrame(scaler_fs.fit_transform(X_train_fs.values), columns=X_train_fs.columns, index=X_train_fs.index)

In [14]:
def fit_df(scaler_type, df):
    return pd.DataFrame(scaler_type.fit_transform(df.values), columns=df.columns, index=df.index)

In [19]:
X_train.to_csv('X_train_scaled.csv',index=False)
X_train_fs.to_csv('X_train_fs_scaled.csv',index=False)

X_test_fs.to_csv('X_test_fs_scaled.csv',index=False)
X_test.to_csv('X_test_scaled.csv',index=False)

# Model Building

### Important Metrics:

We care the most about recall because as the credit card company, you are more concerned about people who are likely to attrite. False positives are people who we think are going to attrite but don't actually attrite. False negatives are people who we think are going to stay, but acutally levae. We care more about minimizing this group because they are more costly. WE want Recall for people who churn to be high. This means we want a large proportion of people who are positive are properly classified as positive and small portion classified as negative.

In [None]:
def plot_precision_recall_curve(y_train_alg,y_pred_alg, alg):
    # calculate p-r curves
    precision, recall, thresholds = precision_recall_curve(y_train_alg, y_pred_alg)
    # convert to f score
    fscore = (5 * precision * recall) / (4 *precision+ recall)
    # locate the index of the largest f score
    ix = argmax(fscore)
    print(alg)
    print('Best Threshold=%f, F-Score=%.3f' % (thresholds[ix], fscore[ix]))
    # plot the roc curve for the model
    no_skill = len(y_train[y_train==1]) / len(y_train)
    pyplot.plot(recall, precision, marker='.', label=alg)
    pyplot.plot([0,1], [no_skill,no_skill], linestyle='--', label='No Skill')
    pyplot.scatter(recall[ix], precision[ix], marker='o', color='black', label=alg+' Best',zorder=10)
    # axis labels
    pyplot.xlabel('Recall')
    pyplot.ylabel('Precision')
    pyplot.legend()
    # show the plot
    pyplot.show()


In [None]:
kf = KFold(n_splits=5,shuffle=False)

In [None]:
### Logistic Regression

In [None]:
grid={"C":np.logspace(-3,3,7), "penalty":['l1','l2','elasticnet']}# l1 lasso l2 ridge
logreg=LogisticRegression()
logreg_cv=GridSearchCV(logreg,grid,cv=10)
logreg_cv.fit(X_train_res,y_train_res)

print("tuned hpyerparameters :(best parameters) ",logreg_cv.best_params_)
print("accuracy :",logreg_cv.best_score_)

In [None]:
random_grid={"C":np.logspace(-3,3,7), "penalty":['l1','l2','elasticnet']}# l1 lasso l2 ridge
imba_pipeline = imblearn.pipeline.make_pipeline(SMOTE(), 
                              LogisticRegression())
new_params = {'logisticregression__' + key: random_grid[key] for key in random_grid}
grid_imba = RandomizedSearchCV(imba_pipeline, param_distributions=new_params, cv=kf, scoring='recall',
                        return_train_score=True, n_iter = 100, verbose=2, n_jobs = -1)
grid_imba.fit(X_train, y_train);
# grid_imba.best_params_
# {'logisticregression__penalty': 'l2', 'logisticregression__C': 1.0}

In [None]:
logreg_best=LogisticRegression(C=1.0,penalty="l2")
logreg_pipeline = imblearn.pipeline.make_pipeline(SMOTE(), 
                              logreg_best)
logreg_pipeline.fit(X_train,y_train)
y_pred_logreg = logreg_pipeline.predict_proba(X_train)[:, 1]

plot_precision_recall_curve(y_train,y_pred_logreg,'Logistic Regression')

In [None]:
## Feature Selection Logistic Regression

In [None]:
random_grid={"C":np.logspace(-3,3,7), "penalty":['l1','l2','elasticnet']}# l1 lasso l2 ridge
imba_pipeline_fs = imblearn.pipeline.make_pipeline(SMOTE(), 
                              LogisticRegression())
new_params = {'logisticregression__' + key: random_grid[key] for key in random_grid}
grid_imba_fs = RandomizedSearchCV(imba_pipeline_fs, param_distributions=new_params, cv=kf, scoring='recall',
                        return_train_score=True, n_iter = 100, verbose=2, n_jobs = -1)
grid_imba_fs.fit(X_train_fs, y_train);
# grid_imba.best_params_
# {'logisticregression__penalty': 'l2', 'logisticregression__C': 1.0}

In [None]:
logreg_best_fs=LogisticRegression(C=0.1,penalty="l2")
logreg_pipeline_fs = imblearn.pipeline.make_pipeline(SMOTE(), 
                              logreg_best_fs)
logreg_pipeline_fs.fit(X_train_fs,y_train)
y_pred_logreg_fs = logreg_pipeline_fs.predict_proba(X_train_fs)[:, 1]

plot_precision_recall_curve(y_train,y_pred_logreg_fs,'Logistic Regression')

In [None]:
## Gradient Boost

In [None]:
# Number of trees in random forest
loss = ['deviance','exponential']
learning_rate = [.001,.01,.1]
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 300, num = 5)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt','log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 50,5)]
# Minimum number of samples required to split a node
min_samples_split = [20,30,50,100]
# Minimum number of samples required at each leaf node
min_samples_leaf = [20,30,50,100]
# Create the random grid
random_grid = {'loss': loss,
               'learning_rate':learning_rate,
               'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
              'criterion':['friedman_mse', 'mse']}

In [None]:
imba_pipeline = make_pipeline(SMOTE(), 
                              GradientBoostingClassifier())
new_params = {'gradientboostingclassifier__' + key: random_grid[key] for key in random_grid}
grid_imba = RandomizedSearchCV(imba_pipeline, param_distributions=new_params, cv=kf, scoring='recall',
                        return_train_score=True)
grid_imba.fit(X_train, y_train);

## grid_imba.best_params_

# {'gradientboostingclassifier__n_estimators': 200,
#  'gradientboostingclassifier__min_samples_split': 30,
#  'gradientboostingclassifier__min_samples_leaf': 50,
#  'gradientboostingclassifier__max_features': 'auto',
#  'gradientboostingclassifier__max_depth': 30,
#  'gradientboostingclassifier__loss': 'deviance',
#  'gradientboostingclassifier__learning_rate': 0.1,
#  'gradientboostingclassifier__criterion': 'friedman_mse'}

In [None]:
gboost_best = GradientBoostingClassifier(n_estimators=200,min_samples_split=30,min_samples_leaf=50,max_features='auto',max_depth=30,loss='deviance',learning_rate=.1,criterion='friedman_mse')

imba_pipeline = make_pipeline(SMOTE(), 
                              gboost_best)
imba_pipeline.fit(X_train, y_train)
y_pred_gboost = imba_pipeline.predict_proba(X_train)[:,1]

plot_precision_recall_curve(y_train,y_pred_gboost,'GBoost')

In [None]:
imba_pipeline = make_pipeline(SMOTE(), 
                              GradientBoostingClassifier())
new_params = {'gradientboostingclassifier__' + key: random_grid[key] for key in random_grid}
grid_imba_fs = RandomizedSearchCV(imba_pipeline, param_distributions=new_params, cv=kf, scoring='recall',
                        return_train_score=True)
grid_imba_fs.fit(X_train_fs, y_train);

In [None]:
gboost_best_fs = GradientBoostingClassifier(n_estimators=200,min_samples_split=30,min_samples_leaf=50,max_features='auto',max_depth=30,loss='deviance',learning_rate=.1,criterion='friedman_mse')

imba_pipeline_fs = make_pipeline(SMOTE(), 
                              gboost_best_fs)
imba_pipeline_fs.fit(X_train, y_train)
y_pred_gboost_fs = imba_pipeline_fs.predict_proba(X_train_fs)[:,1]

plot_precision_recall_curve(y_train,y_pred_gboost_fs,'GBoost')

In [None]:
### ADABoost

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 300, num = 5)]
# Number of features to consider at every split
base_estimator = ['none',LogisticRegression()]
# Minimum number of samples required to split a node
learning_rate = [0.0001, 0.001, 0.01, 0.1, 1.0]
# Minimum number of samples required at each leaf node
algorithm = ['SAMME', 'SAMME.R']
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'base_estimator': base_estimator,
               'learning_rate': learning_rate,
              'algorithm': algorithm}

imba_pipeline = make_pipeline(SMOTE(), 
                              AdaBoostClassifier())
new_params = {'adaboostclassifier__' + key: random_grid[key] for key in random_grid}
grid_imba = RandomizedSearchCV(imba_pipeline, param_distributions=new_params, cv=kf, scoring='recall',
                        return_train_score=True)
grid_imba.fit(X_train,y_train)
grid_imba.best_params_

# {'n_estimators': 300,
#  'learning_rate': 0.1,
#  'base_estimator': LogisticRegression(),
#  'algorithm': 'SAMME.R'}

In [None]:
ada_best = AdaBoostClassifier(n_estimators=300,base_estimator=LogisticRegression(),learning_rate=1.0,algorithm='SAMME')
ada_pipeline = make_pipeline(SMOTE(), 
                              ada_best)
ada_pipeline.fit(X_train, y_train)
y_pred_ada = ada_best.predict_proba(X_train)[:,1]

plot_precision_recall_curve(y_train,y_pred_ada,'ADA Boost')

In [None]:
imba_pipeline = make_pipeline(SMOTE(), 
                              AdaBoostClassifier())
new_params = {'adaboostclassifier__' + key: random_grid[key] for key in random_grid}
grid_imba_fs = RandomizedSearchCV(imba_pipeline, param_distributions=new_params, cv=kf, scoring='recall',
                        return_train_score=True)
grid_imba_fs.fit(X_train_fs,y_train)
grid_imba_fs.best_params_

# {'adaboostclassifier__n_estimators': 250,
#  'adaboostclassifier__learning_rate': 0.01,
#  'adaboostclassifier__base_estimator': LogisticRegression(),
#  'adaboostclassifier__algorithm': 'SAMME'}

In [None]:
ada_best_fs = AdaBoostClassifier(n_estimators=250,base_estimator=LogisticRegression(),learning_rate=.01,algorithm='SAMME')
ada_pipeline_fs = make_pipeline(SMOTE(), 
                              ada_best_fs)
ada_pipeline_fs.fit(X_train_fs, y_train)
y_pred_ada_fs = ada_best_fs.predict_proba(X_train_fs)[:,1]

plot_precision_recall_curve(y_train,y_pred_ada_fs,'ADA Boost')

In [None]:
### RandomForestClassifier

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 300, num = 5)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt','log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 50,5)]
# Minimum number of samples required to split a node
min_samples_split = [20,30,50,100]
# Minimum number of samples required at each leaf node
min_samples_leaf = [20,30,50,100]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
              'criterion':['entropy','gini']}
print(random_grid)

In [None]:
imba_pipeline = make_pipeline(SMOTE(), 
                              RandomForestClassifier())
new_params = {'randomforestclassifier__' + key: random_grid[key] for key in random_grid}
##new_params = {'randomforestclassifier__' + key: params[key] for key in params}
grid_imba = RandomizedSearchCV(imba_pipeline, param_distributions=new_params, cv=kf, scoring='recall',
                        return_train_score=True)
grid_imba.fit(X_train, y_train);

# grid_imba.best_params_
# {'randomforestclassifier__n_estimators': 200,
#  'randomforestclassifier__min_samples_split': 30,
#  'randomforestclassifier__min_samples_leaf': 20,
#  'randomforestclassifier__max_features': 'log2',
#  'randomforestclassifier__max_depth': 50,
#  'randomforestclassifier__criterion': 'gini'}

In [None]:
rf_best = RandomForestClassifier(n_estimators=200,min_samples_split=30,min_samples_leaf=20,max_features='log2',max_depth=50,criterion='gini')
rf_pipeline = make_pipeline(SMOTE(), rf_best)
rf_pipeline.fit(X_train,y_train)

y_pred_rf = rf_pipeline.predict_proba(X_train)[:,1]
plot_precision_recall_curve(y_train,y_pred_rf,'Random Forest')

In [None]:
imba_pipeline = make_pipeline(SMOTE(), 
                              RandomForestClassifier())
new_params = {'randomforestclassifier__' + key: random_grid[key] for key in random_grid}
##new_params = {'randomforestclassifier__' + key: params[key] for key in params}
grid_imba_fs = RandomizedSearchCV(imba_pipeline, param_distributions=new_params, cv=kf, scoring='recall',
                        return_train_score=True)
grid_imba_fs.fit(X_train_fs, y_train);


In [None]:
rf_best = RandomForestClassifier(n_estimators=250,min_samples_split=50,min_samples_leaf=50,max_features='log2',max_depth=20,criterion='entropy')
rf_pipeline_fs = make_pipeline(SMOTE(), rf_best)
rf_pipeline_fs.fit(X_train_fs,y_train)

y_pred_rf_fs = rf_pipeline_fs.predict_proba(X_train_fs)[:,1]
plot_precision_recall_curve(y_train,y_pred_rf_fs,'Random Forest')

In [None]:
# No Feature Selection Precision_Recall Curves

In [None]:
alg = 'Logistic Regression'
# calculate p-r curves
precision, recall, thresholds = precision_recall_curve(y_train, y_pred_logreg)
# convert to f score
fscore = (5 * precision * recall) / (4 *precision+ recall)
# locate the index of the largest f score
ix = argmax(fscore)
print(alg)
print('Best Threshold=%f, F-Score=%.3f' % (thresholds[ix], fscore[ix]))
# plot the roc curve for the model
no_skill = len(y_train[y_train==1]) / len(y_train)
pyplot.plot(recall, precision, marker='.', label=alg)
pyplot.plot([0,1], [no_skill,no_skill], linestyle='--', label='No Skill')
pyplot.scatter(recall[ix], precision[ix], marker='o', color='black',zorder=10)


alg = 'Random Forest'
# calculate p-r curves
precision, recall, thresholds = precision_recall_curve(y_train, y_pred_rf)
# convert to f score
fscore = (5 * precision * recall) / (4 *precision+ recall)
# locate the index of the largest f score
ix = argmax(fscore)
print(alg)
print('Best Threshold=%f, F-Score=%.3f' % (thresholds[ix], fscore[ix]))
# plot the roc curve for the model
no_skill = len(y_train[y_train==1]) / len(y_train)
pyplot.plot(recall, precision, marker='.', label=alg)
pyplot.scatter(recall[ix], precision[ix], marker='o', color='black',zorder=10)

alg = 'ADA Boost'
# calculate p-r curves
precision, recall, thresholds = precision_recall_curve(y_train, y_pred_ada)
# convert to f score
fscore = (5 * precision * recall) / (4 *precision+ recall)
# locate the index of the largest f score
ix = argmax(fscore)
print(alg)
print('Best Threshold=%f, F-Score=%.3f' % (thresholds[ix], fscore[ix]))
# plot the roc curve for the model
no_skill = len(y_train[y_train==1]) / len(y_train)
pyplot.plot(recall, precision, marker='.', label=alg)
pyplot.scatter(recall[ix], precision[ix], marker='o', color='black',zorder=10)

alg = 'Gradient Boost'
# calculate p-r curves
precision, recall, thresholds = precision_recall_curve(y_train, y_pred_gboost)
# convert to f score
fscore = (5 * precision * recall) / (4 *precision+ recall)
# locate the index of the largest f score
ix = argmax(fscore)
print(alg)
print('Best Threshold=%f, F-Score=%.3f' % (thresholds[ix], fscore[ix]))
# plot the roc curve for the model
no_skill = len(y_train[y_train==1]) / len(y_train)
pyplot.plot(recall, precision, marker='.', label=alg)
pyplot.scatter(recall[ix], precision[ix], marker='o', color='black',zorder=10)




# axis labels
pyplot.xlabel('Recall')
pyplot.ylabel('Precision')
pyplot.legend()
# show the plot
pyplot.show()

In [None]:
# Feature Selection Precision_Recall Curves

In [None]:
alg = 'Logistic Regression'
# calculate p-r curves
precision, recall, thresholds = precision_recall_curve(y_train, y_pred_logreg_fs)
# convert to f score
fscore = (5 * precision * recall) / (4 *precision+ recall)
# locate the index of the largest f score
ix = argmax(fscore)
print(alg)
print('Best Threshold=%f, F-Score=%.3f' % (thresholds[ix], fscore[ix]))
# plot the roc curve for the model
no_skill = len(y_train[y_train==1]) / len(y_train)
pyplot.plot(recall, precision, marker='.', label=alg)
pyplot.plot([0,1], [no_skill,no_skill], linestyle='--', label='No Skill')
pyplot.scatter(recall[ix], precision[ix], marker='o', color='black',zorder=10)


alg = 'Random Forest'
# calculate p-r curves
precision, recall, thresholds = precision_recall_curve(y_train, y_pred_rf_fs)
# convert to f score
fscore = (5 * precision * recall) / (4 *precision+ recall)
# locate the index of the largest f score
ix = argmax(fscore)
print(alg)
print('Best Threshold=%f, F-Score=%.3f' % (thresholds[ix], fscore[ix]))
# plot the roc curve for the model
no_skill = len(y_train[y_train==1]) / len(y_train)
pyplot.plot(recall, precision, marker='.', label=alg)
pyplot.scatter(recall[ix], precision[ix], marker='o', color='black',zorder=10)

alg = 'ADA Boost'
# calculate p-r curves
precision, recall, thresholds = precision_recall_curve(y_train, y_pred_ada_fs)
# convert to f score
fscore = (5 * precision * recall) / (4 *precision+ recall)
# locate the index of the largest f score
ix = argmax(fscore)
print(alg)
print('Best Threshold=%f, F-Score=%.3f' % (thresholds[ix], fscore[ix]))
# plot the roc curve for the model
no_skill = len(y_train[y_train==1]) / len(y_train)
pyplot.plot(recall, precision, marker='.', label=alg)
pyplot.scatter(recall[ix], precision[ix], marker='o', color='black',zorder=10)




# axis labels
pyplot.xlabel('Recall')
pyplot.ylabel('Precision')
pyplot.legend()
# show the plot
pyplot.show()

In [None]:
# Final Model

In [None]:
def plt_curves(y_train_alg,y_pred_alg,model):
    alg = model
    # calculate p-r curves
    precision, recall, thresholds = precision_recall_curve(y_train_alg, y_pred_alg)
    # convert to f score
    fscore = (5 * precision * recall) / (4 *precision+ recall)
    # locate the index of the largest f score
    ix = argmax(fscore)
    print(alg)
    print('Best Threshold=%f, F-Score=%.3f' % (thresholds[ix], fscore[ix]))
    # plot the roc curve for the model
    no_skill = len(y_train_alg[y_train_alg==1]) / len(y_train_alg)
    pyplot.plot(recall, precision, marker='.', label=alg)
    pyplot.scatter(recall[ix], precision[ix], marker='o', color='black',zorder=10)

In [None]:
plt_curves(y_train,y_pred_logreg_fs,'Logistic Regression FS')
plt_curves(y_train,y_pred_ada_fs,'ADA Boost FS')
plt_curves(y_train,y_pred_gboost_fs,'Gradient Boost FS')
plt_curves(y_train,y_pred_rf_fs,'Random Forest FS')

# axis labels
pyplot.xlabel('Recall')
pyplot.ylabel('Precision')
pyplot.legend()
# show the plot
pyplot.show()

plt_curves(y_train,y_pred_logreg,'Logistic Regression')
plt_curves(y_train,y_pred_ada,'ADA Boost')
plt_curves(y_train,y_pred_gboost,'Gradient Boost')
plt_curves(y_train,y_pred_rf,'Random Forest')

# axis labels
pyplot.xlabel('Recall')
pyplot.ylabel('Precision')
pyplot.legend()
# show the plot
pyplot.show()

In [None]:
## Identifying Key Factors
rf_best.predict_proba(X_test)[:,1]

In [None]:
plt_curves(y_test,rf_best.predict_proba(X_test)[:,1],'Random Forest')