In [8]:
import pandas as pd 
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.ensemble import RandomForestClassifier
from dataset import Dataset
from sklearn.model_selection import GridSearchCV
from feature_selection import get_features

In [9]:
dataset = Dataset()            # Loads the preprocessed dataset
dataset.apply_preparation()    # Applies preprocessing and feature engineering
train_set = dataset.train_data # Training set without labels (train.csv)
target = dataset.target        # Labels for training set     (train.csv[Deceased])
test_set = dataset.test_data   # Unlabeled test set          (test.csv)

In [10]:
#Scaling
cols = train_set.columns.values
test_idx = test_set.index.values
scale_type = "StandardScaler"
if scale_type == "RobustScaler":
    robust = RobustScaler().fit(train_set)
    train_set = robust.transform(train_set)
    test_set = robust.transform(test_set)
elif scale_type == "MinMaxScaler":
    minmax = MinMaxScaler().fit(train_set)
    train_set = minmax.transform(train_set)
    test_set = minmax.transform(test_set)
elif scale_type == "StandardScaler":
    scaler = StandardScaler().fit(train_set)
    train_set = scaler.transform(train_set)
    test_set = scaler.transform(test_set)
train_set = pd.DataFrame(train_set, columns=cols)
test_set = pd.DataFrame(test_set, columns=cols) 
test_set.index = test_idx

In [11]:
#rfe = get_features(train_set, target, method="rfe", model="rf", n_features="auto", verbose=1)
#boruta = get_features(train_set, target, method="boruta", model="rf", n_features="auto", verbose=1)
#sfs = get_features(train_set, target, method="sfs", model="rf", n_features="auto", verbose=1)

In [12]:
boruta = ['Birthday_year', 'Medical_Expenses_Family', 'Sev_by_gender',
       'Gender_M', 'spending_vs_severity', 'spending_family_member',
       'spending_family_severity']
rfe = ['Birthday_year', 'Medical_Expenses_Family', 'Sev_by_gender',
       'Gender_M', 'spending_vs_severity', 'spending_family_member',
       'severity_against_avg_city', 'spending_family_severity']
sfs = ['Severity',
 'Sev_by_city',
 'Sev_by_gender',
 'Medical_Tent_D',
 'Medical_Tent_E',
 'Medical_Tent_F',
 'City_Albuquerque',
 'severity_against_avg_city']

In [13]:
is_training = False

if is_training:
    # GridSearchCV: Random Forest
    fs_names = ["boruta", "rfe", "sfs"]
    counter = 0
    for fs in [boruta, rfe, sfs]:
        X_train_grid = train_set[fs]
        parameters = dict(n_estimators=[100, 200, 300], 
                        criterion=['gini', "entropy"],
                        max_depth=[None, 10, 20],
                        min_samples_split=[2,3], 
                        class_weight=[None, "balanced", {0: 1, 1: 1.3}],
                        ccp_alpha=[0.0, 0.0001],
                        max_samples=[None, 0.8],
                        )

        rf = RandomForestClassifier()
        rf_gs = GridSearchCV(rf, parameters, cv=5, scoring=["accuracy"], refit=False, verbose=1, n_jobs=-1)
        rf_gs.fit(X_train_grid, target)
        results = pd.DataFrame(rf_gs.cv_results_)
        print("Finished ", fs_names[counter])
        results.to_csv("results/" + fs_names[counter] + "_rf_gridsearch_results.csv")
        counter += 1

In [14]:
for fs in ["boruta", "rfe", "sfs"]:
    print(fs,"\n")
    rf_results = pd.read_csv("results/" + fs + "_rf_gridsearch_results.csv")
    rf_results = rf_results.sort_values("rank_test_accuracy", )
    display(rf_results.head(n=3))

boruta 



Unnamed: 0.1,Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_ccp_alpha,param_class_weight,param_criterion,param_max_depth,param_max_samples,...,param_n_estimators,params,split0_test_accuracy,split1_test_accuracy,split2_test_accuracy,split3_test_accuracy,split4_test_accuracy,mean_test_accuracy,std_test_accuracy,rank_test_accuracy
269,269,0.949062,0.057544,0.069813,0.008556,0.0001,,entropy,10.0,,...,300,"{'ccp_alpha': 0.0001, 'class_weight': None, 'c...",0.838889,0.844444,0.872222,0.821229,0.821229,0.839603,0.018779,1
57,57,0.280649,0.010808,0.019348,0.00185,0.0,,entropy,10.0,0.8,...,100,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",0.844444,0.838889,0.883333,0.810056,0.821229,0.83959,0.025088,2
381,381,0.464957,0.011238,0.026529,0.007637,0.0001,"{0: 1, 1: 1.3}",gini,10.0,0.8,...,100,"{'ccp_alpha': 0.0001, 'class_weight': {0: 1, 1...",0.838889,0.838889,0.866667,0.804469,0.832402,0.836263,0.019819,3


rfe 



Unnamed: 0.1,Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_ccp_alpha,param_class_weight,param_criterion,param_max_depth,param_max_samples,...,param_n_estimators,params,split0_test_accuracy,split1_test_accuracy,split2_test_accuracy,split3_test_accuracy,split4_test_accuracy,mean_test_accuracy,std_test_accuracy,rank_test_accuracy
23,23,0.973197,0.041321,0.076595,0.006747,0.0,,gini,10.0,0.8,...,300,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",0.855556,0.838889,0.877778,0.826816,0.821229,0.844053,0.020564,1
376,376,0.593014,0.030385,0.040892,0.008967,0.0001,"{0: 1, 1: 1.3}",gini,10.0,,...,200,"{'ccp_alpha': 0.0001, 'class_weight': {0: 1, 1...",0.844444,0.844444,0.877778,0.826816,0.810056,0.840708,0.022515,2
121,121,0.924925,0.037089,0.063431,0.010845,0.0,balanced,entropy,10.0,,...,200,"{'ccp_alpha': 0.0, 'class_weight': 'balanced',...",0.861111,0.833333,0.877778,0.815642,0.815642,0.840701,0.024905,3


sfs 



Unnamed: 0.1,Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_ccp_alpha,param_class_weight,param_criterion,param_max_depth,param_max_samples,...,param_n_estimators,params,split0_test_accuracy,split1_test_accuracy,split2_test_accuracy,split3_test_accuracy,split4_test_accuracy,mean_test_accuracy,std_test_accuracy,rank_test_accuracy
215,215,0.555514,0.016521,0.053458,0.003816,0.0,"{0: 1, 1: 1.3}",entropy,20.0,0.8,...,300,"{'ccp_alpha': 0.0, 'class_weight': {0: 1, 1: 1...",0.827778,0.788889,0.866667,0.793296,0.804469,0.81622,0.028597,1
386,386,0.603985,0.009641,0.05605,0.008681,0.0001,"{0: 1, 1: 1.3}",gini,20.0,,...,300,"{'ccp_alpha': 0.0001, 'class_weight': {0: 1, 1...",0.827778,0.788889,0.866667,0.793296,0.804469,0.81622,0.028597,1
378,378,0.207444,0.00708,0.016755,0.001163,0.0001,"{0: 1, 1: 1.3}",gini,10.0,0.8,...,100,"{'ccp_alpha': 0.0001, 'class_weight': {0: 1, 1...",0.827778,0.788889,0.866667,0.793296,0.804469,0.81622,0.028597,1


In [15]:
rf_results = pd.read_csv("results/rfe_rf_gridsearch_results.csv")
rf_results = rf_results.sort_values("rank_test_accuracy", )
rf_results.iloc[1,5:-9]

param_ccp_alpha                    0.0001
param_class_weight         {0: 1, 1: 1.3}
param_criterion                      gini
param_max_depth                        10
param_max_samples                     NaN
param_min_samples_split                 3
param_n_estimators                    200
Name: 376, dtype: object

In [16]:
clf = RandomForestClassifier(n_estimators=200, min_samples_split=3, class_weight={0:1, 1:1.3},criterion="gini",max_depth=10,ccp_alpha=0.0001, random_state=1)
clf.fit(train_set[rfe],target)
clf.score(train_set[rfe], target)

0.9309576837416481