# Load Dependencies 

In [348]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Import Data

In [349]:
train = pd.read_csv('Data/train_features.csv')
test = pd.read_csv('Data/test_features.csv')
target = pd.read_csv('Data/train_labels.csv')
train.shape, test.shape

((59400, 40), (14358, 40))

# Select Features

In [350]:
selected_features = ['amount_tsh',
    'date_recorded',
    'gps_height',
    'basin',
    'region',
    'district_code',
    'population',
    'public_meeting',
    'scheme_management',
    'permit',
    'construction_year',
    'extraction_type_class',
    'management_group',
    'payment',
    'quality_group',
    'quantity',
    'source_type',
    'source_class', 
    'waterpoint_type',
    'funder',
    'installer', 
    'latitude',
    'longitude']

def select_features(df, features):
    '''
    Subsets dataframe based on list of columns names accepted 
    as a parameter.
    '''
    return df[features]

In [351]:
train = select_features(train, features=selected_features)
test = select_features(test, features=selected_features)
train.shape, test.shape

((59400, 23), (14358, 23))

# Encode Target

In [352]:
target['encoded'] = target['status_group'].replace({
    'functional': 1,
    'non functional': 0,
    'functional needs repair':0
})

# Process Features 

### Impute Features

In [353]:
def smart_impute(X):
    ''' Adapted directly from Dakota P.'s awesome work '''
    X = X.copy()
    
    # Convert all strings in object columns to lowercase
    cat_features = X.select_dtypes('object').columns.tolist()
    for feature in cat_features:
        X[feature] = X[feature].str.lower()
    
    # Replace -2.000000e-08 with np.nan (not showing as zero due to datatype)
    X['latitude'] = X['latitude'].replace( -2.000000e-08, np.nan)
    
    impute_features = ['gps_height', 'population', 'amount_tsh', 'construction_year', 'latitude', 'longitude']

    for feature in impute_features:
        # Replace values=0.0 with np.nan (0.0 appears to indicate missing values in dataset)
        X[feature] = X[feature].replace(0, np.nan)
        
        # Note, hardcoded train when calculating mean to avoid leakage into test data
        # If District code available replace NA's with mean value of other well in the same district 
        X[feature] = X[feature].fillna( X.groupby(['region', 'district_code'])[feature].transform('mean') )
        # If no district code, replace NA's  with mean value of other wells in the same region
        X[feature] = X[feature].fillna( X.groupby(['region'])[feature].transform('mean') )
        # If no district and no region, replace NA's with mean value of all wells
        X[feature] = X[feature].fillna( X[feature].mean() )
    
    return X

In [354]:
train = smart_impute(train)
test = smart_impute(test)

train.shape, test.shape

((59400, 23), (14358, 23))

### Wrangle Features

In [355]:
def wrangle_features(X):
    X = X.copy()
    
    # Create month and year features from the recorded data feature
    X['date_recorded'] = pd.to_datetime(X['date_recorded'])
    X['date_recorded_month'] = X['date_recorded'].dt.month
    X['date_recorded_year'] = X['date_recorded'].dt.year
    
    # Create feature for seasons in Tanzania
    X['Hot_Dry_Season'] = (X['date_recorded_month'] == 12) | (X['date_recorded_month'] < 3)
    X['Heavy_Rain_Season'] = (X['date_recorded_month'] > 4) & (X['date_recorded_month'] < 6)
    X['Cool_Dry_Season'] = (X['date_recorded_month'] > 5) & (X['date_recorded_month'] < 11)
    X['Moderate_Rain'] = (X['date_recorded_month'] == 11) | (X['date_recorded_month'] == 3)
    
    # Bin low freq. categories into 'other' 
    X['scheme_management'] = X['scheme_management'].replace({
        'SWC':'Other',
        'Trust':'Other',
        'None':'Other'
    })
    
    # Create unkown category for missing values in scheme management column
    X['scheme_management'] = X['scheme_management'].fillna('unknown')
    X['permit'] = X['permit'].fillna(False)
    X['public_meeting'] = X['public_meeting'].fillna(True)
    
    # Create age category out of construction_year
    # Bin 0 values as -1
    X['pump_age'] = ( 2014 - X['construction_year'] )
    X['pump_age'] = X['pump_age'].replace({2014:-1})
    
    # Create Installer Features
    X['DWE_Installer'] = (X['installer'] == 'DWE')
    X['Gov_Installer'] = (X['installer'] == 'Government')

    one_time_install = train['installer'].value_counts()[train['installer'].value_counts() == 1]
    X['One_Time_Installer'] = X['installer'].isin(one_time_install.index)

    small_install = train['installer'].value_counts()[ (train['installer'].value_counts() < 10) & (train['installer'].value_counts() > 1) ]
    X['Small_Installer'] = X['installer'].isin(small_install.index)

    big_install = (( train['installer'].value_counts() >= 10 ) == True)
    X['Big_Installer'] = X['installer'].isin(big_install.index)
    
    # Create Funder Features
    X['Tanzania_Gov_Funder'] = (X['funder'] == 'Government Of Tanzania')

    one_time_funder = train['funder'].value_counts()[train['funder'].value_counts() == 1]
    X['One_Time_Funder'] = X['funder'].isin(one_time_funder.index)

    small_funder = train['funder'].value_counts()[ (train['funder'].value_counts() < 10) & (train['funder'].value_counts() > 1) ]
    X['Small_Funder'] = X['funder'].isin(small_funder.index)

    big_funder = (( train['funder'].value_counts() >= 10 ) == True)
    X['Big_Funder'] = X['funder'].isin(big_funder.index)
    
    # Replace population 0 with median population of Train
    median_pop = train['population'].median()
    X['population'] = X['population'].replace(0, median_pop)
    
    # Create interaction between amount of water avaialable and population
    X['pop*amount_tsh'] = X['population'] * X['amount_tsh']
    X['pop/amount_tsh'] = X['population'] / X['amount_tsh']
    X['pop/amount_tsh'] = X['pop/amount_tsh'].replace(np.inf, 3000)
    
    # Create interaction between amount of water avaialable and height
    X['gps_height*amount_tsh'] = X['gps_height'] * X['amount_tsh']
    
    # Polynomial gps_Height
    X['gps_height**2'] = X['gps_height'] ** 2
    X['gps_height**3'] = X['gps_height'] ** 3
    
    # Interaction between latitude and height
    X['latitude*height*amount'] = X['latitude'] * X['gps_height'] * X['amount_tsh']
    X['latitude*height'] = X['latitude'] * X['gps_height']
    
    # Create Binned Features
    labels = [1,2,3,4,5,6,7,8,9,10]
    X['gps_height_binned'] = pd.cut(x=X['gps_height'], bins=10, labels=labels)
    X['pump_age_binned'] = pd.cut(x=X['pump_age'], bins=10, labels=labels)
    X['amount_tsh_binned'] = pd.cut(x=X['amount_tsh'], bins=10, labels=labels)
    X['longitude_binned'] = pd.cut(x=X['longitude'], bins=10, labels=labels)
    X['latitude_binned'] = pd.cut(x=X['latitude'], bins=10, labels=labels)
    
    # Convert Binned Features to int datatype
    X['gps_height_binned'] = X['gps_height_binned'].astype(int)
    X['pump_age_binned'] = X['pump_age_binned'].astype(int)
    X['amount_tsh_binned'] = X['amount_tsh_binned'].astype(int)
    X['longitude_binned'] = X['longitude_binned'].astype(int)
    X['latitude_binned'] = X['latitude_binned'].astype(int)
    
    # Pumps Funded and Built by Tanzania Government
    X['Gov_Funded_Gov_Built'] = X['Tanzania_Gov_Funder'] * X['DWE_Installer'].astype(int)
    
    return X

In [356]:
test = wrangle_features(test)
train = wrangle_features(train)

train.shape, test.shape

((59400, 52), (14358, 52))

In [357]:
def create_score(X, feature):
    rel_score = pd.crosstab(train[feature], target['status_group'])
    rel_score = pd.DataFrame(rel_score)
    
    new_feature = feature + '_rel_score'
    total = rel_score['functional'] + rel_score['functional needs repair'] + rel_score['non functional']
    rel_score[new_feature] = rel_score['functional'] / total
    
    rel_score = rel_score.reset_index()
    rel_score = rel_score[ [feature, new_feature] ]
    
    X = pd.merge(X, rel_score, how='left', on=feature)
    X[new_feature] = X[new_feature].fillna(0)
    
    return X

In [358]:
def composite_scores(X):
    X = X.copy()
    
    # Create Reliability Scores
    X = create_score(X, 'installer')
    X = create_score(X, 'scheme_management')
    X = create_score(X, 'management_group')
    X = create_score(X, 'payment')
    X = create_score(X, 'extraction_type_class')
    X = create_score(X, 'waterpoint_type')
    X = create_score(X, 'pump_age_binned')
    X = create_score(X, 'gps_height_binned')
    X = create_score(X, 'amount_tsh_binned')
    X = create_score(X, 'longitude_binned')
    X = create_score(X, 'latitude_binned')
    X = create_score(X, 'region')
    X = create_score(X, 'basin')
    X = create_score(X, 'district_code')
    
    # Create Composite Scores
    X['Management_Score'] = X['installer_rel_score'] + X['scheme_management_rel_score'] + X['management_group_rel_score']
    X['Condition_Score'] = X['gps_height_binned_rel_score'] + X['amount_tsh_binned_rel_score'] + X['latitude_binned_rel_score'] + X['longitude_binned_rel_score']
    X['Tech_Score'] = X['payment_rel_score'] + X['pump_age_binned_rel_score'] + X['extraction_type_class_rel_score'] + X['waterpoint_type_rel_score']
    
    X['Overall_Reliability_Score'] = X['Management_Score'] + X['Condition_Score'] + X['Tech_Score']
    X['Overall_Reliability_Score**2'] = X['Overall_Reliability_Score'] ** 2 
    
    # Public Support Score 
    X['Public_Support'] = X['payment_rel_score'] + X['public_meeting'] + X['permit']
    
    # Location Score
    X['Location_Score'] = X['basin_rel_score'] + X['district_code_rel_score'] + X['region_rel_score']
    
    # Location * Condition
    X['Location*Condition'] = X['Location_Score'] + X['Condition_Score']
    X['Location*Condition**2'] = X['Location*Condition'] ** 2
    
    # Public Support and Location
    X['Public_Support+Loc'] = X['Public_Support'] + X['Location_Score']
    
    # Site Score
    X['Site_Score'] = X['Location_Score'] + X['Public_Support'] + X['Overall_Reliability_Score']
    X['Site_Score**2'] = X['Site_Score'] ** 2
    
    # Negative Correlated Features
    X['dry&high'] = (X['Hot_Dry_Season'] | X['Cool_Dry_Season']) * X['latitude*height']
    X['Pump_Age*Pop/Amount_tsg'] = X['pump_age'] * X['pop/amount_tsh']
    X['pump_age_binned**2'] = X['pump_age_binned'] ** 2
    
    # Drop unecessary columns
    drop_cols = ['date_recorded', 'funder', 'installer', 'construction_year']
    X = X.drop(columns=drop_cols)
    
    return X

In [359]:
test = composite_scores(test)
train = composite_scores(train)

train.shape, test.shape

((59400, 77), (14358, 77))

In [360]:
train.columns.tolist() == (test.columns.tolist())

True

### Encode Features 

In [361]:
import category_encoders as ce

def one_hot(X):
    # Features to one hot encode
    one_hot_cols = ['extraction_type_class', 'payment', 'quality_group', 
                    'quantity', 'source_class', 'waterpoint_type']
    
    # Convert all relevant cols to category datatype (for encoder)
    X[one_hot_cols] = X[one_hot_cols].astype('category')
    
    # Initialize and transform relevant features
    encoder = ce.OneHotEncoder(use_cat_names=True)
    
    # Note, train hardcoded to avoid overfitting test data
    encoder.fit(train[one_hot_cols])
    encoded = encoder.transform(X[one_hot_cols])
    
    X = X.drop(columns=one_hot_cols)
    X = pd.concat([X, encoded], axis=1)
    
    return X

In [362]:
import category_encoders as ce

def ordinal(X):
    ord_cols = ['basin', 'region', 'scheme_management',
                    'management_group', 'source_type']
    
    # Initialize and transform relevant features
    encoder = ce.OrdinalEncoder(cols=ord_cols)
    
    # Note, train hardcoded to avoid overfitting test data
    encoder.fit(train)
    X = encoder.transform(X)
    
    return X

In [363]:
test = one_hot(test)
train = one_hot(train)
processed_train = ordinal(train)
processed_test = ordinal(test)
processed_train.shape, processed_test.shape

((59400, 106), (14358, 106))

### Create Additional Features Based on Encoded Features 

- Only intended to be used with OneHotEncoding

In [364]:
def non_func_features(X):
    X = X.copy()
    
    X['quantity_dry*pump_age/amount'] = X['quantity_dry'] / X['Pump_Age*Pop/Amount_tsg']
    
    X['Condition_Risk'] = X['quantity_dry'] + X['waterpoint_type_other'] + X['extraction_type_class_other'] + X['quality_group_unknown']
    X['Condition**2'] = X['Condition_Risk'] ** 2
    
    X['Tech_Risk'] = X['extraction_type_class_motorpump'] + X['waterpoint_type_communal standpipe multiple'] + X['waterpoint_type_other']
    X['Tech_Risk**2'] = X['Tech_Risk'] ** 2
    
    X['Non_Tech_Risk*Pump_age'] = X['Tech_Risk'] * X['pump_age']
    
    X['Management_Risk'] = X['Tanzania_Gov_Funder'] + X['Gov_Installer'] + X['payment_never pay'] + X['payment_unknown']
    X['Management_Risk**2'] = X['Management_Risk'] ** 2
    
    X['Non_Func_Risk'] = X['Condition_Risk'] + X['Tech_Risk'] + X['Management_Risk']
    X['Non_Func_Risk*Pump_age'] = X['Non_Func_Risk'] * X['pump_age']
    X['Non_Func_Risk**2'] = X['Non_Func_Risk'] ** 2
    
    X['Tech_Repair_Score'] = X['source_class_surface'] + X['extraction_type_class_gravity']
    X['Tech_Repair_Score**2'] = X['Tech_Repair_Score'] ** 2
    
    return X 

In [365]:
processed_train = non_func_features(processed_train)
processed_test = non_func_features(processed_test)
processed_train.shape, processed_test.shape

  .format(op=op_str, alt_op=unsupported[op_str]))
  .format(op=op_str, alt_op=unsupported[op_str]))


((59400, 119), (14358, 119))

In [366]:
pd.set_option('display.max_columns', 500)

In [368]:
processed_train, processed_test = processed_train.align(processed_test, join='left', axis=1)

In [369]:
processed_train.columns.tolist() == processed_test.columns.tolist()

True

# Testing Model

### Create Submission

In [370]:
def create_submission(y_test_pred):
    sample_submission = pd.read_csv('Data/sample_submission.csv')
    submission = sample_submission.copy()
    submission['status_group'] = y_test_pred
    
    now = pd.to_datetime('now')
    filename = 'MB_' + str(now).replace(' ','_')[0:-7] 
    
    submission.to_csv(f'Submissions/{filename}.csv', index=False)
    print(f'Submissions/{filename}.csv')

In [371]:
# Filenames of your submissions you want to ensemble
files = ['submission-01.csv', 'submission-02.csv', 'submission-03.csv']

def create_ensemble_submission(files):
    submissions = (pd.read_csv(file)[['status_group']] for file in files)
    ensemble = pd.concat(submissions, axis='columns')
    majority_vote = ensemble.mode(axis='columns')[0]

    sample_submission = pd.read_csv('sample_submission.csv')
    submission = sample_submission.copy()
    submission['status_group'] = majority_vote
    
    now = pd.to_datetime('now')
    filename = 'MB_' + str(now).replace(' ','_')[0:-7]
    
    submission.to_csv('Submissions/{filename}.csv', index=False)
    print(f'Submissions/{filename}.csv')

### Classification Metrics 

In [372]:
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix, log_loss, f1_score, mean_absolute_error, mean_squared_error 
import matplotlib.pyplot as plt

def classification_metrics(fit_estimator, X, y):
    '''
    Evaluates classifiers performance across several metrics (accuracy, roc_auc, confusion matrix,
                                                              log loss, f1, mean absolute error, 
                                                              mean squared error)
    Parameters:
        fit_estimator: an sklearn estimator that has been fitted to train data
        X: the features with which predictions will be based on
        y: the labels with which performance will be evaluated
    Returns: 
        Prints summary of performance across metrics and visualizaiton of ROC curve
    '''
    score = fit_estimator.score(X, y)
    y_pred = fit_estimator.predict(X)
    y_pred_proba = fit_estimator.predict_proba(X)[:,1]
    
    name = fit_estimator.__class__.__name__
    print(name)
    print('Accuracy Score:', score)
    print('F1 Score:', f1_score(y, y_pred, average='weighted'))

    conf_mat = pd.DataFrame(confusion_matrix(y, y_pred), columns=['Predicted Functional', 'Predicted Needs Repair', 'Predicted Non-Functional'],
                           index=['Actual Functional','Actual Needs Repair', 'Actual Non-Functional'])
    print('Confusion Matrix:')
    print(conf_mat)

### Random Forest Classifier

In [373]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate

model = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=10, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=250,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=10,
                       warm_start=False)

scores = cross_validate(model, 
                        processed_train, 
                        target['encoded'], 
                        return_train_score=True,
                        return_estimator=True,
                        scoring='accuracy', 
                        n_jobs=-1,
                        cv=3)

In [374]:
pd.DataFrame(scores)

Unnamed: 0,fit_time,score_time,estimator,test_score,train_score
0,24.386263,1.366607,"(DecisionTreeClassifier(class_weight=None, cri...",0.797121,0.818434
1,24.255038,1.190345,"(DecisionTreeClassifier(class_weight=None, cri...",0.796667,0.817727
2,24.52641,1.45256,"(DecisionTreeClassifier(class_weight=None, cri...",0.795707,0.82053


In [381]:
model.fit(processed_train, target['encoded'])
train_pred = model.predict(processed_train)
test_pred = model.predict(processed_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


building tree 1 of 250building tree 2 of 250

building tree 3 of 250
building tree 4 of 250
building tree 5 of 250
building tree 6 of 250
building tree 7 of 250
building tree 8 of 250
building tree 9 of 250


[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.4s


building tree 10 of 250
building tree 11 of 250
building tree 12 of 250
building tree 13 of 250


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    0.7s


building tree 14 of 250
building tree 15 of 250
building tree 16 of 250
building tree 17 of 250
building tree 18 of 250
building tree 19 of 250
building tree 20 of 250


[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    1.1s


building tree 21 of 250
building tree 22 of 250
building tree 23 of 250
building tree 24 of 250
building tree 25 of 250
building tree 26 of 250
building tree 27 of 250
building tree 28 of 250


[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    1.5s


building tree 29 of 250
building tree 30 of 250
building tree 31 of 250
building tree 32 of 250
building tree 33 of 250
building tree 34 of 250
building tree 35 of 250
building tree 36 of 250
building tree 37 of 250


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    2.0s


building tree 38 of 250
building tree 39 of 250
building tree 40 of 250
building tree 41 of 250
building tree 42 of 250
building tree 43 of 250
building tree 44 of 250
building tree 45 of 250
building tree 46 of 250
building tree 47 of 250


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    2.5s


building tree 48 of 250
building tree 49 of 250
building tree 50 of 250
building tree 51 of 250
building tree 52 of 250
building tree 53 of 250
building tree 54 of 250
building tree 55 of 250
building tree 56 of 250
building tree 57 of 250


[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:    3.1s


building tree 58 of 250
building tree 59 of 250
building tree 60 of 250
building tree 61 of 250
building tree 62 of 250
building tree 63 of 250
building tree 64 of 250
building tree 65 of 250
building tree 66 of 250
building tree 67 of 250
building tree 68 of 250
building tree 69 of 250


[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    3.8s


building tree 70 of 250
building tree 71 of 250
building tree 72 of 250
building tree 73 of 250
building tree 74 of 250
building tree 75 of 250
building tree 76 of 250
building tree 77 of 250
building tree 78 of 250
building tree 79 of 250
building tree 80 of 250
building tree 81 of 250


[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:    4.6s


building tree 82 of 250
building tree 83 of 250
building tree 84 of 250
building tree 85 of 250
building tree 86 of 250
building tree 87 of 250
building tree 88 of 250
building tree 89 of 250
building tree 90 of 250building tree 91 of 250

building tree 92 of 250
building tree 93 of 250


[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:    5.5s


building tree 94 of 250
building tree 95 of 250
building tree 96 of 250
building tree 97 of 250
building tree 98 of 250
building tree 99 of 250
building tree 100 of 250
building tree 101 of 250
building tree 102 of 250
building tree 103 of 250
building tree 104 of 250
building tree 105 of 250
building tree 106 of 250
building tree 107 of 250
building tree 108 of 250
building tree 109 of 250
building tree 110 of 250


[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:    6.2s


building tree 111 of 250
building tree 112 of 250
building tree 113 of 250
building tree 114 of 250
building tree 115 of 250
building tree 116 of 250
building tree 117 of 250
building tree 118 of 250
building tree 119 of 250
building tree 120 of 250
building tree 121 of 250
building tree 122 of 250
building tree 123 of 250
building tree 124 of 250
building tree 125 of 250


[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:    7.0s


building tree 126 of 250
building tree 127 of 250
building tree 128 of 250
building tree 129 of 250
building tree 130 of 250
building tree 131 of 250
building tree 132 of 250
building tree 133 of 250
building tree 134 of 250
building tree 135 of 250
building tree 136 of 250
building tree 137 of 250
building tree 138 of 250
building tree 139 of 250
building tree 140 of 250
building tree 141 of 250
building tree 142 of 250


[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:    8.1s


building tree 143 of 250building tree 144 of 250

building tree 145 of 250
building tree 146 of 250
building tree 147 of 250
building tree 148 of 250
building tree 149 of 250
building tree 150 of 250
building tree 151 of 250
building tree 152 of 250
building tree 153 of 250
building tree 154 of 250
building tree 155 of 250
building tree 156 of 250
building tree 157 of 250
building tree 158 of 250
building tree 159 of 250


[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:    9.0s


building tree 160 of 250
building tree 161 of 250
building tree 162 of 250
building tree 163 of 250
building tree 164 of 250
building tree 165 of 250
building tree 166 of 250
building tree 167 of 250
building tree 168 of 250
building tree 169 of 250
building tree 170 of 250
building tree 171 of 250
building tree 172 of 250
building tree 173 of 250
building tree 174 of 250
building tree 175 of 250
building tree 176 of 250


[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:   10.0s


building tree 177 of 250
building tree 178 of 250
building tree 179 of 250
building tree 180 of 250
building tree 181 of 250
building tree 182 of 250
building tree 183 of 250
building tree 184 of 250
building tree 185 of 250
building tree 186 of 250
building tree 187 of 250
building tree 188 of 250
building tree 189 of 250
building tree 190 of 250
building tree 191 of 250
building tree 192 of 250
building tree 193 of 250
building tree 194 of 250
building tree 195 of 250
building tree 196 of 250
building tree 197 of 250
building tree 198 of 250


[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   11.0s


building tree 199 of 250
building tree 200 of 250
building tree 201 of 250
building tree 202 of 250
building tree 203 of 250
building tree 204 of 250
building tree 205 of 250
building tree 206 of 250
building tree 207 of 250
building tree 208 of 250
building tree 209 of 250
building tree 210 of 250
building tree 211 of 250
building tree 212 of 250
building tree 213 of 250
building tree 214 of 250
building tree 215 of 250
building tree 216 of 250
building tree 217 of 250
building tree 218 of 250
building tree 219 of 250


[Parallel(n_jobs=-1)]: Done 213 tasks      | elapsed:   12.1s


building tree 220 of 250
building tree 221 of 250
building tree 222 of 250
building tree 223 of 250
building tree 224 of 250
building tree 225 of 250
building tree 226 of 250
building tree 227 of 250
building tree 228 of 250
building tree 229 of 250
building tree 230 of 250
building tree 231 of 250
building tree 232 of 250
building tree 233 of 250
building tree 234 of 250
building tree 235 of 250
building tree 236 of 250
building tree 237 of 250
building tree 238 of 250
building tree 239 of 250


[Parallel(n_jobs=-1)]: Done 234 tasks      | elapsed:   13.4s


building tree 240 of 250
building tree 241 of 250
building tree 242 of 250
building tree 243 of 250
building tree 244 of 250
building tree 245 of 250
building tree 246 of 250
building tree 247 of 250
building tree 248 of 250
building tree 249 of 250
building tree 250 of 250


[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:   14.2s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done  53 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done  77 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done  90 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done 105 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed:    0.6s
[Parallel(n_jobs=4)]: Done 137 tasks      | elapsed:    0.7s
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:    

In [385]:
y_test = pd.Series(test_pred).replace({0:'non functional', 1:'functional'})

In [386]:
create_submission(y_test)

Submissions/MB_2019-05-22_20:45:47.csv


#### Randomized SearchCV 

In [209]:
from sklearn.model_selection import RandomizedSearchCV

param_distributions = {
    'n_estimators': [200, 300, 400, 500],
    'max_depth': [8, 10, 12, 15],
}

search = RandomizedSearchCV(model, 
                           param_distributions=param_distributions,
                           n_iter=8,
                           scoring='accuracy', 
                           verbose=10,
                           cv=3,
                           n_jobs=-1,
                           return_train_score=True)

In [210]:
rf_search = search.fit(processed_train, target['status_group'])

Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done  20 out of  24 | elapsed:  6.1min remaining:  1.2min
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  7.3min finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


building tree 1 of 500building tree 2 of 500

building tree 3 of 500
building tree 4 of 500
building tree 5 of 500
building tree 6 of 500
building tree 7 of 500
building tree 8 of 500


[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.6s


building tree 9 of 500
building tree 10 of 500
building tree 11 of 500
building tree 12 of 500
building tree 13 of 500
building tree 14 of 500
building tree 15 of 500
building tree 16 of 500


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    0.9s


building tree 17 of 500
building tree 18 of 500
building tree 19 of 500
building tree 20 of 500


[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    1.5s


building tree 21 of 500
building tree 22 of 500
building tree 23 of 500
building tree 24 of 500
building tree 25 of 500
building tree 26 of 500
building tree 27 of 500
building tree 28 of 500


[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    1.9s


building tree 29 of 500
building tree 30 of 500
building tree 31 of 500
building tree 32 of 500
building tree 33 of 500
building tree 34 of 500
building tree 35 of 500
building tree 36 of 500


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    2.7s


building tree 37 of 500
building tree 38 of 500
building tree 39 of 500
building tree 40 of 500
building tree 41 of 500
building tree 42 of 500
building tree 43 of 500
building tree 44 of 500
building tree 45 of 500
building tree 46 of 500
building tree 47 of 500
building tree 48 of 500


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    3.4s


building tree 49 of 500
building tree 50 of 500
building tree 51 of 500
building tree 52 of 500
building tree 53 of 500
building tree 54 of 500
building tree 55 of 500
building tree 56 of 500
building tree 57 of 500
building tree 58 of 500


[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:    4.2s


building tree 59 of 500
building tree 60 of 500
building tree 61 of 500
building tree 62 of 500
building tree 63 of 500
building tree 64 of 500
building tree 65 of 500
building tree 66 of 500
building tree 67 of 500
building tree 68 of 500
building tree 69 of 500
building tree 70 of 500


[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    5.2s


building tree 71 of 500
building tree 72 of 500
building tree 73 of 500
building tree 74 of 500
building tree 75 of 500
building tree 76 of 500
building tree 77 of 500
building tree 78 of 500
building tree 79 of 500
building tree 80 of 500
building tree 81 of 500
building tree 82 of 500
building tree 83 of 500building tree 84 of 500



[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:    6.2s


building tree 85 of 500
building tree 86 of 500
building tree 87 of 500
building tree 88 of 500
building tree 89 of 500
building tree 90 of 500
building tree 91 of 500
building tree 92 of 500
building tree 93 of 500
building tree 94 of 500
building tree 95 of 500
building tree 96 of 500


[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:    7.2s


building tree 97 of 500
building tree 98 of 500
building tree 99 of 500
building tree 100 of 500
building tree 101 of 500
building tree 102 of 500
building tree 103 of 500
building tree 104 of 500
building tree 105 of 500
building tree 106 of 500
building tree 107 of 500
building tree 108 of 500


[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:    8.5s


building tree 109 of 500
building tree 110 of 500
building tree 111 of 500
building tree 112 of 500
building tree 113 of 500
building tree 114 of 500
building tree 115 of 500
building tree 116 of 500
building tree 117 of 500
building tree 118 of 500
building tree 119 of 500
building tree 120 of 500
building tree 121 of 500
building tree 122 of 500
building tree 123 of 500
building tree 124 of 500


[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:    9.6s


building tree 125 of 500
building tree 126 of 500
building tree 127 of 500
building tree 128 of 500
building tree 129 of 500
building tree 130 of 500
building tree 131 of 500
building tree 132 of 500
building tree 133 of 500
building tree 134 of 500
building tree 135 of 500
building tree 136 of 500
building tree 137 of 500
building tree 138 of 500
building tree 139 of 500
building tree 140 of 500


[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:   11.1s


building tree 141 of 500
building tree 142 of 500
building tree 143 of 500
building tree 144 of 500
building tree 145 of 500
building tree 146 of 500
building tree 147 of 500
building tree 148 of 500
building tree 149 of 500
building tree 150 of 500
building tree 151 of 500
building tree 152 of 500
building tree 153 of 500
building tree 154 of 500
building tree 155 of 500
building tree 156 of 500
building tree 157 of 500
building tree 158 of 500
building tree 159 of 500
building tree 160 of 500


[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   12.5s


building tree 161 of 500
building tree 162 of 500
building tree 163 of 500
building tree 164 of 500
building tree 165 of 500
building tree 166 of 500
building tree 167 of 500
building tree 168 of 500
building tree 169 of 500
building tree 170 of 500
building tree 171 of 500
building tree 172 of 500
building tree 173 of 500
building tree 174 of 500
building tree 175 of 500
building tree 176 of 500


[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:   14.0s


building tree 177 of 500
building tree 178 of 500
building tree 179 of 500
building tree 180 of 500
building tree 181 of 500
building tree 182 of 500
building tree 183 of 500
building tree 184 of 500
building tree 185 of 500
building tree 186 of 500
building tree 187 of 500
building tree 188 of 500
building tree 189 of 500
building tree 190 of 500
building tree 191 of 500building tree 192 of 500

building tree 193 of 500
building tree 194 of 500
building tree 195 of 500
building tree 196 of 500


[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   15.4s


building tree 197 of 500
building tree 198 of 500
building tree 199 of 500
building tree 200 of 500
building tree 201 of 500
building tree 202 of 500
building tree 203 of 500
building tree 204 of 500
building tree 205 of 500
building tree 206 of 500
building tree 207 of 500
building tree 208 of 500
building tree 209 of 500
building tree 210 of 500
building tree 211 of 500
building tree 212 of 500
building tree 213 of 500
building tree 214 of 500
building tree 215 of 500
building tree 216 of 500
building tree 217 of 500


[Parallel(n_jobs=-1)]: Done 213 tasks      | elapsed:   17.2s


building tree 218 of 500
building tree 219 of 500
building tree 220 of 500
building tree 221 of 500
building tree 222 of 500
building tree 223 of 500
building tree 224 of 500
building tree 225 of 500
building tree 226 of 500
building tree 227 of 500
building tree 228 of 500
building tree 229 of 500
building tree 230 of 500
building tree 231 of 500
building tree 232 of 500
building tree 233 of 500
building tree 234 of 500
building tree 235 of 500
building tree 236 of 500
building tree 237 of 500
building tree 238 of 500
building tree 239 of 500
building tree 240 of 500


[Parallel(n_jobs=-1)]: Done 234 tasks      | elapsed:   18.9s


building tree 241 of 500
building tree 242 of 500
building tree 243 of 500
building tree 244 of 500
building tree 245 of 500
building tree 246 of 500
building tree 247 of 500
building tree 248 of 500
building tree 249 of 500
building tree 250 of 500
building tree 251 of 500
building tree 252 of 500
building tree 253 of 500
building tree 254 of 500
building tree 255 of 500
building tree 256 of 500
building tree 257 of 500
building tree 258 of 500
building tree 259 of 500
building tree 260 of 500
building tree 261 of 500

[Parallel(n_jobs=-1)]: Done 257 tasks      | elapsed:   20.6s



building tree 262 of 500
building tree 263 of 500
building tree 264 of 500
building tree 265 of 500
building tree 266 of 500
building tree 267 of 500
building tree 268 of 500
building tree 269 of 500
building tree 270 of 500
building tree 271 of 500
building tree 272 of 500
building tree 273 of 500
building tree 274 of 500
building tree 275 of 500
building tree 276 of 500
building tree 277 of 500
building tree 278 of 500
building tree 279 of 500
building tree 280 of 500
building tree 281 of 500
building tree 282 of 500
building tree 283 of 500
building tree 284 of 500
building tree 285 of 500


[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:   22.4s


building tree 286 of 500
building tree 287 of 500
building tree 288 of 500
building tree 289 of 500
building tree 290 of 500
building tree 291 of 500
building tree 292 of 500
building tree 293 of 500
building tree 294 of 500
building tree 295 of 500
building tree 296 of 500
building tree 297 of 500
building tree 298 of 500
building tree 299 of 500
building tree 300 of 500
building tree 301 of 500
building tree 302 of 500
building tree 303 of 500
building tree 304 of 500
building tree 305 of 500
building tree 306 of 500
building tree 307 of 500
building tree 308 of 500


[Parallel(n_jobs=-1)]: Done 305 tasks      | elapsed:   24.6s


building tree 309 of 500
building tree 310 of 500
building tree 311 of 500
building tree 312 of 500
building tree 313 of 500
building tree 314 of 500
building tree 315 of 500
building tree 316 of 500
building tree 317 of 500
building tree 318 of 500
building tree 319 of 500
building tree 320 of 500
building tree 321 of 500
building tree 322 of 500
building tree 323 of 500
building tree 324 of 500
building tree 325 of 500
building tree 326 of 500
building tree 327 of 500
building tree 328 of 500
building tree 329 of 500
building tree 330 of 500
building tree 331 of 500
building tree 332 of 500
building tree 333 of 500
building tree 334 of 500


[Parallel(n_jobs=-1)]: Done 330 tasks      | elapsed:   26.7s


building tree 335 of 500
building tree 336 of 500
building tree 337 of 500
building tree 338 of 500
building tree 339 of 500
building tree 340 of 500
building tree 341 of 500
building tree 342 of 500
building tree 343 of 500
building tree 344 of 500
building tree 345 of 500
building tree 346 of 500
building tree 347 of 500
building tree 348 of 500
building tree 349 of 500
building tree 350 of 500
building tree 351 of 500
building tree 352 of 500
building tree 353 of 500
building tree 354 of 500
building tree 355 of 500
building tree 356 of 500
building tree 357 of 500
building tree 358 of 500
building tree 359 of 500
building tree 360 of 500


[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:   28.7s


building tree 361 of 500
building tree 362 of 500
building tree 363 of 500
building tree 364 of 500
building tree 365 of 500
building tree 366 of 500
building tree 367 of 500
building tree 368 of 500
building tree 369 of 500
building tree 370 of 500
building tree 371 of 500
building tree 372 of 500
building tree 373 of 500
building tree 374 of 500
building tree 375 of 500
building tree 376 of 500
building tree 377 of 500
building tree 378 of 500
building tree 379 of 500
building tree 380 of 500
building tree 381 of 500
building tree 382 of 500
building tree 383 of 500
building tree 384 of 500
building tree 385 of 500
building tree 386 of 500
building tree 387 of 500
building tree 388 of 500


[Parallel(n_jobs=-1)]: Done 384 tasks      | elapsed:   30.9s


building tree 389 of 500
building tree 390 of 500
building tree 391 of 500
building tree 392 of 500
building tree 393 of 500
building tree 394 of 500
building tree 395 of 500
building tree 396 of 500
building tree 397 of 500
building tree 398 of 500
building tree 399 of 500
building tree 400 of 500
building tree 401 of 500
building tree 402 of 500
building tree 403 of 500
building tree 404 of 500
building tree 405 of 500
building tree 406 of 500
building tree 407 of 500
building tree 408 of 500
building tree 409 of 500
building tree 410 of 500
building tree 411 of 500
building tree 412 of 500
building tree 413 of 500
building tree 414 of 500
building tree 415 of 500
building tree 416 of 500
building tree 417 of 500
building tree 418 of 500
building tree 419 of 500


[Parallel(n_jobs=-1)]: Done 413 tasks      | elapsed:   33.2s


building tree 420 of 500
building tree 421 of 500
building tree 422 of 500
building tree 423 of 500
building tree 424 of 500
building tree 425 of 500
building tree 426 of 500
building tree 427 of 500
building tree 428 of 500
building tree 429 of 500
building tree 430 of 500
building tree 431 of 500
building tree 432 of 500
building tree 433 of 500
building tree 434 of 500
building tree 435 of 500
building tree 436 of 500
building tree 437 of 500
building tree 438 of 500
building tree 439 of 500
building tree 440 of 500
building tree 441 of 500
building tree 442 of 500
building tree 443 of 500
building tree 444 of 500
building tree 445 of 500
building tree 446 of 500
building tree 447 of 500
building tree 448 of 500


[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   35.5s


building tree 449 of 500
building tree 450 of 500
building tree 451 of 500
building tree 452 of 500
building tree 453 of 500
building tree 454 of 500
building tree 455 of 500
building tree 456 of 500
building tree 457 of 500
building tree 458 of 500
building tree 459 of 500
building tree 460 of 500
building tree 461 of 500
building tree 462 of 500
building tree 463 of 500
building tree 464 of 500
building tree 465 of 500
building tree 466 of 500
building tree 467 of 500
building tree 468 of 500
building tree 469 of 500
building tree 470 of 500
building tree 471 of 500
building tree 472 of 500
building tree 473 of 500
building tree 474 of 500
building tree 475 of 500
building tree 476 of 500
building tree 477 of 500
building tree 478 of 500
building tree 479 of 500
building tree 480 of 500


[Parallel(n_jobs=-1)]: Done 473 tasks      | elapsed:   38.3s


building tree 481 of 500
building tree 482 of 500
building tree 483 of 500
building tree 484 of 500
building tree 485 of 500
building tree 486 of 500
building tree 487 of 500
building tree 488 of 500
building tree 489 of 500
building tree 490 of 500
building tree 491 of 500
building tree 492 of 500
building tree 493 of 500
building tree 494 of 500
building tree 495 of 500
building tree 496 of 500
building tree 497 of 500
building tree 498 of 500
building tree 499 of 500
building tree 500 of 500


[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:   40.4s finished


In [215]:
rf_search.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=15, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=-1, oob_score=False, random_state=None,
                       verbose=10, warm_start=False)

In [220]:
pd.DataFrame(rf_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,61.777111,0.022307,2.737904,0.068379,500,8,"{'n_estimators': 500, 'max_depth': 8}",0.755657,0.755354,0.755808,0.755606,0.000189,8,0.766414,0.765606,0.766162,0.766061,0.000338
1,100.827378,0.467727,4.587985,0.067031,500,15,"{'n_estimators': 500, 'max_depth': 15}",0.80197,0.80197,0.799343,0.801094,0.001238,1,0.883561,0.888737,0.887071,0.886456,0.002158
2,70.707527,0.473404,3.120538,0.404885,400,12,"{'n_estimators': 400, 'max_depth': 12}",0.790707,0.791667,0.790152,0.790842,0.000626,3,0.83404,0.833081,0.835657,0.834259,0.001063
3,34.235129,2.088073,1.664827,0.296758,200,12,"{'n_estimators': 200, 'max_depth': 12}",0.790152,0.790909,0.78803,0.789697,0.001218,5,0.832601,0.833283,0.835303,0.833729,0.001147
4,74.363536,1.035506,3.643329,0.160455,500,10,"{'n_estimators': 500, 'max_depth': 10}",0.776162,0.774495,0.776111,0.775589,0.000774,6,0.800758,0.798763,0.801919,0.80048,0.001304
5,32.092283,1.133835,1.47288,0.488336,200,10,"{'n_estimators': 200, 'max_depth': 10}",0.776515,0.77404,0.774697,0.775084,0.001047,7,0.801061,0.798838,0.802045,0.800648,0.001341
6,43.240282,0.685561,1.922619,0.194793,200,15,"{'n_estimators': 200, 'max_depth': 15}",0.801616,0.801212,0.799495,0.800774,0.00092,2,0.882904,0.888561,0.887222,0.886229,0.002414
7,70.019189,0.910262,2.52259,0.461599,500,12,"{'n_estimators': 500, 'max_depth': 12}",0.79,0.792273,0.790253,0.790842,0.001017,3,0.833131,0.833409,0.836111,0.834217,0.001344


#### RF Model

In [76]:
model = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=10, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=400,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=10,
                       warm_start=False)

model.fit(processed_train, target['status_group'])

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


building tree 1 of 400
building tree 2 of 400building tree 3 of 400

building tree 4 of 400
building tree 5 of 400
building tree 6 of 400
building tree 7 of 400
building tree 8 of 400


[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.6s


building tree 9 of 400
building tree 10 of 400
building tree 11 of 400
building tree 12 of 400
building tree 13 of 400
building tree 14 of 400
building tree 15 of 400
building tree 16 of 400


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    0.9s


building tree 17 of 400
building tree 18 of 400
building tree 19 of 400
building tree 20 of 400


[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    1.5s


building tree 21 of 400
building tree 22 of 400
building tree 23 of 400
building tree 24 of 400
building tree 25 of 400
building tree 26 of 400
building tree 27 of 400
building tree 28 of 400


[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    1.9s


building tree 29 of 400
building tree 30 of 400
building tree 31 of 400
building tree 32 of 400
building tree 33 of 400
building tree 34 of 400
building tree 35 of 400
building tree 36 of 400


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    2.8s


building tree 37 of 400
building tree 38 of 400
building tree 39 of 400
building tree 40 of 400
building tree 41 of 400
building tree 42 of 400
building tree 43 of 400
building tree 44 of 400
building tree 45 of 400


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    3.7s


building tree 46 of 400
building tree 47 of 400
building tree 48 of 400
building tree 49 of 400
building tree 50 of 400
building tree 51 of 400
building tree 52 of 400
building tree 53 of 400
building tree 54 of 400
building tree 55 of 400
building tree 56 of 400
building tree 57 of 400


[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:    4.7s


building tree 58 of 400
building tree 59 of 400
building tree 60 of 400
building tree 61 of 400
building tree 62 of 400
building tree 63 of 400
building tree 64 of 400
building tree 65 of 400
building tree 66 of 400
building tree 67 of 400
building tree 68 of 400
building tree 69 of 400


[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    5.6s


building tree 70 of 400
building tree 71 of 400
building tree 72 of 400
building tree 73 of 400
building tree 74 of 400
building tree 75 of 400
building tree 76 of 400
building tree 77 of 400
building tree 78 of 400
building tree 79 of 400
building tree 80 of 400
building tree 81 of 400


[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:    6.6s


building tree 82 of 400
building tree 83 of 400
building tree 84 of 400
building tree 85 of 400
building tree 86 of 400
building tree 87 of 400
building tree 88 of 400
building tree 89 of 400
building tree 90 of 400
building tree 91 of 400
building tree 92 of 400
building tree 93 of 400


[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:    7.6s


building tree 94 of 400
building tree 95 of 400
building tree 96 of 400
building tree 97 of 400
building tree 98 of 400
building tree 99 of 400
building tree 100 of 400
building tree 101 of 400
building tree 102 of 400
building tree 103 of 400
building tree 104 of 400
building tree 105 of 400
building tree 106 of 400
building tree 107 of 400
building tree 108 of 400
building tree 109 of 400
building tree 110 of 400


[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:    8.8s


building tree 111 of 400
building tree 112 of 400
building tree 113 of 400
building tree 114 of 400
building tree 115 of 400
building tree 116 of 400
building tree 117 of 400
building tree 118 of 400
building tree 119 of 400
building tree 120 of 400
building tree 121 of 400
building tree 122 of 400
building tree 123 of 400


[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   10.1s


building tree 124 of 400
building tree 125 of 400
building tree 126 of 400
building tree 127 of 400
building tree 128 of 400
building tree 129 of 400
building tree 130 of 400
building tree 131 of 400
building tree 132 of 400
building tree 133 of 400
building tree 134 of 400
building tree 135 of 400
building tree 136 of 400
building tree 137 of 400
building tree 138 of 400
building tree 139 of 400
building tree 140 of 400


[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:   11.8s


building tree 141 of 400
building tree 142 of 400
building tree 143 of 400
building tree 144 of 400
building tree 145 of 400
building tree 146 of 400
building tree 147 of 400
building tree 148 of 400
building tree 149 of 400
building tree 150 of 400
building tree 151 of 400
building tree 152 of 400
building tree 153 of 400
building tree 154 of 400
building tree 155 of 400
building tree 156 of 400
building tree 157 of 400
building tree 158 of 400
building tree 159 of 400
building tree 160 of 400


[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   13.1s


building tree 161 of 400
building tree 162 of 400
building tree 163 of 400
building tree 164 of 400
building tree 165 of 400
building tree 166 of 400
building tree 167 of 400
building tree 168 of 400
building tree 169 of 400
building tree 170 of 400
building tree 171 of 400
building tree 172 of 400
building tree 173 of 400
building tree 174 of 400
building tree 175 of 400
building tree 176 of 400
building tree 177 of 400


[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:   14.7s


building tree 178 of 400
building tree 179 of 400
building tree 180 of 400
building tree 181 of 400
building tree 182 of 400
building tree 183 of 400
building tree 184 of 400
building tree 185 of 400
building tree 186 of 400
building tree 187 of 400
building tree 188 of 400
building tree 189 of 400
building tree 190 of 400
building tree 191 of 400
building tree 192 of 400
building tree 193 of 400
building tree 194 of 400
building tree 195 of 400
building tree 196 of 400
building tree 197 of 400


[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   16.6s


building tree 198 of 400
building tree 199 of 400
building tree 200 of 400
building tree 201 of 400
building tree 202 of 400
building tree 203 of 400
building tree 204 of 400
building tree 205 of 400
building tree 206 of 400
building tree 207 of 400
building tree 208 of 400
building tree 209 of 400
building tree 210 of 400
building tree 211 of 400
building tree 212 of 400
building tree 213 of 400
building tree 214 of 400
building tree 215 of 400
building tree 216 of 400
building tree 217 of 400


[Parallel(n_jobs=-1)]: Done 213 tasks      | elapsed:   18.2s


building tree 218 of 400
building tree 219 of 400
building tree 220 of 400
building tree 221 of 400
building tree 222 of 400
building tree 223 of 400
building tree 224 of 400
building tree 225 of 400
building tree 226 of 400
building tree 227 of 400
building tree 228 of 400
building tree 229 of 400
building tree 230 of 400
building tree 231 of 400
building tree 232 of 400
building tree 233 of 400
building tree 234 of 400
building tree 235 of 400
building tree 236 of 400
building tree 237 of 400
building tree 238 of 400


[Parallel(n_jobs=-1)]: Done 234 tasks      | elapsed:   20.2s


building tree 239 of 400
building tree 240 of 400
building tree 241 of 400
building tree 242 of 400
building tree 243 of 400
building tree 244 of 400
building tree 245 of 400
building tree 246 of 400
building tree 247 of 400
building tree 248 of 400
building tree 249 of 400
building tree 250 of 400
building tree 251 of 400
building tree 252 of 400
building tree 253 of 400
building tree 254 of 400
building tree 255 of 400
building tree 256 of 400
building tree 257 of 400
building tree 258 of 400
building tree 259 of 400
building tree 260 of 400
building tree 261 of 400
building tree 262 of 400


[Parallel(n_jobs=-1)]: Done 257 tasks      | elapsed:   22.5s


building tree 263 of 400
building tree 264 of 400
building tree 265 of 400
building tree 266 of 400
building tree 267 of 400
building tree 268 of 400
building tree 269 of 400
building tree 270 of 400
building tree 271 of 400
building tree 272 of 400
building tree 273 of 400
building tree 274 of 400
building tree 275 of 400
building tree 276 of 400
building tree 277 of 400
building tree 278 of 400
building tree 279 of 400
building tree 280 of 400
building tree 281 of 400
building tree 282 of 400
building tree 283 of 400
building tree 284 of 400


[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:   24.9s


building tree 285 of 400
building tree 286 of 400
building tree 287 of 400
building tree 288 of 400
building tree 289 of 400
building tree 290 of 400
building tree 291 of 400
building tree 292 of 400
building tree 293 of 400
building tree 294 of 400
building tree 295 of 400
building tree 296 of 400
building tree 297 of 400
building tree 298 of 400
building tree 299 of 400
building tree 300 of 400
building tree 301 of 400
building tree 302 of 400
building tree 303 of 400building tree 304 of 400

building tree 305 of 400
building tree 306 of 400
building tree 307 of 400
building tree 308 of 400


[Parallel(n_jobs=-1)]: Done 305 tasks      | elapsed:   27.2s


building tree 309 of 400
building tree 310 of 400
building tree 311 of 400
building tree 312 of 400
building tree 313 of 400
building tree 314 of 400
building tree 315 of 400
building tree 316 of 400
building tree 317 of 400
building tree 318 of 400
building tree 319 of 400
building tree 320 of 400
building tree 321 of 400
building tree 322 of 400
building tree 323 of 400
building tree 324 of 400
building tree 325 of 400
building tree 326 of 400
building tree 327 of 400
building tree 328 of 400
building tree 329 of 400
building tree 330 of 400
building tree 331 of 400
building tree 332 of 400
building tree 333 of 400
building tree 334 of 400


[Parallel(n_jobs=-1)]: Done 330 tasks      | elapsed:   29.5s


building tree 335 of 400
building tree 336 of 400
building tree 337 of 400
building tree 338 of 400
building tree 339 of 400
building tree 340 of 400
building tree 341 of 400
building tree 342 of 400
building tree 343 of 400
building tree 344 of 400
building tree 345 of 400
building tree 346 of 400
building tree 347 of 400
building tree 348 of 400
building tree 349 of 400
building tree 350 of 400
building tree 351 of 400
building tree 352 of 400
building tree 353 of 400
building tree 354 of 400
building tree 355 of 400
building tree 356 of 400
building tree 357 of 400
building tree 358 of 400
building tree 359 of 400
building tree 360 of 400
building tree 361 of 400


[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:   31.4s


building tree 362 of 400
building tree 363 of 400
building tree 364 of 400
building tree 365 of 400
building tree 366 of 400
building tree 367 of 400
building tree 368 of 400
building tree 369 of 400
building tree 370 of 400
building tree 371 of 400
building tree 372 of 400
building tree 373 of 400
building tree 374 of 400
building tree 375 of 400
building tree 376 of 400
building tree 377 of 400
building tree 378 of 400
building tree 379 of 400
building tree 380 of 400
building tree 381 of 400
building tree 382 of 400
building tree 383 of 400
building tree 384 of 400
building tree 385 of 400
building tree 386 of 400
building tree 387 of 400
building tree 388 of 400
building tree 389 of 400


[Parallel(n_jobs=-1)]: Done 384 tasks      | elapsed:   33.3s


building tree 390 of 400
building tree 391 of 400
building tree 392 of 400
building tree 393 of 400
building tree 394 of 400
building tree 395 of 400
building tree 396 of 400
building tree 397 of 400
building tree 398 of 400
building tree 399 of 400
building tree 400 of 400


[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:   34.3s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done  53 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done  77 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done  90 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 105 tasks      | elapsed:    0.6s
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed:    0.7s
[Parallel(n_jobs=4)]: Done 137 tasks      | elapsed:    0.8s
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:    

In [86]:
y_test_pred = model.predict(processed_test)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  53 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  77 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  90 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 105 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 137 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 173 tasks      | elapsed:    0.3s
[Para

In [89]:
y_train_pred = model.predict(processed_train)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done  53 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:    0.6s
[Parallel(n_jobs=4)]: Done  77 tasks      | elapsed:    0.7s
[Parallel(n_jobs=4)]: Done  90 tasks      | elapsed:    0.8s
[Parallel(n_jobs=4)]: Done 105 tasks      | elapsed:    0.9s
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed:    1.0s
[Parallel(n_jobs=4)]: Done 137 tasks      | elapsed:    1.1s
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:    1.2s
[Parallel(n_jobs=4)]: Done 173 tasks      | elapsed:    1.4s
[Para

In [90]:
accuracy_score(target['status_group'], y_train_pred)

0.8180976430976431

In [88]:
create_submission( y_test_pred )

In [346]:
model.fit(processed_train, target['status_group'])

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


building tree 1 of 250building tree 2 of 250

building tree 3 of 250
building tree 4 of 250
building tree 5 of 250
building tree 6 of 250
building tree 7 of 250
building tree 8 of 250


[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.5s


building tree 9 of 250
building tree 10 of 250
building tree 11 of 250
building tree 12 of 250
building tree 13 of 250
building tree 14 of 250
building tree 15 of 250
building tree 16 of 250


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    0.7s


building tree 17 of 250
building tree 18 of 250
building tree 19 of 250
building tree 20 of 250


[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    1.2s


building tree 21 of 250
building tree 22 of 250
building tree 23 of 250
building tree 24 of 250
building tree 25 of 250
building tree 26 of 250
building tree 27 of 250
building tree 28 of 250


[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    1.6s


building tree 29 of 250
building tree 30 of 250
building tree 31 of 250
building tree 32 of 250
building tree 33 of 250
building tree 34 of 250
building tree 35 of 250
building tree 36 of 250


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    2.2s


building tree 37 of 250
building tree 38 of 250building tree 39 of 250

building tree 40 of 250
building tree 41 of 250
building tree 42 of 250
building tree 43 of 250
building tree 44 of 250
building tree 45 of 250
building tree 46 of 250
building tree 47 of 250
building tree 48 of 250


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    2.6s


building tree 49 of 250
building tree 50 of 250
building tree 51 of 250
building tree 52 of 250
building tree 53 of 250
building tree 54 of 250
building tree 55 of 250
building tree 56 of 250


[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:    3.3s


building tree 57 of 250
building tree 58 of 250
building tree 59 of 250
building tree 60 of 250
building tree 61 of 250
building tree 62 of 250
building tree 63 of 250
building tree 64 of 250
building tree 65 of 250
building tree 66 of 250
building tree 67 of 250
building tree 68 of 250


[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    3.9s


building tree 69 of 250
building tree 70 of 250
building tree 71 of 250
building tree 72 of 250
building tree 73 of 250
building tree 74 of 250
building tree 75 of 250
building tree 76 of 250
building tree 77 of 250
building tree 78 of 250
building tree 79 of 250
building tree 80 of 250


[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:    4.8s


building tree 81 of 250
building tree 82 of 250
building tree 83 of 250
building tree 84 of 250
building tree 85 of 250
building tree 86 of 250
building tree 87 of 250
building tree 88 of 250
building tree 89 of 250
building tree 90 of 250
building tree 91 of 250
building tree 92 of 250
building tree 93 of 250
building tree 94 of 250
building tree 95 of 250
building tree 96 of 250


[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:    5.5s


building tree 97 of 250
building tree 98 of 250
building tree 99 of 250
building tree 100 of 250
building tree 101 of 250
building tree 102 of 250
building tree 103 of 250
building tree 104 of 250
building tree 105 of 250
building tree 106 of 250
building tree 107 of 250
building tree 108 of 250


[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:    6.4s


building tree 109 of 250
building tree 110 of 250
building tree 111 of 250
building tree 112 of 250
building tree 113 of 250
building tree 114 of 250building tree 115 of 250

building tree 116 of 250
building tree 117 of 250
building tree 118 of 250
building tree 119 of 250
building tree 120 of 250
building tree 121 of 250
building tree 122 of 250
building tree 123 of 250
building tree 124 of 250


[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:    7.4s


building tree 125 of 250
building tree 126 of 250
building tree 127 of 250
building tree 128 of 250
building tree 129 of 250
building tree 130 of 250
building tree 131 of 250
building tree 132 of 250
building tree 133 of 250
building tree 134 of 250
building tree 135 of 250
building tree 136 of 250
building tree 137 of 250
building tree 138 of 250
building tree 139 of 250
building tree 140 of 250


[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:    8.6s


building tree 141 of 250
building tree 142 of 250
building tree 143 of 250
building tree 144 of 250
building tree 145 of 250
building tree 146 of 250
building tree 147 of 250
building tree 148 of 250
building tree 149 of 250
building tree 150 of 250
building tree 151 of 250
building tree 152 of 250
building tree 153 of 250
building tree 154 of 250
building tree 155 of 250
building tree 156 of 250
building tree 157 of 250
building tree 158 of 250
building tree 159 of 250
building tree 160 of 250


[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   10.0s


building tree 161 of 250
building tree 162 of 250
building tree 163 of 250
building tree 164 of 250
building tree 165 of 250
building tree 166 of 250
building tree 167 of 250
building tree 168 of 250
building tree 169 of 250
building tree 170 of 250
building tree 171 of 250
building tree 172 of 250
building tree 173 of 250
building tree 174 of 250
building tree 175 of 250
building tree 176 of 250
building tree 177 of 250


[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:   11.5s


building tree 178 of 250
building tree 179 of 250
building tree 180 of 250
building tree 181 of 250
building tree 182 of 250
building tree 183 of 250
building tree 184 of 250
building tree 185 of 250
building tree 186 of 250
building tree 187 of 250
building tree 188 of 250
building tree 189 of 250
building tree 190 of 250
building tree 191 of 250
building tree 192 of 250
building tree 193 of 250
building tree 194 of 250
building tree 195 of 250
building tree 196 of 250
building tree 197 of 250
building tree 198 of 250


[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   12.7s


building tree 199 of 250
building tree 200 of 250
building tree 201 of 250
building tree 202 of 250
building tree 203 of 250
building tree 204 of 250
building tree 205 of 250
building tree 206 of 250
building tree 207 of 250
building tree 208 of 250
building tree 209 of 250
building tree 210 of 250
building tree 211 of 250
building tree 212 of 250
building tree 213 of 250
building tree 214 of 250
building tree 215 of 250
building tree 216 of 250


[Parallel(n_jobs=-1)]: Done 213 tasks      | elapsed:   14.1s


building tree 217 of 250
building tree 218 of 250
building tree 219 of 250
building tree 220 of 250
building tree 221 of 250
building tree 222 of 250
building tree 223 of 250
building tree 224 of 250
building tree 225 of 250
building tree 226 of 250
building tree 227 of 250
building tree 228 of 250
building tree 229 of 250
building tree 230 of 250
building tree 231 of 250
building tree 232 of 250
building tree 233 of 250
building tree 234 of 250
building tree 235 of 250
building tree 236 of 250
building tree 237 of 250
building tree 238 of 250
building tree 239 of 250


[Parallel(n_jobs=-1)]: Done 234 tasks      | elapsed:   15.5s


building tree 240 of 250
building tree 241 of 250
building tree 242 of 250
building tree 243 of 250
building tree 244 of 250
building tree 245 of 250
building tree 246 of 250
building tree 247 of 250
building tree 248 of 250
building tree 249 of 250
building tree 250 of 250


[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:   16.4s finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=10, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=250,
                       n_jobs=-1, oob_score=False, random_state=None,
                       verbose=10, warm_start=False)

In [347]:
classification_metrics(model, processed_train, target['status_group'])

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done  53 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done  77 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done  90 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done 105 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 137 tasks      | elapsed:    0.6s
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:    0.7s
[Parallel(n_jobs=4)]: Done 173 tasks      | elapsed:    0.8s
[Para

RandomForestClassifier
Accuracy Score: 0.7934848484848485
F1 Score: 0.772256639138132
Confusion Matrix:
                       Predicted Functional  Predicted Needs Repair  \
Actual Functional                     30675                     101   
Actual Needs Repair                    3133                     560   
Actual Non-Functional                  6854                      72   

                       Predicted Non-Functional  
Actual Functional                          1483  
Actual Needs Repair                         624  
Actual Non-Functional                     15898  


## XGBoost

In [316]:
from xgboost import XGBClassifier
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [375]:
grboost = XGBClassifier(n_estimators=250,
                        booster='dart',
                        max_depth=5,
                        learning_rate=0.07,
                        silent=False,
                        n_jobs=-1,
)

# param = {'booster': 'dart',
#          'max_depth': 5, 'learning_rate': 0.07,
#          'objective': 'binary:logistic', 'silent': False,
#          'sample_type': 'uniform',
#          'normalize_type': 'tree', 'n_jobs':-1,
#          'rate_drop': 0.1,
#          'skip_drop': 0.5}

grboost.fit(processed_train, target['encoded'])

XGBClassifier(base_score=0.5, booster='dart', colsample_bylevel=1,
              colsample_bytree=1, gamma=0, learning_rate=0.07, max_delta_step=0,
              max_depth=5, min_child_weight=1, missing=None, n_estimators=250,
              n_jobs=-1, nthread=None, objective='binary:logistic',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=None, silent=False, subsample=1)

In [376]:
y_train_pred_boost= grboost.predict(processed_train)

In [377]:
y_test_pred_boost = grboost.predict(processed_test, ntree_limit=50)

In [378]:
accuracy_score(target['encoded'], y_train_pred_boost)

0.824023569023569

In [339]:
create_submission(y_test_pred_boost)

SubmissionsMB_2019-05-22_20:26:28.csv


In [100]:
y_test_pred_boost = grboost.predict_proba(processed_test)

### AdaBoostClassifier 

In [108]:
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier(n_estimators=100)

search = cross_validate(ada, 
                        processed_train, 
                        target['status_group'], 
                        return_train_score=True,
                        return_estimator=True,
                        scoring='accuracy', 
                        n_jobs=-1,
                        verbose=10,
                        cv=3)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  2.7min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  2.7min finished


In [109]:
pd.DataFrame( search )

Unnamed: 0,fit_time,score_time,estimator,test_score,train_score
0,101.621589,17.280218,"(DecisionTreeClassifier(class_weight=None, cri...",0.742071,0.742071
1,100.635195,17.111658,"(DecisionTreeClassifier(class_weight=None, cri...",0.739596,0.741995
2,100.662782,17.122066,"(DecisionTreeClassifier(class_weight=None, cri...",0.741212,0.742551


### Voting Classifier

In [121]:
from sklearn.ensemble import VotingClassifier

estimators = [
    ('rf', model),
    ('grboost', grboost),
]

ensemble = VotingClassifier(estimators, n_jobs=-1, voting='soft', weights=[1, 1])

In [122]:
ensemble.fit(processed_train, target['status_group'])

VotingClassifier(estimators=[('rf',
                              RandomForestClassifier(bootstrap=True,
                                                     class_weight=None,
                                                     criterion='gini',
                                                     max_depth=12,
                                                     max_features='auto',
                                                     max_leaf_nodes=None,
                                                     min_impurity_decrease=0.0,
                                                     min_impurity_split=None,
                                                     min_samples_leaf=1,
                                                     min_samples_split=2,
                                                     min_weight_fraction_leaf=0.0,
                                                     n_estimators=400,
                                                     n_jobs=-1, oob_score=Fals

In [123]:
ensemble_train_predict = ensemble.predict(processed_train)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done  53 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:    0.6s
[Parallel(n_jobs=4)]: Done  77 tasks      | elapsed:    0.7s
[Parallel(n_jobs=4)]: Done  90 tasks      | elapsed:    0.8s
[Parallel(n_jobs=4)]: Done 105 tasks      | elapsed:    0.9s
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed:    1.0s
[Parallel(n_jobs=4)]: Done 137 tasks      | elapsed:    1.2s
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:    1.3s
[Parallel(n_jobs=4)]: Done 173 tasks      | elapsed:    1.5s
[Para

In [114]:
ensemble_test_predict = ensemble.predict(processed_test)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  53 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done  77 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done  90 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 105 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 137 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done 173 tasks      | elapsed:    0.5s
[Para

In [115]:
ensemble_test_predict

array(['functional', 'functional', 'functional', ..., 'functional',
       'functional', 'non functional'], dtype=object)

In [124]:
accuracy_score(target['status_group'], ensemble_train_predict)

0.8261447811447812

In [118]:
create_submission(ensemble_test_predict)