# Load Dependencies 

In [341]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Import Data

In [342]:
from sklearn.model_selection import train_test_split

train = pd.read_csv('Data/train_features.csv')
X_test = pd.read_csv('Data/test_features.csv')
target = pd.read_csv('Data/train_labels.csv')
label = target['status_group']

X_train, X_val, y_train, y_val = train_test_split(train, label, test_size = 0.2, random_state = 42)
X_train.shape, X_val.shape, X_test.shape, y_train.shape, y_val.shape

((47520, 40), (11880, 40), (14358, 40), (47520,), (11880,))

# Data Dictionary 

In [372]:
data_dict = pd.read_excel('Data_Dictionary.xlsx', skipfooter=3)
data_dict

Unnamed: 0,Index,Feature,Description,Dtype,% NA?,Unique values,Correlation w/Target,Hypothesis,Baseline Approach,Notes
0,1,amount_tsh,Amount of water available to waterpoint,float,,98,0.053,Important,Standardize Data,"Highly Positive Skew (70% = 0), should address..."
1,2,date_recorded,The data the observation was entered,object,,356,,Important,Create columns for month and year,Recording from 10/2002 to 12/2013
2,3,funder,who funded the wall,object,0.061,1897,,Moderate,Ignore,"Long Tail of funders, consider binning, invest..."
3,4,gps_height,altitude of the well,int,,2428,0.11,Important,Standardize Data,Distribution positively skewed
4,5,installer,orgaization that installed the well,object,0.061,2145,,Important,Ignore,"31% installed by same group, long-tail of smal..."
5,6,longitude,GPS coordinate,float,,57516,-0.004,Moderate,Ignore,Weird distribution of values near zero (error?...
6,7,latitude,GPS coordinate,float,,57517,0.014,Moderate,Ignore,
7,8,wpt_name,Name of the waterpoint if there is one,object,,37400,,Low,Ignore,6% of obs. Have value of 'none'
8,9,num_private,?,int,,65,0.0005,Low,Ignore,98% of values = 0
9,10,basin,Geographic water basin,object,,9,,Low,One Hot Encode,Majority category is 17% of population


# Select Features

In [343]:
selected_features = ['amount_tsh',
    'date_recorded',
    'gps_height',
    'basin',
    'region',
    'district_code',
    'population',
    'public_meeting',
    'scheme_management',
    'permit',
    'construction_year',
    'extraction_type_class',
    'management_group',
    'payment',
    'quality_group',
    'quantity',
    'source_type',
    'source_class', 
    'waterpoint_type',
    'funder',
    'installer', 
    'latitude',
    'longitude']

def select_features(df, features):
    '''
    Subsets dataframe based on list of columns names accepted 
    as a parameter.
    '''
    return df[features]

In [344]:
X_train = select_features(X_train, features=selected_features)
X_val = select_features(X_val, features=selected_features)
X_test = select_features(X_test, features=selected_features)
X_train.shape, X_val.shape, X_test.shape

((47520, 23), (11880, 23), (14358, 23))

# Encode Target

In [345]:
# Only used to test collapsing problem to binomial classification
target['encoded'] = target['status_group'].replace({
    'functional': 1,
    'non functional': 0,
    'functional needs repair':0
})

# Process Features 

### Impute Features

In [346]:
def smart_impute(X):
    ''' Adapted directly from Dakota P.'s awesome work '''
    X = X.copy()
    
    # Convert all strings in object columns to lowercase
    cat_features = X.select_dtypes('object').columns.tolist()
    for feature in cat_features:
        X[feature] = X[feature].str.lower()
    
    # Replace -2.000000e-08 with np.nan (not showing as zero due to datatype)
    X['latitude'] = X['latitude'].replace( -2.000000e-08, np.nan)
    
    impute_features = ['gps_height', 'population', 'amount_tsh', 'construction_year', 'latitude', 'longitude']

    for feature in impute_features:
        # Replace values=0.0 with np.nan (0.0 appears to indicate missing values in dataset)
        X[feature] = X[feature].replace(0, np.nan)
        
        # Note, hardcoded train when calculating mean to avoid leakage into test data
        # If District code available replace NA's with mean value of other well in the same district 
        X[feature] = X[feature].fillna( X.groupby(['region', 'district_code'])[feature].transform('mean') )
        # If no district code, replace NA's  with mean value of other wells in the same region
        X[feature] = X[feature].fillna( X.groupby(['region'])[feature].transform('mean') )
        # If no district and no region, replace NA's with mean value of all wells
        X[feature] = X[feature].fillna( X[feature].mean() )
    
    # Convert Boolean Features to int
    
    
    return X

In [347]:
X_train = smart_impute(X_train)
X_val = smart_impute(X_val)
X_test = smart_impute(X_test)
X_train.shape, X_val.shape, X_test.shape

((47520, 23), (11880, 23), (14358, 23))

# Wrangle Features

In [348]:
def wrangle_features(X):
    X = X.copy()
    
    # Create month and year features from the recorded data feature
    X['date_recorded'] = pd.to_datetime(X['date_recorded'])
    X['date_recorded_month'] = X['date_recorded'].dt.month
    X['date_recorded_year'] = X['date_recorded'].dt.year
    
    # Create feature for seasons in Tanzania
    X['Hot_Dry_Season'] = (X['date_recorded_month'] == 12) | (X['date_recorded_month'] < 3)
    X['Heavy_Rain_Season'] = (X['date_recorded_month'] > 4) & (X['date_recorded_month'] < 6)
    X['Cool_Dry_Season'] = (X['date_recorded_month'] > 5) & (X['date_recorded_month'] < 11)
    X['Moderate_Rain'] = (X['date_recorded_month'] == 11) | (X['date_recorded_month'] == 3)
    
    # Bin low freq. categories into 'other' 
    X['scheme_management'] = X['scheme_management'].replace({
        'SWC':'Other',
        'Trust':'Other',
        'None':'Other'
    })
    
    # Create unkown category for missing values in scheme management column
    X['scheme_management'] = X['scheme_management'].fillna('unknown')
    X['permit'] = X['permit'].fillna(False)
    X['public_meeting'] = X['public_meeting'].fillna(True)
    
    # Create age category out of construction_year
    # Bin 0 values as -1
    X['pump_age'] = ( 2014 - X['construction_year'] )
    X['pump_age'] = X['pump_age'].replace({2014:-1})
    
    # Create Installer Features
    X['DWE_Installer'] = (X['installer'] == 'DWE')
    X['Gov_Installer'] = (X['installer'] == 'Government')

    one_time_install = train['installer'].value_counts()[train['installer'].value_counts() == 1]
    X['One_Time_Installer'] = X['installer'].isin(one_time_install.index)

    small_install = train['installer'].value_counts()[ (train['installer'].value_counts() < 10) & (train['installer'].value_counts() > 1) ]
    X['Small_Installer'] = X['installer'].isin(small_install.index)

    big_install = (( train['installer'].value_counts() >= 10 ) == True)
    X['Big_Installer'] = X['installer'].isin(big_install.index)
    
    # Create Funder Features
    X['Tanzania_Gov_Funder'] = (X['funder'] == 'Government Of Tanzania')

    one_time_funder = train['funder'].value_counts()[train['funder'].value_counts() == 1]
    X['One_Time_Funder'] = X['funder'].isin(one_time_funder.index)

    small_funder = train['funder'].value_counts()[ (train['funder'].value_counts() < 10) & (train['funder'].value_counts() > 1) ]
    X['Small_Funder'] = X['funder'].isin(small_funder.index)

    big_funder = (( train['funder'].value_counts() >= 10 ) == True)
    X['Big_Funder'] = X['funder'].isin(big_funder.index)
    
    # Create interaction between amount of water avaialable and population
    X['pop*amount_tsh'] = X['population'] * X['amount_tsh']
    X['pop/amount_tsh'] = X['population'] / X['amount_tsh']
    X['pop/amount_tsh'] = X['pop/amount_tsh'].replace(np.inf, 3000)
    
    # Create interaction between amount of water avaialable and height
    X['gps_height*amount_tsh'] = X['gps_height'] * X['amount_tsh']
    
    # Polynomial gps_Height
    X['gps_height**2'] = X['gps_height'] ** 2
    X['gps_height**3'] = X['gps_height'] ** 3
    
    # Interaction between latitude and height
    X['latitude*height*amount'] = X['latitude'] * X['gps_height'] * X['amount_tsh']
    X['latitude*height'] = X['latitude'] * X['gps_height']
    
    # Create Binned Features
    labels = [1,2,3,4,5,6,7,8,9,10]
    X['gps_height_binned'] = pd.cut(x=X['gps_height'], bins=10, labels=labels)
    X['pump_age_binned'] = pd.cut(x=X['pump_age'], bins=10, labels=labels)
    X['amount_tsh_binned'] = pd.cut(x=X['amount_tsh'], bins=10, labels=labels)
    X['longitude_binned'] = pd.cut(x=X['longitude'], bins=10, labels=labels)
    X['latitude_binned'] = pd.cut(x=X['latitude'], bins=10, labels=labels)
    
    # Convert Binned Features to int datatype
    X['gps_height_binned'] = X['gps_height_binned'].astype(int)
    X['pump_age_binned'] = X['pump_age_binned'].astype(int)
    X['amount_tsh_binned'] = X['amount_tsh_binned'].astype(int)
    X['longitude_binned'] = X['longitude_binned'].astype(int)
    X['latitude_binned'] = X['latitude_binned'].astype(int)
    
    # Pumps Funded and Built by Tanzania Government
    X['Gov_Funded_Gov_Built'] = X['Tanzania_Gov_Funder'] * X['DWE_Installer'].astype(int)
    
    return X

In [349]:
X_test = wrangle_features(X_test)
X_val = wrangle_features(X_val)
X_train = wrangle_features(X_train)

X_train.shape, X_val.shape, X_test.shape

((47520, 52), (11880, 52), (14358, 52))

# Create Composite Scores (Additional Feature Engineering) 

In [350]:
def create_score(X, feature):
    rel_score = pd.crosstab(X_train[feature], target['status_group'])
    rel_score = pd.DataFrame(rel_score)
    
    new_feature = feature + '_rel_score'
    total = rel_score['functional'] + rel_score['functional needs repair'] + rel_score['non functional']
    rel_score[new_feature] = rel_score['functional'] / total
    
    rel_score = rel_score.reset_index()
    rel_score = rel_score[ [feature, new_feature] ]
    
    X = pd.merge(X, rel_score, how='left', on=feature)
    X[new_feature] = X[new_feature].fillna(0)
    
    return X

In [351]:
def composite_scores(X):
    X = X.copy()
    
    # Create Reliability Scores
    rel_cols = ['installer', 'scheme_management', 'management_group', 'payment', 'extraction_type_class',
               'waterpoint_type', 'pump_age_binned', 'gps_height_binned', 'amount_tsh_binned', 'longitude_binned',
               'latitude_binned', 'region', 'basin', 'district_code']
    
    for col in rel_cols:
        X = create_score(X, col)
    
    # Create Composite Scores
    X['Management_Score'] = X['installer_rel_score'] + X['scheme_management_rel_score'] + X['management_group_rel_score']
    X['Condition_Score'] = X['gps_height_binned_rel_score'] + X['amount_tsh_binned_rel_score'] + X['latitude_binned_rel_score'] + X['longitude_binned_rel_score']
    X['Tech_Score'] = X['payment_rel_score'] + X['pump_age_binned_rel_score'] + X['extraction_type_class_rel_score'] + X['waterpoint_type_rel_score']
    
    X['Overall_Reliability_Score'] = X['Management_Score'] + X['Condition_Score'] + X['Tech_Score']
    X['Overall_Reliability_Score**2'] = X['Overall_Reliability_Score'] ** 2 
    
    # Public Support Score 
    X['Public_Support'] = X['payment_rel_score'] + X['public_meeting'] + X['permit']
    
    # Location Score
    X['Location_Score'] = X['basin_rel_score'] + X['district_code_rel_score'] + X['region_rel_score']
    
    # Location * Condition
    X['Location*Condition'] = X['Location_Score'] + X['Condition_Score']
    X['Location*Condition**2'] = X['Location*Condition'] ** 2
    
    # Public Support and Location
    X['Public_Support+Loc'] = X['Public_Support'] + X['Location_Score']
    
    # Site Score
    X['Site_Score'] = X['Location_Score'] + X['Public_Support'] + X['Overall_Reliability_Score']
    X['Site_Score**2'] = X['Site_Score'] ** 2
    
    # Negative Correlated Features
    X['dry&high'] = (X['Hot_Dry_Season'] | X['Cool_Dry_Season']) * X['latitude*height']
    X['Pump_Age*Pop/Amount_tsg'] = X['pump_age'] * X['pop/amount_tsh']
    X['pump_age_binned**2'] = X['pump_age_binned'] ** 2
    
    return X

In [352]:
X_train = composite_scores(X_train)
X_val = composite_scores(X_val)
X_test = composite_scores(X_test)

X_train.shape, X_val.shape, X_test.shape

((47520, 81), (11880, 81), (14358, 81))

In [353]:
def drop_cols(X):
    X = X.copy()
    
    # Drop unecessary columns
    drop_cols = ['date_recorded', 'funder', 'installer', 'construction_year']
    X = X.drop(columns=drop_cols)
    
    return X

In [354]:
X_train = drop_cols(X_train)
X_val = drop_cols(X_val)
X_test = drop_cols(X_test)

X_train.shape, X_val.shape, X_test.shape

((47520, 77), (11880, 77), (14358, 77))

# Encode Features 

### One Hot Encoding

In [355]:
import category_encoders as ce

def one_hot(X_train, X_val, X_test):
    # Features to one hot encode
    one_hot_cols = ['extraction_type_class', 'payment', 'quality_group', 
                    'quantity', 'source_class', 'waterpoint_type']
    
    # Initialize and transform relevant features
    encoder = ce.OneHotEncoder(cols=one_hot_cols, use_cat_names=True)
    
    # Note, train hardcoded to avoid overfitting test data
    encoder.fit(X_train)
    train_encoded = encoder.transform(X_train)
    val_encoded = encoder.transform(X_val)
    test_encoded = encoder.transform(X_test)
    
    train_encoded, val_encoded = train_encoded.align(val_encoded, join='left', axis=1)
    train_encoded, test_encoded = train_encoded.align(test_encoded, join='left', axis=1)
    
    return train_encoded, val_encoded, test_encoded

In [356]:
X_train, X_val, X_test = one_hot(X_train, X_val, X_test)

X_train.shape, X_val.shape, X_test.shape

((47520, 106), (11880, 106), (14358, 106))

In [357]:
X_train.columns.tolist() == X_val.columns.tolist() == X_test.columns.tolist()

True

### Ordinal Encoding

In [358]:
import category_encoders as ce

def ordinal(X):
    ord_cols = ['basin', 'source_type', 'region', 'scheme_management',
                    'management_group']
    
    # Initialize and transform relevant features
    encoder = ce.OrdinalEncoder(cols=ord_cols)
    
    # Note, train hardcoded to avoid overfitting test data
    encoder.fit(X_train)
    X = encoder.transform(X)
    
    return X

In [359]:
X_train = ordinal(X_train)
X_val = ordinal(X_val)
X_test = ordinal(X_test)

X_train.shape, X_val.shape, X_test.shape

((47520, 106), (11880, 106), (14358, 106))

### Create Additional Features Based on Encoded Features 

- Only intended to be used with OneHotEncoding

In [360]:
def non_func_features(X):
    X = X.copy()
    
    # Convert boolean cols to ints
    X = X.applymap(lambda x: 1 if x == True else x)
    X = X.applymap(lambda x: 0 if x == False else x)
    
    X['quantity_dry*pump_age/amount'] = X['quantity_dry'] / X['Pump_Age*Pop/Amount_tsg']
    
    X['Condition_Risk'] = X['quantity_dry'] + X['waterpoint_type_other'] + X['extraction_type_class_other'] + X['quality_group_unknown']
    X['Condition**2'] = X['Condition_Risk'] ** 2
    
    X['Tech_Risk'] = X['extraction_type_class_motorpump'] + X['waterpoint_type_communal standpipe multiple'] + X['waterpoint_type_other']
    X['Tech_Risk**2'] = X['Tech_Risk'] ** 2
    
    X['Non_Tech_Risk*Pump_age'] = X['Tech_Risk'] * X['pump_age']
    
    X['Management_Risk'] = X['Tanzania_Gov_Funder'] + X['Gov_Installer'] + X['payment_never pay'] + X['payment_unknown']
    X['Management_Risk**2'] = X['Management_Risk'] ** 2
    
    X['Non_Func_Risk'] = X['Condition_Risk'] + X['Tech_Risk'] + X['Management_Risk']
    X['Non_Func_Risk*Pump_age'] = X['Non_Func_Risk'] * X['pump_age']
    X['Non_Func_Risk**2'] = X['Non_Func_Risk'] ** 2
    
    X['Tech_Repair_Score'] = X['source_class_surface'] + X['extraction_type_class_gravity']
    X['Tech_Repair_Score**2'] = X['Tech_Repair_Score'] ** 2
    
    return X 

In [361]:
X_train = non_func_features(X_train)
X_val = non_func_features(X_val)
X_test = non_func_features(X_test)

X_train.shape, X_val.shape, X_test.shape

((47520, 119), (11880, 119), (14358, 119))

# Testing Model

### Create Submission Functions

In [362]:
def create_submission(y_test_pred):
    sample_submission = pd.read_csv('Data/sample_submission.csv')
    submission = sample_submission.copy()
    submission['status_group'] = y_test_pred
    
    now = pd.to_datetime('now')
    filename = 'MB_' + str(now).replace(' ','_')[0:-7] 
    
    submission.to_csv(f'Submissions/{filename}.csv', index=False)
    print(f'Submissions/{filename}.csv')

### Classification Metrics 
- Function to capture snapshot of model performance

In [363]:
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix, log_loss, f1_score, mean_absolute_error, mean_squared_error 
import matplotlib.pyplot as plt

def classification_metrics(fit_estimator, X, y):
    '''
    Evaluates classifiers performance across several metrics (accuracy, roc_auc, confusion matrix,
                                                              log loss, f1, mean absolute error, 
                                                              mean squared error)
    Parameters:
        fit_estimator: an sklearn estimator that has been fitted to train data
        X: the features with which predictions will be based on
        y: the labels with which performance will be evaluated
    Returns: 
        Prints summary of performance across metrics and visualizaiton of ROC curve
    '''
    score = fit_estimator.score(X, y)
    y_pred = fit_estimator.predict(X)
    y_pred_proba = fit_estimator.predict_proba(X)[:,1]
    
    name = fit_estimator.__class__.__name__
    print(name)
    print('Accuracy Score:', score)
    print('F1 Score:', f1_score(y, y_pred, average='weighted'))

    conf_mat = pd.DataFrame(confusion_matrix(y, y_pred), columns=['Predicted Functional', 'Predicted Needs Repair', 'Predicted Non-Functional'],
                           index=['Actual Functional','Actual Needs Repair', 'Actual Non-Functional'])
    print('Confusion Matrix:')
    print(conf_mat)

# Random Forest Classifier

In [374]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate

model = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=6, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=6,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=1,
                       warm_start=False)

In [375]:
model.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   13.4s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:   14.9s finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=6, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=6,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=1,
                       warm_start=False)

In [376]:
train_pred = model.predict(X_train)
val_pred = model.predict(X_val)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.6s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    1.9s
[Parallel(n_jobs=4)]: Done 500 out of 500 | elapsed:    2.2s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.7s
[Parallel(n_jobs=4)]: Done 500 out of 500 | elapsed:    0.8s finished


In [377]:
test_accuracy = accuracy_score(y_train, train_pred)
val_accuracy = accuracy_score(y_val, val_pred)
test_accuracy, val_accuracy

(0.7468434343434344, 0.7001683501683502)

In [379]:
test_pred = model.predict(X_test)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done 500 out of 500 | elapsed:    0.4s finished


In [380]:
create_submission(test_pred)

Submissions/MB_2019-05-23_00:27:41.csv


In [381]:
classification_metrics(model, X_train, y_train)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    1.1s
[Parallel(n_jobs=4)]: Done 500 out of 500 | elapsed:    1.2s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.7s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    1.3s
[Parallel(n_jobs=4)]: Done 500 out of 500 | elapsed:    1.4s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    1.1s
[Parallel(n_jobs=4)]: Done 500 out of 500 | elapsed:    1.3s finished


RandomForestClassifier
Accuracy Score: 0.7468434343434344


  'precision', 'predicted', average, warn_for)


F1 Score: 0.7126186127677527
Confusion Matrix:
                       Predicted Functional  Predicted Needs Repair  \
Actual Functional                     24151                       0   
Actual Needs Repair                    2863                       0   
Actual Non-Functional                  6913                       0   

                       Predicted Non-Functional  
Actual Functional                          1651  
Actual Needs Repair                         603  
Actual Non-Functional                     11339  


# XGBoost

In [110]:
from xgboost import XGBClassifier
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [113]:
grboost = XGBClassifier(learning_rate=0.07, booster = 'dart', n_estimators=200, max_depth = 8)

grboost.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='dart', colsample_bylevel=1,
              colsample_bytree=1, gamma=0, learning_rate=0.07, max_delta_step=0,
              max_depth=8, min_child_weight=1, missing=None, n_estimators=200,
              n_jobs=1, nthread=None, objective='multi:softprob',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=None, silent=True, subsample=1)

In [114]:
y_train_pred_boost= grboost.predict(X_train)

In [119]:
y_val_pred_boost = grboost.predict(X_val, ntree_limit=50)

In [117]:
accuracy_score(y_train, y_train_pred_boost)

0.8440656565656566

In [120]:
accuracy_score(y_val, y_val_pred_boost)

0.7787878787878788

In [100]:
y_test_pred_boost = grboost.predict_proba(X_test)

In [339]:
create_submission(y_test_pred_boost)

SubmissionsMB_2019-05-22_20:26:28.csv


# AdaBoostClassifier 

In [108]:
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier(n_estimators=100)

search = cross_validate(ada, 
                        X_train, 
                        y_train, 
                        return_train_score=True,
                        return_estimator=True,
                        scoring='accuracy', 
                        n_jobs=-1,
                        verbose=10,
                        cv=3)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  2.7min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  2.7min finished


In [109]:
pd.DataFrame( search )

Unnamed: 0,fit_time,score_time,estimator,test_score,train_score
0,101.621589,17.280218,"(DecisionTreeClassifier(class_weight=None, cri...",0.742071,0.742071
1,100.635195,17.111658,"(DecisionTreeClassifier(class_weight=None, cri...",0.739596,0.741995
2,100.662782,17.122066,"(DecisionTreeClassifier(class_weight=None, cri...",0.741212,0.742551


# Voting Classifier

In [290]:
# Filenames of  submissions to ensemble
files = ['submission-01.csv', 'submission-02.csv', 'submission-03.csv']

def create_ensemble_submission(files):
    submissions = (pd.read_csv(file)[['status_group']] for file in files)
    ensemble = pd.concat(submissions, axis='columns')
    majority_vote = ensemble.mode(axis='columns')[0]

    sample_submission = pd.read_csv('sample_submission.csv')
    submission = sample_submission.copy()
    submission['status_group'] = majority_vote
    
    now = pd.to_datetime('now')
    filename = 'MB_' + str(now).replace(' ','_')[0:-7]
    
    submission.to_csv('Submissions/{filename}.csv', index=False)
    print(f'Submissions/{filename}.csv')

In [121]:
from sklearn.ensemble import VotingClassifier

estimators = [
    ('rf', model),
    ('grboost', grboost),
]

ensemble = VotingClassifier(estimators, n_jobs=-1, voting='soft', weights=[1, 1])

In [122]:
ensemble.fit(processed_train, target['status_group'])

VotingClassifier(estimators=[('rf',
                              RandomForestClassifier(bootstrap=True,
                                                     class_weight=None,
                                                     criterion='gini',
                                                     max_depth=12,
                                                     max_features='auto',
                                                     max_leaf_nodes=None,
                                                     min_impurity_decrease=0.0,
                                                     min_impurity_split=None,
                                                     min_samples_leaf=1,
                                                     min_samples_split=2,
                                                     min_weight_fraction_leaf=0.0,
                                                     n_estimators=400,
                                                     n_jobs=-1, oob_score=Fals

In [123]:
ensemble_train_predict = ensemble.predict(processed_train)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done  53 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:    0.6s
[Parallel(n_jobs=4)]: Done  77 tasks      | elapsed:    0.7s
[Parallel(n_jobs=4)]: Done  90 tasks      | elapsed:    0.8s
[Parallel(n_jobs=4)]: Done 105 tasks      | elapsed:    0.9s
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed:    1.0s
[Parallel(n_jobs=4)]: Done 137 tasks      | elapsed:    1.2s
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:    1.3s
[Parallel(n_jobs=4)]: Done 173 tasks      | elapsed:    1.5s
[Para

In [114]:
ensemble_test_predict = ensemble.predict(processed_test)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  53 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done  77 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done  90 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 105 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 137 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done 173 tasks      | elapsed:    0.5s
[Para

In [115]:
ensemble_test_predict

array(['functional', 'functional', 'functional', ..., 'functional',
       'functional', 'non functional'], dtype=object)

In [124]:
accuracy_score(target['status_group'], ensemble_train_predict)

0.8261447811447812

In [118]:
create_submission(ensemble_test_predict)