# Load Dependencies 

In [150]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Import Data

In [227]:
train = pd.read_csv('Data/train_features.csv')
test = pd.read_csv('Data/test_features.csv')
target = pd.read_csv('Data/train_labels.csv')
train.shape, test.shape

((59400, 40), (14358, 40))

# Select Features

In [228]:
selected_features = ['amount_tsh',
    'date_recorded',
    'gps_height',
    'basin',
    'region',
    'population',
    'public_meeting',
    'scheme_management',
    'permit',
    'construction_year',
    'extraction_type_class',
    'management_group',
    'payment',
    'quality_group',
    'quantity',
    'source_type',
    'source_class', 
    'waterpoint_type',
    'funder',
    'installer', 
    'latitude',
    'longitude']

def select_features(df, features):
    '''
    Subsets dataframe based on list of columns names accepted 
    as a parameter.
    '''
    return df[features]

In [229]:
train = select_features(train, features=selected_features)
test = select_features(test, features=selected_features)
train.shape, test.shape

((59400, 22), (14358, 22))

# Encode Target

In [230]:
target['encoded'] = target['status_group'].replace({
    'functional':1,
    'non functional':-1,
    'functional needs repair':0
})

# Process Features 

In [231]:
def wrangle_features(X):
    X = X.copy()
    
    # Create month and year features from the recorded data feature
    X['date_recorded'] = pd.to_datetime(X['date_recorded'])
    X['date_recorded_month'] = X['date_recorded'].dt.month
    X['date_recorded_year'] = X['date_recorded'].dt.year
    
    # Bin low freq. categories into 'other'
    X['scheme_management'] = X['scheme_management'].replace({
        'SWC':'Other',
        'Trust':'Other',
        'None':'Other'
    })
    
    # Create age category out of construction_year
    # Bin 0 values as -1
    X['pump_age'] = ( 2014 - X['construction_year'] )
    X['pump_age'].replace({2014:-1}).value_counts()
    
    # Alter longitude values so all points are reasonable
    X['longitude'] = X['longitude'].replace({
        0.000000:train['longitude'].median()
    })
    
    # Create Installer Features
    X['DWE_Installer'] = (X['installer'] == 'DWE')

    one_time_install = train['installer'].value_counts()[train['installer'].value_counts() == 1]
    X['One_Time_Installer'] = X['installer'].isin(one_time_install.index)

    small_install = train['installer'].value_counts()[ (train['installer'].value_counts() < 10) & (train['installer'].value_counts() > 1) ]
    X['Small_Installer'] = X['installer'].isin(small_install.index)

    big_install = (( train['installer'].value_counts() >= 10 ) == True)
    X['Big_Installer'] = X['installer'].isin(big_install.index)
    
    # Create Funder Features
    X['Tanzania_Gov_Funder'] = (X['funder'] == 'Government Of Tanzania')

    one_time_funder = train['funder'].value_counts()[train['funder'].value_counts() == 1]
    X['One_Time_Funder'] = X['funder'].isin(one_time_funder.index)

    small_funder = train['funder'].value_counts()[ (train['funder'].value_counts() < 10) & (train['funder'].value_counts() > 1) ]
    X['Small_Funder'] = X['funder'].isin(small_funder.index)

    big_funder = (( train['funder'].value_counts() >= 10 ) == True)
    X['Big_Funder'] = X['funder'].isin(big_funder.index)
    
    # Replace population 0 with median population of Train
    median_pop = train['population'].median()
    X['population'] = X['population'].replace(0, median_pop)
    
    # Create interaction between amount of water avaialable and population
    X['pop*amount_tsh'] = X['population'] * X['amount_tsh']
    
    # Drop unecessary columns
    drop_cols = ['date_recorded', 'funder', 'installer', 'construction_year']
    X = X.drop(columns=drop_cols)
    
    return X

In [232]:
test = wrangle_features(test)
train = wrangle_features(train)

train.shape, test.shape

((59400, 30), (14358, 30))

In [233]:
from sklearn.preprocessing import StandardScaler, RobustScaler
def standardize(X):
    # Features to standardize
    standardize_cols = ['amount_tsh', 'gps_height', 'longitude', 
                        'latitude', 'population', ]
    
    # Silence Data Conversion warning
    X[standardize_cols] = X[standardize_cols].astype(float)
    
    # Fit and transform scaler
    scaler = RobustScaler()
    scaler.fit(train[standardize_cols])
    scaled = pd.DataFrame( scaler.transform(X[standardize_cols]) )
    
    # Add back column names
    for i in range(len(standardize_cols)):
        scaled = scaled.rename(columns={i:standardize_cols[i]})
        
    # Drop non-standardized columns
    X = X.drop(columns=standardize_cols)
    
    # Concat scaled features with rest of features
    X = pd.concat([X, scaled], axis=1)
    
    return X

In [234]:
train = standardize(train)
test = standardize(test)

train.shape, test.shape

((59400, 30), (14358, 30))

In [235]:
import category_encoders as ce

def one_hot(X):
    # Features to one hot encode
    one_hot_cols = ['date_recorded_month', 'date_recorded_year', 
                    'basin', 'region', 'extraction_type_class', 
                    'management_group', 'payment', 'quality_group', 
                    'quantity', 'source_type', 'source_class', 
                    'waterpoint_type']
    
    # Convert all relevant cols to category datatype (for encoder)
    X[one_hot_cols] = X[one_hot_cols].astype('category')
    
    # Initialize and transform relevant features
    encoder = ce.OneHotEncoder(use_cat_names=True)
    
    # Note, train hardcoded to avoid overfitting test data
    encoder.fit(train[one_hot_cols])
    X = encoder.transform(X[one_hot_cols])
    
    return X

In [236]:
def target_encode(X):
    # Features to target encode
    target_cols = ['date_recorded_month', 'date_recorded_year', 
                    'basin', 'region', 'extraction_type_class', 
                    'management_group', 'payment', 'quality_group', 
                    'quantity', 'source_type', 'source_class', 
                    'waterpoint_type', 'scheme_management']
    
    # Convert all relevant cols to category datatype (for encoder)
    X[target_cols] = X[target_cols].astype('category')
    
    # Initialize and transform relevant features
    encoder = ce.TargetEncoder(smoothing=5, min_samples_leaf=5)
    
    # Note, train hardcoded to avoid overfitting test data
    encoder.fit(train[target_cols], target['encoded'])
    encoded = encoder.transform(X[target_cols])
    
    # Add Target Encoded features back to features DataFrame
    X = X.drop(columns=target_cols)
    X = pd.concat([X, encoded], axis=1)
    
    return X

In [237]:
processed_test = target_encode(test)
processed_train = target_encode(train)
processed_train.shape, processed_test.shape

((59400, 30), (14358, 30))

In [238]:
from sklearn.impute import SimpleImputer

def impute(X):
    imputer = SimpleImputer(strategy='most_frequent')
    imputer.fit(processed_train)
    X = imputer.transform(X)
    return X

In [239]:
processed_test = pd.DataFrame( impute(processed_test), columns=processed_test.columns)
processed_train = pd.DataFrame( impute(processed_train), columns=processed_train.columns)

In [240]:
# processed_train.to_csv(f'processed_train.csv', index=False)
# processed_test.to_csv(f'processed_test.csv', index=False)

## Polynomial Features

In [241]:
from sklearn.preprocessing import PolynomialFeatures

In [242]:
poly = PolynomialFeatures(interaction_only=False)
poly.fit(processed_train)
poly_train = poly.transform(processed_train)
poly_test = poly.transform(processed_test)
poly_train.shape, poly_test.shape

((59400, 496), (14358, 496))

## PCA

In [243]:
from sklearn.decomposition import PCA

pca = PCA(n_components=15)

pca.fit(poly_train)

pca_train = pca.transform(poly_train)
pca_test = pca.transform(poly_test)

In [244]:
def pca_to_df(pca_array):
    pca_df = pd.DataFrame(pca_array)
    for col in pca_df.columns:
        pca_df = pca_df.rename(columns={
            col:'PCA_Poly' + str(col) 
        })
    return pca_df

pca_train = pca_to_df(pca_train)
pca_test = pca_to_df(pca_test)

## Clustering

In [245]:
from sklearn.cluster import KMeans

K = [100]
sum_of_squared_distances = []

km = KMeans(n_clusters=k)
km = km.fit(processed_train)
sum_of_squared_distances.append(km.inertia_)

train_clusters = km.predict(processed_train)
test_clusters = km.predict(processed_test)

In [246]:
test_clusters = pd.Series(test_clusters, name='clusters')
train_clusters = pd.Series(train_clusters, name='clusters')

## Concatenating DataFrame

In [247]:
final_train = pd.concat([processed_train, pca_train, train_clusters], axis=1)
final_test = pd.concat([processed_test, pca_test, test_clusters], axis=1)

In [248]:
final_train.shape, final_test.shape

((59400, 46), (14358, 46))

# Testing Model

### Random Forest Classifier

In [253]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate

model = RandomForestClassifier(n_estimators=300,
                              max_depth=None, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.2min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.2min finished


In [255]:
from sklearn.model_selection import RandomizedSearchCV

param_distributions = {
    'n_estimators': [200, 300, 400],
    'max_depth': [15, 25, 30],
}

search = RandomizedSearchCV(model, 
                           param_distributions=param_distributions,
                           n_iter=8,
                           scoring='accuracy', 
                           verbose=10,
                           cv=3,
                           n_jobs=-1,
                           return_train_score=True)

search.fit(final_train, target['status_group'])

Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done  20 out of  24 | elapsed:  5.2min remaining:  1.0min
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  6.0min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators=300, n_jobs=-1,
                                                    oob

In [257]:
search.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=25, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=400,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [259]:
search.best_score_

0.793989898989899

### GradientBoostingClassifier 

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(verbose=10, n_iter_no_change=10)

param_distributions = {
    'n_estimators': [200, 300],
    'max_depth': [15, 25],
}

search = RandomizedSearchCV(gbc, 
                           param_distributions=param_distributions,
                           n_iter=8,
                           scoring='accuracy', 
                           verbose=10,
                           cv=3,
                           n_jobs=-1,
                           return_train_score=True)

search.fit(final_train, target['status_group'])

In [None]:
search.best_estimator_

### SVC

In [None]:
from sklearn.svm import SVC

svc = SVC(kernel='linear')

param_distributions = {
    'C': [0.1, 1],
    'gamma': [0.01, 'auto'],
    'kernel': ['linear', 'rbf']
}

svc_search = RandomizedSearchCV(svc, 
                           param_distributions=param_distributions,
                           n_iter=8,
                           scoring='accuracy', 
                           verbose=10,
                           cv=3,
                           n_jobs=-1,
                           return_train_score=True)

svc_search.fit(final_train, target['status_group'])

In [None]:
svc.best_estimator_

### K Nearest Neighbors

In [265]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
kn = KNeighborsClassifier(n_jobs=-1)

param_distributions = {
    'n_neighbors': [3, 5, 7],
    'p': [1, 2]
}

kn_search = RandomizedSearchCV(kn, 
                           param_distributions=param_distributions,
                           n_iter=8,
                           scoring='accuracy', 
                           verbose=10,
                           cv=3,
                           n_jobs=-1,
                           return_train_score=True)

kn_search.fit(final_train, target['status_group'])

## Best Estimators

In [266]:
rf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=25, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=400,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

gbc = GradientBoostingClassifier(n_estimators=300, max_depth=15, n_iter_no_change=10)

knn = KNeighborsClassifier(n_neighbors=7, p=1, n_jobs=-1)

svc = SVC(gamma='auto', C=1.0)

## Voting Classifier 

In [271]:
from sklearn.ensemble import VotingClassifier

estimators = [
    ('knn', km),
    ('svc', svc),
    ('gbc', gbc),
    ('rf', rf)
]

ensemble = VotingClassifier(estimators, n_jobs=-1, voting='hard')

In [272]:
ensemble.fit(final_train, target['status_group'])
ensemble_train_predict = ensemble.predict(final_train)
ensemble_test_predict = ensemble.predict(final_test)

In [274]:
accuracy_score(target['status_group'], ensemble_train_predict)

0.96493265993266

# Create Submission

In [223]:
def create_submission(y_test_pred):
    sample_submission = pd.read_csv('Data/sample_submission.csv')
    submission = sample_submission.copy()
    submission['status_group'] = y_test_pred
    
    now = pd.to_datetime('now')
    filename = 'MB_' + str(now).replace(' ','_')[0:-7] 
    
    submission.to_csv(f'Submissions/{filename}.csv', index=False)

In [276]:
create_submission(ensemble_test_predict)