In [611]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV,RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score,confusion_matrix,accuracy_score,SCORERS
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from xgboost import XGBClassifier

In [612]:
train_x = pd.read_csv("training_set_values.csv")
train_y = pd.read_csv("training_set_labels.csv")
test_x=pd.read_csv("test_set_values.csv")

# Dropping Features
- Dropping features with too many missing values - scheme_name
- Dropping unusable/unnecessary features - num_private, recorded_by
-  Dropping features with very high correlation and leaving only one of each

In [613]:
# Dropping features with too many missing values, unusable features and features with very high correlation
drop_features = ["scheme_name", "num_private", "recorded_by"]
repeated = ["payment", "quality_group", "quantity_group", "source", "source_class",
            "region", "extraction_type", "extraction_type_group", "waterpoint_type_group"]

X = train_x.copy()
X=X.drop(drop_features, axis=1)
X=X.drop(repeated, axis=1)

test_x = test_x.drop(drop_features, axis=1)
test_x = test_x.drop(repeated, axis=1)

In [614]:
# X.isnull().sum()

In [615]:
# pd.set_option('display.max_rows', 3000)
# train_x['basin'].value_counts()

In [616]:
pd.reset_option('display.max_rows')

# Filling in null values

In [617]:
missing_null=["funder", "installer", "subvillage", "public_meeting", "scheme_management",  "permit",]

for col in missing_null:
    X[col].fillna(X[col].mode()[0], inplace=True)
    test_x[col].fillna(X[col].mode()[0], inplace=True)

# Filling in other types of missing values such as 0, None, unkown

In [618]:
# boolean=["public_meeting", "permit"]
missing_None_mode=["scheme_management"]
for col in missing_None_mode:
    mode = X[col].mode()[0]
    X[col].replace("None", mode, inplace=True)
    test_x[col].replace("None", mode, inplace=True)
    
missing_unknown_values_mode = ["payment_type", "water_quality", "quantity", "management", "management_group"]
for col in missing_unknown_values_mode:
    mode = X[col].mode()[0]
    X[col].replace("unknown", mode, inplace=True)
    
missing_zero_mean_int = ["gps_height", "population", "construction_year"]
for col in missing_zero_mean_int:
    X[col].replace(0, np.NaN, inplace=True)
    mean = int(X[col].mean(skipna=True))
    X[col].fillna(mean, inplace=True)
    test_x[col].replace(0, mean, inplace=True)
    
missing_none_drop_temp = ["wpt_name"]
# for col in missing_unknown_values_mode:
#     X[col].replace("none", X[col].mode()[0], inplace=True)

X = X.drop(missing_none_drop_temp, axis=1)
test_x = test_x.drop(missing_none_drop_temp, axis=1)

In [619]:
# pd.set_option('display.max_rows', 3000)
# test_x['funder'].value_counts()

In [620]:
# X.isnull().sum()

In [621]:
# Onehot and Ordinal Encoders chosen based on whether features are ordinal or not and the number of categories.
# Ordinal used for non ordinal features when number of categories are very high.
one_hot=["basin", "scheme_management", "extraction_type_class",
         "management_group", "quantity", "source_type", "waterpoint_type"]
ordinal=["funder","installer", "subvillage", "lga", "ward", "public_meeting",
         "permit", "management", "payment_type", "water_quality", "date_recorded"]

In [622]:
# Get list of categorical variables
# s = (X.dtypes == 'object')
# object_cols = list(s[s].index)
# object_cols

In [623]:
X["test"] = 0
test_x["test"] = 1
frames = [X, test_x]
combined_dataset = pd.concat(frames)

# Ordinal Encoding categorical features

In [624]:
ordEnc=OrdinalEncoder()
combined_dataset[ordinal] = ordEnc.fit_transform(combined_dataset[ordinal])

In [625]:
test_x = combined_dataset.loc[combined_dataset['test'] == 1]
test_x = test_x.drop('test', axis=1)
X = combined_dataset.loc[combined_dataset['test'] == 0]
X = X.drop('test', axis=1)

# OneHot Encoding categorical features

In [626]:
ohe = OneHotEncoder(handle_unknown='ignore')
for col in one_hot:
    ohot_encoded = ohe.fit_transform(X[col].values.reshape(len(X[col].values),1)).toarray()
    ohot_encoded2 = ohot_encoded[:,:].astype(int)
    df_ohot = pd.DataFrame(ohot_encoded2, columns=ohe.get_feature_names())
    X = pd.concat([X,df_ohot], axis=1)
    X.drop(col, axis=1, inplace=True)
    ohot_encoded_test = ohe.transform(test_x[col].values.reshape(len(test_x[col].values),1)).toarray()
    ohot_encoded2_test = ohot_encoded_test[:,:].astype(int)
    df_ohot_test = pd.DataFrame(ohot_encoded2_test, columns=ohe.get_feature_names())
    test_x = pd.concat([test_x,df_ohot_test], axis=1)
    test_x.drop(col, axis=1, inplace=True)

In [627]:
# train_y.head()

# Label Encoding categorical labels

In [628]:
le = LabelEncoder()
train_y[["status_group"]]=train_y[["status_group"]].apply(lambda col : le.fit_transform(col))

# Training Model

In [629]:
x = X.iloc[:,:69].values
y = train_y.iloc[:,1].values

## Training and testing using train data

In [630]:
# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)
# model=RandomForestClassifier(n_estimators=600,max_depth=40)
# model.fit(x_train,y_train)
# predictions=model.predict(x_test)
# confusion_matrix(y_test,predictions)

## Training full dataset with train data and predicting for test data

In [631]:
# Test Data Set
model=RandomForestClassifier(n_estimators=600,max_depth=40)
model.fit(x,y)
predictions=model.predict(test_x)

In [632]:
# accuracy_score(y_test,predictions)

In [633]:
out_preds=le.inverse_transform(predictions)
indexes=pd.read_csv("test_set_values.csv").iloc[:,0].values
out_data=pd.DataFrame({"id":indexes,"status_group":out_preds})
out_data.to_csv("output.csv",encoding='utf-8',index=False)

## Hyperparameter optimization

In [634]:
# params={"n_estimators":[100,400,600,800],
#         "max_depth":[10,20,40,80]}
# model=RandomForestClassifier()
# cv=KFold(n_splits=10,shuffle=True)
# gsearch = GridSearchCV(model, params,cv=cv, verbose=2, n_jobs=-1)
# results = gsearch.fit(x_train, y_train)
# results.best_params_