In [294]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV,RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score,confusion_matrix,accuracy_score,SCORERS, f1_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from xgboost import XGBClassifier

In [295]:
train_x = pd.read_csv("training_set_values.csv")
train_y = pd.read_csv("training_set_labels.csv")
test_x=pd.read_csv("test_set_values.csv")

# Dropping Features
- Dropping features with too many missing values - scheme_name
- Dropping unusable/unnecessary features - num_private, recorded_by
-  Dropping features with very high correlation and leaving only one of each

In [296]:
# Dropping features with too many missing values, unusable features and features with very high correlation
drop_features = ["scheme_name", "num_private", "recorded_by"]
repeated = ["payment", "quality_group", "quantity_group", "source", "source_class",
            "region", "extraction_type", "extraction_type_group", "waterpoint_type_group"]

X = train_x.copy()
X=X.drop(drop_features, axis=1)
X=X.drop(repeated, axis=1)

test_x = test_x.drop(drop_features, axis=1)
test_x = test_x.drop(repeated, axis=1)

In [297]:
# X.isnull().sum()

In [298]:
# pd.set_option('display.max_rows', 3000)
# train_x['date_recorded'].value_counts().sort_index()

In [299]:
# pd.reset_option('display.max_rows')

# Filling in null values

In [300]:
missing_null=["funder", "installer", "subvillage", "public_meeting", "scheme_management",  "permit",]

for col in missing_null:
    X[col].fillna(X[col].mode()[0], inplace=True)
    test_x[col].fillna(X[col].mode()[0], inplace=True)

# Filling in other types of missing values such as 0, None, unkown

In [301]:
# boolean=["public_meeting", "permit"]
missing_None_mode=["scheme_management"]
for col in missing_None_mode:
    mode = X[col].mode()[0]
    X[col].replace("None", mode, inplace=True)
    test_x[col].replace("None", mode, inplace=True)
    
missing_unknown_values_mode = ["payment_type", "water_quality", "quantity", "management", "management_group"]
for col in missing_unknown_values_mode:
    mode = X[col].mode()[0]
    X[col].replace("unknown", mode, inplace=True)
    
missing_zero_mean_int = ["gps_height", "population", "construction_year"]
for col in missing_zero_mean_int:
    X[col].replace(0, np.NaN, inplace=True)
    mean = int(X[col].mean(skipna=True))
    X[col].fillna(mean, inplace=True)
    test_x[col].replace(0, mean, inplace=True)
    
missing_none_drop_temp = ["wpt_name"]
# for col in missing_unknown_values_mode:
#     X[col].replace("none", X[col].mode()[0], inplace=True)

X["construction_year"] = X.construction_year.apply(lambda x: pd.to_datetime(x,format='%Y'))
test_x["construction_year"] = test_x.construction_year.apply(lambda x: pd.to_datetime(x,format='%Y'))

X = X.drop(missing_none_drop_temp, axis=1)
test_x = test_x.drop(missing_none_drop_temp, axis=1)

In [302]:
pd.set_option('display.max_rows', 3000)
# combined_dataset['construction_year'].value_counts().sort_index()

In [303]:
# X.isnull().sum()

In [304]:
# Get list of categorical variables
# s = (X.dtypes == 'object')
# object_cols = list(s[s].index)
# object_cols

In [305]:
X["test"] = 0
test_x["test"] = 1
frames = [X, test_x]
combined_dataset = pd.concat(frames)

## Feature Engineering
### Added new active_time feature
- Represents the time the pump has been active
- Created by difference between construction_year and date_recorded

In [306]:
combined_dataset['active_time']=(combined_dataset.date_recorded.apply(pd.to_datetime)-combined_dataset.construction_year).dt.days
# combined_dataset['active_time']=combined_dataset.active_time.apply(lambda x: float(x)/ (365.25*24*60*60*1e9))
combined_dataset.loc[combined_dataset['active_time'] < 0,combined_dataset.columns=='active_time'] = 5000
combined_dataset['construction_year'] = pd.DatetimeIndex(combined_dataset['construction_year']).year

In [307]:
# combined_dataset = combined_dataset.drop(['active_time'], axis=1)

In [308]:
pd.set_option('display.max_rows', 3000)
# combined_dataset['active_time'].value_counts(bins=100)

In [309]:
# Onehot and Ordinal Encoders chosen based on whether features are ordinal or not and the number of categories.
# Ordinal used for non ordinal features when number of categories are very high.
one_hot=["basin", "scheme_management", "extraction_type_class",
         "management_group", "quantity", "source_type", "waterpoint_type"]
ordinal=["funder","installer", "subvillage", "lga", "ward", "public_meeting",
         "permit", "management", "payment_type", "water_quality", 'date_recorded']

# Ordinal Encoding categorical features

In [310]:
ordEnc=OrdinalEncoder()
combined_dataset[ordinal] = ordEnc.fit_transform(combined_dataset[ordinal])

In [311]:
test_x = combined_dataset.loc[combined_dataset['test'] == 1]
test_x = test_x.drop('test', axis=1)
X = combined_dataset.loc[combined_dataset['test'] == 0]
X = X.drop('test', axis=1)

# OneHot Encoding categorical features

In [312]:
ohe = OneHotEncoder(handle_unknown='ignore')
for col in one_hot:
    ohot_encoded = ohe.fit_transform(X[col].values.reshape(len(X[col].values),1)).toarray()
    ohot_encoded2 = ohot_encoded[:,:].astype(int)
    df_ohot = pd.DataFrame(ohot_encoded2, columns=ohe.get_feature_names())
    X = pd.concat([X,df_ohot], axis=1)
    X.drop(col, axis=1, inplace=True)
    ohot_encoded_test = ohe.transform(test_x[col].values.reshape(len(test_x[col].values),1)).toarray()
    ohot_encoded2_test = ohot_encoded_test[:,:].astype(int)
    df_ohot_test = pd.DataFrame(ohot_encoded2_test, columns=ohe.get_feature_names())
    test_x = pd.concat([test_x,df_ohot_test], axis=1)
    test_x.drop(col, axis=1, inplace=True)

In [313]:
# train_y.head()

# Label Encoding categorical labels

In [314]:
le = LabelEncoder()
train_y[["status_group"]]=train_y[["status_group"]].apply(lambda col : le.fit_transform(col))

# Training Model

In [315]:
X

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,subvillage,region_code,...,x0_river/lake,x0_shallow well,x0_spring,x0_cattle trough,x0_communal standpipe,x0_communal standpipe multiple,x0_dam,x0_hand pump,x0_improved spring,x0_other
0,69572,6000.0,54.0,1548.0,1390.0,1706.0,34.938093,-9.856322,13116.0,11,...,0,0,1,0,1,0,0,0,0,0
1,8776,0.0,322.0,522.0,1399.0,610.0,34.698766,-2.147466,17596.0,20,...,0,0,0,0,1,0,0,0,0,0
2,34310,25.0,313.0,924.0,686.0,2296.0,37.460664,-3.821329,10096.0,21,...,0,0,0,0,0,1,0,0,0,0
3,67743,0.0,285.0,1961.0,263.0,2078.0,38.486161,-11.155298,9998.0,90,...,0,0,0,0,0,1,0,0,0,0
4,19728,0.0,111.0,20.0,1018.0,133.0,31.130847,-1.825359,8583.0,18,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59395,60739,10.0,351.0,486.0,1210.0,229.0,37.169807,-3.253847,6315.0,3,...,0,0,1,0,1,0,0,0,0,0
59396,27263,4700.0,97.0,196.0,1212.0,299.0,35.249991,-9.070629,3323.0,11,...,1,0,0,0,1,0,0,0,0,0
59397,37057,0.0,82.0,507.0,1018.0,442.0,34.017087,-8.750434,9784.0,12,...,0,0,0,0,0,0,0,1,0,0
59398,31282,0.0,48.0,992.0,1018.0,1360.0,35.861315,-6.378573,15553.0,1,...,0,1,0,0,0,0,0,1,0,0


In [316]:
x = X.iloc[:,:70].values
y = train_y.iloc[:,1].values

## Training and testing using train data

In [317]:
# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)
# model=RandomForestClassifier(n_estimators=600, max_depth=40, min_samples_split=10)
# model.fit(x_train,y_train)
# predictions=model.predict(x_test)
# confusion_matrix(y_test,predictions)

array([[5890,   87,  467],
       [ 491,  253,  123],
       [ 999,   41, 3529]], dtype=int64)

In [318]:
# accuracy_score(y_test,predictions)
# f1_score(y_test, predictions, average='weighted')

0.8141414141414142

## Training full dataset with train data and predicting for test data

In [319]:
# Test Data Set
model=RandomForestClassifier(n_estimators=600,max_depth=40, min_samples_split=10)
model.fit(x,y)
predictions=model.predict(test_x)

In [320]:
out_preds=le.inverse_transform(predictions)
indexes=pd.read_csv("test_set_values.csv").iloc[:,0].values
out_data=pd.DataFrame({"id":indexes,"status_group":out_preds})
out_data.to_csv("output.csv",encoding='utf-8',index=False)

## Hyperparameter optimization

In [321]:
# params={"n_estimators":[100,400,600,800],
#         "max_depth":[10,20,40,80]}
# model=RandomForestClassifier()
# cv=KFold(n_splits=10,shuffle=True)
# gsearch = GridSearchCV(model, params,cv=cv, verbose=2, n_jobs=-1)
# results = gsearch.fit(x_train, y_train)
# results.best_params_