In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt 

In [2]:
afr = pd.read_csv(r"../share/train.csv")
afr_test = pd.read_csv(r"../share/test.csv")

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, LabelBinarizer, Imputer, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib

import re
import warnings
warnings.filterwarnings('ignore')

#from fuzzywuzzy import fuzz
#from fuzzywuzzy import process

import math
import pickle

In [16]:
from sklearn.metrics import confusion_matrix

In [17]:
# cnf_test
def error_met(y_test, y_check):
    
    cnf = confusion_matrix(y_test, y_check)
    
    fpr = cnf[1,0]/cnf[1,:].sum()
    fnr = cnf[0,1]/cnf[0,:].sum()
    return 0.9*fpr+0.1*fnr

In [18]:
# error_met(cnf_train)
# cnf_test
def error(y_test, y_check):
    
    cnf = confusion_matrix(y_test, y_check)
    
    fpr = cnf[1,0]/cnf[1,:].sum()
    fnr = cnf[0,1]/cnf[0,:].sum()
    return -(0.9*fpr+0.1*fnr)*accuracy_score(y_test, y_check)

In [19]:
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, accuracy_score, make_scorer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [38]:
raw_df = afr
test = afr_test

In [39]:
cat_columns_missing = ['funder', 'installer', 'subvillage', 'scheme_management', 'scheme_name']
bool_columns_missing = ['public_meeting', 'permit']

In [40]:
## Replacing missing in categorical by field "Missing"
for col in cat_columns_missing:
    raw_df[col] = raw_df[col].fillna("Missing")
    test[col] = test[col].fillna("Missing")

In [41]:
## Replacing missing in boolean columns by 99
raw_df[bool_columns_missing] *= 1
test[bool_columns_missing] *= 1

for col in bool_columns_missing:
    raw_df[col] = raw_df[col].fillna(99)
    test[col] = test[col].fillna(99)

In [42]:
## Getting year, month, day out of date recorded
date_col = ['date_recorded']

for col in date_col:
    raw_df[col] = pd.to_datetime(raw_df[col])
    test[col] = pd.to_datetime(test[col])
    
    raw_df[col+'year'] = raw_df[col].dt.year
    raw_df[col+'month'] = raw_df[col].dt.month
    raw_df[col+'day'] = raw_df[col].dt.day
    
    test[col+'year'] = test[col].dt.year
    test[col+'month'] = test[col].dt.month
    test[col+'day'] = test[col].dt.day

In [43]:
## Coverting target to 1,0
raw_df["defective"] = raw_df["defective"].map(lambda x: 1 if x=="yes" else 0)
test["defective"] = 0

In [44]:
## Converting num_private to boolean as 99% are 0's
raw_df["num_private"] = raw_df["num_private"].map(lambda x: 0 if x==0 else 1)
test["num_private"] = test["num_private"].map(lambda x: 0 if x==0 else 1)

In [45]:
## Derived column = construction year - recorded year
raw_df["construction_minus_recorded"] = raw_df["date_recordedyear"] - raw_df["construction_year"]
raw_df.loc[raw_df["construction_year"]==0,'construction_minus_recorded'] = 99999
test["construction_minus_recorded"] = test["date_recordedyear"] - test["construction_year"]

In [46]:
perc_95 = pd.DataFrame(raw_df.amount_tsh.describe(percentiles=[0.95])).iloc[5][0]
raw_df.loc[raw_df["amount_tsh"]>perc_95,'amount_tsh'] = perc_95

In [47]:
cols_to_drop = ['new_ids', 'region_code', 'district_code', 'recorded_by', 'construction_year', 
                'date_recordedyear', 'date_recordedmonth', 'date_recordedday', 'wpt_name'] + date_col
raw_df = raw_df.drop(cols_to_drop,axis=1)
test = test.drop(cols_to_drop,axis=1)

In [48]:
raw_df.columns

Index(['amount_tsh', 'funder', 'gps_height', 'installer', 'longitude',
       'latitude', 'num_private', 'basin', 'subvillage', 'region', 'lga',
       'ward', 'population', 'public_meeting', 'scheme_management',
       'scheme_name', 'permit', 'extraction_type', 'extraction_type_group',
       'extraction_type_class', 'management', 'management_group', 'payment',
       'payment_type', 'water_quality', 'quality_group', 'quantity',
       'quantity_group', 'source', 'source_type', 'source_class',
       'waterpoint_type', 'waterpoint_type_group', 'defective',
       'construction_minus_recorded'],
      dtype='object')

In [49]:
X = raw_df.drop("defective", axis=1)
y = raw_df["defective"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

In [50]:
temp = pd.DataFrame(X_train.dtypes).reset_index()
cat_features = list(temp.index[temp[0]=="object"])

In [51]:
model = ct.CatBoostClassifier(iterations=250, 
                           depth=9, 
                           class_weights = [1,8],
                           learning_rate=0.1,
                           loss_function='Logloss')

model.fit(X_train, y_train,cat_features=cat_features)

preds_cat_val = model.predict(X_val)
preds_cat_train = model.predict(X_train)

#print(confusion_matrix(y_val, preds_cat))
print(error_met(y_val, preds_cat_val))
print(error_met(y_train, preds_cat_train))

0:	learn: 0.5895366	total: 203ms	remaining: 50.7s
1:	learn: 0.5259532	total: 374ms	remaining: 46.4s
2:	learn: 0.4663044	total: 606ms	remaining: 49.9s
3:	learn: 0.4320168	total: 845ms	remaining: 52s
4:	learn: 0.4037437	total: 940ms	remaining: 46.1s
5:	learn: 0.3771304	total: 1.19s	remaining: 48.5s
6:	learn: 0.3598724	total: 1.38s	remaining: 48s
7:	learn: 0.3445778	total: 1.63s	remaining: 49.2s
8:	learn: 0.3342532	total: 1.86s	remaining: 49.9s
9:	learn: 0.3258657	total: 2.11s	remaining: 50.7s
10:	learn: 0.3196463	total: 2.22s	remaining: 48.2s
11:	learn: 0.3113043	total: 2.47s	remaining: 49s
12:	learn: 0.3055328	total: 2.71s	remaining: 49.5s
13:	learn: 0.3001747	total: 2.95s	remaining: 49.8s
14:	learn: 0.2951696	total: 3.05s	remaining: 47.9s
15:	learn: 0.2908993	total: 3.29s	remaining: 48s
16:	learn: 0.2871501	total: 3.53s	remaining: 48.3s
17:	learn: 0.2844886	total: 3.8s	remaining: 49s
18:	learn: 0.2822904	total: 3.97s	remaining: 48.3s
19:	learn: 0.2806250	total: 4.12s	remaining: 47.3s
2

161:	learn: 0.2352546	total: 29s	remaining: 15.7s
162:	learn: 0.2349918	total: 29.2s	remaining: 15.6s
163:	learn: 0.2345400	total: 29.5s	remaining: 15.5s
164:	learn: 0.2344454	total: 29.7s	remaining: 15.3s
165:	learn: 0.2342619	total: 30s	remaining: 15.2s
166:	learn: 0.2340629	total: 30.3s	remaining: 15s
167:	learn: 0.2338317	total: 30.5s	remaining: 14.9s
168:	learn: 0.2334834	total: 30.8s	remaining: 14.7s
169:	learn: 0.2334626	total: 31s	remaining: 14.6s
170:	learn: 0.2332320	total: 31.3s	remaining: 14.4s
171:	learn: 0.2331167	total: 31.5s	remaining: 14.3s
172:	learn: 0.2330491	total: 31.7s	remaining: 14.1s
173:	learn: 0.2329220	total: 32s	remaining: 14s
174:	learn: 0.2326123	total: 32.2s	remaining: 13.8s
175:	learn: 0.2321452	total: 32.5s	remaining: 13.7s
176:	learn: 0.2318759	total: 32.8s	remaining: 13.5s
177:	learn: 0.2317455	total: 33s	remaining: 13.3s
178:	learn: 0.2316074	total: 33.3s	remaining: 13.2s
179:	learn: 0.2315217	total: 33.5s	remaining: 13s
180:	learn: 0.2314142	total:

In [71]:
y_train_predict = model.predict_proba(X_train)[:,1]
y_test_predict = model.predict_proba(X_val)[:,1]
y_train_predict = [1 if i > 0.4 else 0 for i in y_train_predict]
y_test_predict = [1 if i > 0.4 else 0 for i in y_test_predict]
print("train error = ", error_met(y_train, y_train_predict))
print("test error = ", error_met(y_val, y_test_predict))
print("train acc = ", accuracy_score(y_train, y_train_predict))
print("test acc = ", accuracy_score(y_val, y_test_predict))

train error =  0.0548474580194481
test error =  0.07710409873865429
train acc =  0.7029333333333333
test acc =  0.65256


In [52]:
model.fit(raw_df.drop("defective",axis=1), raw_df.defective,cat_features=cat_features)
p3 = model.predict_proba(test)[:,1]
ct2_ts = [1 if i > 0.4 else 0 for i in p3]

0:	learn: 0.5898661	total: 126ms	remaining: 31.4s
1:	learn: 0.5177157	total: 241ms	remaining: 29.8s
2:	learn: 0.4732768	total: 277ms	remaining: 22.8s
3:	learn: 0.4393138	total: 308ms	remaining: 19s
4:	learn: 0.4013645	total: 412ms	remaining: 20.2s
5:	learn: 0.3742588	total: 522ms	remaining: 21.2s
6:	learn: 0.3604675	total: 572ms	remaining: 19.9s
7:	learn: 0.3420366	total: 695ms	remaining: 21s
8:	learn: 0.3283501	total: 815ms	remaining: 21.8s
9:	learn: 0.3170553	total: 936ms	remaining: 22.5s
10:	learn: 0.3084364	total: 1.04s	remaining: 22.6s
11:	learn: 0.3049144	total: 1.07s	remaining: 21.2s
12:	learn: 0.2983574	total: 1.19s	remaining: 21.6s
13:	learn: 0.2920301	total: 1.31s	remaining: 22.1s
14:	learn: 0.2878740	total: 1.42s	remaining: 22.2s
15:	learn: 0.2854811	total: 1.46s	remaining: 21.3s
16:	learn: 0.2818350	total: 1.57s	remaining: 21.6s
17:	learn: 0.2782348	total: 1.69s	remaining: 21.8s
18:	learn: 0.2755050	total: 1.8s	remaining: 21.9s
19:	learn: 0.2731668	total: 1.92s	remaining: 2

162:	learn: 0.2025790	total: 19.1s	remaining: 10.2s
163:	learn: 0.2024872	total: 19.3s	remaining: 10.1s
164:	learn: 0.2022924	total: 19.4s	remaining: 9.99s
165:	learn: 0.2020293	total: 19.5s	remaining: 9.88s
166:	learn: 0.2017983	total: 19.6s	remaining: 9.76s
167:	learn: 0.2014788	total: 19.8s	remaining: 9.65s
168:	learn: 0.2010933	total: 19.9s	remaining: 9.53s
169:	learn: 0.2008900	total: 20s	remaining: 9.41s
170:	learn: 0.2008412	total: 20.1s	remaining: 9.29s
171:	learn: 0.2006006	total: 20.2s	remaining: 9.18s
172:	learn: 0.2004064	total: 20.4s	remaining: 9.06s
173:	learn: 0.2002010	total: 20.5s	remaining: 8.95s
174:	learn: 0.1998810	total: 20.6s	remaining: 8.84s
175:	learn: 0.1995788	total: 20.8s	remaining: 8.73s
176:	learn: 0.1994259	total: 20.9s	remaining: 8.62s
177:	learn: 0.1992435	total: 21s	remaining: 8.5s
178:	learn: 0.1990360	total: 21.1s	remaining: 8.38s
179:	learn: 0.1987461	total: 21.3s	remaining: 8.26s
180:	learn: 0.1985351	total: 21.4s	remaining: 8.15s
181:	learn: 0.198

In [103]:
r = [1 if i == "yes" else 0 for i in rf_ts]
c = [1 if i>=2 else 0 for i in (np.array(r) + np.array(ct_ts) + np.array(ct2_ts))]

In [108]:
# ((np.array(r) + np.array(ct_ts) + np.array(ct2_ts)) == 1).sum()
# (lab_act.defective == "no").sum()
(np.array(c) == 0).sum()

1591

In [111]:
(np.array(c) == np.array(ct2_ts)).sum()

4936

In [114]:
df = pd.DataFrame(columns=["id","defective"])
df.id = afr_test.new_ids
df.defective = ct2_ts
df.to_csv("user.csv",index=False)