In [1]:
!pip install category_encoders==2.*
!pip install pandas_profiling==2.*



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from pandas_profiling import ProfileReport
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from category_encoders import OrdinalEncoder , OneHotEncoder
from sklearn.model_selection import cross_val_score, validation_curve
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB


In [3]:
train = pd.read_csv('train_features.csv', na_values=[0, -2.000000e-08], parse_dates=['date_recorded'])
label_df = pd.read_csv('train_labels.csv', na_values=[0, -2.000000e-08])

df = train.merge(label_df, left_index=False, right_index=False)

def wrangle(df):
  # set the id columns as an index
  df = df.set_index('id')

  # Drop High Cardinality Columns
  high_cordonality = [col for col in df.select_dtypes('object').columns if df[col].nunique() > 100]
  df = df.drop(columns = high_cordonality)

  # Drop Constant Column 
  one_value_col= [col for col in df.columns if df[col].nunique() < 2]
  df = df.drop(columns = one_value_col)

  # Drop columns with high proportion of null values (60% nan value of the colume)
  high_nan_value = [ col for col in df if (df[col].isnull().sum()/df.shape[0]) >= 0.90]
  df = df.drop(columns = high_nan_value)

  # Drop Duplicate Column
  rows_to_check = 100
  dupe_cols = [col for col in df.head(rows_to_check).T.duplicated().index
               if df.head(rows_to_check).T.duplicated()[col]]
  df.drop(columns=dupe_cols, inplace=True) 

  # Create age feature
  df['pump_age'] = df['date_recorded'].dt.year - df['construction_year']
  df.drop(columns='date_recorded', inplace=True)

  

  return df

In [4]:
train = wrangle(df)
print(train.shape)
train.head()

(47520, 30)


Unnamed: 0_level_0,amount_tsh,gps_height,longitude,latitude,basin,region,region_code,district_code,population,public_meeting,...,water_quality,quality_group,quantity,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group,pump_age
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
454.0,50.0,2092.0,35.42602,-4.227446,Internal,Manyara,21,1.0,160.0,True,...,soft,good,insufficient,spring,spring,groundwater,communal standpipe,communal standpipe,functional,15.0
510.0,,,35.510074,-5.724555,Internal,Dodoma,1,6.0,,True,...,soft,good,enough,shallow well,shallow well,groundwater,hand pump,hand pump,functional,
14146.0,,,32.499866,-9.081222,Lake Rukwa,Mbeya,12,6.0,,True,...,soft,good,enough,shallow well,shallow well,groundwater,other,other,non functional,
47410.0,,,34.060484,-8.830208,Rufiji,Mbeya,12,7.0,,True,...,soft,good,insufficient,river,river/lake,surface,communal standpipe,communal standpipe,non functional,
1288.0,300.0,1023.0,37.03269,-6.040787,Wami / Ruvu,Morogoro,5,1.0,120.0,True,...,salty,salty,enough,shallow well,shallow well,groundwater,other,other,non functional,14.0


In [5]:
test= pd.read_csv('test_features.csv', na_values=[0, -2.000000e-08],parse_dates=['date_recorded'])

X_test = wrangle(test)
print(X_test.shape)
X_test.head()

(11880, 29)


Unnamed: 0_level_0,amount_tsh,gps_height,longitude,latitude,basin,region,region_code,district_code,population,public_meeting,...,payment_type,water_quality,quality_group,quantity,source,source_type,source_class,waterpoint_type,waterpoint_type_group,pump_age
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
37098,,,31.985658,-3.59636,Lake Tanganyika,Shinyanga,17,5.0,,True,...,unknown,soft,good,dry,shallow well,shallow well,groundwater,other,other,
14530,,,32.832815,-4.944937,Lake Tanganyika,Tabora,14,6.0,,True,...,never pay,milky,milky,insufficient,shallow well,shallow well,groundwater,hand pump,hand pump,
62607,10.0,1675.0,35.488289,-4.242048,Internal,Manyara,21,1.0,148.0,True,...,per bucket,soft,good,insufficient,spring,spring,groundwater,communal standpipe,communal standpipe,5.0
46053,,,33.140828,-9.059386,Lake Rukwa,Mbeya,12,6.0,,False,...,never pay,soft,good,seasonal,shallow well,shallow well,groundwater,hand pump,hand pump,
47083,50.0,1109.0,34.217077,-4.430529,Internal,Singida,13,1.0,235.0,True,...,per bucket,soft,good,enough,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,2.0


In [78]:
print(train.shape)
train.isnull().sum()

(47520, 30)


amount_tsh               33331
gps_height               16275
longitude                 1433
latitude                  1433
basin                        0
region                       0
region_code                  0
district_code               19
population               17048
public_meeting            2689
scheme_management         3102
permit                    2439
construction_year        16503
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
management_group             0
payment                      0
payment_type                 0
water_quality                0
quality_group                0
quantity                     0
source                       0
source_type                  0
source_class                 0
waterpoint_type              0
waterpoint_type_group        0
status_group                 0
pump_age                 16503
dtype: int64

In [79]:
! pip install missingpy

Collecting missingpy
  Downloading missingpy-0.2.0-py3-none-any.whl (49 kB)
Installing collected packages: missingpy
Successfully installed missingpy-0.2.0


In [6]:
X= train.drop(columns=['status_group'])
y= train['status_group']

print(f'X shape {X.shape} \ny shape {y.shape}')

X shape (47520, 29) 
y shape (47520,)


In [7]:
XS =X.sample(100)
yS =y.sample(100)

In [8]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.1, random_state =42)

print(f'X_train shape {X_train.shape} ----> y_train shape  {y_train.shape} \nX_test shape {X_val.shape} ----> y_test shape  {y_val.shape}')

X_train shape (42768, 29) ----> y_train shape  (42768,) 
X_test shape (4752, 29) ----> y_test shape  (4752,)


### Decision Trees

In [9]:
model_df =make_pipeline(
    OrdinalEncoder(),
    SimpleImputer(strategy='mean'),
    DecisionTreeClassifier()
)

In [10]:
cv_score_model_dt =cross_val_score(model_df,
                                   X,
                                   y,
                                   cv=5,
                                   n_jobs=-2)

In [11]:
# finding the accuracy for the Decision Trees model

print(f'the Decision Trees model accuracy mean score is {round(cv_score_model_dt.mean()*100,2)}%')

the Decision Trees model accuracy mean score is 74.89%


### random forest

In [12]:
from sklearn.ensemble import RandomForestClassifier

In [13]:
model_RF =make_pipeline(
    OrdinalEncoder(),
    SimpleImputer(strategy='median'),
    RandomForestClassifier()
)

In [14]:
cv_score_model_RF =cross_val_score(model_RF,
                                   X,
                                   y,
                                   cv=5,
                                   verbose=2,
                                   n_jobs=-2)

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 11 concurrent workers.
[Parallel(n_jobs=-2)]: Done   2 out of   5 | elapsed:   12.6s remaining:   19.0s
[Parallel(n_jobs=-2)]: Done   5 out of   5 | elapsed:   12.8s remaining:    0.0s
[Parallel(n_jobs=-2)]: Done   5 out of   5 | elapsed:   12.8s finished


In [15]:
# finding the accuracy for the random forest model

print(f'therandom forest model accuracy mean score is {round(cv_score_model_RF.mean()*100,2)}%')

therandom forest model accuracy mean score is 79.95%


## tune the model

In [16]:
pramas_grid={
    "randomforestclassifier__max_depth": np.arange(20,30,1),
    "randomforestclassifier__n_estimators":np.arange(400,510,10),
    'randomforestclassifier__min_samples_leaf':[2],
    'randomforestclassifier__min_samples_split':[5],
    'randomforestclassifier__max_features': ['auto', 'sqrt'],
}

In [17]:
model_dt_tune = GridSearchCV(model_RF, 
                             param_grid=pramas_grid, 
                             n_jobs=-2, 
                             cv=3, 
                             verbose=2)

model_dt_tune.fit(X, y);

In [18]:
finding the accuracy for the train and val data

print(f'the new model best score is {round((model_dt_tune.best_score_)*100,2)}%')
print('the model best paremater is')
model_dt_tune.best_params_



In [19]:
model_RF =make_pipeline(
    OrdinalEncoder(),
    SimpleImputer(strategy='median'),
    RandomForestClassifier()
)


In [20]:
cv_score_model_RF =cross_val_score(model_RF,
                                   X,
                                   y,
                                   cv=5,
                                   verbose=2,
                                   n_jobs=-2)

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 11 concurrent workers.
[Parallel(n_jobs=-2)]: Done   2 out of   5 | elapsed:    9.4s remaining:   14.2s
[Parallel(n_jobs=-2)]: Done   5 out of   5 | elapsed:   12.0s remaining:    0.0s
[Parallel(n_jobs=-2)]: Done   5 out of   5 | elapsed:   12.0s finished


In [21]:
# finding the accuracy for the random forest model

print(f'therandom forest model accuracy mean score is {round(cv_score_model_RF.mean()*100,2)}%')

therandom forest model accuracy mean score is 80.05%


In [70]:
pramas_grid={
    "randomforestclassifier__max_depth":[25],
    "randomforestclassifier__n_estimators":[195],
    'randomforestclassifier__min_samples_leaf':[2],
    'randomforestclassifier__min_samples_split':[5],
    'randomforestclassifier__max_features': ['auto'],
    'randomforestclassifier__bootstrap': [False],
    'randomforestclassifier__criterion': ['gini'],
    'randomforestclassifier__max_leaf_nodes': [4010],
}

In [71]:
model_dt_tune = GridSearchCV(model_RF, 
                             param_grid=pramas_grid, 
                             n_jobs=-2, 
                             cv=5, 
                             verbose=2)

model_dt_tune.fit(X, y);

Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [72]:
# finding the accuracy for the train and val data

print(f'the new model best score is {round((model_dt_tune.best_score_)*100,2)}%')
print('the model best paremater is')
model_dt_tune.best_params_



the new model best score is 80.62%
the model best paremater is


{'randomforestclassifier__bootstrap': False,
 'randomforestclassifier__criterion': 'gini',
 'randomforestclassifier__max_depth': 25,
 'randomforestclassifier__max_features': 'auto',
 'randomforestclassifier__max_leaf_nodes': 4010,
 'randomforestclassifier__min_samples_leaf': 2,
 'randomforestclassifier__min_samples_split': 5,
 'randomforestclassifier__n_estimators': 195}

In [73]:
y_pred = model_dt_tune.predict(X_test)
submission = pd.DataFrame({'status_group':y_pred}, index=X_test.index)

In [74]:
from IPython.display import HTML
import pandas as pd
import numpy as np
import base64

In [75]:
datestamp = pd.Timestamp.now().strftime('%Y-%m-%d_%H_%M_')
filename=f'{datestamp}submission.csv'

def create_download_link(df, title = "Download CSV file", filename = filename):  
    csv = df.to_csv()
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)

In [76]:
create_download_link(submission)

### XGBoost

In [None]:
!pip install xgboost

In [None]:
from xgboost import XGBClassifier

In [None]:
model_XGB =make_pipeline(
    OneHotEncoder(use_cat_names=True),
    SimpleImputer(strategy='mean'),
    XGBClassifier()
)

In [None]:
cv_score_model_XGB =cross_val_score(model_XGB,
                                    X,
                                    y,
                                    cv=3,
                                    verbose=2,
                                    n_jobs=-2)

In [None]:
# # finding the accuracy for the XGBoost model

print(f'the XGBoost model accuracy mean score is {round(cv_score_model_XGB.mean()*100,2)}%')

#### Tune the model

In [None]:
pramas_grid={
    'xgbclassifier__booster':['gbtree', 'dart'],
    'xgbclassifier__nthread':range(100,160,10),
    'xgbclassifier__eta': np.arange(0.0,0.6,0.1),
    'xgbclassifier__gamma':range(100,160,10),
    'xgbclassifier__alpha':[0]
} 

In [None]:
model_XGB_tune = GridSearchCV(model_XGB, 
                              param_grid=pramas_grid, 
                              n_jobs=-2, 
                              cv=3, 
                              verbose=2)

model_XGB_tune.fit(X, y);

In [None]:
print('model best score is ',model_XGB_tune.best_score_)
print('model best params is ,')
model_XGB_tune.best_params_

### extraTrees classifier

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import RepeatedStratifiedKFold

In [None]:
model_ETC =make_pipeline(
    OrdinalEncoder(),
    SimpleImputer(strategy='mean'),
    ExtraTreesClassifier()
)

In [None]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
cv_score_model_ETC =cross_val_score(model_ETC,
                                   X,
                                   y,
                                   cv=3,
                                    verbose=2,
                                   n_jobs=-2)

In [None]:
# finding the accuracy for the extraTrees classifier model

print(f'the extraTrees classifier model accuracy mean score is {round(cv_score_model_ETC.mean()*100,2)}%')

#### Tune the model

In [None]:
pramas_grid={
    'extraeteesclassifier__n_estimators':np.arange(0,3000,500),
    'extraeteesclassifier__min_samples_split': [5],
    'extraeteesclassifier__min_samples_leaf':[2],
    'extraeteesclassifier__max_depth': np.arange(20,30,1),
    'extraeteesclassifier__max_samples':np.arange(0.3,0.9,0.1),
} 

In [None]:
model_ETC_tune = GridSearchCV(model_ETC, 
                              param_grid=pramas_grid, 
                              n_jobs=-2, 
                              cv=5, 
                              verbose=2)

model_ETC_tune.fit(X, y);

In [None]:
print('model best score is ',model_ETC_tune.best_score_)
print('model best params is ,')
model_ETC_tune.best_params_

### stacking ensemble

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.preprocessing import StandardScaler

In [None]:

 # define the base models
level0 = list()
level0.append(('RF',make_pipeline(
    OrdinalEncoder(),
    SimpleImputer(strategy='median'),
    RandomForestClassifier(bootstrap = False,
                          max_depth=20,
                          max_features='auto',
                          min_samples_leaf =2,
                          min_samples_split =5,
                          n_estimators=170,)
)))
level0.append(('XGB',make_pipeline(
    OneHotEncoder(use_cat_names=True),
    SimpleImputer(strategy='mean'),
    XGBClassifier()
)))
level0.append(('ETC',make_pipeline(
    OrdinalEncoder(),
    SimpleImputer(strategy='mean'),
    ExtraTreesClassifier()
)))
    
level1 = make_pipeline(
    OrdinalEncoder(),
    SimpleImputer(strategy='mean'),
    DecisionTreeClassifier())

# define the stacking ensemble
model_ss = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)


In [None]:

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(model_ss, 
                             X, 
                             y, 
                             cv=3, 
                             n_jobs=-2,
                             verbose=2)


In [None]:
scores.mean()

In [None]:
param_grid={
    'xgbclassifier__nthread':range(100,140,10),
    'extraeteesclassifier__max_depth': np.arange(20,24,1),
    'extraeteesclassifier__max_samples':np.arange(0.3,0.5,0.1)
}

In [None]:
model_rs = RandomizedSearchCV(model_ss, 
                        param_distributions=param_grid, 
                        n_iter=3,
                        n_jobs=-2, 
                        cv=3, 
                        verbose=1)

model_rs.fit(X, y);

In [None]:
model_rs.best_score_

In [None]:
model_rs.best_params_

In [None]:
y_pred = model_rs.predict(X_test)
submission = pd.DataFrame({'status_group':y_pred}, index=X_test.index)
datestamp = pd.Timestamp.now().strftime('%Y-%m-%d_%H_%M_')
submission.to_csv(f'{datestamp}submission.csv')

In [None]:
# generate CSV
prediction.to_csv('new_submission_1.csv')