In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler

In [3]:
values=pd.read_csv('training_values.csv')
labels=pd.read_csv('training_labels.csv')
df = values.merge(labels, on='id')

In [4]:
df['gps_height'].replace(0.0, np.nan, inplace=True)
df['population'].replace(0.0, np.nan, inplace=True)
df['amount_tsh'].replace(0.0, np.nan, inplace=True)
df['longitude'].replace(0.0, np.nan, inplace=True)
df['latitude'].replace(0.0, np.nan, inplace=True)
df['construction_year'].replace(0.0, np.nan, inplace=True)

In [5]:
df.isnull().sum()

id                           0
amount_tsh               41639
date_recorded                0
funder                    3635
gps_height               20438
installer                 3655
longitude                 1812
latitude                     0
wpt_name                     0
num_private                  0
basin                        0
subvillage                 371
region                       0
region_code                  0
district_code                0
lga                          0
ward                         0
population               21381
public_meeting            3334
recorded_by                  0
scheme_management         3877
scheme_name              28166
permit                    3056
construction_year        20709
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
management_group             0
payment                      0
payment_type                 0
water_quality                0
quality_

In [6]:
df.groupby(['region','region_code']).size() 

region         region_code
Arusha         2              3024
               24              326
Dar es Salaam  7               805
Dodoma         1              2201
Iringa         11             5294
Kagera         18             3316
Kigoma         16             2816
Kilimanjaro    3              4379
Lindi          8               300
               18                8
               80             1238
Manyara        21             1583
Mara           20             1969
Mbeya          12             4639
Morogoro       5              4006
Mtwara         9               390
               90              917
               99              423
Mwanza         17               55
               19             3047
Pwani          6              1609
               40                1
               60             1025
Rukwa          15             1808
Ruvuma         10             2640
Shinyanga      11                6
               14               20
               17           

In [7]:
# fill null values for numerical variables with mean of regional value

df["gps_height"].fillna(df.groupby(['region'])["gps_height"].transform("mean"), inplace=True)
df["gps_height"].fillna(df["gps_height"].mean(), inplace=True)

df["population"].fillna(df.groupby(['region'])["population"].transform("mean"), inplace=True)
df["population"].fillna(df["population"].mean(), inplace=True)

df["amount_tsh"].fillna(df.groupby(['region'])["amount_tsh"].transform("mean"), inplace=True)
df["amount_tsh"].fillna(df["amount_tsh"].mean(), inplace=True)


df["construction_year"].fillna(df.groupby(['region'])["construction_year"].transform("mean"), inplace=True)
df["construction_year"].fillna(df["construction_year"].mean(), inplace=True)

df["longitude"].fillna(df.groupby(['region'])["longitude"].transform("mean"), inplace=True)

In [8]:
df["construction_year"]

0        1999.00000
1        2010.00000
2        2009.00000
3        1986.00000
4        1997.25665
            ...    
59395    1999.00000
59396    1996.00000
59397    1997.25665
59398    1997.25665
59399    2002.00000
Name: construction_year, Length: 59400, dtype: float64

In [9]:
import datetime
current_year = datetime.datetime.now().year
df['waterpoint_age'] = current_year - df['construction_year']

In [10]:
df['waterpoint_age']

0        25.00000
1        14.00000
2        15.00000
3        38.00000
4        26.74335
           ...   
59395    25.00000
59396    28.00000
59397    26.74335
59398    26.74335
59399    22.00000
Name: waterpoint_age, Length: 59400, dtype: float64

In [11]:
df['permit']

0        False
1         True
2         True
3         True
4         True
         ...  
59395     True
59396     True
59397    False
59398     True
59399     True
Name: permit, Length: 59400, dtype: object

In [12]:
df['public_meeting']

0        True
1         NaN
2        True
3        True
4        True
         ... 
59395    True
59396    True
59397    True
59398    True
59399    True
Name: public_meeting, Length: 59400, dtype: object

In [13]:
df['permit'] = df['permit'].fillna(False)
df['public_meeting'] = df['public_meeting'].fillna(False)

In [14]:
string_columns = df.select_dtypes(include='object').columns

In [15]:
df[string_columns] = df[string_columns].apply(lambda x: x.str.lower())

In [16]:
df["funder"].fillna("other", inplace=True)
df["subvillage"].fillna("other", inplace=True)
df["wpt_name"].fillna("other", inplace=True)

In [17]:
unique = df['installer'].unique()
print(unique.size)
print(df['installer'].size)
for x in unique:
    print(x)

1936
59400
roman
grumeti
world vision
unicef
artisan
dwe
dwsp
water aid
private
danida
lawatefuka water sup
wedeco
danid
twe
isf
kilolo star
district council
water
wu
nan
not known
central government
cefa
commu
accra
lga
muwsa
kkkt _ konde and dwe
government
olgilai village community
kkkt
rwe
adra /community
sema
shipo
hesawa
acra
community
ifad
sengerema water department
he
isf and tacare
kokeni
da
adra
allys
aict
kiuma
ces
district counci
ruthe
adra/community
tulawaka gold mine
kkt c
water board
local contract
wfp
lips
tasaf
world
0
sw
fini water
kanisa
oxfarm
village council orpha
villagers
idara ya maji
fpct
wvt
ir
angli
secondary school
amref
jbg
dadis
international aid services
rw
dmdd
tcrs
rc church
jica
gwasco l
af
wananchi
fw
mwe &
gove
tdft
rwe/dwe
central govt
world bank
twesa
norad
hans
finw
fin water
oxfam
plan internationa
rwedwe
cdt
north
oikos e .africa
shawasa
un
save the rain
john gemuta co
tlc
rc churc
plan int
phase
lvia
rhobi
makonde water population
rwe/ community

In [18]:
df["scheme_management"].fillna("other", inplace=True)
df['installer'].fillna("other", inplace=True)

In [19]:
df["scheme_name"].fillna("other", inplace=True)

In [20]:
df['date_recorded'] = pd.to_datetime(df['date_recorded'])

In [21]:
current_date = datetime.datetime.now()
df['days_since_recorded'] = (current_date - df['date_recorded']).dt.days

In [22]:
df['days_since_recorded']

0        4796
1        4073
2        4082
3        4110
4        4675
         ... 
59395    4015
59396    4742
59397    4768
59398    4802
59399    4787
Name: days_since_recorded, Length: 59400, dtype: int64

In [23]:
df.drop('date_recorded',axis=1,inplace=True)

In [24]:
train = df.copy()

In [25]:
label = train.pop('status_group')

In [26]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
for column_name in train.columns:
    if train[column_name].dtype == object:
        train[column_name] = le.fit_transform(train[column_name])
    else:
        pass

In [29]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_data = scaler.fit_transform(train)

In [30]:
from sklearn.decomposition import PCA
pca = PCA(n_components=5)
pca_result = pca.fit_transform(scaled_data)
pca_df = pd.DataFrame(data=pca_result, columns=['PC1', 'PC2','PC3','PC4','PC5'])

In [31]:
pca_df

Unnamed: 0,PC1,PC2,PC3,PC4,PC5
0,-2.488577,0.203567,0.415276,-0.014891,0.006312
1,-1.140987,-0.142796,-2.073506,0.937359,-0.745314
2,0.498118,-1.329491,-2.368571,-1.736529,0.040702
3,5.108214,-3.340505,3.029975,-1.870433,-0.110942
4,-1.422050,-2.136594,-2.849328,4.159600,0.404690
...,...,...,...,...,...
59395,-2.293409,-0.289440,0.065023,-1.924672,-1.289455
59396,-2.475371,-2.368808,0.802725,-0.378408,0.845446
59397,3.212907,0.652691,0.544656,-0.535537,1.226903
59398,-0.142695,0.356293,0.683824,1.363492,-0.486682


In [32]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train, label, test_size=0.33, random_state=42)

In [33]:
from sklearn.ensemble import RandomForestClassifier
model_rfc = RandomForestClassifier(n_estimators=1000)

In [34]:
model_rfc.fit(X_train,y_train)

In [35]:
y_pred = model_rfc.predict(X_test)

In [36]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8099173553719008


In [40]:
from sklearn import svm
rbf = svm.SVC(kernel='rbf', gamma=0.5, C=0.1).fit(X_train, y_train)

In [41]:
y_pred_rbf= rbf.predict(X_test)

In [43]:
accuracy = accuracy_score(y_test, y_pred_rbf)
print("Accuracy:", accuracy)

Accuracy: 0.546831955922865


In [62]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(splitter="random",criterion="entropy",max_depth=9)
clf = clf.fit(X_train,y_train)
y_pred_dt = clf.predict(X_test)

In [63]:
accuracy = accuracy_score(y_test, y_pred_dt)
print("Accuracy:", accuracy)

Accuracy: 0.7336496275890215


In [65]:
from sklearn.ensemble import AdaBoostClassifier
# Create adaboost classifer object
abc = AdaBoostClassifier(n_estimators=50,learning_rate=1)
# Train Adaboost Classifer
ada_model = abc.fit(X_train, y_train)

#Predict the response for test dataset
y_pred_ada = ada_model.predict(X_test)



In [66]:
accuracy = accuracy_score(y_test, y_pred_ada)
print("Accuracy:", accuracy)

Accuracy: 0.7228854198551168


In [73]:
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier(n_estimators = 1000)
gb_model = gb.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_gb)
print("Accuracy:", accuracy)

Accuracy: 0.7929292929292929


In [75]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(random_state=1, max_iter=500).fit(X_train, y_train)
y_test_mlp = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_test_mlp)
print("Accuracy:", accuracy)

Accuracy: 0.5287215590245893


In [81]:
from catboost import CatBoostClassifier
catboost = CatBoostClassifier()
catboost.fit(X_train, y_train)
y_pred_cb = catboost.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_cb)
print("Accuracy:", accuracy)

Learning rate set to 0.095481
0:	learn: 1.0322192	total: 12.9ms	remaining: 12.9s
1:	learn: 0.9789083	total: 25.4ms	remaining: 12.7s
2:	learn: 0.9352114	total: 37.1ms	remaining: 12.3s
3:	learn: 0.8997287	total: 48ms	remaining: 12s
4:	learn: 0.8704405	total: 59.2ms	remaining: 11.8s
5:	learn: 0.8447615	total: 71ms	remaining: 11.8s
6:	learn: 0.8227287	total: 82.8ms	remaining: 11.7s
7:	learn: 0.8042613	total: 94ms	remaining: 11.7s
8:	learn: 0.7881212	total: 106ms	remaining: 11.7s
9:	learn: 0.7744020	total: 116ms	remaining: 11.5s
10:	learn: 0.7624938	total: 128ms	remaining: 11.5s
11:	learn: 0.7514015	total: 139ms	remaining: 11.5s
12:	learn: 0.7417996	total: 149ms	remaining: 11.3s
13:	learn: 0.7325951	total: 160ms	remaining: 11.2s
14:	learn: 0.7243816	total: 171ms	remaining: 11.2s
15:	learn: 0.7174918	total: 182ms	remaining: 11.2s
16:	learn: 0.7107020	total: 193ms	remaining: 11.2s
17:	learn: 0.7054970	total: 204ms	remaining: 11.1s
18:	learn: 0.7002896	total: 215ms	remaining: 11.1s
19:	learn: 

170:	learn: 0.5440200	total: 1.89s	remaining: 9.17s
171:	learn: 0.5433703	total: 1.91s	remaining: 9.17s
172:	learn: 0.5429488	total: 1.92s	remaining: 9.16s
173:	learn: 0.5425607	total: 1.93s	remaining: 9.16s
174:	learn: 0.5420674	total: 1.94s	remaining: 9.15s
175:	learn: 0.5416956	total: 1.95s	remaining: 9.15s
176:	learn: 0.5412504	total: 1.97s	remaining: 9.14s
177:	learn: 0.5408912	total: 1.98s	remaining: 9.14s
178:	learn: 0.5405101	total: 1.99s	remaining: 9.13s
179:	learn: 0.5403543	total: 2s	remaining: 9.12s
180:	learn: 0.5399525	total: 2.01s	remaining: 9.11s
181:	learn: 0.5397595	total: 2.02s	remaining: 9.09s
182:	learn: 0.5394446	total: 2.03s	remaining: 9.08s
183:	learn: 0.5391418	total: 2.05s	remaining: 9.09s
184:	learn: 0.5387505	total: 2.06s	remaining: 9.08s
185:	learn: 0.5384500	total: 2.08s	remaining: 9.11s
186:	learn: 0.5381659	total: 2.1s	remaining: 9.11s
187:	learn: 0.5377295	total: 2.11s	remaining: 9.11s
188:	learn: 0.5374268	total: 2.12s	remaining: 9.11s
189:	learn: 0.53

334:	learn: 0.4991347	total: 3.79s	remaining: 7.52s
335:	learn: 0.4989235	total: 3.8s	remaining: 7.5s
336:	learn: 0.4987823	total: 3.81s	remaining: 7.49s
337:	learn: 0.4986249	total: 3.82s	remaining: 7.47s
338:	learn: 0.4984774	total: 3.83s	remaining: 7.46s
339:	learn: 0.4981785	total: 3.84s	remaining: 7.45s
340:	learn: 0.4980737	total: 3.85s	remaining: 7.43s
341:	learn: 0.4979064	total: 3.85s	remaining: 7.42s
342:	learn: 0.4976662	total: 3.87s	remaining: 7.4s
343:	learn: 0.4973966	total: 3.87s	remaining: 7.39s
344:	learn: 0.4971723	total: 3.88s	remaining: 7.38s
345:	learn: 0.4969282	total: 3.9s	remaining: 7.36s
346:	learn: 0.4967917	total: 3.91s	remaining: 7.35s
347:	learn: 0.4966519	total: 3.92s	remaining: 7.34s
348:	learn: 0.4964003	total: 3.93s	remaining: 7.33s
349:	learn: 0.4961341	total: 3.94s	remaining: 7.31s
350:	learn: 0.4959101	total: 3.95s	remaining: 7.3s
351:	learn: 0.4956608	total: 3.96s	remaining: 7.29s
352:	learn: 0.4954626	total: 3.97s	remaining: 7.28s
353:	learn: 0.495

501:	learn: 0.4701739	total: 5.49s	remaining: 5.45s
502:	learn: 0.4700479	total: 5.5s	remaining: 5.43s
503:	learn: 0.4700092	total: 5.51s	remaining: 5.42s
504:	learn: 0.4698575	total: 5.52s	remaining: 5.41s
505:	learn: 0.4697793	total: 5.53s	remaining: 5.4s
506:	learn: 0.4696399	total: 5.54s	remaining: 5.39s
507:	learn: 0.4695604	total: 5.55s	remaining: 5.38s
508:	learn: 0.4693973	total: 5.57s	remaining: 5.37s
509:	learn: 0.4691272	total: 5.58s	remaining: 5.36s
510:	learn: 0.4690199	total: 5.59s	remaining: 5.35s
511:	learn: 0.4689135	total: 5.6s	remaining: 5.33s
512:	learn: 0.4687937	total: 5.61s	remaining: 5.32s
513:	learn: 0.4685704	total: 5.62s	remaining: 5.31s
514:	learn: 0.4684386	total: 5.63s	remaining: 5.3s
515:	learn: 0.4683244	total: 5.64s	remaining: 5.29s
516:	learn: 0.4681805	total: 5.65s	remaining: 5.28s
517:	learn: 0.4679783	total: 5.66s	remaining: 5.27s
518:	learn: 0.4678882	total: 5.67s	remaining: 5.25s
519:	learn: 0.4677000	total: 5.68s	remaining: 5.24s
520:	learn: 0.46

667:	learn: 0.4475091	total: 7.22s	remaining: 3.59s
668:	learn: 0.4474186	total: 7.24s	remaining: 3.58s
669:	learn: 0.4473305	total: 7.25s	remaining: 3.57s
670:	learn: 0.4471990	total: 7.26s	remaining: 3.56s
671:	learn: 0.4470872	total: 7.27s	remaining: 3.55s
672:	learn: 0.4470006	total: 7.28s	remaining: 3.54s
673:	learn: 0.4467441	total: 7.29s	remaining: 3.52s
674:	learn: 0.4465956	total: 7.3s	remaining: 3.51s
675:	learn: 0.4464008	total: 7.31s	remaining: 3.5s
676:	learn: 0.4463156	total: 7.32s	remaining: 3.49s
677:	learn: 0.4462032	total: 7.33s	remaining: 3.48s
678:	learn: 0.4461034	total: 7.34s	remaining: 3.47s
679:	learn: 0.4459969	total: 7.35s	remaining: 3.46s
680:	learn: 0.4458878	total: 7.36s	remaining: 3.45s
681:	learn: 0.4457646	total: 7.37s	remaining: 3.44s
682:	learn: 0.4456167	total: 7.38s	remaining: 3.42s
683:	learn: 0.4454535	total: 7.39s	remaining: 3.41s
684:	learn: 0.4453612	total: 7.4s	remaining: 3.4s
685:	learn: 0.4452925	total: 7.41s	remaining: 3.39s
686:	learn: 0.44

831:	learn: 0.4284831	total: 8.89s	remaining: 1.79s
832:	learn: 0.4283851	total: 8.9s	remaining: 1.78s
833:	learn: 0.4283345	total: 8.91s	remaining: 1.77s
834:	learn: 0.4281849	total: 8.92s	remaining: 1.76s
835:	learn: 0.4280729	total: 8.93s	remaining: 1.75s
836:	learn: 0.4279856	total: 8.94s	remaining: 1.74s
837:	learn: 0.4278192	total: 8.95s	remaining: 1.73s
838:	learn: 0.4277531	total: 8.96s	remaining: 1.72s
839:	learn: 0.4276308	total: 8.97s	remaining: 1.71s
840:	learn: 0.4275115	total: 8.98s	remaining: 1.7s
841:	learn: 0.4274013	total: 8.99s	remaining: 1.69s
842:	learn: 0.4272250	total: 9s	remaining: 1.68s
843:	learn: 0.4271321	total: 9.01s	remaining: 1.67s
844:	learn: 0.4270301	total: 9.02s	remaining: 1.65s
845:	learn: 0.4269659	total: 9.03s	remaining: 1.64s
846:	learn: 0.4268659	total: 9.04s	remaining: 1.63s
847:	learn: 0.4267862	total: 9.05s	remaining: 1.62s
848:	learn: 0.4266386	total: 9.06s	remaining: 1.61s
849:	learn: 0.4265234	total: 9.07s	remaining: 1.6s
850:	learn: 0.4264

996:	learn: 0.4115108	total: 10.6s	remaining: 31.9ms
997:	learn: 0.4114132	total: 10.6s	remaining: 21.2ms
998:	learn: 0.4112582	total: 10.6s	remaining: 10.6ms
999:	learn: 0.4111443	total: 10.6s	remaining: 0us
Accuracy: 0.7995102540557086


In [77]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.5-cp39-cp39-win_amd64.whl.metadata (1.2 kB)
Collecting graphviz (from catboost)
  Downloading graphviz-0.20.3-py3-none-any.whl.metadata (12 kB)
Downloading catboost-1.2.5-cp39-cp39-win_amd64.whl (101.2 MB)
   ---------------------------------------- 0.0/101.2 MB ? eta -:--:--
   ---------------------------------------- 0.0/101.2 MB ? eta -:--:--
   ---------------------------------------- 0.1/101.2 MB 2.0 MB/s eta 0:00:52
   ---------------------------------------- 0.4/101.2 MB 4.6 MB/s eta 0:00:23
   ---------------------------------------- 0.8/101.2 MB 6.0 MB/s eta 0:00:17
    --------------------------------------- 1.4/101.2 MB 7.1 MB/s eta 0:00:14
    --------------------------------------- 1.9/101.2 MB 8.2 MB/s eta 0:00:13
   - -------------------------------------- 2.7/101.2 MB 9.4 MB/s eta 0:00:11
   - -------------------------------------- 3.6/101.2 MB 10.8 MB/s eta 0:00:10
   - -------------------------------------- 4.6/101.2 MB 1