In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV,KFold,train_test_split,cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,BaggingClassifier,AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import statsmodels.api as sm
from sklearn.metrics import confusion_matrix,classification_report

In [2]:
data=pd.read_csv('data1.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9557 entries, 0 to 9556
Columns: 143 entries, Id to Target
dtypes: float64(8), int64(130), object(5)
memory usage: 10.4+ MB


In [4]:
data.columns

Index(['Id', 'v2a1', 'hacdor', 'rooms', 'hacapo', 'v14a', 'refrig', 'v18q',
       'v18q1', 'r4h1',
       ...
       'SQBescolari', 'SQBage', 'SQBhogar_total', 'SQBedjefe', 'SQBhogar_nin',
       'SQBovercrowding', 'SQBdependency', 'SQBmeaned', 'agesq', 'Target'],
      dtype='object', length=143)

In [5]:
pd.set_option('display.max_rows',None)
data.isnull().sum()

Id                    0
v2a1               6860
hacdor                0
rooms                 0
hacapo                0
v14a                  0
refrig                0
v18q                  0
v18q1              7342
r4h1                  0
r4h2                  0
r4h3                  0
r4m1                  0
r4m2                  0
r4m3                  0
r4t1                  0
r4t2                  0
r4t3                  0
tamhog                0
tamviv                0
escolari              0
rez_esc            7928
hhsize                0
paredblolad           0
paredzocalo           0
paredpreb             0
pareddes              0
paredmad              0
paredzinc             0
paredfibras           0
paredother            0
pisomoscer            0
pisocemento           0
pisoother             0
pisonatur             0
pisonotiene           0
pisomadera            0
techozinc             0
techoentrepiso        0
techocane             0
techootro             0
cielorazo       

In [6]:
data.drop(['Id','v2a1','v18q1','rez_esc'],axis=1,inplace=True)

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9557 entries, 0 to 9556
Columns: 139 entries, hacdor to Target
dtypes: float64(5), int64(130), object(4)
memory usage: 10.1+ MB


In [8]:
data.dropna(inplace=True)

In [9]:
data['edjefe']=np.where(data['edjefe']=='yes',1,data['edjefe'])

In [10]:
data['edjefe']=np.where(data['edjefe']=='no',0,data['edjefe'])

In [11]:
data['edjefa']=np.where(data['edjefa']=='yes',1,data['edjefa'])
data['edjefa']=np.where(data['edjefa']=='no',0,data['edjefa'])

In [12]:
data['dependency']=np.where(data['dependency']=='yes',1,data['dependency'])
data['dependency']=np.where(data['dependency']=='no',0,data['dependency'])

In [13]:
data.drop('female',axis=1,inplace=True)

In [14]:
from sklearn.preprocessing import LabelEncoder
Le=LabelEncoder()
data['idhogar']=Le.fit_transform(data['idhogar'])

In [15]:
X=data.drop('Target',axis=1)
y=data.Target

In [16]:
from sklearn.preprocessing import StandardScaler

In [17]:
sc=StandardScaler()
X=sc.fit_transform(X)

In [18]:
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=1)

In [19]:
param={'n_neighbors':[1,202,402],'weights':['uniform','distance']}
grid=GridSearchCV(estimator=KNeighborsClassifier(),param_grid=param,cv=15)

In [20]:
grid.fit(x_train,y_train)

GridSearchCV(cv=15, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': [1, 202, 402],
                         'weights': ['uniform', 'distance']})

In [21]:
grid.best_estimator_

KNeighborsClassifier(n_neighbors=1)

In [23]:
LR=LogisticRegression()
NB=GaussianNB()
KNN=KNeighborsClassifier(n_neighbors=1)
RF=RandomForestClassifier(criterion='gini',random_state=1,n_estimators=10)
DTC=DecisionTreeClassifier(criterion='gini',random_state=1)
boost_LR=AdaBoostClassifier(base_estimator=LR,n_estimators=50)
boost_NB=AdaBoostClassifier(base_estimator=NB,n_estimators=50)
boost_DTC=AdaBoostClassifier(base_estimator=DTC,n_estimators=50)
boost_RF=AdaBoostClassifier(base_estimator=RF,n_estimators=50)

In [24]:
models=[]
models.append(('KNN',KNN))
models.append(('LR',LR))
models.append(('NB',NB))
models.append(('RF',RF))
models.append(('DTC',DTC))
models.append(('bag_LR',bag_LR))
models.append(('bag_NB',bag_NB))
models.append(('bag_DTC',bag_DTC))
models.append(('bag_KNN',bag_KNN))

In [25]:
results=[]
names=[]
kfold=KFold(n_splits=10,shuffle=True,random_state=0,)
for name,model  in models:
    cv_result=cross_val_score(model,x_train,y_train,cv=kfold,scoring='f1_weighted')
    results.append(cv_result)
    names.append(name)
    print(name,':',cv_result)

KNN : [0.7963993  0.83935966 0.83822152 0.83207142 0.8295982  0.80260536
 0.81684085 0.8245612  0.83839316 0.82367015]
LR : [0.61297111 0.63010078 0.62905229 0.64347032 0.65581248 0.61321972
 0.61294697 0.64074379 0.64760189 0.66898448]
NB : [0.11637509 0.10864741 0.12545787 0.10985369 0.12187738 0.11477825
 0.15087555 0.1369084  0.10806798 0.12593196]
RF : [0.85927843 0.86693335 0.86298574 0.86353585 0.87340299 0.85102111
 0.85239907 0.85922427 0.85316816 0.86103642]
DTC : [0.89476709 0.90307649 0.86344512 0.91739612 0.8750887  0.87287352
 0.88626072 0.89598578 0.88723854 0.90009746]
bag_LR : [0.61369774 0.63551141 0.61855529 0.65150823 0.66239271 0.61606259
 0.61649305 0.63812409 0.65197767 0.66620674]
bag_NB : [0.11447714 0.09536586 0.10796053 0.08496196 0.10291405 0.11454765
 0.11778371 0.11323815 0.09748694 0.12117881]
bag_DTC : [0.89567626 0.91186426 0.89743774 0.92879966 0.90117303 0.88955434
 0.90639084 0.90391141 0.89333744 0.91909219]
bag_KNN : [0.7979677  0.81553794 0.822410

In [26]:
for i in range(len(results)):
    print(names[i],':',np.mean(results[i]),np.var(results[i],ddof=1))

KNN : 0.8241720823287233 0.0002236368648818333
LR : 0.635490383719246 0.00037273443330886725
NB : 0.12187735935718258 0.00018745097539605112
RF : 0.8602985395291786 4.8389144378367464e-05
DTC : 0.88962295360691 0.00025811846591469696
bag_LR : 0.6370529513924286 0.00040868955351178936
bag_NB : 0.10699148169799774 0.00013313996934731664
bag_DTC : 0.9047237143823489 0.00014975965947092046
bag_KNN : 0.815749743692473 0.00018953682294534354


In [27]:
bag_DTC.fit(x_train,y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(random_state=1))

In [28]:
y_predict=bag_DTC.predict(x_test)

In [29]:
print(bag_DTC.score(x_test,y_test))

0.9253314724354501


In [30]:
confusion_matrix(y_test,y_predict)

array([[ 183,   17,    0,   17],
       [  14,  402,    7,   44],
       [   1,   37,  277,   47],
       [   0,   15,   15, 1790]], dtype=int64)

In [31]:
print(classification_report(y_test,y_predict))

              precision    recall  f1-score   support

           1       0.92      0.84      0.88       217
           2       0.85      0.86      0.86       467
           3       0.93      0.77      0.84       362
           4       0.94      0.98      0.96      1820

    accuracy                           0.93      2866
   macro avg       0.91      0.86      0.89      2866
weighted avg       0.92      0.93      0.92      2866

