In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn import ensemble 
from lightgbm import LGBMClassifier
from sklearn.linear_model import SGDClassifier

In [None]:
train_df = pd.read_csv('../input/train.csv')
test_df = pd.read_csv('../input/test.csv')

In [None]:
## data cleaning
def data_cleaning(data):
    data['dependency']=np.sqrt(data['SQBdependency'])
    data['rez_esc']=data['rez_esc'].fillna(0)
    data['v18q1']=data['v18q1'].fillna(0)
    data['v2a1']=data['v2a1'].fillna(0)
    
    conditions = [
    (data['edjefe']=='no') & (data['edjefa']=='no'), #both no
    (data['edjefe']=='yes') & (data['edjefa']=='no'), # yes and no
    (data['edjefe']=='no') & (data['edjefa']=='yes'), #no and yes 
    (data['edjefe']!='no') & (data['edjefe']!='yes') & (data['edjefa']=='no'), # number and no
    (data['edjefe']=='no') & (data['edjefa']!='no') # no and number
    ]
    choices = [0, 1, 1, data['edjefe'], data['edjefa']]
    data['edjefx']=np.select(conditions, choices)
    data['edjefx']=data['edjefx'].astype(int)
    data.drop(['edjefe', 'edjefa'], axis=1, inplace=True)
    
    meaneduc_nan=data[data['meaneduc'].isnull()][['Id','idhogar','escolari']]
    me=meaneduc_nan.groupby('idhogar')['escolari'].mean().reset_index()
    for row in meaneduc_nan.iterrows():
        idx=row[0]
        idhogar=row[1]['idhogar']
        m=me[me['idhogar']==idhogar]['escolari'].tolist()[0]
        data.at[idx, 'meaneduc']=m
        data.at[idx, 'SQBmeaned']=m*m
    data=data.drop('idhogar',axis=1)
    return data

In [None]:
x_train = data_cleaning(train_df)
x_train = x_train.drop('Id',axis=1)

In [None]:
y_train = x_train.Target
x_train = x_train.drop('Target',axis=1)

In [None]:
preds = pd.DataFrame()
test = data_cleaning(test_df)
ids=test['Id']
test.drop(['Id'], axis=1, inplace=True)

In [None]:
%%time
from lightgbm import LGBMClassifier

m1 = ensemble.AdaBoostClassifier(ensemble.ExtraTreesClassifier(n_estimators=500), n_estimators=250, learning_rate=0.01, algorithm='SAMME')  
m1.fit(x_train, y_train) 
preds["Model1"] = m1.predict(test)

m2 = ensemble.ExtraTreesClassifier(n_estimators=550)  
m2.fit(x_train, y_train)
preds["Model2"] = m2.predict(test)

m3 = XGBClassifier(max_depth=20, n_estimators=2000)  
m3.fit(x_train, y_train)
preds["Model3"] = m3.predict(test)

m5 = ensemble.AdaBoostClassifier(ensemble.GradientBoostingClassifier(n_estimators=1000, max_depth=10), n_estimators=1000, learning_rate=0.01, algorithm="SAMME")
m5.fit(x_train, y_train)
preds["Model5"] = m5.predict(test)

m6 = LGBMClassifier(n_estimators=3000, max_depth=15)
m6.fit(x_train, y_train)
preds["Model6"] = m6.predict(test)

In [None]:

pred = preds.mode(axis=1)
submit=pd.DataFrame({'Id': ids, 'Target': pred[0].astype('int').values})
submit.to_csv('Vsubmit.csv', index=False)