In [415]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier

In [416]:
train = pd.read_csv("./train_data.csv")
test = pd.read_csv("./test_data.csv")

In [417]:
train.target.value_counts()

0    98868
2    36854
1    33585
Name: target, dtype: int64

In [418]:
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
train['cont_1'] = min_max_scaler.fit_transform(train['cont_1'].reshape(-1,1))
test['cont_1'] = min_max_scaler.transform(test['cont_1'].reshape(-1,1))

train['cont_2'] = min_max_scaler.fit_transform(train['cont_2'].reshape(-1,1))
test['cont_2'] = min_max_scaler.transform(test['cont_2'].reshape(-1,1))

train['cont_3'] = min_max_scaler.fit_transform(train['cont_3'].reshape(-1,1))
test['cont_3'] = min_max_scaler.transform(test['cont_3'].reshape(-1,1))

  app.launch_new_instance()


In [419]:
drop_list = ['index' , 'connection_id']

In [420]:
y_train = train.target
train = train.drop(drop_list + ['target'] , axis=1)

In [421]:
conn = test['connection_id']
test = test.drop(['connection_id'] , axis=1)

In [422]:
cont_features = train.columns[train.columns.str.startswith('cont')] 
cat_features = train.columns[train.columns.str.startswith('cat')] 


In [423]:
X_train = train[cont_features]
X_test = test[cont_features]

In [424]:
from sklearn.preprocessing import OneHotEncoder
enc=OneHotEncoder(sparse=False)
for col in cat_features:
    data=train[[col]].append(test[[col]])
    enc.fit(data)
    # Fitting One Hot Encoding on train data
    temp = enc.transform(train[[col]])
    # Changing the encoded features into a data frame with new column names
    temp=pd.DataFrame(temp,columns=[(col+"_"+str(i)) for i in data[col]
            .value_counts().index])
    # In side by side concatenation index values should be same
    # Setting the index values similar to the X_train data frame
    temp=temp.set_index(train.index.values)
    # adding the new One Hot Encoded varibales to the train data frame
    X_train=pd.concat([X_train,temp],axis=1)
    # fitting One Hot Encoding on test data
    temp = enc.transform(test[[col]])
    # changing it into data frame and adding column names
    temp=pd.DataFrame(temp,columns=[(col+"_"+str(i)) for i in data[col]
            .value_counts().index])
    # Setting the index for proper concatenation
    temp=temp.set_index(test.index.values)
    # adding the new One Hot Encoded varibales to test data frame
    X_test=pd.concat([X_test,temp],axis=1)


In [425]:
class Ensemble(object):
    def __init__(self, n_splits, stacker, base_models):
        self.n_splits = n_splits
        self.stacker = stacker
        self.base_models = base_models

    def fit_predict(self, X, y, T):
        X = np.array(X)
        y = np.array(y)
        T = np.array(T)

        folds = list(StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=2016).split(X, y))

        S_train = np.zeros((X.shape[0], len(self.base_models)))
        S_test = np.zeros((T.shape[0], len(self.base_models)))
        for i, clf in enumerate(self.base_models):

            S_test_i = np.zeros((T.shape[0], self.n_splits))

            for j, (train_idx, test_idx) in enumerate(folds):
                X_train = X[train_idx]
                y_train = y[train_idx]
                X_holdout = X[test_idx]

                print ("Fit %s fold %d" % (str(clf).split('(')[0], j+1))
                clf.fit(X_train, y_train)
                y_pred = clf.predict_proba(X_holdout)[:,1]                

                S_train[test_idx, i] = y_pred
                S_test_i[:, j] = clf.predict_proba(T)[:,1]
            S_test[:, i] = S_test_i.mean(axis=1)

        results = cross_val_score(self.stacker, S_train, y, cv=3)
        print("Stacker score: %.5f" % (results.mean()))

        self.stacker.fit(S_train, y)
        res = self.stacker.predict_proba(S_test)[:,1]
        return res , S_train , S_test

In [426]:
lgb_params = {}
lgb_params['learning_rate'] = 0.02
lgb_params['n_estimators'] = 650
lgb_params['max_bin'] = 10
lgb_params['subsample'] = 0.8
lgb_params['subsample_freq'] = 10
lgb_params['colsample_bytree'] = 0.8   
lgb_params['min_child_samples'] = 500
lgb_params['feature_fraction'] = 0.9
lgb_params['bagging_freq'] = 1
lgb_params['seed'] = 200

lgb_params2 = {}
lgb_params2['n_estimators'] = 1090
lgb_params2['learning_rate'] = 0.02
lgb_params2['colsample_bytree'] = 0.3   
lgb_params2['subsample'] = 0.7
lgb_params2['subsample_freq'] = 2
lgb_params2['num_leaves'] = 16

lgb_params2['feature_fraction'] = 0.9
lgb_params2['bagging_freq'] = 1
lgb_params2['seed'] = 200


lgb_params3 = {}
lgb_params3['n_estimators'] = 1100
lgb_params3['max_depth'] = 4
lgb_params3['learning_rate'] = 0.02
lgb_params3['feature_fraction'] = 0.9
lgb_params3['bagging_freq'] = 1
lgb_params3['seed'] = 200

In [427]:
lgb_model = LGBMClassifier(**lgb_params)

lgb_model2 = LGBMClassifier(**lgb_params2)

lgb_model3 = LGBMClassifier(**lgb_params3)

In [428]:
log_model = LogisticRegression()      
stack = Ensemble(n_splits=3,
        stacker = log_model,
        base_models = (lgb_model, lgb_model2, lgb_model3))        
        
y_pred , S_train , S_test = stack.fit_predict(train, y_train, test) 


Fit LGBMClassifier fold 1
Fit LGBMClassifier fold 2
Fit LGBMClassifier fold 3
Fit LGBMClassifier fold 1
Fit LGBMClassifier fold 2
Fit LGBMClassifier fold 3
Fit LGBMClassifier fold 1
Fit LGBMClassifier fold 2
Fit LGBMClassifier fold 3
Stacker score: 0.78097


In [429]:
log_model.fit(S_train, y_train)
y_pred = log_model.predict(S_test)

In [430]:
test['target'] = y_pred

In [431]:
test['connection_id'] = conn

In [432]:
test[[ 'connection_id' , 'target' ]].to_csv('./result2.csv' , index=False)