In [1]:
import pandas as pd
import numpy as np

import random

import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss

import matplotlib.pyplot as plt
import seaborn as sns
import gc
import utils 

plt.style.use('ggplot')
%matplotlib inline

In [6]:
train = pd.read_csv('train.csv', sep=';')
test = pd.read_csv('test.csv', sep=';', na_values='None')

train = utils.clean_data(train)
test = utils.clean_data(test)

train = utils.new_features(train)
test = utils.new_features(test)

TARGET = 'smoke'

test['cardio'] = np.nan

X = pd.concat((train, test), axis=0)
X_train = X.loc[~X[TARGET].isnull()]
y_train = X_train[TARGET].values.ravel()
X_train = X_train.drop(TARGET, axis=1)

X_test1 = train.loc[train[TARGET].isnull()].drop(TARGET, axis=1)
X_test2 = test.loc[test[TARGET].isnull()].drop(TARGET, axis=1)

In [10]:
use_columns = [
'age_group', 'gender', 'weight', 'ap_hi','alco', 'active', 'ap_lo', 'cholesterol', 'gluc','BMI', 'ap_dif', 'MAP'
]
mean_columns = [
    'alco'
]

In [18]:
import gc
gc.collect()

136

In [None]:
def get_ctr_features(data, test, y, ctr_cols, dctr, num):
        data["target"] = y
        dcols = set(test.columns)
        kf = cross_validation.StratifiedKFold(y, n_folds=4, shuffle=True, random_state=11)
        tr = np.zeros((data.shape[0], len(ctr_cols)))
        for kfold, (itr, icv) in enumerate(kf):
            data_tr = data.iloc[itr]
            data_te = data.iloc[icv]
            for t, col in enumerate(ctr_cols):
                if col not in dcols:
                    continue
                ctr_df = data_tr[[col, "target"]].groupby(col).agg(["count", "sum"])
                ctr_dict = ctr_df.apply(lambda x: calc_ctr(x, num), axis=1).to_dict()
                tr[icv, t] = data_te[col].apply(lambda x: ctr_dict.get(x, dctr))

        te = np.zeros((test.shape[0], len(ctr_cols)))
        for t, col in enumerate(ctr_cols):
            if col not in dcols:
                    continue
            ctr_df = data[[col, "target"]].groupby(col).agg(["count", "sum"])
            ctr_dict = ctr_df.apply(lambda x: calc_ctr(x, num), axis=1).to_dict()
            te[:, t] = test[col].apply(lambda x: ctr_dict.get(x, dctr))
        del data["target"]
        return tr, te

In [226]:
from sklearn.pipeline import Pipeline, FeatureUnion
    
class ColumnsFilter:
    def __init__(self, columns):
        self.columns = columns
    
    def fit(self, X, y=None):
#         print(X.__class__)
        return self
    
    def transform(self, X):
        return X[self.columns]
    
class LogLog:
    def __init__(self):
        self.columns = 1
    
    def fit(self, X, y=None):
        print(X[:4])
        return self
    
    def transform(self, X):
        return X
    
class SmoothLikelihood:
    def __init__(self, columns, glob_mean_value, kf, alpha=13):
        self.columns = columns
        self.glob_mean_value = glob_mean_value
        self.alpha = alpha
        self.kf = kf
        if isinstance(columns, (list, tuple)):
            self.new_column = '_'.join(columns) + '_target_mean'
            self.columns = columns
        else:
            self.new_column = columns + '_target_mean'
            self.columns = [columns]
    
    def fit_transform(self, X, y):
        X=X.copy()
        X['target'] = y
        
        def calc(x):
            return (x['sum'] + self.glob_mean_value * self.alpha) / (x['count'] + self.alpha)
        result = np.zeros(X.shape[0])
        for itr, ite in self.kf.split(X, y):
            tr = X.iloc[itr]
            te = X.iloc[ite]
            
            temp = tr.groupby(self.columns)['target'].agg(["count", "sum"])
            value_dict = temp.apply(calc, axis=1).to_dict()

            result[ite] = te[self.columns].apply(lambda x: value_dict.get(tuple(x.values), self.glob_mean_value), axis=1)

        result = pd.DataFrame(result, columns=[self.new_column])
        
        temp = X.groupby(self.columns)['target'].agg(["count", "sum"])
        self.value_dict = temp.apply(calc, axis=1).to_dict()
       
#         X.drop('target', axis=1, inplace=True)
        return result
    
    def transform(self, X):
        result = pd.DataFrame()
        result[self.new_column] = X[self.columns].apply(lambda x: self.value_dict.get(tuple(x.values), self.glob_mean_value), axis=1)
        return result

In [227]:
# SmoothLikelihood('alco', 0.08804784985046922, kf=kf, alpha=13).fit_transform(X_train, y_train)

In [245]:
from sklearn.model_selection import StratifiedKFold

# a = SmoothLikelihood(['alco', 'gluc'], 0.08804784985046922, )
# a.fit_transform(X_train, y_train)

kf = StratifiedKFold(random_state=44444, n_splits=10, shuffle=True)
combined_features = FeatureUnion([("filter", ColumnsFilter([
    'age_group', 'gender', 'weight', 'ap_hi', 'ap_lo', 'cholesterol','alco', 'active', 'gluc', 'BMI','error_group', 'ap_dif', 'MAP'
])),
#                                   ("mean_1", SmoothLikelihood(['alco', 'active'], 0.08804784985046922, kf=kf, alpha=13))
                                 ])

# # Use combined features to transform dataset:
# X_features = combined_features.fit(X, y).transform(X)

# svm = SVC(kernel="linear")

# # Do grid search over k, n_components and C:

pipeline = Pipeline([ ("features", combined_features), ("model", xgb.XGBClassifier(**{
                                                                    'colsample_bytree': 0.8,
                                                                    'learning_rate': 0.1,
                                                                    'n_estimators': 200,
                                                                    'subsample': 0.8,
                                                                    'n_jobs': 1,
                                                                    'random_state': 2707,
                                                                    'silent': True,}))])
from sklearn.model_selection import cross_val_score
print(np.mean(cross_val_score(pipeline, X_train, y_train, cv=kf)))

0.922470865881


In [None]:
0.922450250517

In [14]:
y_train.mean()

0.08804784985046922

In [12]:
params = {
    'colsample_bytree': 0.8,
    'learning_rate': 0.1,
    'n_estimators': 224,
    'subsample': 0.8,
    
    'n_jobs': 1,
    'random_state': 2707,
    'silent': True,
}

utils.execute_model(xgb.XGBClassifier(**params),
              X_train,
              y_train,
              mean_columns = mean_columns,
              use_columns = use_columns,
              n_folds = 7,
#               stratification_groups=rew,
              alpha = 13,
             )
gc.collect()

ValueError: cannot reindex from a duplicate axis