In [1]:
import feather
import scipy as sp
import numpy as np
import pandas as pd
import lightgbm as lgb

from collections import Counter
from functools import partial
from math import sqrt
from scipy.stats import rankdata

from sklearn.metrics import cohen_kappa_score, mean_squared_error
from sklearn.metrics import confusion_matrix as sk_cmatrix
from sklearn.model_selection import StratifiedKFold, GroupKFold

import matplotlib.pyplot as plt
import seaborn as sns

def get_score(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def get_y():
    return pd.read_csv('../input/petfinder-adoption-prediction/train/train.csv', usecols=[target]).values.flatten()
    
def run_model(X_train, y_train, X_valid, y_valid, X_test,
            categorical_features, numerical_features,
            predictors, maxvalue_dict, fold_id):
    train = lgb.Dataset(X_train, y_train, 
                        categorical_feature=categorical_features, 
                        feature_name=predictors)
    valid = lgb.Dataset(X_valid, y_valid, 
                        categorical_feature=categorical_features, 
                        feature_name=predictors)
    evals_result = {}
    model = lgb.train(
        MODEL_PARAMS,
        train,
        valid_sets=[valid],
        valid_names=['valid'],
        evals_result=evals_result,
        **FIT_PARAMS
    )
    
    # validation score
    y_pred_valid = model.predict(X_valid)
    y_pred_test = model.predict(X_test)

    return y_pred_valid, y_pred_test
 
def plot_mean_feature_importances(feature_importances, max_num=50, importance_type='gain', path=None):
    mean_gain = feature_importances[[importance_type, 'feature']].groupby('feature').mean()
    feature_importances['mean_' + importance_type] = feature_importances['feature'].map(mean_gain[importance_type])

    if path is not None:
        data = feature_importances.sort_values('mean_'+importance_type, ascending=False).iloc[:max_num, :]
        plt.clf()
        plt.figure(figsize=(16, 8))
        sns.barplot(x=importance_type, y='feature', data=data)
        plt.tight_layout()
        plt.savefig(path)
    
    return feature_importances

def to_bins(x, borders):
    for i in range(len(borders)):
        if x <= borders[i]:
            return i
    return len(borders)

class OptimizedRounder_(object):
    def __init__(self):
        self.coef_ = 0

    def _loss(self, coef, X, y, idx):
        X_p = np.array([to_bins(pred, coef) for pred in X])
        ll = -get_score(y, X_p)
        return ll

    def fit(self, X, y):
        coef = [1.5, 2.0, 2.5, 3.0]
        golden1 = 0.618
        golden2 = 1 - golden1
        ab_start = [(1, 2), (1.5, 2.5), (2, 3), (2.5, 3.5)]
        for it1 in range(10):
            for idx in range(4):
                # golden section search
                a, b = ab_start[idx]
                # calc losses
                coef[idx] = a
                la = self._loss(coef, X, y, idx)
                coef[idx] = b
                lb = self._loss(coef, X, y, idx)
                for it in range(20):
                    # choose value
                    if la > lb:
                        a = b - (b - a) * golden1
                        coef[idx] = a
                        la = self._loss(coef, X, y, idx)
                    else:
                        b = b - (b - a) * golden2
                        coef[idx] = b
                        lb = self._loss(coef, X, y, idx)
        self.coef_ = {'x': coef}

    def predict(self, X, coef):
        X_p = np.array([to_bins(pred, coef) for pred in X])
        return X_p

    def coefficients(self):
        return self.coef_['x']
    
class OptimizedRounder(object):
    def __init__(self):
        self.coef_ = 0

    def _loss(self, coef, X, y, idx):
        X_p = np.array([to_bins(pred, coef) for pred in X])
        ll = -get_score(y, X_p)
        return ll

    def fit(self, X, y):
        coef = [0.2, 0.4, 0.6, 0.8]
        golden1 = 0.618
        golden2 = 1 - golden1
        ab_start = [(0.01, 0.3), (0.15, 0.56), (0.35, 0.75), (0.6, 0.9)]
        for it1 in range(10):
            for idx in range(4):
                # golden section search
                a, b = ab_start[idx]
                # calc losses
                coef[idx] = a
                la = self._loss(coef, X, y, idx)
                coef[idx] = b
                lb = self._loss(coef, X, y, idx)
                for it in range(20):
                    # choose value
                    if la > lb:
                        a = b - (b - a) * golden1
                        coef[idx] = a
                        la = self._loss(coef, X, y, idx)
                    else:
                        b = b - (b - a) * golden2
                        coef[idx] = b
                        lb = self._loss(coef, X, y, idx)
        self.coef_ = {'x': coef}

    def predict(self, X, coef):
        X_p = np.array([to_bins(pred, coef) for pred in X])
        return X_p

    def coefficients(self):
        return self.coef_['x']
    
class StratifiedGroupKFold():
    def __init__(self, n_splits=5):
        self.n_splits = n_splits
    
    def split(self, X, y=None, groups=None):
        fold = pd.DataFrame([X, y, groups]).T
        fold.columns = ['X', 'y', 'groups']
        fold['y'] = fold['y'].astype(int)
        g = fold.groupby('groups')['y'].agg('mean').reset_index()
        fold = fold.merge(g, how='left', on='groups', suffixes=('', '_mean'))
        fold['y_mean'] = fold['y_mean'].apply(np.round)
        fold['fold_id'] = 0
        for unique_y in fold['y_mean'].unique():
            mask = fold.y_mean==unique_y
            selected = fold[mask].reset_index(drop=True)
            cv = GroupKFold(n_splits=n_splits)
            for i, (train_index, valid_index) in enumerate(cv.split(range(len(selected)), y=None, groups=selected['groups'])):
                selected.loc[valid_index, 'fold_id'] = i
            fold.loc[mask, 'fold_id'] = selected['fold_id'].values
            
        for i in range(self.n_splits):
            indices = np.arange(len(fold))
            train_index = indices[fold['fold_id'] != i]
            valid_index = indices[fold['fold_id'] == i]
            yield train_index, valid_index
            
def merge(train, test, path, add_cols):
    df_ = feather.read_dataframe(path)
    add_cols += list(df_.columns)
    train = pd.concat((train, df_[:n_train]), axis=1)
    test = pd.concat((test, df_[n_train:].reset_index(drop=True)), axis=1)
    return train, test, add_cols

In [2]:
target = 'AdoptionSpeed'
len_train = 14993
len_test = 3948
    
    
# ===============
# Params
# ===============
seed = 777
n_splits = 5
np.random.seed(seed)

# feature engineering
n_components = 5
img_size = 256
batch_size = 256

# model
MODEL_PARAMS = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.01,
    'num_leaves': 63,
    'subsample': 0.9,
    'subsample_freq': 1,
    #'colsample_bytree': 0.6,
    'max_depth': 9,
    'max_bin': 127,
    'reg_alpha': 0.11,
    'reg_lambda': 0.01,
    'min_child_weight': 0.2,
    'min_child_samples': 20,
    'min_gain_to_split': 0.02,
    'min_data_in_bin': 3,
    'bin_construct_sample_cnt': 5000,
    'cat_l2': 10,
    'verbose': -1,
    'nthread': 16,
    'seed': 777,
}
FIT_PARAMS = {
    'num_boost_round': 5000,
    'early_stopping_rounds': 100,
    'verbose_eval': 10000,
}

# define
maxvalue_dict = {}
categorical_features = [
     'Breed1',
     'Breed2',
     'Color1',
     'Color2',
     'Color3',
     'Dewormed',
     'FurLength',
     'Gender',
     'Health',
     'MaturitySize',
     'State',
     'Sterilized',
     'Type',
     'Vaccinated',
     'Type_main_breed',
     'BreedName_main_breed',
     'Type_second_breed',
     'BreedName_second_breed',
]
numerical_features = []
text_features = ['Name', 'Description']
remove = ['index', 'seq_text', 'PetID', 'Name', 'Description', 'RescuerID', 'StateName', 'annots_top_desc', 'Description_Emb']

In [26]:
train = feather.read_dataframe('X_trainv16.feather')
n_train = len(train)
add_cols = []

test = feather.read_dataframe('X_testv16.feather')
#---
#text feature select
#---
#train, test, add_cols = merge(train, test, "feature/quora_fasttext.feather", add_cols)
#train, test, add_cols = merge(train, test, "feature/quora_fasttext_meta.feather", add_cols)

#train, test, add_cols = merge(train, test, "feature/quora_fasttext_selftrain.feather", add_cols)
#train, test, add_cols = merge(train, test, "feature/quora_fasttext_selftrain_meta.feather", add_cols)

train, test, add_cols = merge(train, test, "feature/googlenews_mag_light.feather", add_cols)
train, test, add_cols = merge(train, test, "feature/googlenews_mag_ligh_tmeta.feather", add_cols)

#---
#image feature select
#---
train, test, add_cols = merge(train, test, "feature/inception_resnet.feather", add_cols)

In [9]:
add_cols

['quora_fasttext1',
 'quora_fasttext2',
 'quora_fasttext3',
 'quora_fasttext4',
 'quora_fasttext5',
 'quora_fasttext6',
 'quora_fasttext7',
 'quora_fasttext8',
 'quora_fasttext9',
 'quora_fasttext10',
 'quora_fasttext11',
 'quora_fasttext12',
 'quora_fasttext13',
 'quora_fasttext14',
 'quora_fasttext15',
 'quora_fasttext16',
 'quora_fasttext17',
 'quora_fasttext18',
 'quora_fasttext19',
 'quora_fasttext20',
 'quora_fasttext21',
 'quora_fasttext22',
 'quora_fasttext23',
 'quora_fasttext24',
 'quora_fasttext25',
 'quora_fasttext26',
 'quora_fasttext27',
 'quora_fasttext28',
 'quora_fasttext29',
 'quora_fasttext30',
 'quora_fasttext31',
 'quora_fasttext32',
 'quora_fasttext33',
 'quora_fasttext34',
 'quora_fasttext35',
 'quora_fasttext36',
 'quora_fasttext37',
 'quora_fasttext38',
 'quora_fasttext39',
 'quora_fasttext40',
 'quora_fasttext41',
 'quora_fasttext42',
 'quora_fasttext43',
 'quora_fasttext44',
 'quora_fasttext45',
 'quora_fasttext46',
 'quora_fasttext47',
 'quora_fasttext48',
 

In [14]:
test.head()

Unnamed: 0,Type_main_breed,Dewormed,Vaccinated,Sterilized,Color1,Breed1,Color3,Health,Gender,Type,...,inception_resnet_374,inception_resnet_375,inception_resnet_376,inception_resnet_377,inception_resnet_378,inception_resnet_379,inception_resnet_380,inception_resnet_381,inception_resnet_382,inception_resnet_383
0,1,0,0,0,0,2,0,0,0,1,...,0.097559,0.882323,0.433614,0.531246,0.394046,0.351997,0.384253,0.446535,0.247519,0.384539
1,0,2,2,2,1,3,0,0,0,0,...,0.204406,0.603608,0.237417,0.619326,0.410897,0.4524,0.480897,0.65155,0.178983,0.754294
2,0,2,2,2,5,3,0,0,1,0,...,0.257428,0.896163,0.40379,0.658857,0.35029,0.569676,1.180959,0.806619,0.187933,0.466329
3,0,2,2,2,0,3,1,0,1,0,...,0.725568,1.015948,0.831551,0.972851,1.011253,1.22586,1.103978,1.073923,0.564953,0.824841
4,1,2,2,2,0,2,1,0,1,1,...,0.704487,0.406451,0.122368,0.601945,0.288864,0.346162,0.28942,0.162718,0.34729,0.651095


In [27]:
%%time
use_cols = pd.read_csv("importance3.csv")
predictors = list(use_cols[use_cols.gain>0].feature.values) + add_cols
predictors = [c for c in predictors if "gnvec" not in c and "glove_mag" not in c and "img_" not in c]
categorical_features = list(set(categorical_features) - set(remove))
categorical_features = [c for c in categorical_features if c in predictors]
numerical_features = list(set(predictors) - set(categorical_features + [target] + remove))
train = train.loc[:, ~train.columns.duplicated()]

X = train.loc[:, predictors]
X_test = test.loc[:, predictors]
y =  feather.read_dataframe('../input/X_train.feather')["AdoptionSpeed"].values
rescuer_id = pd.read_csv('../input/petfinder-adoption-prediction/train/train.csv').loc[:, 'RescuerID'].iloc[:len_train]

y_pred = np.empty(len_train,)
y_test = np.zeros(len(X_test))

#cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=1337)
#for fold_id, (train_index, valid_index) in enumerate(cv.split(range(len(X)), y)):
#cv = GroupKFold(n_splits=n_splits)
#for fold_id, (train_index, valid_index) in enumerate(cv.split(range(len(X)), y=None, groups=rescuer_id)): 
cv = StratifiedGroupKFold(n_splits=n_splits)
for fold_id, (train_index, valid_index) in enumerate(cv.split(range(len(X)), y=y, groups=rescuer_id)): 
    X_train = X.loc[train_index, :]
    X_valid = X.loc[valid_index, :]
    y_train = y[train_index]
    y_valid = y[valid_index]

    y_pred_valid, y_pred_test = run_model(X_train, y_train, X_valid, y_valid, X_test, 
                     categorical_features, numerical_features,
                     predictors, maxvalue_dict, fold_id)
    y_pred_valid = rankdata(y_pred_valid)/len(y_pred_valid)
    y_pred[valid_index] = y_pred_valid.ravel()
    
    y_pred_test = rankdata(y_pred_test)/len(y_pred_test)
    y_test += y_pred_test / n_splits

optR = OptimizedRounder()
optR.fit(y_pred, y)
coefficients = optR.coefficients()
y_pred_opt = optR.predict(y_pred, coefficients)
score = get_score(y, y_pred_opt)
print(score)



Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[739]	valid's rmse: 1.01912




Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[624]	valid's rmse: 1.07076




Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[985]	valid's rmse: 1.04889




Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[835]	valid's rmse: 1.03352




Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[579]	valid's rmse: 1.04294
0.4462189809117385
CPU times: user 1h 19min 22s, sys: 5.51 s, total: 1h 19min 27s
Wall time: 5min 29s


In [28]:
np.save("npy/y_oof_inception_gn_light_meta.npy", y_pred)
np.save("npy/y_test_inception_gn_light_meta.npy", y_test)

In [12]:
X_test.head()

Unnamed: 0,ratio_median_Age_groupby_Type_Breed1_Breed2,ratio_median_Age_groupby_Type_Breed1,BreedName_main_breed,crop_y_max,diff_var_Sterilized_groupby_RescuerID_State,annots_score_sum_median,Age_mul_Quantity,diff_mean_Fee_groupby_Type_Breed1_Breed2,ratio_count_Quantity_groupby_RescuerID,median_Age_groupby_RescuerID_State,...,inception_resnet_374,inception_resnet_375,inception_resnet_376,inception_resnet_377,inception_resnet_378,inception_resnet_379,inception_resnet_380,inception_resnet_381,inception_resnet_382,inception_resnet_383
0,0.666667,0.666667,2,399.0,0.766793,7.749418,2,-139.449874,0.006849,12.0,...,0.097559,0.882323,0.433614,0.531246,0.394046,0.351997,0.384253,0.446535,0.247519,0.384539
1,8.0,8.0,3,240.0,-1.233207,7.752999,24,8.703365,0.006849,12.0,...,0.204406,0.603608,0.237417,0.619326,0.410897,0.4524,0.480897,0.65155,0.178983,0.754294
2,6.666667,6.666667,3,299.0,-1.233207,7.853079,20,-141.296635,0.006849,12.0,...,0.257428,0.896163,0.40379,0.658857,0.35029,0.569676,1.180959,0.806619,0.187933,0.466329
3,1.0,1.666667,3,424.0,-1.233207,7.407107,5,-60.0,0.006849,12.0,...,0.725568,1.015948,0.831551,0.972851,1.011253,1.22586,1.103978,1.073923,0.564953,0.824841
4,2.0,2.0,2,479.0,-1.233207,5.592001,6,-139.449874,0.006849,12.0,...,0.704487,0.406451,0.122368,0.601945,0.288864,0.346162,0.28942,0.162718,0.34729,0.651095
