In [1]:
import feather
import scipy as sp
import numpy as np
import pandas as pd
import lightgbm as lgb

from collections import Counter
from functools import partial
from math import sqrt

from sklearn.metrics import cohen_kappa_score, mean_squared_error
from sklearn.metrics import confusion_matrix as sk_cmatrix


# FROM: https://www.kaggle.com/myltykritik/simple-lgbm-image-features

# The following 3 functions have been taken from Ben Hamner's github repository
# https://github.com/benhamner/Metrics
def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None):
    """
    Returns the confusion matrix between rater's ratings
    """
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(rater_a + rater_b)
    if max_rating is None:
        max_rating = max(rater_a + rater_b)
    num_ratings = int(max_rating - min_rating + 1)
    conf_mat = [[0 for i in range(num_ratings)]
                for j in range(num_ratings)]
    for a, b in zip(rater_a, rater_b):
        conf_mat[a - min_rating][b - min_rating] += 1
    return conf_mat


def histogram(ratings, min_rating=None, max_rating=None):
    """
    Returns the counts of each type of rating that a rater made
    """
    if min_rating is None:
        min_rating = min(ratings)
    if max_rating is None:
        max_rating = max(ratings)
    num_ratings = int(max_rating - min_rating + 1)
    hist_ratings = [0 for x in range(num_ratings)]
    for r in ratings:
        hist_ratings[r - min_rating] += 1
    return hist_ratings


def quadratic_weighted_kappa(y, y_pred):
    """
    Calculates the quadratic weighted kappa
    axquadratic_weighted_kappa calculates the quadratic weighted kappa
    value, which is a measure of inter-rater agreement between two raters
    that provide discrete numeric ratings.  Potential values range from -1
    (representing complete disagreement) to 1 (representing complete
    agreement).  A kappa value of 0 is expected if all agreement is due to
    chance.
    quadratic_weighted_kappa(rater_a, rater_b), where rater_a and rater_b
    each correspond to a list of integer ratings.  These lists must have the
    same length.
    The ratings should be integers, and it is assumed that they contain
    the complete range of possible ratings.
    quadratic_weighted_kappa(X, min_rating, max_rating), where min_rating
    is the minimum possible rating, and max_rating is the maximum possible
    rating
    """
    rater_a = y
    rater_b = y_pred
    min_rating=None
    max_rating=None
    rater_a = np.array(rater_a, dtype=int)
    rater_b = np.array(rater_b, dtype=int)
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(min(rater_a), min(rater_b))
    if max_rating is None:
        max_rating = max(max(rater_a), max(rater_b))
    conf_mat = confusion_matrix(rater_a, rater_b,
                                min_rating, max_rating)
    num_ratings = len(conf_mat)
    num_scored_items = float(len(rater_a))

    hist_rater_a = histogram(rater_a, min_rating, max_rating)
    hist_rater_b = histogram(rater_b, min_rating, max_rating)

    numerator = 0.0
    denominator = 0.0

    for i in range(num_ratings):
        for j in range(num_ratings):
            expected_count = (hist_rater_a[i] * hist_rater_b[j]
                              / num_scored_items)
            d = pow(i - j, 2.0) / pow(num_ratings - 1, 2.0)
            numerator += d * conf_mat[i][j] / num_scored_items
            denominator += d * expected_count / num_scored_items

    return (1.0 - numerator / denominator)

class OptimizedRounder(object):
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 0
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 1
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 2
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 3
            else:
                X_p[i] = 4

        ll = quadratic_weighted_kappa(y, X_p)
        return -ll

    def fit(self, X, y):
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        initial_coef = [0.5, 1.5, 2.5, 3.5]
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead')

    def predict(self, X, coef):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 0
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 1
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 2
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 3
            else:
                X_p[i] = 4
        return X_p

    def coefficients(self):
        return self.coef_['x']
    
def rmse(actual, predicted):
    return sqrt(mean_squared_error(actual, predicted))

In [8]:
X_train = feather.read_dataframe('../input/X_train_450.feather')
X_test = feather.read_dataframe("../input/X_test_450.feather")
X_train.drop("AdoptionSpeed", axis=1, inplace=True)

y = np.array([0 for i in range(len(X_train))] + [1 for i in range(len(X_test))])

X = X_train.append(X_test).reset_index(drop=True)

all_data_state = feather.read_dataframe("feature/state_info.feather").drop("PetID", axis=1)[['Population', 'Area', 'Pop_density', 'Urban_pop', 'Bumiputra',
       'Chinese', 'Indian']]
X = pd.concat((X, all_data_state), axis=1)

#all_state_agg = feather.read_dataframe("feature/state_agg.feather").drop("PetID", axis=1)
#X_train = pd.concat((X_train, all_state_agg[:n_train]), axis=1)

#X_train["rescuer_target_enc"] = np.load("feature/rescuer_target_enc.npy")

In [9]:
drop_col = ["NMF_img_{}".format(i) for i in range(5)] + ["SVD_img_{}".format(i) for i in range(5)]
X.drop(drop_col, axis=1, inplace=True)
X

Unnamed: 0,Type,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,FurLength,...,dominant_pixel_frac,dominant_score,label_score,Population,Area,Pop_density,Urban_pop,Bumiputra,Chinese,Indian
0,2,3,299,0,1,1,7,0,1,1,...,0.393910,0.302789,0.990786,5411324,8104,670,91.4,57.1,28.6,13.5
1,2,1,265,0,1,1,2,0,2,2,...,0.261856,0.348178,0.981269,1627172,243,6891,100.0,45.9,43.2,10.3
2,1,1,307,0,1,2,7,0,2,2,...,0.055064,0.333318,0.960457,5411324,8104,670,91.4,57.1,28.6,13.5
3,1,4,307,0,2,1,2,0,2,1,...,0.127818,0.136823,0.978698,1627172,243,6891,100.0,45.9,43.2,10.3
4,1,1,307,0,1,1,0,0,2,1,...,0.126334,0.256168,0.984346,5411324,8104,670,91.4,57.1,28.6,13.5
5,2,3,266,0,2,5,6,0,2,1,...,0.078319,0.439436,0.994143,5411324,8104,670,91.4,57.1,28.6,13.5
6,2,12,264,264,1,1,0,0,2,3,...,0.130420,0.350461,0.994640,5411324,8104,670,91.4,57.1,28.6,13.5
7,1,0,307,0,2,1,2,7,2,1,...,0.030610,0.350973,0.965609,5411324,8104,670,91.4,57.1,28.6,13.5
8,2,2,265,0,2,6,0,0,2,2,...,0.098508,0.168647,0.991365,5411324,8104,670,91.4,57.1,28.6,13.5
9,2,12,265,0,2,1,7,0,2,2,...,0.013537,0.206679,0.993973,5411324,8104,670,91.4,57.1,28.6,13.5


In [10]:
params = {'application': 'binary',
          'boosting': 'gbdt',
          'metric': 'auc',
          'num_leaves': 70,
          'max_depth': 9,
          'learning_rate': 0.01,
          'bagging_fraction': 0.85,
          'feature_fraction': 0.8,
          'min_split_gain': 0.02,
          'min_child_samples': 150,
          'min_child_weight': 0.02,
          'lambda_l2': 0.0475,
          'verbosity': -1,
          "seed": 0}

# Additional parameters:
early_stop = 500
verbose_eval = 10000
num_rounds = 10000
n_splits = 5

In [11]:
from sklearn.model_selection import StratifiedKFold


kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=1337)
oof_train = np.zeros((X.shape[0]))
feature_importance_df = pd.DataFrame()

i = 0
for train_index, valid_index in kfold.split(X, y):
    train_index = np.random.permutation(train_index)
    valid_index = np.random.permutation(valid_index)
    
    X_tr, y_tr = X.iloc[train_index, :], y[train_index]
    X_val, y_val = X.iloc[valid_index, :], y[valid_index]
    
    d_train = lgb.Dataset(X_tr, label=y_tr)
    d_valid = lgb.Dataset(X_val, label=y_val)
    watchlist = [d_train, d_valid]
    
    print('training LGB:')
    model = lgb.train(params,
                      train_set=d_train,
                      num_boost_round=num_rounds,
                      valid_sets=watchlist,
                      verbose_eval=verbose_eval,
                      early_stopping_rounds=early_stop)
    
    val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    oof_train[valid_index] = val_pred
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df['feature'] = X.columns.values
    fold_importance_df['importance'] = model.feature_importance(importance_type="gain")
    fold_importance_df['fold'] = i
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

    i += 1

training LGB:
Training until validation scores don't improve for 500 rounds.
Early stopping, best iteration is:
[4926]	training's auc: 1	valid_1's auc: 0.871597
training LGB:
Training until validation scores don't improve for 500 rounds.
Early stopping, best iteration is:
[5501]	training's auc: 1	valid_1's auc: 0.864491
training LGB:
Training until validation scores don't improve for 500 rounds.
Early stopping, best iteration is:
[4570]	training's auc: 1	valid_1's auc: 0.857367
training LGB:
Training until validation scores don't improve for 500 rounds.
Early stopping, best iteration is:
[1276]	training's auc: 0.994732	valid_1's auc: 0.836678
training LGB:
Training until validation scores don't improve for 500 rounds.
Early stopping, best iteration is:
[5288]	training's auc: 1	valid_1's auc: 0.864016


In [12]:
imports = feature_importance_df.groupby('feature')['feature', 'importance'].mean().reset_index()
imports.sort_values('importance', ascending=False)

Unnamed: 0,feature,importance
90,agg_RescuerID_FurLength_SUM,69179.515579
98,agg_RescuerID_MaturitySize_SUM,41130.040751
94,agg_RescuerID_Health_SUM,39191.503139
86,agg_RescuerID_Fee_SUM,33048.599962
96,agg_RescuerID_MaturitySize_MEAN,28202.032740
88,agg_RescuerID_FurLength_MEAN,22782.156257
58,RescuerID_COUNT,21836.044230
84,agg_RescuerID_Fee_MEAN,13567.118334
92,agg_RescuerID_Health_MEAN,10684.740149
83,agg_RescuerID_Fee_MAX,9734.232137
