In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
train_data = pd.read_json("../input/train.json")
test_data = pd.read_json("../input/train.json")

In [None]:
# create target variables
train_data['target'] = train_data['interest_level'].apply(lambda x: 0 if x=='low' else 1 if x=='medium' else 2)
train_data['low'] = train_data['interest_level'].apply(lambda x: 1 if x=='low' else 0)
train_data['medium'] = train_data['interest_level'].apply(lambda x: 1 if x=='medium' else 0)
train_data['high'] = train_data['interest_level'].apply(lambda x: 1 if x=='high' else 0)

In [None]:
# Basic encoding of 'manager_id'
from sklearn import preprocessing

lbl = preprocessing.LabelEncoder()
lbl.fit(list(train_data['manager_id'].values))
train_data['manager_id'] = lbl.transform(list(train_data['manager_id'].values))

In [None]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split 

In [None]:
# functions to compute interest level fraction and manager skill
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin

class manager_skill(BaseEstimator, TransformerMixin):
    
    def __init__(self, threshold = 5):
        
        self.threshold = threshold
        
    def _reset(self):
     
        if hasattr(self, 'mapping_'):
            
            self.mapping_ = {}
            self.mean_skill_ = 0.0
            
    def fit(self, X,y):
    
        self._reset()
        
        temp = pd.concat([X.manager_id,pd.get_dummies(y)], axis = 1).groupby('manager_id').mean()
        temp.columns = ['low_frac', 'medium_frac', 'high_frac']
        temp['count'] = X.groupby('manager_id').count().iloc[:,1]
        
        print(temp.head())

        mean = temp.loc[temp['count'] >= self.threshold, ['low_frac', 'medium_frac', 'high_frac']].mean()
        
        temp.loc[temp['count'] < self.threshold, ['low_frac', 'medium_frac', 'high_frac']] = mean
        
        self.mapping_ = temp[['low_frac', 'medium_frac', 'high_frac']]
        self.mean_skill_ = mean
            
        return self
    
    def transform(self, X):

        X = pd.merge(left = X, right = self.mapping_, how = 'left', left_on = 'manager_id', right_index = True)
        X[['low_frac', 'medium_frac', 'high_frac']] = X[['low_frac', 'medium_frac', 'high_frac']].fillna(self.mean_skill_)
        
        return X

In [None]:
# use stratified cv to find the optimal parameters
%matplotlib inline
import matplotlib.pylab as plt
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy import stats
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import StratifiedKFold
import itertools as itertools
from sklearn.metrics import log_loss

In [None]:
def get_skf_indexes(df, target, kfold=4):
    X = df.values
    y = df[target].values
    skf = StratifiedKFold(n_splits=kfold);
    skf.get_n_splits(X, y);
    indexes = [[],[]]
    for train_index, test_index in skf.split(X, y):
        indexes[0].append(train_index) # Training indexes
        indexes[1].append(test_index) # test indexes
    return indexes

In [None]:
def get_lr_perf(df_train, df_test, feature='__to_check', target='target', n_quantile=20):
    results = {}
    # Inputs
    xtrain = df_train[feature].values.reshape(-1,1)
    ytrain = df_train[target].values
    xtest = df_test[feature].values.reshape(-1,1)
    ytest = df_test[target].values
    # Evaluation as a single feature
    lr = LogisticRegression()
    lr.fit(xtrain, ytrain)
    yptrain = lr.predict_proba(xtrain)
    yptest = lr.predict_proba(xtest)
    results['train.num'] = np.round(log_loss(ytrain, yptrain), 6)
    results['test.num'] = np.round(log_loss(ytest, yptest), 6)
    # Evaluation as a categorical feature using quantile buckets
    bins = np.unique(np.percentile(xtrain, np.arange(n_quantile, 100, n_quantile)))
    xtrainq = np.digitize(xtrain, bins)
    xtestq = np.digitize(xtest, bins)
    lb = LabelBinarizer()
    x1 = lb.fit_transform(xtrainq)
    x2 = lb.transform(xtestq)
    lr.fit(x1, ytrain);
    yptrain = lr.predict_proba(x1)
    yptest = lr.predict_proba(x2)
    results['train.cat'] = np.round(log_loss(ytrain, yptrain), 6)
    results['test.cat'] = np.round(log_loss(ytest, yptest), 6)
    return results

In [None]:
# parameters to check
A = tuple(np.arange(0.05,1,0.05))
T = tuple(np.arange(5,55,5))

# Stratified kfold
idx_train, idx_test = get_skf_indexes(train_data, 'target', kfold=4) # kfold=4, set to 2 to quickly run here

# Get results
Y = pd.DataFrame()

In [None]:
for iper,(i_train,i_test) in enumerate(zip(idx_train,idx_test)):
    print(iper)
    df_train = train_data.iloc[i_train, :].copy()
    df_test = train_data.iloc[i_test, :].copy()
    # For each parameter combination
    for a, t in itertools.product(A,T):
        trans = manager_skill(threshold = t)
        trans.fit(df_train,df_train['interest_level'])
        df_train_transform = trans.transform(df_train)
        df_val_transform = trans.transform(df_test)
        df_train_transform['__to_check'] = a*df_train_transform['high_frac'] + (1-a)*df_train_transform['medium_frac']
        df_val_transform['__to_check'] = a*df_val_transform['high_frac'] + (1-a)*df_val_transform['medium_frac']
        results = get_lr_perf(df_train_transform, df_val_transform, feature='__to_check', target='target', n_quantile=20)
        results.update({'fold': iper, 'params': {'A':a, 'T': t}})
        Y =  Y.append(pd.DataFrame(pd.Series(results)).transpose())
for i in ['train.cat', 'train.num', 'test.cat', 'test.num']:
    Y[i] = Y[i].astype(float)

In [None]:
Y.sort_values(by='test.num',ascending=True)

In [None]:
# compute the average among folds of each parameter combination
Y_average = pd.DataFrame(index=range(len(np.unique(Y.params))), columns=['train.cat', 'train.num', 'test.cat', 'test.num'])
Y_average = pd.concat([pd.DataFrame({'params':np.unique(Y.params)}),Y_average],axis=1)
for i in range(len(np.unique(Y.params))):
    param = np.unique(Y.params)[i]
    df = Y[Y.params == param][['train.cat', 'train.num', 'test.cat', 'test.num']].mean(axis=1)
    Y_average.loc[Y_average.index[i],['train.cat', 'train.num', 'test.cat', 'test.num']] = np.array(df)

In [None]:
Y_average.sort_values(by='test.num',ascending=True)
# a = 0.05, t = 5

In [None]:
# assign interest fraction on test dataset
trans = manager_skill()
trans.fit(train_data, train_data['interest_level'])
train_data = trans.transform(train_data)
train_data.head()
test_data = trans.transform(test_data)
test_data.head()