In [1]:
import numpy as np
import pandas as pd
from pandas.tools.plotting import scatter_matrix

import xgboost as xgb

import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

pd.options.mode.chained_assignment = None #'warn'

In [2]:
X_train = pd.read_csv('x_train.csv', sep=';')
X_test = pd.read_csv('x_test.csv', sep=';')
X = pd.concat((X_train, X_test), axis=0)

# Populate last_lvl_completed

In [3]:
last_lvl_completed_train = pd.DataFrame(data = np.zeros(X_train.shape[0]), columns = ['last_lvl_completed'])
last_lvl_completed_test = pd.DataFrame(data = np.zeros(X_test.shape[0]), columns = ['last_lvl_completed'])

In [4]:
def get_fin_unfin_unknown_idx(data):
    diff = (data['maxPlayerLevel'] - data['numberOfAttemptedLevels']).values
    not_fin = (diff == -1) | ((diff == 0) & (data['averageNumOfTurnsPerCompletedLevel'] < 0.0000001).values)
    fin = (diff == 0) & (data['averageNumOfTurnsPerCompletedLevel'] > 0).values
    unknown = ~(fin | not_fin)
    return fin, not_fin, unknown

In [5]:
fin_all, not_fin_all, unknown_all = get_fin_unfin_unknown_idx(X)
y = np.zeros(X.shape[0])
y[fin_all] = 1

y = y[~unknown_all]
X_sub_train = X.loc[~unknown_all]
X_sub_test = X.loc[unknown_all]

In [6]:
fit_columns = ['maxPlayerLevel','attemptsOnTheHighestLevel','doReturnOnLowerLevels','fractionOfUsefullBoosters']

In [8]:
xgb_1_params = {
    'colsample_bytree': 0.75,
    'gamma': 0.0,
    'learning_rate': 0.01,
    'max_depth': 5,
    'min_child_weight': 2,
    'n_estimators': 1121,
    'reg_alpha': 0.005,
    'reg_lambda': 1.0,
    'subsample': 0.8,

    'nthread': 4,
    'seed': 2707,
    'silent': 1
}

In [11]:
model = xgb.XGBClassifier(**xgb_1_params)
model.fit(X_sub_train[fit_columns], y)

fin, not_fin, unknown = get_fin_unfin_unknown_idx(X_train)
last_lvl_completed_train.loc[fin, 'last_lvl_completed'] = 1
last_lvl_completed_train.loc[unknown, 'last_lvl_completed'] = model.predict(X_train.loc[unknown, fit_columns])

fin, not_fin, unknown = get_fin_unfin_unknown_idx(X_test)
last_lvl_completed_test.loc[fin, 'last_lvl_completed'] = 1
last_lvl_completed_test.loc[unknown, 'last_lvl_completed'] = model.predict(X_test.loc[unknown, fit_columns])

# Try to reconstruct lost data

In [12]:
X_train_cleaned = X_train.copy()
X_test_cleaned = X_test.copy()

X_train_cleaned['completed_levels'] = X_train_cleaned['numberOfAttemptedLevels'] - 1 + last_lvl_completed_train['last_lvl_completed']
X_test_cleaned['completed_levels'] = X_test_cleaned['numberOfAttemptedLevels'] - 1 + last_lvl_completed_test['last_lvl_completed']

X_train_cleaned['start_level'] = X_train_cleaned['maxPlayerLevel'] - X_train_cleaned['completed_levels']
X_test_cleaned['start_level'] = X_test_cleaned['maxPlayerLevel'] - X_test_cleaned['completed_levels']

X_clean = pd.concat((X_train_cleaned, X_test_cleaned), axis=0)
X_clean = X_clean.loc[X_clean['start_level']==0]

train_upd_idx = (X_train_cleaned['start_level'] > 0).values
test_upd_idx = (X_test_cleaned['start_level'] > 0).values

In [13]:
columns_to_update = ['totalNumOfAttempts', 'averageNumOfTurnsPerCompletedLevel', 
                     'numberOfBoostersUsed', 'totalScore', 'totalBonusScore',
                     'totalStarsCount', 'numberOfDaysActuallyPlayed']
fit_columns = ['maxPlayerLevel','attemptsOnTheHighestLevel','doReturnOnLowerLevels','fractionOfUsefullBoosters']
reg_params_1 = { #########################
                'colsample_bytree': 0.75,
                'gamma': 0.0,
                'learning_rate': 0.01,
                'max_depth': 4,
                'min_child_weight': 4,
                'n_estimators': 1138,
                'reg_alpha': 125,
                'reg_lambda': 0.01,
                'subsample': 0.825,
                
                'nthread': 4,
                'seed': 2707,
                'silent': 1
               }
reg_params_2 = { #########################
                'colsample_bytree': 0.75,
                'gamma': 0.625,
                'learning_rate': 0.01,
                'max_depth': 5,
                'min_child_weight': 6,
                'n_estimators': 1692,
                'reg_alpha': 0,
                'reg_lambda': 7.0,
                'subsample': 0.8,
                
                'nthread': 4,
                'seed': 2707,
                'silent': 1
               }
reg_params_3 = { #########################
                'colsample_bytree': 0.5,
                'gamma': 0.55,
                'learning_rate': 0.1,
                'max_depth': 5,
                'min_child_weight': 1,
                'n_estimators': 1301,
                'reg_alpha': 12.9,
                'reg_lambda': 1,
                #'reg_lambda': 0,
                'subsample': 0.82,
                
                'nthread': 4,
                'seed': 2707,
                'silent': 1
}
reg_params_4 = { #########################
                'colsample_bytree': 0.75,
                'gamma': 0.0,
                'learning_rate': 0.01,
                'max_depth': 5,
                'min_child_weight': 2,
                'n_estimators': 1017,
                'reg_alpha': 484,
                'reg_lambda': 1,
                'subsample': 0.77,
              
                'nthread': 4,
                'seed': 2707,
                'silent': 1
             }
reg_params_5 = { #########################
                'colsample_bytree': 0.75,
                'gamma': 0.0,
                'learning_rate': 0.01,
                'max_depth': 5,
                'min_child_weight': 2,
                'n_estimators': 659,
                'reg_alpha': 0.005,
                'reg_lambda': 1,
                'subsample': 0.8,
               
                'nthread': 4,
                'seed': 2707,
                'silent': 1
               }

reg_params_dict = {
    'totalNumOfAttempts': reg_params_1,
    'averageNumOfTurnsPerCompletedLevel' : reg_params_2,
    'numberOfBoostersUsed' : reg_params_3,
    'totalScore' : reg_params_4,
    'totalBonusScore' : reg_params_4,
    'totalStarsCount' : reg_params_4,
    'numberOfDaysActuallyPlayed' : reg_params_5
}

In [14]:
for col in columns_to_update:
    model = xgb.XGBRegressor(**reg_params_dict[col])
    model.fit(X_clean[fit_columns], X_clean[col].values.ravel())

    for data, upd_idx in [(X_train_cleaned, train_upd_idx), (X_test_cleaned, test_upd_idx)]:
        predict_data = data.loc[upd_idx, fit_columns]
        predict_data['maxPlayerLevel'] = data.loc[upd_idx, 'start_level']
        prediction = model.predict(predict_data)
        prediction[prediction < 0] = 0 # fix for bad models....
        if col == 'averageNumOfTurnsPerCompletedLevel':
            data.loc[upd_idx, col] = (data.loc[upd_idx, col] * data.loc[upd_idx, 'completed_levels']
                                            + prediction * data.loc[upd_idx, 'start_level']) \
                                            / data.loc[upd_idx, 'maxPlayerLevel']
        else:
            data.loc[upd_idx, col] = data.loc[upd_idx, col] + prediction

In [17]:
X_train_cleaned = X_train_cleaned.drop(fit_columns, axis=1)
X_test_cleaned = X_test_cleaned.drop(fit_columns, axis=1)

X_train_cleaned = X_train_cleaned.drop('numberOfAttemptedLevels', axis=1)
X_test_cleaned = X_test_cleaned.drop('numberOfAttemptedLevels', axis=1)

X_train_cleaned.columns = 'upd_' + X_train_cleaned.columns
X_test_cleaned.columns = 'upd_' + X_test_cleaned.columns

In [18]:

X_train = pd.concat((X_train, last_lvl_completed_train, X_train_cleaned), axis=1)
X_test = pd.concat((X_test, last_lvl_completed_test, X_test_cleaned), axis=1)
X_train.to_csv('restored_x_train.csv', index=False, sep=';')
X_test.to_csv('restored_x_test.csv', index=False, sep=';')