In [1]:
%matplotlib inline
import math
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn import model_selection, ensemble
from sklearn.metrics import accuracy_score, log_loss
import xgboost as xgb





In [132]:
# RANDOM FOREST

def random_forest(x1, x2, y1, y2, n_estimators=20, max_features=5):
    rf = ensemble.RandomForestClassifier(n_estimators=n_estimators, max_features=max_features)
    rf.fit(x1, y1)
    err_train = np.mean(y1 != rf.predict(x1))
    err_test  = np.mean(y2  != rf.predict(x2))
    return err_train, err_test

# GRAD BOOST

def grad_boost(x1, x2, y1, y2, n_estimators=20):
    gbt = ensemble.GradientBoostingClassifier(n_estimators=n_estimators)
    gbt.fit(x1, y1)
    err_train = np.mean(y1 != gbt.predict(x1))
    err_test = np.mean(y2 != gbt.predict(x2))
    return err_train, err_test

# XGBOOST

def xgboost(x1, x2, y1, y2):
    model = xgb.XGBClassifier()
    model.fit(x1, y1)
    err_train = np.mean(y1 != model.predict(x1))
    err_test = np.mean(y2 != model.predict(x2))
    return err_train, err_test

class ML():
    model = None
    errors = None
    log_loss = None
    
    def __init__(self, x1, x2, y1, y2, model):
        self.x1, self.y1, self.x2, self.y2 = x1, y1, x2, y2
        self.model = model
        self.fit()
    
    def fit(self):
        self.model.fit(self.x1, self.y1)
        y1p, y2p = self.model.predict(self.x1), self.model.predict(self.x2)
        self.errors = np.mean(self.y1 != y1p), np.mean(self.y2 != y2p)
        self.log_loss = log_loss(self.y1, y1p, eps=1e-15), log_loss(self.y2, y2p, eps=1e-15)
    
    def predict(self, data):
        return self.model.predict(data)
    
    

In [91]:
xytte = pd.read_csv("data/xytte.csv")

####################### ВНИМАНИЕ #######################
# здесь трейн это которые в задаче трейн, с ответами
# тест без ответов
# но дальше в алгоритмах трейн будет разбиваться на два подсета
# которые будут у меня называться x1-y1 и x2-y2, а по смыслу как раз трейн и тест
# надо не перепутать

x_train = xytte[xytte.returned == xytte.returned].reset_index(drop=True).drop("returned", axis=1)
y_train = xytte[xytte.returned == xytte.returned].reset_index(drop=True)[["returned"]]
x_test  = xytte[xytte.returned != xytte.returned].reset_index(drop=True).drop("returned", axis=1)

In [92]:
# проверяем, что нигде не налажали
# читаем из файла оригинальные данные

x_test_ref = pd.read_csv("data/x_test.csv", sep=";", dtype=np.float32)
x_train_ref = pd.read_csv("data/x_train.csv", sep=";", dtype=np.float32)
y_train_ref = pd.read_csv("data/y_train.csv", sep=";", dtype=np.float32, header=None, names=["returned"])

# заводим функцию на сравнение таблиц

def df_equal(df1, df2):
    if list(df1.columns) != list(df2.columns): 
        print("Mismatch columns:")
        print("df1: ", list(df1.columns))
        print("df2: ", list(df2.columns))
        return False
    if list(df1.index) != list(df2.index): 
        print("Mismatch index:")
        print("df1: ", list(df1.columns)[0], ":", list(df1.columns)[-1])
        print("df2: ", list(df2.columns)[0], ":", list(df2.columns)[-1])
        return False
    
    ne_stacked = (df1 != df2).stack()
    changed = ne_stacked[ne_stacked]
    changed.index.names = ['id', 'col']

    difference_locations = np.where(df1 != df2)
    changed_from = df1.values[difference_locations]
    changed_to = df2.values[difference_locations]
    diff = pd.DataFrame({'from': changed_from, 'to': changed_to}, index=changed.index)
    return not diff[diff["from"] - diff["to"] > 0.000001].count()[0]

# ассертим наши таблицы

assert(df_equal(x_test[x_test_ref.columns], x_test_ref))
assert(df_equal(x_train[x_train_ref.columns], x_train_ref))
assert(df_equal(y_train, y_train_ref))

In [93]:
params_all = list(x_train.columns)

params_orig = [
'maxPlayerLevel',
'numberOfAttemptedLevels',
'attemptsOnTheHighestLevel',
'totalNumOfAttempts',
'averageNumOfTurnsPerCompletedLevel',
'numberOfBoostersUsed',
'fractionOfUsefullBoosters',
'totalScore',
'totalBonusScore',
'totalStarsCount',
'numberOfDaysActuallyPlayed',
]
     
params_mod = [
#'attemptsOnTheHighestLevel_dvd',
'attemptsOnTheHighestLevel_ln',
'attemptsPerDay',
'averageNumOfTurnsPerCompletedLevel_dvd',
'maxPlayerLevel_ln',
'numberOfAttemptedLevels_dvd',
#'numberOfBoostersUsed_dvd',
'numberOfBoostersUsed_dvd_ln',
#'numberOfBoostersUsed_ln',
'numberOfDaysActuallyPlayed_ln',
'totalBonusScore_dvd',
'totalNumOfAttempts_ln',
#'totalScore_ln',
'totalScore_ln_dvd',
#'totalStarsCount_dvd',
'totalStarsCount_dvd_ln',
#'totalStarsCount_ln',
]

params_bool = [
'allAttemptsOnTheHighestLevel',
'attLevelsMoreThanMaxLevel',
'doReturnOnLowerLevels',
'zeroTotalScore',
'zeroTurnsPerCompletedLevel'
]

params_orig_norm = [
'attemptsOnTheHighestLevel_norm',
'attemptsPerDay_norm',
#'numberOfBoostersUsed_norm',
#'maxPlayerLevel_norm',
'fractionOfUsefullBoosters_norm',
#'averageNumOfTurnsPerCompletedLevel_norm',
'numberOfDaysActuallyPlayed_norm',
'totalNumOfAttempts_norm',
'totalBonusScore_norm',
'totalScore_norm',
'totalStarsCount_norm',
]

params_mod_norm = [
'attemptsOnTheHighestLevel_dvd_norm',
#'attemptsOnTheHighestLevel_ln_norm',
'averageNumOfTurnsPerCompletedLevel_dvd_norm',
'maxPlayerLevel_ln_norm',
'numberOfAttemptedLevels_dvd_norm',
#'numberOfAttemptedLevels_norm',
'numberOfBoostersUsed_dvd_ln_norm',
#'numberOfBoostersUsed_dvd_norm',
#'numberOfBoostersUsed_ln_norm',
'numberOfDaysActuallyPlayed_ln_norm',
'totalBonusScore_dvd_norm',
'totalNumOfAttempts_ln_norm',
'totalScore_ln_dvd_norm',
#'totalScore_ln_norm',
'totalStarsCount_dvd_ln_norm',
#'totalStarsCount_dvd_norm',
'totalStarsCount_ln_norm',
]

In [127]:
#params = params_orig + ["doReturnOnLowerLevels"]
#params = params_mod + params_bool
#params = params_bool
params = params_all
#params

In [128]:
x_train_feed = x_train[params]
x_test_feed = x_test[params]
y_train_feed = list(y_train.returned)
#y_test_feed = list(y_test)

x1, x2, y1, y2 = train_test_split(x_train_feed, y_train_feed, test_size = 0.3)

assert(set(list(x_train_feed.columns)) == set(list(x_test_feed.columns)))
assert(set(list(x1.columns)) == set(list(x2.columns)))
assert(type(y1) == list)
assert(type(y2) == list)
assert(len(y1) == len(x1.index))
assert(len(y2) == len(x2.index))

#list(x1.columns), list(x2.columns) 

In [133]:
RF = ML(x1, x2, y1, y2, ensemble.RandomForestClassifier(n_estimators=400, max_features=7))
GB = ML(x1, x2, y1, y2, ensemble.GradientBoostingClassifier(n_estimators=600))
XGB = ML(x1, x2, y1, y2, xgb.XGBClassifier())


print(RF.log_loss, "\n", GB.log_loss, "\n", XGB.log_loss)

(0.14828533533008867, 5.968204828671742) 
 (4.2612866421946283, 5.9818595984148892) 
 (5.7578166911735513, 5.6859504321656482)


In [134]:
out = GB.predict(x_test_feed)

In [135]:
out

array([ 0.,  0.,  1., ...,  0.,  0.,  0.])

In [54]:
rf = ensemble.RandomForestClassifier(n_estimators=100, max_features=5)
type(ensemble.RandomForestClassifier(n_estimators=100, max_features=5))

sklearn.ensemble.forest.RandomForestClassifier