In [76]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

In [77]:
train, test = pd.read_csv('data/GGG/train.csv'), pd.read_csv('data/GGG/test.csv')

In [78]:
train.head()

Unnamed: 0,id,bone_length,rotting_flesh,hair_length,has_soul,color,type
0,0,0.354512,0.350839,0.465761,0.781142,clear,Ghoul
1,1,0.57556,0.425868,0.531401,0.439899,green,Goblin
2,2,0.467875,0.35433,0.811616,0.791225,black,Ghoul
3,4,0.776652,0.508723,0.636766,0.884464,black,Ghoul
4,5,0.566117,0.875862,0.418594,0.636438,green,Ghost


In [79]:
train.type.unique()

array(['Ghoul', 'Goblin', 'Ghost'], dtype=object)

# Vis

In [80]:
# sns.pairplot(train, hue='type');

In [81]:
# sns.jointplot(x='bone_length', y='rotting_flesh', data=train);

# Eval

In [90]:
SEED = 1234
np.random.seed = SEED

In [91]:
from sklearn.preprocessing import LabelEncoder

In [92]:
nfm = ['id']
target = 'type'

In [93]:
X = train.drop(['id', 'type'], axis=1)
y = train[target]

In [94]:
answers_encoder = LabelEncoder()
y = answers_encoder.fit_transform(y)

In [95]:
X.head()

Unnamed: 0,bone_length,rotting_flesh,hair_length,has_soul,color
0,0.354512,0.350839,0.465761,0.781142,clear
1,0.57556,0.425868,0.531401,0.439899,green
2,0.467875,0.35433,0.811616,0.791225,black
3,0.776652,0.508723,0.636766,0.884464,black
4,0.566117,0.875862,0.418594,0.636438,green


In [96]:
def onehot_encode(df, column):
    from sklearn.preprocessing import LabelBinarizer
    
    cs = df.select_dtypes(include=['O']).columns.values
    if column not in cs:
        return (df, None)

    rest = [x for x in df.columns.values if x != column]

    lb = LabelBinarizer()
    data = lb.fit_transform(df[column])
    new_col_names = ['%s_%s' % (column, x) for x in lb.classes_]

    if len(new_col_names) != data.shape[1]:
        new_col_names = new_col_names[::-1][:data.shape[1]]

    return (pd.concat((df.drop([column], axis=1), pd.DataFrame(data=data, columns=new_col_names)), axis=1), lb)

In [97]:
X, lb = onehot_encode(X, 'color')

In [98]:
from sklearn.model_selection import train_test_split

X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.85, random_state=SEED)

&nbsp;

In [99]:
from catboost import Pool, CatBoostClassifier, cv, CatboostIpythonWidget

In [100]:
model = CatBoostClassifier(custom_loss=['Accuracy'], random_seed=SEED)

In [101]:
model.fit(X_train.values, y_train, eval_set=(X_validation.values, y_validation), verbose=False);

In [102]:
cv_data = cv(
    model.get_params(),
    Pool(X.values, label=y),
    fold_count=10
);

In [103]:
cv_data.keys()

dict_keys(["b'Logloss'_train_avg", "b'Logloss'_train_stddev", "b'Logloss'_test_avg", "b'Logloss'_test_stddev", "b'Accuracy'_train_avg", "b'Accuracy'_train_stddev", "b'Accuracy'_test_avg", "b'Accuracy'_test_stddev"])

In [104]:
print('Best validation accuracy score: {:.2f}±{:.2f} on step {}'.format(
    np.max(cv_data["b'Accuracy'_test_avg"]),
    cv_data["b'Accuracy'_test_stddev"][np.argmax(cv_data["b'Accuracy'_test_avg"])],
    np.argmax(cv_data["b'Accuracy'_test_avg"])
))

Best validation accuracy score: 0.60±0.07 on step 168


&nbsp;

In [105]:
from hpsklearn import HyperoptEstimator

In [106]:
# Create the estimator object
estim = HyperoptEstimator()

In [107]:
# Search the space of classifiers and preprocessing steps and their
# respective hyperparameters in sklearn to fit a model to the data
estim.fit(X_train.values, y_train, valid_size=0.1)

In [108]:
# Report the accuracy of the classifier on a given set of data
score = estim.score( X_validation, y_validation )
score

0.6964285714285714

In [109]:
# Return instances of the classifier and preprocessing steps
model = estim.best_model()
model

{'ex_preprocs': (),
 'learner': GradientBoostingClassifier(criterion='friedman_mse', init=None,
               learning_rate=0.01769098173318415, loss='deviance',
               max_depth=2, max_features=0.9808320058656615,
               max_leaf_nodes=None, min_impurity_split=1e-07,
               min_samples_leaf=25, min_samples_split=2,
               min_weight_fraction_leaf=0.0, n_estimators=18,
               presort='auto', random_state=4, subsample=1.0, verbose=0,
               warm_start=False),
 'preprocs': (PCA(copy=True, iterated_power='auto', n_components=10, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False),)}

&nbsp;

In [110]:
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier

In [111]:
clf = ExtraTreesClassifier(n_estimators=100)

In [112]:
clf.fit(X_train, y_train);

In [113]:
clf.score(X_validation, y_validation)

0.7321428571428571

&nbsp;

In [114]:
from sklearn.grid_search import GridSearchCV

In [122]:
clf = GridSearchCV(ExtraTreesClassifier(), {
    'n_estimators': [50, 80, 100, 150, 300],
    'max_depth': [5,8,15,25,None],
    'min_samples_split': [2,5,10,15,100],
    'min_samples_leaf': [1,2,5,10],
    'max_features': ['log2', 'sqrt', None]
})

In [123]:
clf.fit(X, y);

In [124]:
clf.best_score_

0.7358490566037735

In [125]:
clf.best_params_

{'max_depth': 5,
 'max_features': 'log2',
 'min_samples_leaf': 1,
 'min_samples_split': 10,
 'n_estimators': 80}

In [126]:
clf.best_estimator_

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=5, max_features='log2', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=10, min_weight_fraction_leaf=0.0,
           n_estimators=80, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)