In [1]:
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, RepeatedStratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score

import pandas as pd

from xgboost import XGBClassifier

In [12]:
y_encoder = LabelEncoder()
x7_encoder = LabelEncoder()

def get_data(data_str: str):
    if data_str == 'training':
        training_data = pd.read_csv('TrainOnMe_orig.csv')
        training_data = training_data.drop(columns=['Unnamed: 0', 'x12'])
        training_data = training_data.dropna()

        training_data['y'] = y_encoder.fit_transform(training_data['y'])
        training_data['x7'] = x7_encoder.fit_transform(training_data['x7'])

        X = training_data.drop(columns=['y'])
        y = training_data['y']
        return X, y
    
    elif data_str == 'evaluation':
        evaluation_data = pd.read_csv('EvaluateOnMe.csv')
        evaluation_data = evaluation_data.drop(columns=['Unnamed: 0', 'x12'])
        evaluation_data['x7'] = x7_encoder.transform(evaluation_data['x7'])

        return evaluation_data
    
X, y = get_data('training')
X_eval = get_data('evaluation')

In [7]:
def run_search(X, y, pipeline, param_grid, n_splits=5, scoring='accuracy', n_iter=None, n_repeats=10, verbose=1):

    cv = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats)

    if n_iter:
        search = RandomizedSearchCV(pipeline, param_grid, n_iter=n_iter, scoring=scoring, cv=cv, verbose=verbose, n_jobs=-1, random_state=42)
    else:
        search = GridSearchCV(pipeline, param_grid, scoring=scoring, cv=cv, verbose=verbose, n_jobs=-1)

    search.fit(X, y)

    return search

In [4]:
def best_model(X, y, pipeline, param_grid, X_test=None, y_test=None, n_splits=5, scorings=['accuracy'], n_iter=None, n_repeats=10, verbose=1):
    best_models = {}
    for scoring in scorings:
        print(f"Running search for {scoring}...")
        search = run_search(X=X, y=y, pipeline=pipeline, param_grid=param_grid, n_splits=n_splits, scoring=scoring, n_iter=n_iter, n_repeats=n_repeats, verbose=verbose)
        
        print("Best model parameters:")
        print(search.best_params_)
        print(f"Cross-validation score ({scoring}): {search.best_score_:.4f}")

        if X_test is not None and y_test is not None:
            y_pred = search.best_estimator_.predict(X_test)
            score = accuracy_score(y_test, y_pred)
            print(f"Test score ({scoring}): {score:.4f}\n")
        
        best_models[scoring] = (search.best_estimator_, search.best_params_, search.best_score_)

    return best_models

In [5]:
preprocessor = ColumnTransformer(
    transformers=[
        ('pca', PCA(), ['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x8', 'x9', 'x10', 'x11', 'x13'])
        ])

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('xgb', XGBClassifier())
])

param_grid = {
    'xgb__n_estimators': [200, 250, 300, 350],
    'xgb__learning_rate': [0.085, 0.08, 0.075],
    'xgb__max_depth': [5, 6, 7],
    'xgb__subsample': [0.65, 0.675, 0.7],
    'xgb__eval_metric': ['merror', 'mlogloss'],
    'xgb__sampling_method': ['uniform'],
    'xgb__grow_policy': ['lossguide'],
    'xgb__objective': ['multi:softmax'],
    'xgb__num_class': [3],
    'preprocessor__pca__n_components': ['mle']
}

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

best_xgb_split = best_model(X_train, y_train, pipeline, param_grid, X_test=X_test, y_test=y_test, scorings=['accuracy', 'neg_log_loss'], n_iter=None, verbose=1)

Running search for accuracy...
Fitting 50 folds for each of 216 candidates, totalling 10800 fits
Best model parameters:
{'preprocessor__pca__n_components': 'mle', 'xgb__eval_metric': 'merror', 'xgb__grow_policy': 'lossguide', 'xgb__learning_rate': 0.08, 'xgb__max_depth': 5, 'xgb__n_estimators': 350, 'xgb__num_class': 3, 'xgb__objective': 'multi:softmax', 'xgb__sampling_method': 'uniform', 'xgb__subsample': 0.65}
Cross-validation score (accuracy): 0.8775
Test score (accuracy): 0.8800

Running search for neg_log_loss...
Fitting 50 folds for each of 216 candidates, totalling 10800 fits
Best model parameters:
{'preprocessor__pca__n_components': 'mle', 'xgb__eval_metric': 'merror', 'xgb__grow_policy': 'lossguide', 'xgb__learning_rate': 0.075, 'xgb__max_depth': 5, 'xgb__n_estimators': 200, 'xgb__num_class': 3, 'xgb__objective': 'multi:softmax', 'xgb__sampling_method': 'uniform', 'xgb__subsample': 0.65}
Cross-validation score (neg_log_loss): -0.3104
Test score (neg_log_loss): 0.8800



In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

best_xgb_split_random = best_model(X_train, y_train, pipeline, param_grid, X_test=X_test, y_test=y_test, scorings=['accuracy', 'neg_log_loss'], n_iter=10, verbose=1)

Running search for accuracy...
Fitting 50 folds for each of 10 candidates, totalling 500 fits
Best model parameters:
{'xgb__subsample': 0.675, 'xgb__sampling_method': 'uniform', 'xgb__objective': 'multi:softmax', 'xgb__num_class': 3, 'xgb__n_estimators': 200, 'xgb__max_depth': 5, 'xgb__learning_rate': 0.075, 'xgb__grow_policy': 'lossguide', 'xgb__eval_metric': 'merror', 'preprocessor__pca__n_components': 'mle'}
Cross-validation score (accuracy): 0.8831
Test score (accuracy): 0.8500

Running search for neg_log_loss...
Fitting 50 folds for each of 10 candidates, totalling 500 fits
Best model parameters:
{'xgb__subsample': 0.675, 'xgb__sampling_method': 'uniform', 'xgb__objective': 'multi:softmax', 'xgb__num_class': 3, 'xgb__n_estimators': 200, 'xgb__max_depth': 5, 'xgb__learning_rate': 0.075, 'xgb__grow_policy': 'lossguide', 'xgb__eval_metric': 'merror', 'preprocessor__pca__n_components': 'mle'}
Cross-validation score (neg_log_loss): -0.3001
Test score (neg_log_loss): 0.8500



In [10]:
best_xgb = best_model(X, y, pipeline, param_grid)

Running search for accuracy...
Fitting 50 folds for each of 216 candidates, totalling 10800 fits
Best model parameters:
{'preprocessor__pca__n_components': 'mle', 'xgb__eval_metric': 'merror', 'xgb__grow_policy': 'lossguide', 'xgb__learning_rate': 0.075, 'xgb__max_depth': 5, 'xgb__n_estimators': 250, 'xgb__num_class': 3, 'xgb__objective': 'multi:softmax', 'xgb__sampling_method': 'uniform', 'xgb__subsample': 0.65}
Cross-validation score (accuracy): 0.8840


In [20]:
best_xgb['accuracy'][0]

In [13]:
# retrieve the best model and make predictions on the evaluation data
xgb_model = best_xgb['accuracy'][0]

y_pred = xgb_model.predict(X_eval)
y_pred = y_encoder.inverse_transform(y_pred)

In [16]:
# write preditions to file
with open('y_pred.txt', 'w') as f:
    for i, item in enumerate(y_pred):
        f.write("%s\n" % item) if i < len(y_pred) - 1 else f.write("%s" % item)