In [None]:
import pandas as pd
from scipy import stats
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk
import warnings
import io
import joblib

warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv('train.csv', index_col=0, sep=';')
eval = pd.read_csv('eval.csv', index_col=0, sep=',')
eval.x12 = eval.x12.astype('object')
train.dropna(inplace=True)
train = train.loc[~train.y.isin(["Astuto", "tjugo", "Boob", "Jorgg"])]

In [None]:
zscore = np.abs(stats.zscore(train.select_dtypes(include=["float"])))
ZSCORE_THREASHOLD = 4

is_inlier = ~ (zscore > ZSCORE_THREASHOLD).any(axis=1)
train = train[is_inlier]

In [None]:
from sklearn.model_selection import train_test_split

RANDOM_STATE = 42
X = train.drop("y", axis=1)
y = train["y"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

In [None]:
numerical_features = X.select_dtypes(include=['float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value="missing")),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1))
])

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, KernelPCA

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('pca', PCA(n_components=7)),
])

In [None]:
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_features),
    ('cat', categorical_transformer, categorical_features)
])

In [None]:

import numpy as np
from xgboost import XGBRFClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold

# Encode the target variables to integers
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_boosted = le.fit_transform(y)



pipeline_boosted = Pipeline(steps=[('preprocessor', preprocessor),
                            ('classifier', XGBRFClassifier(random_state = RANDOM_STATE))])

params_boosted = {
    'classifier__n_estimators': [100, 200, 300, 400, 500],
    'classifier__max_depth': [3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
    'classifier__learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
    'classifier__subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'classifier__colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'classifier__colsample_bylevel': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'classifier__colsample_bynode': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'classifier__reg_lambda': [0.1, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 5.0],
    'classifier__reg_alpha': [0.1, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 5.0],
    'classifier__gamma': [0.1, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 5.0],
    'classifier__min_child_weight': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'classifier__max_delta_step': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'classifier__base_score': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'classifier__n_jobs': [2],
    'classifier__random_state': [RANDOM_STATE],
}

cv = StratifiedKFold(shuffle=True, random_state=RANDOM_STATE, n_splits=10)
rfr_random_boosted = RandomizedSearchCV(pipeline_boosted, param_distributions=params_boosted, n_iter = 5000, cv = cv, verbose=1, random_state=RANDOM_STATE, n_jobs=-1, return_train_score=True)
print("Fitting now")
rfr_random_boosted.fit(X, y_boosted)
rfr_random_boosted.best_score_


In [None]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                            ('forest', RandomForestClassifier(random_state=RANDOM_STATE))])
params = { 
 'forest__bootstrap': [True, False],
 'forest__max_depth': list(range(1, 30)) + [None],
 'forest__max_features': ['sqrt', 'log2'],
 'forest__min_samples_leaf': list(range(1, 20)),
 'forest__min_samples_split': list(range(1, 20)),
 'forest__n_estimators': list(range(100, 1000, 100))
}

cv = StratifiedKFold(shuffle=True, random_state=RANDOM_STATE, n_splits=10)
rfr_random = RandomizedSearchCV(pipeline, param_distributions=params, n_iter = 5000, cv = cv, verbose=1, random_state=RANDOM_STATE, n_jobs=-1, return_train_score=True)
print("Fitting now")
rfr_random.fit(X, y)
rfr_random.best_score_


In [None]:
rfr_random_boosted.best_params_

In [None]:
rfr_random_boosted.best_estimator_

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif
def make_pipeline(k, n):
    pipeline = Pipeline(steps=[('preprocessor',
                    ColumnTransformer(transformers=[('num',
                                                    Pipeline(steps=[('imputer',
                                                                    SimpleImputer(strategy='median')),
                                                                    ('pca',
                                                                    PCA(n_components=n))]),
                                                    pd.Index(['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x8', 'x9', 'x10', 'x11', 'x13'], dtype='object')),
                                                    ('cat',
                                                    Pipeline(steps=[('imputer',
                                                                    SimpleImputer(fill_value='missing',
                                                                                    strategy='constant')),
                                                                    ('encoder',
                                                                    OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1))]),
                                                    pd.Index(['x7', 'x12'], dtype='object'))])),
                    ('select', SelectKBest(f_classif, k=k)),
                    ('forest',
                    XGBRFClassifier(classifier__subsample= 0.5,
                                    classifier__reg_lambda= 5.0,
                                    classifier__reg_alpha = 1.0,
                                    classifier__random_state = 42,
                                    classifier__n_jobs = 2,
                                    classifier__n_estimators = 300,
                                    classifier__min_child_weight = 1,
                                    classifier__max_depth = 6,
                                    classifier__max_delta_step = 8,
                                    classifier__learning_rate = 0.3,
                                    classifier__gamma = 0.1,
                                    classifier__colsample_bytree = 0.9,
                                    classifier__colsample_bynode = 0.8,
                                    classifier__colsample_bylevel = 1.0,
                                    classifier__base_score = 0.8))])
    return pipeline

In [None]:
from sklearn.model_selection import cross_val_score

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train_boosted = le.fit_transform(y_train)


cv_scores = {}
for n in range(1, 12):
    for k in range(1, n):
        pipeline = make_pipeline(k, n)
        scores = cross_val_score(pipeline, X_train, y_train_boosted, cv = 10, n_jobs=-1)
        cv_scores[(k, n)] = scores.mean()

print(max(cv_scores, key=cv_scores.get), cv_scores[max(cv_scores, key=cv_scores.get)])
params = max(cv_scores, key=cv_scores.get)
model = make_pipeline(params[0], params[1])
model.fit(X_train, y_train_boosted)
predictions = model.predict(eval)

predictions = le.inverse_transform(predictions)

with open("predictions.txt", "w") as f:
    for prediction in predictions:
         f.write(str(prediction) + "\n")


In [None]:
print(max(cv_scores, key=cv_scores.get), cv_scores[max(cv_scores, key=cv_scores.get)])