## 1. Data import

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import make_scorer,confusion_matrix, classification_report
from sklearn.utils.class_weight import compute_class_weight
import xgboost as xgb


#Importamos los datos de entrenamiento
data_train = pd.read_csv("../input/data_train.csv").values
#data_train = np.hstack((data_train[:,0:4],data_train[:,5:55])) #quitar feature 4

#Importamos los datos de test
data_test = pd.read_csv("../input/data_test.csv").values

#Separamos las features y las clases en subsets de train y test (20%) para hacer pruebas
#x_train, x_test, y_train, y_test = train_test_split(data_train[:,0:54], data_train[:,54], test_size = 0.2, random_state = 30)

# Outliers

In [2]:
from sklearn.ensemble import IsolationForest

# Modelo Isolation Forest
model = IsolationForest(contamination="auto", random_state=30)

# Entrena el modelo
model.fit(data_train) #probar teniendo en cuenta solo variables no categoricas

# Predice las etiquetas de outliers (1 para inliers, -1 para outliers) => Pasar a True inliers False outliers
outlier_labels = model.predict(data_train)
outlier_labels = np.asarray(np.where(outlier_labels==-1, 0, outlier_labels)).T
outlier_labels = [bool(outlier_labels[i]) for i in range(len(outlier_labels))]

#Filtramos el dataset
data_train_inliers = data_train[outlier_labels]
data_train = data_train_inliers

#Separamos las features y las clases en subsets de train y test (20%) para hacer pruebas
#x_train, x_test, y_train, y_test = train_test_split(data_train_inliers[:,0:54], data_train_inliers[:,54], test_size = 0.2, random_state = 30)

# Cost matrix and score function

In [3]:
C = [[ 0,   5,  1, 1, 1,  1, 1],
[10,   0,  1, 1, 1,  1, 1],
[20,  20,  0, 5, 5, 50, 5],
[20,  20, 10, 0, 1, 50, 5],
[20, 100,  5, 1, 0,  5, 5],
[ 5,  10, 10, 5, 1,  0, 1],
[10,   5,  1, 1, 1,  1, 0]]

def score_function (y, y_pred):
     C = [[ 0,   5,  1, 1, 1,  1, 1],
     [10,   0,  1, 1, 1,  1, 1],
     [20,  20,  0, 5, 5, 50, 5],
     [20,  20, 10, 0, 1, 50, 5],
     [20, 100,  5, 1, 0,  5, 5],
     [ 5,  10, 10, 5, 1,  0, 1],
     [10,   5,  1, 1, 1,  1, 0]]
     score = np.mean([C[int(i)][int(j)] for i, j in zip(y, y_pred)])
     return score

# GridSeacrhCV

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(y_train)

# Configuración del clasificador XGBoost
clf = xgb.XGBClassifier(objective='multi:softmax', num_class=7, random_state = 30, n_jobs = -1)

# Definición de los parámetros a buscar en la cuadrícula
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2, 0.3],
    'n_estimators': [200],
    'max_depth': [6, 30, 50],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'min_split_loss': [0,0.1,0.2]
}


scorer = make_scorer(score_func = score_function, greater_is_better = False)

# Configuración del objeto GridSearchCV
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, scoring=scorer, cv=3, verbose=3)

# Definición de la matriz de costes
def get_sample_weights(labels, cost_matrix):
    # Calcula los pesos de muestra basados en la matriz de costes y las etiquetas
    weights = np.zeros_like(labels, dtype=float)
    for i, label in enumerate(labels):
        weights[i] = sum(cost_matrix[int(label)])
    return weights

sample_weights = get_sample_weights(y_train, C)

grid_search.fit(x_train, y_train, sample_weight = sample_weights)

# Mejores parámetros encontrados
print("Mejores parámetros encontrados:")
print(grid_search.best_params_)

# Mejor modelo entrenado
best_model = grid_search.best_estimator_

# Predicciones en el conjunto de prueba con el mejor modelo
y_pred = best_model.predict(x_test)

# Evaluación del modelo
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f'Score: {score_function(y_test,y_pred)}\n')
print("\nMatriz de Confusión:")
print(conf_matrix)
print("\nInforme de Clasificación:")
print(classification_rep)

# XGBoost

## Pruebas

In [None]:
# Configuración del clasificador XGBoost
clf = xgb.XGBClassifier(
                        random_state = 30,
                        n_estimators = 200,
                        learning_rate = 0.1,
                        max_depth = 30,
                        objective = 'multi:softmax',
                        num_class = 7,
                        verbosity = 2,
                        subsample = 0.8,
                        colsample_bytree= 1.0,
                        min_split_loss = 0
                        )

# Entrenamiento del modelo
clf.fit(x_train, y_train)

# Predicciones en el conjunto de prueba
y_proba = clf.predict_proba(x_test)
weighted_costs = np.matmul(y_proba, np.array(C))
y_pred = np.argmin(weighted_costs, axis = 1)

# Evaluación del modelo
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f'Score: {score_function(y_test,y_pred)}\n')
print("\nMatriz de Confusión:")
print(conf_matrix)
print("\nInforme de Clasificación:")
print(classification_rep)

## Kaggle

In [None]:
# Configuración del clasificador XGBoost
clf = xgb.XGBClassifier(
                        random_state = 30,
                        n_estimators =  200,
                        learning_rate = 0.1,
                        max_depth = 30,
                        objective = 'multi:softmax',
                        num_class = 7,
                        verbosity = 2,
                        subsample = 0.8,
                        colsample_bytree= 1.0,
                        min_split_loss = 0
                        )

# Entrenamiento del modelo
clf.fit(data_train[:,0:54], data_train[:,54])

# Predicciones en el conjunto de prueba
y_proba = clf.predict_proba(data_test)
weighted_costs = np.matmul(y_proba, np.array(C))
y_pred = np.argmin(weighted_costs, axis = 1)

#Archivo
id = np.arange(data_test.shape[0])
y = pd.DataFrame(
     {
          "Id": id,
          "Category": y_pred,
     }
)
nombre = "prueba_XGBoost_prob.csv"
y.to_csv(nombre, index=False)