**Usando XGBooster**

In [9]:
from google.colab import drive

drive.mount('/gdrive')
%cd /gdrive/MyDrive/rdc/tei

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive/MyDrive/rdc/tei


In [53]:
import pandas as pd

train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [56]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

for col in train_df.columns:
    if col != 'id':
        train_df[col] = le.fit_transform(train_df[col])

for col in test_df.columns:
    if col != 'id':
        test_df[col] = le.fit_transform(test_df[col])
    if col == 'class':
        le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
        print(le_name_mapping)


In [12]:
from sklearn.model_selection import train_test_split

X = train_df.drop('class', axis=1)
y = train_df['class']

X_sample, _, y_sample, _ = train_test_split(X, y, test_size=0.9, stratify=y, random_state=42)

print(f"Tamanho da amostra: {X_sample.shape}")



Tamanho da amostra: (311694, 21)


**Testando hiperparametros com GRID**  
Com amostra diminuída para acelerar

In [14]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn.metrics import make_scorer, matthews_corrcoef

param_grid = {
    'max_depth': [4],
    'learning_rate': [0.01, 0.1],
    'n_estimators': [200],
    'subsample': [0.7],
    'colsample_bytree': [0.8, 1.0]
}

model = xgb.XGBClassifier(objective='binary:logistic', use_label_encoder=False, random_state=42)

mcc_scorer = make_scorer(matthews_corrcoef)

grid_search = GridSearchCV(estimator=model,
                           param_grid=param_grid,
                           scoring=mcc_scorer,
                           cv=10,
                           n_jobs=-1,
                           verbose=1)

grid_search.fit(X_sample, y_sample)

print(f'Melhores parâmetros encontrados: {grid_search.best_params_}')
print(f'Melhor MCC score: {grid_search.best_score_}')


Fitting 10 folds for each of 4 candidates, totalling 40 fits


Parameters: { "use_label_encoder" } are not used.



Melhores parâmetros encontrados: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 200, 'subsample': 0.7}
Melhor MCC score: 0.9646033786022412


Melhores parâmetros encontrados: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 200, 'subsample': 0.7}  
Melhor MCC score: 0.9646033786022412

In [55]:
columns_to_remove = [
	'cap-surface', 'gill-attachment', 'gill-spacing',
	'stem-root', 'stem-surface', 'veil-type',
	'veil-color', 'ring-type', 'spore-print-color'
]

train_df.drop(columns=columns_to_remove, inplace=True)
test_df.drop(columns=columns_to_remove, inplace=True)

In [36]:
from sklearn.model_selection import train_test_split

X = train_df.drop('class', axis=1)
X = X.drop('id', axis=1)
y = train_df['class']

X_sample, x_val, y_sample, y_val = train_test_split(X, y, test_size=0.6, stratify=y, random_state=42)

print(f"Tamanho da amostra: {X_sample.shape}")


Tamanho da amostra: (1246778, 11)


**Testando os parametros achados com as melhores colunas**

In [38]:
model = xgb.XGBClassifier(colsample_bytree=1.0, learning_rate=0.1, max_depth=4, n_estimators=200, n_jobs=-1, random_state=42)
model.fit(X_sample, y_sample)

In [39]:
from sklearn.metrics import matthews_corrcoef

y_pred = model.predict(x_val)
mcc = matthews_corrcoef(y_val, y_pred)
print(f"MCC: {mcc}")


MCC: 0.8716760883374526


Score Kaggle: 0.29715  
MCC: 0.8716760883374526


In [40]:
x_test = test_df.drop('id', axis=1)

In [41]:
test_predictions = model.predict(x_test)

test_df['class'] = test_predictions
test_df['class'].replace({0: 'e', 1: 'p'}, inplace=True)
test_df[["id","class"]].to_csv("xgb-c1.csv", index=False)

In [42]:
from sklearn.model_selection import train_test_split

X = train_df.drop('class', axis=1)
X = X.drop('id', axis=1)
y = train_df['class']

X_sample, x_val, y_sample, y_val = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

print(f"Tamanho da amostra: {X_sample.shape}")

Tamanho da amostra: (2181861, 11)


In [43]:
model = xgb.XGBClassifier(colsample_bytree=1.0, learning_rate=0.1, max_depth=4, n_estimators=200, n_jobs=-1, random_state=42)
model.fit(X_sample, y_sample)

In [44]:
from sklearn.metrics import matthews_corrcoef

y_pred = model.predict(x_val)
mcc = matthews_corrcoef(y_val, y_pred)
print(f"MCC: {mcc}")

MCC: 0.8687080610129355


Score Kaggle: 0.30023  
MCC: 0.8687080610129355

In [48]:
x_test = test_df.drop('id', axis=1).drop('class', axis=1)
test_predictions = model.predict(x_test)

test_df['class'] = test_predictions
test_df['class'].replace({0: 'e', 1: 'p'}, inplace=True)
test_df[["id","class"]].to_csv("xgb-c2.csv", index=False)

In [50]:
from sklearn.model_selection import train_test_split

X = train_df.drop('class', axis=1)
X = X.drop('id', axis=1)
y = train_df['class']

In [51]:
model = xgb.XGBClassifier(colsample_bytree=1.0, learning_rate=0.1, max_depth=4, n_estimators=200, n_jobs=-1, random_state=42)
model.fit(X_sample, y_sample)

Score Kaggle: 0.30023

In [52]:
x_test = test_df.drop('id', axis=1).drop('class', axis=1)
test_predictions = model.predict(x_test)

test_df['class'] = test_predictions
test_df['class'].replace({0: 'e', 1: 'p'}, inplace=True)
test_df[["id","class"]].to_csv("xgb-c3.csv", index=False)