**Testando subconjuntos de colunas**

In [None]:
from google.colab import drive

drive.mount('/gdrive')
%cd /gdrive/MyDrive/rdc/tei

Mounted at /gdrive
/gdrive/MyDrive/rdc/tei


In [None]:
import pandas as pd

df_test = pd.read_csv("test.csv")
df_train = pd.read_csv("train.csv")



In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

df_train = df_train.drop('id', axis=1)

#label encoder para o DecisionTree

for col in df_train.columns:
  if col != 'class':
    le.fit(df_train[col])
    df_train[col] = le.transform(df_train[col])

for col in df_test.columns:
    if col != 'id':
      le.fit(df_test[col])
      df_test[col] = le.transform(df_test[col])

**Info Gain dos atributos**

In [None]:
from sklearn.datasets import make_classification
from sklearn.feature_selection import mutual_info_classif

df_train_top = df_train.sample(10)

columns = list(df_train_top.drop(columns=['class']).columns)

x = df_train[columns]
y = df_train['class']
info_gain = mutual_info_classif(x, y)
print(info_gain)

[0.04284346 0.10168189 0.07868184 0.13650698 0.18825469 0.07951056
 0.14768791 0.08753585 0.04124173 0.0682369  0.253293   0.21090194
 0.14606272 0.30176821 0.24480002 0.14700359 0.21633622 0.29802586
 0.17935283 0.09190185]


In [None]:
#deixar as colunas pré salvas para não ser necessário executar
# info_gain = [0.04284346 0.10168189 0.07868184 0.13650698 0.18825469 0.07951056 0.14768791 0.08753585 0.04124173 0.0682369 0.253293 0.21090194 0.14606272 0.30176821 0.24480002 0.14700359 0.21633622 0.29802586 0.17935283 0.09190185]
# columns =  ['cap-diameter', 'cap-shape', 'cap-surface', 'cap-color', 'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color', 'stem-height', 'stem-width', 'stem-root', 'stem-surface', 'stem-color', 'veil-type', 'veil-color', 'has-ring', 'ring-type', 'spore-print-color', 'habitat', 'season']


In [None]:
df_info_gain = pd.DataFrame({
    'Feature': columns,
    'Information Gain': info_gain
})

print(df_info_gain)

In [None]:
df_info_gain_sorted = df_info_gain.sort_values(by='Information Gain', ascending=False)

In [None]:
markdown_table = df_info_gain_sorted.to_markdown(index=False)
print("\nTabela em Markdown:")
print(markdown_table)


Tabela em Markdown:
| Feature              |   Information Gain |
|:---------------------|-------------------:|
| veil-type            |          0.301768  |
| spore-print-color    |          0.298026  |
| stem-root            |          0.253293  |
| veil-color           |          0.2448    |
| ring-type            |          0.216336  |
| stem-surface         |          0.210902  |
| does-bruise-or-bleed |          0.188255  |
| habitat              |          0.179353  |
| gill-spacing         |          0.147688  |
| has-ring             |          0.147004  |
| stem-color           |          0.146063  |
| cap-color            |          0.136507  |
| cap-shape            |          0.101682  |
| season               |          0.0919018 |
| gill-color           |          0.0875359 |
| gill-attachment      |          0.0795106 |
| cap-surface          |          0.0786818 |
| stem-width           |          0.0682369 |
| cap-diameter         |          0.0428435 |
| stem-height


Tabela em Markdown:
| Feature              |   Information Gain |  
|:---------------------|-------------------:|  
| veil-type            |          0.301768  |    
| spore-print-color    |          0.298026  |  
| stem-root            |          0.253293  |  
| veil-color           |          0.2448    |  
| ring-type            |          0.216336  |  
| stem-surface         |          0.210902  |  
| does-bruise-or-bleed |          0.188255  |  
| habitat              |          0.179353  |  
| gill-spacing         |          0.147688  |  
| has-ring             |          0.147004  |  
| stem-color           |          0.146063  |  
| cap-color            |          0.136507  |  
| cap-shape            |          0.101682  |  
| season               |          0.0919018 |  
| gill-color           |          0.0875359 |  
| gill-attachment      |          0.0795106 |  
| cap-surface          |          0.0786818 |  
| stem-width           |          0.0682369 |  
| cap-diameter         |          0.0428435 |  
| stem-height          |          0.0412417 |  

**Trabalhando com colunas com alto info gain**

In [None]:
colunas_selecionadas1 = df_info_gain[df_info_gain['Information Gain'] > 0.15]['Feature'].tolist()
colunas_selecionadas1

['does-bruise-or-bleed',
 'stem-root',
 'stem-surface',
 'veil-type',
 'veil-color',
 'ring-type',
 'spore-print-color',
 'habitat']

In [None]:
n1_df_train = df_train[colunas_selecionadas1].copy()
n1_df_train['class'] = df_train['class']
n2_df_test = df_test[colunas_selecionadas1].copy()
n2_df_test['id'] = df_test['id']

In [None]:
from sklearn.model_selection import train_test_split

X = n1_df_train.drop('class', axis=1)
y = n1_df_train['class']
n1_X_train, n1_X_val, n1_y_train, n1_y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(max_depth=4, min_samples_leaf=4, random_state=42)
model.fit(n1_X_train, n1_y_train)

In [None]:
# Usando a árvore com melhor desempenho

In [None]:
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import accuracy_score

n1_y_pred = model.predict(n1_X_val)
mcc_score = matthews_corrcoef(n1_y_val, n1_y_pred)
print(f'MCC Score: {mcc_score}')

MCC Score: 0.3002554459093672


MCC Score: 0.3002554459093672  
Score Kaggle: 0.14619

In [None]:
test_predictions = model.predict(n2_df_test.drop('id', axis=1))

n2_df_test['class'] = test_predictions
n2_df_test[["id","class"]].to_csv("n1-col.csv", index=False)

**Tirando colunas com muito null**

In [None]:
import pandas as pd

df_test = pd.read_csv("test.csv")
df_train = pd.read_csv("train.csv")


In [None]:
import numpy as np

np.where(pd.isna(df_train))

(array([      0,       0,       0, ..., 3116944, 3116944, 3116944]),
 array([12, 13, 15, ..., 15, 16, 19]))

In [None]:
df_train.isna().sum()

Unnamed: 0,0
id,0
class,0
cap-diameter,4
cap-shape,40
cap-surface,671023
cap-color,12
does-bruise-or-bleed,8
gill-attachment,523936
gill-spacing,1258435
gill-color,57


In [None]:
columns_to_remove = [
	'cap-surface', 'gill-attachment', 'gill-spacing',
	'stem-root', 'stem-surface', 'veil-type',
	'veil-color', 'ring-type', 'spore-print-color'
]

df_train.drop(columns=columns_to_remove, inplace=True)
df_test.drop(columns=columns_to_remove, inplace=True)
df_train.isna().sum()

Unnamed: 0,0
id,0
class,0
cap-diameter,4
cap-shape,40
cap-color,12
does-bruise-or-bleed,8
gill-color,57
stem-height,0
stem-width,0
stem-color,38


In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

#label encoder para o DecisionTree

for col in df_train.columns:
  if col != 'class':
    le.fit(df_train[col])
    df_train[col] = le.transform(df_train[col])

for col in df_test.columns:
    if col != 'id':
      le.fit(df_test[col])
      df_test[col] = le.transform(df_test[col])

In [None]:
n2_X_train = df_train.drop(columns=["id", "class"])
n2_y_train = df_train["class"]

In [None]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(max_depth=4, min_samples_leaf=4, random_state=42)
model.fit(n2_X_train, n2_y_train)

Score Kaggle: 0.28894

In [None]:
test_predictions = model.predict(df_test.drop('id', axis=1))

n2_df_test['class'] = test_predictions
n2_df_test[["id","class"]].to_csv("n2-col.csv", index=False)

In [None]:
import pandas as pd

df_test = pd.read_csv("test.csv")
df_train = pd.read_csv("train.csv")


In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()


for col in df_train.columns:
  le.fit(df_train[col])
  df_train[col] = le.transform(df_train[col])

for col in df_test.columns:
    if col != 'id':
      le.fit(df_test[col])
      df_test[col] = le.transform(df_test[col])
      if col == 'class':
          le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
          print(le_name_mapping)

**Observando Correlação**

In [None]:
import pandas as pd

# Calculando a correlação entre as variáveis
correlacao = df_train.corr()

# Verificando a correlação das colunas com a variável target
print(correlacao['class'].sort_values())

stem-width             -0.170778
cap-diameter           -0.167451
spore-print-color      -0.133569
stem-surface           -0.119287
cap-shape              -0.090214
veil-color             -0.085226
stem-color             -0.070817
season                 -0.063554
gill-color             -0.060206
veil-type              -0.050324
stem-height            -0.049232
cap-surface            -0.047879
does-bruise-or-bleed   -0.037557
habitat                -0.027231
gill-attachment        -0.008883
id                     -0.000136
stem-root               0.013776
gill-spacing            0.026922
cap-color               0.043370
ring-type               0.046832
has-ring                0.049625
class                   1.000000
Name: class, dtype: float64


Testes de colunas com o xgb

In [None]:
import pandas as pd

df_test = pd.read_csv("test.csv")
df_train = pd.read_csv("train.csv")

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()


#transform para o xgbooster porque ele precisa da coluna class em 0 ou 1

for col in df_train.columns:
  le.fit(df_train[col])
  df_train[col] = le.transform(df_train[col])
  if col == 'class':
        le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
        print(le_name_mapping)

for col in df_test.columns:
    if col != 'id':
      le.fit(df_test[col])
      df_test[col] = le.transform(df_test[col])

{'e': 0, 'p': 1}


In [None]:
df_train.columns

Index(['class', 'cap-diameter', 'cap-shape', 'cap-surface', 'cap-color',
       'does-bruise-or-bleed', 'gill-spacing', 'gill-color', 'stem-height',
       'stem-width', 'stem-surface', 'stem-color', 'veil-type', 'veil-color',
       'has-ring', 'ring-type', 'spore-print-color', 'habitat', 'season'],
      dtype='object')

In [None]:
variables_to_remove1 = ['id', 'gill-attachment', 'stem-root']
n3_x_test = df_test.drop(columns=variables_to_remove1)
variables_to_remove1.append('class')
n3_y_train = df_train['class']
n3_X_train = df_train.drop(columns=variables_to_remove1)

In [None]:
import xgboost as xgb
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import make_scorer, matthews_corrcoef
import pandas as pd

model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False, random_state=42)

mcc_scorer = make_scorer(matthews_corrcoef) #mcc

X = n3_X_train
y = n3_y_train

# Aplicar validação cruzada
scores = cross_val_score(model, X, y, cv=10, scoring=mcc_scorer)


In [None]:
print(f'MCC Scores com o conjunto n3: {scores}')
print(f'Média MCC com o conjunto n3: {scores.mean()}')

MCC Scores com o conjunto n3: [0.98089696 0.97982983 0.98019793 0.97968939 0.9810082  0.98037313
 0.9796805  0.97997378 0.98066621 0.9802295 ]
Média MCC com o conjunto n3: 0.9802545436559678


In [None]:
model.fit(X, y)

Parameters: { "use_label_encoder" } are not used.



Score Kaggle: 0.22471  
MCC Scores com o conjunto n3: [0.98089696 0.97982983 0.98019793 0.97968939 0.9810082  0.98037313
 0.9796805  0.97997378 0.98066621 0.9802295 ]  
Média MCC com o conjunto n3: 0.9802545436559678

In [None]:
test_predictions = model.predict(n3_x_test)

df_test['class'] = test_predictions
df_test['class'].replace({0: 'e', 1: 'p'}, inplace=True)
df_test[["id","class"]].to_csv("n3_col.csv", index=False)

In [None]:
import pandas as pd

df_test = pd.read_csv("test.csv")
df_train = pd.read_csv("train.csv")

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()


#transform para o xgbooster porque ele precisa da coluna class em 0 ou 1

for col in df_train.columns:
  le.fit(df_train[col])
  df_train[col] = le.transform(df_train[col])
  if col == 'class':
        le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
        print(le_name_mapping)

for col in df_test.columns:
    if col != 'id':
      le.fit(df_test[col])
      df_test[col] = le.transform(df_test[col])

{'e': 0, 'p': 1}


In [None]:
#tirando as mesmas variaveis com muito null do conjunto n2
columns_to_remove = [
	'cap-surface', 'gill-attachment', 'gill-spacing',
	'stem-root', 'stem-surface', 'veil-type',
	'veil-color', 'ring-type', 'spore-print-color', 'id'
]
n4_x_test = df_test.drop(columns=columns_to_remove)
n4_y_train = df_train['class']
columns_to_remove.append('class')
n4_X_train = df_train.drop(columns=columns_to_remove)


Modelo combinando as colunas que não tem muitos null e xgb

In [None]:
model.fit(n4_X_train, n4_y_train)

Parameters: { "use_label_encoder" } are not used.



Score Kaggle: 0.32130