In [62]:
import pandas as pd
import zipfile
import os
import kagglehub
import plotly.express as px
from scipy.stats import chi2_contingency
from IPython.display import display
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression


In [2]:
path = kagglehub.dataset_download("uciml/mushroom-classification")
print("Path to dataset files:", path)

files = os.listdir(path)
print("Files in dataset:", files)


Path to dataset files: /kaggle/input/mushroom-classification
Files in dataset: ['mushrooms.csv']


In [60]:
f'{path}/mushrooms.csv'

'/kaggle/input/mushroom-classification/mushrooms.csv'

In [3]:
df = pd.read_csv(f'{path}/mushrooms.csv')

In [4]:
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [5]:
df.columns

Index(['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')

In [6]:
nulls_df = pd.DataFrame(df.isnull().sum())
nulls_df.columns = ['cantidad_datos_nulos']
display(nulls_df)

Unnamed: 0,cantidad_datos_nulos
class,0
cap-shape,0
cap-surface,0
cap-color,0
bruises,0
odor,0
gill-attachment,0
gill-spacing,0
gill-size,0
gill-color,0


Observamos que no hay valores vacíos en ninguna de las columnas, por lo tanto, no es necesario realizar imputación de datos.

In [7]:
df_counts = df['class'].value_counts().reset_index()
df_counts.columns = ['clases', 'count']
fig = px.bar(
    df_counts,
    x='clases',
    y='count',
    color = 'clases',
    title='Distribución de clases',
    color_discrete_sequence=['#FFA15A', '#19D3F3']
)
fig.show()


Observamos que las clases de la variable objetivo están balanceadas, por lo tanto, no es necesario aplicar técnicas de balanceo.

In [8]:
df_feature_1_distrib = df["cap-shape"].value_counts().reset_index()
px.bar(df_feature_1_distrib, x = "cap-shape", y ="count", color_discrete_sequence=['#FFA15A'], title= 'cap-shape conteo' )


In [9]:
df_cap_surface_vs_target = df.groupby(["cap-color", "class"]).size().reset_index()
df_cap_surface_vs_target.columns = ["cap-color", "class", "count"]
px.bar(df_cap_surface_vs_target, x = "cap-color", y ="count", color_discrete_sequence = ['#19D3F3'], title= 'cap-color vs clases')

In [10]:
dx = {'variable':[],'pvalor':[]}
for var in df.columns:
    df_temp = chi2_contingency(pd.crosstab(df[var], df["class"]))
    dx['variable'].append(var)
    dx['pvalor'].append(df_temp[1])
df_chi2 = pd.DataFrame(dx)
df_chi2.sort_values(by = 'pvalor', inplace = True)
df_chi2 = df_chi2.reset_index(drop=False)
display(df_chi2)

Unnamed: 0,index,variable,pvalor
0,0,class,0.0
1,4,bruises,0.0
2,5,odor,0.0
3,14,stalk-color-above-ring,0.0
4,12,stalk-surface-above-ring,0.0
5,13,stalk-surface-below-ring,0.0
6,9,gill-color,0.0
7,8,gill-size,0.0
8,15,stalk-color-below-ring,0.0
9,21,population,0.0


In [11]:

alpha = 0.05  # Nivel de significancia
df_chi2["aceptacion"] = df_chi2["pvalor"].apply(lambda x: 1 if x < alpha else 0)
list_of_accepted_vars = df_chi2[df_chi2['aceptacion']==1]['variable']
display(df_chi2[df_chi2['aceptacion']==1]['variable'])

Unnamed: 0,variable
0,class
1,bruises
2,odor
3,stalk-color-above-ring
4,stalk-surface-above-ring
5,stalk-surface-below-ring
6,gill-color
7,gill-size
8,stalk-color-below-ring
9,population


In [12]:
df_with_selected_vars = df[list_of_accepted_vars]
df_with_selected_vars.head()

Unnamed: 0,class,bruises,odor,stalk-color-above-ring,stalk-surface-above-ring,stalk-surface-below-ring,gill-color,gill-size,stalk-color-below-ring,population,...,spore-print-color,stalk-root,gill-spacing,cap-shape,ring-number,cap-color,cap-surface,veil-color,gill-attachment,stalk-shape
0,p,t,p,w,s,s,k,n,w,s,...,k,e,c,x,o,n,s,w,f,e
1,e,t,a,w,s,s,k,b,w,n,...,n,c,c,x,o,y,s,w,f,e
2,e,t,l,w,s,s,n,b,w,n,...,n,c,c,b,o,w,s,w,f,e
3,p,t,p,w,s,s,n,n,w,s,...,k,e,c,x,o,w,y,w,f,e
4,e,f,n,w,s,s,k,b,w,a,...,n,e,w,x,o,g,s,w,f,t


Ahora es necesario realizar una transformación sobre los datos, ya que se requiere convertir las variables de tipo cadena a un formato adecuado para entrenar correctamente los modelos de clasificación.

In [41]:
X = df_with_selected_vars.drop("class", axis = 1)
y = df_with_selected_vars["class"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [44]:
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoder.fit(X_train)

X_train_encoded = encoder.transform(X_train)
X_test_encoded = encoder.transform(X_test)

X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=encoder.get_feature_names_out(X_train.columns), index=X_train.index)
X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=encoder.get_feature_names_out(X_train.columns), index=X_test.index)

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [48]:
svm_model = SVC()
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Validación cruzada con 5 folds
svm_scores = cross_val_score(svm_model, X_train_encoded, y_train_encoded, cv=5)
rf_scores = cross_val_score(rf_model, X_train_encoded, y_train_encoded, cv=5)

print("SVM accuracy promedio:", svm_scores.mean())
print("Random Forest accuracy promedio:", rf_scores.mean())

SVM accuracy promedio: 1.0
Random Forest accuracy promedio: 1.0


In [52]:
rf_model = rf_model.fit(X_train_encoded, y_train_encoded)
smv_model = svm_model.fit(X_train_encoded, y_train_encoded)
display(rf_model, svm_model)

In [53]:
y_pred_rf = rf_model.predict(X_test_encoded)
y_pred_svm = smv_model.predict(X_test_encoded)

In [54]:
classification_report_rf = classification_report(y_test_encoded, y_pred_rf, output_dict=True)
classification_report_svm = classification_report(y_test_encoded, y_pred_svm, output_dict=True)

In [55]:
display((pd.DataFrame(classification_report_rf).iloc[:3])[['0','1']])

Unnamed: 0,0,1
precision,1.0,1.0
recall,1.0,1.0
f1-score,1.0,1.0


In [56]:
display(pd.DataFrame(classification_report_svm).iloc[:3][['0','1']])

Unnamed: 0,0,1
precision,1.0,1.0
recall,1.0,1.0
f1-score,1.0,1.0


In [58]:
# Para Random Forest
cm_rf = confusion_matrix(y_test_encoded, y_pred_rf)
cm_rf_df = pd.DataFrame(cm_rf, index=['Real 0', 'Real 1'], columns=['Predicha 0', 'Predicha 1'])

# Convertir a formato largo
cm_rf_long = cm_rf_df.reset_index().melt(id_vars='index')
cm_rf_long.columns = ['Real', 'Predicha', 'Valor']



In [59]:
fig = px.imshow(
    cm_rf,
    labels=dict(x="Predicción", y="Real", color="Cantidad"),
    x=['Predicha 0', 'Predicha 1'],
    y=['Real 0', 'Real 1'],
    text_auto=True,
    color_continuous_scale="Blues"
)
fig.update_layout(title="Matriz de Confusión - Random Forest")
fig.show()


In [38]:
cm_svm = confusion_matrix(y_test, y_pred_svm)

fig = px.imshow(
    cm_svm,
    labels=dict(x="Predicción", y="Real", color="Cantidad"),
    x=['Predicha 0', 'Predicha 1'],
    y=['Real 0', 'Real 1'],
    text_auto=True,
    color_continuous_scale="Greens"
)
fig.update_layout(title="Matriz de Confusión - SVM")
fig.show()


Los datos disponibles parecen ser insuficientes para realizar una predicción confiable sobre la variable class. Aunque los resultados iniciales puedan parecer muy prometedores —con métricas como precisión, recall y f1-score todas iguales a 1.0—, este rendimiento perfecto no refleja necesariamente una capacidad real del modelo para generalizar a nuevos datos. De hecho, es un claro indicio de sobreajuste (overfitting). Este fenómeno ocurre cuando el modelo no solo aprende los patrones reales del conjunto de entrenamiento, sino también el ruido y las particularidades específicas de esos datos. En este caso, tanto los modelos de Random Forest como el SVM obtuvieron resultados perfectos en las métricas de evaluación y mostraron matrices de confusión sin errores, lo que significa que no cometieron ninguna clasificación incorrecta. Sin embargo, esto es altamente inusual en aplicaciones del mundo real y sugiere que el modelo probablemente ha memorizado los datos, en lugar de aprender patrones generalizables. Este tipo de desempeño irreal suele deberse a un conjunto de datos muy pequeño, muy limpio, o a una posible filtración de información entre el entrenamiento y la prueba.