In [None]:
# If using Colab
!pip install kds
!pip install statsmodels
!pip install ucimlrepo


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import pylab as pl
import numpy as np
from matplotlib import style
from matplotlib.legend_handler import HandlerBase
import seaborn as sb
import kds


In [None]:
from ucimlrepo import fetch_ucirepo
df = pd.read_csv("/content/secondary_data.csv", delimiter=';')
df.sample(10)


In [None]:
# Rename columns
columns_dict = {}
for i in df.columns:
    r = i.replace('-', '_')
    columns_dict[i] = r
df = df.rename(columns=columns_dict)
df.columns


In [None]:
df.info()


In [None]:
sb.heatmap(df.isnull(), yticklabels=False, cbar=True)


In [None]:
df.drop(columns=['gill_spacing','stem_surface','stem_root', 'spore_print_color', 'veil_type', 'veil_color'], axis=1, inplace=True)

In [None]:
# Label Encoding
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

df['cap_surface_encoded'] = label_encoder.fit_transform(df['cap_surface'])
cap_surface_data = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))
df['cap_surface_encoded'].replace({11: np.nan}, inplace=True)


In [None]:
# Proportional Imputation
from sklearn.utils import resample
def proportional_imputation(column):
    missing = column.isnull()
    filled = resample(column[~missing], n_samples=missing.sum(), replace=True)
    column[missing] = filled
    return column

df['cap_surface_imputed'] = proportional_imputation(df['cap_surface_encoded'])


In [None]:
# Distribution plots
def plot_cat(dataset, feature, set_color=None):
    ax = sb.countplot(data=dataset, x=feature, color=set_color)

plt.figure(figsize=(25, 5))
plt.suptitle('Effects of Imputation on Cap_surface')
plt.subplot(1,2,1)
plot_cat(df,'cap_surface_encoded')
plt.subplot(1,2,2)
plot_cat(df,'cap_surface_imputed', set_color='lightseagreen')
plt.show()


In [None]:
cap_surface_data = {value: key for key, value in cap_surface_data.items()}
df['cap_surface_imputed'] = df['cap_surface_imputed'].map(cap_surface_data)


In [None]:
cap_surface_data = {value: key for key, value in cap_surface_data.items()}
df['cap_surface_imputed'] = df['cap_surface_imputed'].map(cap_surface_data)


In [None]:
df_imputed['class_encoded'] = label_encoder.fit_transform(df_imputed['class'])
df_imputed['does_bruise_or_bleed_encoded'] = label_encoder.fit_transform(df_imputed['does_bruise_or_bleed'])
df_imputed['has_ring_encoded'] = label_encoder.fit_transform(df_imputed['has_ring'])


In [None]:
# Replace rare categories
df_imputed['habitat'] = df_imputed['habitat'].replace(['m', 'h', 'p', 'w', 'u'], 'Other')
df_imputed['stem_color'] = df_imputed['stem_color'].replace(['u', 'b', 'l', 'r', 'p', 'e', 'k', 'g', 'o', 'f'], 'Other')
df_imputed['gill_color'] = df_imputed['gill_color'].replace(['u', 'b', 'r', 'g', 'e', 'o', 'k','f'], 'Other')
df_imputed['cap_color'] = df_imputed['cap_color'].replace(['o', 'r', 'p', 'g', 'u', 'b', 'l', 'k'], 'Other')
df_imputed['cap_shape'] = df_imputed['cap_shape'].replace(['p','c','o'], 'Other')
df_imputed['cap_surface_imputed'] = df_imputed['cap_surface_imputed'].replace([ 'g', 'e', 'l', 'd', 'w', 'i', 'k'], 'Other')
df_imputed['ring_type_imputed'] = df_imputed['ring_type_imputed'].replace(['g', 'p', 'e', 'l', 'm', 'r', 'z'], 'Other')


In [None]:
from scipy.stats import zscore

df_cleaned['z_cap_diameter'] = zscore(df_cleaned['cap_diameter'])
df_cleaned['z_stem_height'] = zscore(df_cleaned['stem_height'])
df_cleaned['z_stem_width'] = zscore(df_cleaned['stem_width'])

outliers = df_cleaned[
    (df_cleaned['z_cap_diameter'].abs() > 2.5) |
    (df_cleaned['z_stem_height'].abs() > 2.5) |
    (df_cleaned['z_stem_width'].abs() > 2.5)
]


In [None]:
outlier_indices = outliers.index
df_cleaned = df_cleaned.drop(outlier_indices).reset_index(drop=True)
df_cleaned.drop(columns=['z_cap_diameter', 'z_stem_height', 'z_stem_width'], inplace=True)


In [None]:
df_dummies = pd.get_dummies(df_cleaned, dtype=int, drop_first=True)


In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

X = df_dummies.drop(columns=["class_encoded"])
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print(vif_data)


In [None]:
columns_to_drop = ['cap_diameter', 'stem_height', 'stem_width', 'cap_shape_x', 'habitat_d', 'ring_type_imputed_f']
existing_columns_to_drop = [col for col in columns_to_drop if col in df_dummies.columns]
df_dummies.drop(columns=existing_columns_to_drop, axis=1, inplace=True)


In [None]:
import statsmodels.api as sm
logit_model = sm.Logit(df_dummies['class_encoded'], df_dummies.drop(['class_encoded'], axis=1))
result = logit_model.fit()
result.summary2()


In [None]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_jobs=-1)
forest.fit(df_dummies.drop(['class_encoded'], axis=1), df_dummies['class_encoded'])

Importance_Table = pd.concat([
    pd.DataFrame({'Variables Independientes': list(df_dummies.drop(['class_encoded'], axis=1).columns)}),
    pd.DataFrame({'Importancia': list(forest.feature_importances_)})
], axis=1)

Variables_Importance = Importance_Table.sort_values('Importancia', ascending=False)
print(Variables_Importance)


In [None]:
df_reg = df_dummies[['class_encoded', 'stem_color_w', 'stem_color_y', 'cap_shape_b',
                     'cap_shape_f', 'gill_color_n', 'gill_color_w', 'has_ring_encoded',
                     'cap_color_e', 'cap_color_n', 'does_bruise_or_bleed_encoded', 'habitat_g',
                     'habitat_l', 'gill_attachment_imputed_e', 'gill_attachment_imputed_p',
                     'cap_surface_imputed_s', 'cap_surface_imputed_h', 'season_w']]


In [None]:
y = df_reg['class_encoded']
X = df_reg.drop('class_encoded', axis=1)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

lr = LogisticRegression()
lr.fit(X_train, y_train)

y_pred_test = lr.predict(X_test)
y_pred_train = lr.predict(X_train)


In [None]:
def calc_metrics(y_train, y_pred_train, y_test, y_pred_test):
    print('Matriz de confusión: Train')
    print(confusion_matrix(y_train, y_pred_train))
    print('Matriz de confusión: Test')
    print(confusion_matrix(y_test, y_pred_test))
    print('Accuracy: Train')
    print(accuracy_score(y_train, y_pred_train))
    print('Accuracy: Test')
    print(accuracy_score(y_test, y_pred_test))
    print('Precision: Train')
    print(precision_score(y_train, y_pred_train))
    print('Precision: Test')
    print(precision_score(y_test, y_pred_test))
    print('Recall: Train')
    print(recall_score(y_train, y_pred_train))
    print('Recall: Test')
    print(recall_score(y_test, y_pred_test))

calc_metrics(y_train, y_pred_train, y_test, y_pred_test)


In [None]:
TP, FN, FP, TN = confusion_matrix(y_test, y_pred_test).ravel()
d = np.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
MCC = ((TP*TN)-(FP*FN))/(d)


In [None]:
ind = pd.DataFrame(columns = ['Modelo', 'F1Score'])
f1score = f1_score(y_test, y_pred_test)
new_row = pd.DataFrame({'Modelo': ['LogisticReg'], 'F1Score': [f1score], 'MCC':[MCC]})
ind = pd.concat([ind, new_row], ignore_index=True)


In [None]:
from sklearn.tree import DecisionTreeClassifier

dt_cl = DecisionTreeClassifier(criterion='gini', max_depth=12)
dt_cl = dt_cl.fit(X_train, y_train)

y_pred_train = dt_cl.predict(X_train)
y_pred_test = dt_cl.predict(X_test)

calc_metrics(y_train, y_pred_train, y_test, y_pred_test)


In [None]:
TP, FN, FP, TN = confusion_matrix(y_test, y_pred_test).ravel()
d = np.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
MCC = ((TP*TN)-(FP*FN))/(d)

f1score = f1_score(y_test, y_pred_test)
new_row = pd.DataFrame({'Modelo': ['DecTreeClass'], 'F1Score': [f1score], 'MCC':[MCC]})
ind = pd.concat([ind, new_row], ignore_index=True)


In [None]:
lr_prob = lr.predict_proba(X_test)[:, 1]
dt_prob = dt_cl.predict_proba(X_test)[:, 1]

from sklearn.metrics import roc_curve, roc_auc_score
r_prob = [0 for _ in range(len(y_test))]

r_auc = roc_auc_score(y_test, r_prob)
lr_auc = roc_auc_score(y_test, lr_prob)
dt_auc = roc_auc_score(y_test, dt_prob)


In [None]:
print('Random (chance) Prediction: AUROC = %.3f' % r_auc)
print('Logistic Regression: AUROC = %.3f' % lr_auc)
print('Decision Tree: AUROC = %.3f' % dt_auc)


In [None]:
r_fpr, r_tpr, _ = roc_curve(y_test, r_prob)
lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_prob)
dt_fpr, dt_tpr, _ = roc_curve(y_test, dt_prob)

plt.plot(r_fpr, r_tpr, linestyle='--', label='Random (AUROC = %.3f)' % r_auc)
plt.plot(lr_fpr, lr_tpr, linestyle='--', label='Logistic (AUROC = %.3f)' % lr_auc)
plt.plot(dt_fpr, dt_tpr, linestyle='--', label='Decision Tree (AUROC = %.3f)' % dt_auc)
plt.title('ROC Plot')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()
