<a href="https://colab.research.google.com/github/michelDol/mineriaDatos/blob/main/Clasificacion_de_Tumores.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Paso 1**
**Importar librerias**

In [None]:
!pip install catboost

In [108]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, f1_score, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import confusion_matrix


### **Paso 2**
**Importar base de datos y preparar datos**

In [111]:
from google.colab import files
uploaded = files.upload()

df = pd.read_excel("brain_tumor_dataset.xlsx")
print("Dimensiones (filas, columnas):", df.shape)

df.head()
df.info()


Saving brain_tumor_dataset.xlsx to brain_tumor_dataset.xlsx
Dimensiones (filas, columnas): (20000, 19)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 19 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Patient_ID           20000 non-null  int64  
 1   Age                  20000 non-null  int64  
 2   Gender               20000 non-null  object 
 3   Tumor_Type           20000 non-null  object 
 4   Tumor_Size           20000 non-null  float64
 5   Location             20000 non-null  object 
 6   Histology            20000 non-null  object 
 7   Stage                20000 non-null  object 
 8   Symptom_1            20000 non-null  object 
 9   Symptom_2            20000 non-null  object 
 10  Symptom_3            20000 non-null  object 
 11  Radiation_Treatment  20000 non-null  object 
 12  Surgery_Performed    20000 non-null  object 
 13  Chemotherapy         20000 non-nu

In [112]:
df["Size_Age_Ratio"] = df["Tumor_Size"] / (df["Age"] + 1)
df["Aggressiveness"] = df["Tumor_Size"] * df["Tumor_Growth_Rate"]
df["Growth_Age"] = df["Tumor_Growth_Rate"] * df["Age"]
df["Inverse_Survival"] = 1 / (df["Survival_Rate"] + 0.0001)

In [113]:
y = df["MRI_Result"]
X = df.drop(columns=["MRI_Result"])

# Convert target variable to numerical labels
y = y.map({"Negative": 0, "Positive": 1})

In [114]:


X_train, X_temp, y_train, y_temp = train_test_split(
    X, y,
    test_size=0.5,
    stratify=y,
    random_state=42
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.5,
    stratify=y_temp,
    random_state=42
)
print(len(X_train), len(X_val), len(X_test))

10000 5000 5000


In [116]:
num_cols = X.select_dtypes(include=["int64","float64"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns

### **Paso 3**
**Crear preprocesamiento**

In [117]:


numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, num_cols),
    ("cat", categorical_transformer, cat_cols)
])

### **Paso 5**
**Modelos a evaluar**

In [120]:

models = {
    "LogisticRegression": LogisticRegression(max_iter=3000),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "RandomForest": RandomForestClassifier(
        random_state=42,
        n_estimators=200
    ),
    "ExtraTrees": ExtraTreesClassifier(
        random_state=42,
        n_estimators=300
    ),
    "GradientBoosting": GradientBoostingClassifier(
        random_state=42
    ),
    "AdaBoost": AdaBoostClassifier(
        random_state=42
    ),
    "SVM (RBF)": SVC(
        kernel="rbf",
        probability=True,
        random_state=42
    ),
    "kNN": KNeighborsClassifier(
        n_neighbors=5
    ),
    "GaussianNB": GaussianNB(),
    "MLP": MLPClassifier(
        hidden_layer_sizes=(100,50),
        max_iter=1000,
        random_state=42
    ),
    "XGBoost": XGBClassifier(
      eval_metric="logloss", random_state=42),
    "LightGBM": LGBMClassifier(
        random_state=42
    ),
    "CatBoost": CatBoostClassifier(
        verbose=0,
        random_state=42
    )

}


In [121]:
results = []

for name, model in models.items():

    pipe = Pipeline([
        ("preprocessor", preprocessor),
        ("model", model)
    ])

    pipe.fit(X_train, y_train)

    y_val_pred = pipe.predict(X_val)

    acc = accuracy_score(y_val, y_val_pred)
    f1 = f1_score(y_val, y_val_pred, average="weighted")

    results.append({
        "Model": name,
        "Validation Accuracy": acc,
        "Validation F1": f1
    })

results_df = pd.DataFrame(results).sort_values("Validation Accuracy", ascending=False)
results_df

[LightGBM] [Info] Number of positive: 5014, number of negative: 4986
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000722 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2177
[LightGBM] [Info] Number of data points in the train set: 10000, number of used features: 47
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501400 -> initscore=0.005600
[LightGBM] [Info] Start training from score 0.005600




Unnamed: 0,Model,Validation Accuracy,Validation F1
11,LightGBM,0.5102,0.510183
1,DecisionTree,0.503,0.502995
4,GradientBoosting,0.5026,0.501755
2,RandomForest,0.502,0.501971
5,AdaBoost,0.5018,0.481717
12,CatBoost,0.5014,0.501391
9,MLP,0.5002,0.499085
7,kNN,0.4986,0.498589
10,XGBoost,0.4986,0.498593
3,ExtraTrees,0.4944,0.494367


In [None]:
#scores = cross_val_score(pipe,X_train,y_train,cv=5,scoring="accuracy")
#scores = cross_val_score(pipe, X_train, y_train, cv=5)
#print(scores.mean())

### **Paso 6**
**Evaluación FINAL**

In [109]:
best_model = Pipeline([
    ("preprocessor", preprocessor),
    ("model", LGBMClassifier(
        random_state=42
    ))
])

best_model.fit(X_train, y_train)

y_test_pred = best_model.predict(X_test)

cm = confusion_matrix(y_test, y_test_pred)
print("Confusion Matrix:")
print(cm)

print("TEST ACCURACY:", accuracy_score(y_test, y_test_pred))
print("TEST F1:", f1_score(y_test, y_test_pred, average="weighted"))

[LightGBM] [Info] Number of positive: 5014, number of negative: 4986
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000678 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1157
[LightGBM] [Info] Number of data points in the train set: 10000, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501400 -> initscore=0.005600
[LightGBM] [Info] Start training from score 0.005600
Confusion Matrix:
[[1233 1259]
 [1234 1274]]
TEST ACCURACY: 0.5014
TEST F1: 0.5013795560253471




In [15]:
df.head()
df.describe()
df["MRI_Result"].value_counts()

Unnamed: 0_level_0,count
MRI_Result,Unnamed: 1_level_1
Positive,10029
Negative,9971


Se dividió el dataset en tres subconjuntos estratificados (50% entrenamiento, 25% validación y 25% prueba), asegurando la proporción de clases en cada partición. Se implementó un pipeline de preprocesamiento que incluyó imputación de valores faltantes, estandarización de variables numéricas y codificación One-Hot de variables categóricas.

Tras comparar distintos algoritmos, el modelo con mejor desempeño fue ______, obteniendo una Accuracy de ___ en validación y ___ en prueba, lo cual indica una adecuada capacidad de generalización.