# **Parte 2: Preprocesamiento de Datos**

In [66]:
# Importamos las librerias
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
warnings.filterwarnings('ignore')
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, r2_score, confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [45]:
# Cargamos el dataset y convertimos las columnas de tipo object a category

path = ('../data/retail_sales_dataset.csv')
df = pd.read_csv(path)

# Añadimos una nueva coluna de clasificacion
def clasificar_ventas(amount):
    if amount > 150:
        return 'Alta'
    elif amount > 50:
        return 'Media'
    else:
        return 'Baja'

df['Clasificación'] = df['Total Amount'].apply(clasificar_ventas)

In [46]:
# Convertir los tipos de datos
df['Date'] = pd.to_datetime(df['Date'])  
df['Customer ID'] = df['Customer ID'].astype('string') 
df['Gender'] = df['Gender'].astype('category')  
df['Product Category'] = df['Product Category'].astype('category')
df['Clasificación'] = df['Clasificación'].astype('category')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Transaction ID    1000 non-null   int64         
 1   Date              1000 non-null   datetime64[ns]
 2   Customer ID       1000 non-null   string        
 3   Gender            1000 non-null   category      
 4   Age               1000 non-null   int64         
 5   Product Category  1000 non-null   category      
 6   Quantity          1000 non-null   int64         
 7   Price per Unit    1000 non-null   int64         
 8   Total Amount      1000 non-null   int64         
 9   Clasificación     1000 non-null   category      
dtypes: category(3), datetime64[ns](1), int64(5), string(1)
memory usage: 58.1 KB


In [47]:
# Preprocesamiento de los datos 

numerical_transformer = Pipeline(steps=[ 
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing  categorical values
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # Handle unseen categories
])

In [50]:
# Preprocesamiento de los datos, usando ColumnTransformer

numerical_cols = ['Age', 'Quantity', 'Price per Unit', 'Total Amount']
categorical_cols = ['Gender', 'Product Category', 'Clasificación']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

In [51]:
# Aplicamos el preprocesamiento a los datos con el Pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor)
])

In [52]:
# Aplicamos el preprocesamiento a los datos
X_transformed = pipeline.fit_transform(df)
X_transformed

array([[-0.54056476,  0.42926498, -0.68512265, ...,  0.        ,
         0.        ,  1.        ],
       [-1.12559156, -0.45399629,  1.68846418, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.62948884, -1.33725757, -0.7906154 , ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-1.34497661,  1.31252625, -0.81698859, ...,  0.        ,
         0.        ,  1.        ],
       [-0.39430806,  0.42926498, -0.68512265, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.41010379,  1.31252625, -0.7906154 , ...,  0.        ,
         0.        ,  1.        ]])

# **Parte 3: Benchmarking de Técnicas de Machine Learning**

**Selección de Modelos:**

Entrenar y evaluar múltiples modelos de machine learning (por ejemplo, Regresión Lineal, KNN, Árbol de Decisión, Random Forest, XGBoost y LGBM).
Utilizar validación cruzada para evaluar el rendimiento de los modelos.

## **Parte 4: Análisis de Métricas**

**Informe de Clasificación:**

* Generar un informe de clasificación para los modelos evaluados.

* Incluir la matriz de confusión para una comprensión detallada de los errores de clasificación.

* Curva ROC y AUC:*

Crear y visualizar la curva ROC para los modelos de clasificación binaria.
Calcular el AUC para evaluar la capacidad del modelo para distinguir entre clases.

In [53]:
# Dividimos los datos en X y y
y = df['Clasificación']

# Dividimos los datos en train y test
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)

In [59]:
# Modelos de clasificación
models = {
    'KNN': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'XGBoost': GradientBoostingClassifier(),
    'LightGBM': LGBMClassifier()
}

In [61]:
for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')  # Adjust scoring metric as needed
    print(f"Model: {name}")
    print(f"Mean Accuracy: {scores.mean():.2f}")
    print(f"Standard Deviation: {scores.std():.2f}")

Model: KNN
Mean Accuracy: 1.00
Standard Deviation: 0.00
Model: Decision Tree
Mean Accuracy: 1.00
Standard Deviation: 0.00
Model: Random Forest
Mean Accuracy: 1.00
Standard Deviation: 0.00
Model: XGBoost
Mean Accuracy: 1.00
Standard Deviation: 0.00
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000090 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 94
[LightGBM] [Info] Number of data points in the train set: 640, number of used features: 12
[LightGBM] [Info] Start training from score -0.808979
[LightGBM] [Info] Start training from score -1.471036
[LightGBM] [Info] Start training from score -1.123930
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000097 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 94
[LightGBM] [Info] Number of data points in the train set: 640, number of used features: 12
[LightGBM] [Info] St

**Comparación de Modelos:**

* Comparar los modelos utilizando métricas de rendimiento como exactitud, precisión, recall, F1-Score y ROC-AUC.

* Seleccionar el mejor modelo basado en las métricas obtenidas.

In [None]:
# Función para evaluar y visualizar el modelo

def evaluate_and_visualize_model(model, name, X_train, y_train, X_test, y_test):


    
    model.fit(X_train, y_train)

    
    y_pred = model.predict(X_test)

    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    
    print(f"Model: {name}")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-score: {f1:.2f}")
    print()

    
    conf_matrix = confusion_matrix(y_test, y_pred)

    
    print("Confusion Matrix:\n", conf_matrix)

    if len(np.unique(y_test)) == 2:
    

        plt.figure()  # Create a new figure for each model
        plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive 1  Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver Operating Characteristic for {}'.format(name))
        plt.legend(loc="lower right")
        plt.show()
# Manejo de multiclass
    else:
    # Add code for multiclass ROC curve/AUC or other visualizations
        pass

# Evaluar y visualizar los modelos
for name, model in models.items():
    evaluate_and_visualize_model(model, name, X_train, y_train, X_test, y_test)

Model: KNN
Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1-score: 1.00

Confusion Matrix:
 [[101   0   0]
 [  0  34   0]
 [  0   0  65]]
Model: Decision Tree
Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1-score: 1.00

Confusion Matrix:
 [[101   0   0]
 [  0  34   0]
 [  0   0  65]]
Model: Random Forest
Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1-score: 1.00

Confusion Matrix:
 [[101   0   0]
 [  0  34   0]
 [  0   0  65]]
Model: XGBoost
Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1-score: 1.00

Confusion Matrix:
 [[101   0   0]
 [  0  34   0]
 [  0   0  65]]
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000122 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 94
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 12
[LightGBM] [Info] Start training from score -0.806876
[LightGBM] [Info] Start training from score -1.475126
[LightGBM] [Info] Start training from s

In [None]:
# Entrenar y evaluar los modelos
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')  # Adjust average as needed
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    print(f"Model: {name}")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-score: {f1:.2f}")
    print()
        
# Confusion Matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:\n", conf_matrix)

    # Confusion Matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:\n", conf_matrix)

    # ROC Curve and AUC
    num_classes = len(np.unique(y_test))

    if len(np.unique(y_test)) == 2:
    # ... (calculate fpr, tpr, roc_auc)

plt.figure()  # Create a new figure for each model
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive 1  Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic for {}'.format(name))
plt.legend(loc="lower right")
plt.show()

else:  # Multiclass classification
        # Handle multiclass ROC curves appropriately (e.g., one-vs-rest)
        # You can use techniques like micro-averaging or macro-averaging
        # or plot individual ROC curves for each class.
        print("ROC curve visualization for multiclass is not straightforward.")
        print("Consider using micro-averaging, macro-averaging, or individual curves.")

IndentationError: expected an indented block after 'if' statement on line 29 (681601611.py, line 32)