In [2]:
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE

from src.modelado import dividir_variables
from src.modelado import dividir_datos
from src.modelado import pipeline_preprocesamiento
from src.modelado import evaluar_modelo
from src.modelado import entrenar_randomforest
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier



In [3]:
df = pd.read_csv("../data/tattoo_studio_clean.csv")
df.head()


Unnamed: 0,Age,Gender,Day_of_Week,Session_Type,Tattoo_Size,Tattoo_Style,Artist_Name,Session_Hours,Final_Rate,Customer_Satisfaction
0,46,Female,Friday,Pre-booked,Sleeve/Back,Minimalist,Artist B,9,47647,3
1,32,Male,Saturday,Walk-in,Small,Custom,Artist C,1,42498,2
2,25,Male,Friday,Pre-booked,Sleeve/Back,Custom,Artist A,10,19778,1
3,38,Female,Friday,Pre-booked,Medium,Script,Artist D,4,5478,4
4,36,Male,Wednesday,Pre-booked,Sleeve/Back,Tribal,Artist B,10,32471,1


In [4]:
X, y = dividir_variables(df, "Customer_Satisfaction")
X.head(), y.head()


(   Age  Gender Day_of_Week Session_Type  Tattoo_Size Tattoo_Style Artist_Name  \
 0   46  Female      Friday   Pre-booked  Sleeve/Back   Minimalist    Artist B   
 1   32    Male    Saturday      Walk-in        Small       Custom    Artist C   
 2   25    Male      Friday   Pre-booked  Sleeve/Back       Custom    Artist A   
 3   38  Female      Friday   Pre-booked       Medium       Script    Artist D   
 4   36    Male   Wednesday   Pre-booked  Sleeve/Back       Tribal    Artist B   
 
    Session_Hours  Final_Rate  
 0              9       47647  
 1              1       42498  
 2             10       19778  
 3              4        5478  
 4             10       32471  ,
 0    3
 1    2
 2    1
 3    4
 4    1
 Name: Customer_Satisfaction, dtype: int64)

In [5]:
X_train, X_test, y_train, y_test = dividir_datos(X, y)

In [6]:
X_train_num = pd.get_dummies(X_train, drop_first=True)
X_test_num = pd.get_dummies(X_test, drop_first=True)
X_test_num = X_test_num.reindex(columns=X_train_num.columns, fill_value=0)

In [7]:
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train_num, y_train)

y_train_res.value_counts()

Customer_Satisfaction
5    351
1    351
4    351
2    351
3    351
Name: count, dtype: int64

In [8]:
preprocesamiento = pipeline_preprocesamiento(X)

In [9]:
modelo_rf = entrenar_randomforest(preprocesamiento, X_train, y_train)


In [10]:
acc, reporte, matriz = evaluar_modelo(modelo_rf, X_test, y_test)
print("Accuracy:", acc)
print(reporte)
print(matriz)

Accuracy: 0.195
              precision    recall  f1-score   support

           1       0.23      0.35      0.28        74
           2       0.24      0.27      0.26        91
           3       0.16      0.14      0.15        80
           4       0.16      0.12      0.13        86
           5       0.12      0.09      0.10        69

    accuracy                           0.20       400
   macro avg       0.18      0.19      0.18       400
weighted avg       0.18      0.20      0.19       400

[[26 18 12 13  5]
 [24 25 15 15 12]
 [17 24 11 15 13]
 [32 16 14 10 14]
 [16 20 16 11  6]]
