In [1]:
import pandas as pd
import numpy as np

# sklearn tools
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# modelos
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
# Cargar el dataset procesado
dataset = pd.read_csv('../data/processed/features_for_model.csv')
#dataset = pd.read_csv('features_for_model.csv')

Selección de target y features

In [3]:
X = dataset.drop(['HighSatisfaction'], axis=1)
y = dataset['HighSatisfaction']

Split de Train y Test

In [4]:
# Dividir en train y test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=2025)

Configuramos y calculamos el standard scaler

In [5]:
# Escalar los datos
std_scaler = StandardScaler()
std_scaler.fit(X_train) # calcular los valores para el scaler.

Guardamos el scaler configurado (con datos de train) como artefacto del modelo.

In [6]:
# Guardar el scaler
import pickle
with open('../artifacts/std_scaler.pkl', 'wb') as f:
    pickle.dump(std_scaler, f)

#with open('std_scaler.pkl', 'wb') as f:
    #pickle.dump(std_scaler, f)

Creamos modelo de predicción

In [7]:
X_train_std = std_scaler.transform(X_train)
X_test_std = std_scaler.transform(X_test)

Definir modelos e hiperparámetros

In [None]:
# Modelo 1: Random Forest
modelo_rf = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=2025) #aqui se modifican los hiperparametros n_estimators, max_depth y random_state
modelo_rf.fit(X_train_std, y_train)
y_preds_rf = modelo_rf.predict(X_test_std)
accuracy_rf = accuracy_score(y_test, y_preds_rf)

In [25]:
# Versión 2
modelo_rf_2 = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
modelo_rf_2.fit(X_train_std, y_train)
y_preds_rf_2 = modelo_rf_2.predict(X_test_std)
accuracy_rf_2 = accuracy_score(y_test, y_preds_rf_2)

In [26]:
# Versión 3
modelo_rf_3 = RandomForestClassifier(n_estimators=50, max_depth=5, random_state=0)
modelo_rf_3.fit(X_train_std, y_train)
y_preds_rf_3 = modelo_rf_3.predict(X_test_std)
accuracy_rf_3 = accuracy_score(y_test, y_preds_rf_3)

In [27]:
# Versión 4
modelo_rf_4 = RandomForestClassifier(n_estimators=150, max_depth=15, random_state=1)
modelo_rf_4.fit(X_train_std, y_train)
y_preds_rf_4 = modelo_rf_4.predict(X_test_std)
accuracy_rf_4 = accuracy_score(y_test, y_preds_rf_4)

In [9]:
# Modelo 2: Regresión Logística
modelo_rl = LogisticRegression(C=1.0, solver='liblinear', random_state=2025) #los hiperparametros son C, solver y random_state
modelo_rl.fit(X_train_std, y_train)
y_preds_rl = modelo_rl.predict(X_test_std)
accuracy_rl = accuracy_score(y_test, y_preds_rl)

In [28]:
# Versión 2
modelo_rl_2 = LogisticRegression(C=0.1, solver='lbfgs', random_state=42, max_iter=200)
modelo_rl_2.fit(X_train_std, y_train)
y_preds_rl_2 = modelo_rl_2.predict(X_test_std)
accuracy_rl_2 = accuracy_score(y_test, y_preds_rl_2)


In [29]:
# Versión 3
modelo_rl_3 = LogisticRegression(C=10, solver='newton-cg', random_state=0)
modelo_rl_3.fit(X_train_std, y_train)
y_preds_rl_3 = modelo_rl_3.predict(X_test_std)
accuracy_rl_3 = accuracy_score(y_test, y_preds_rl_3)

In [30]:
# Versión 4
modelo_rl_4 = LogisticRegression(C=0.5, solver='saga', random_state=2, max_iter=300)
modelo_rl_4.fit(X_train_std, y_train)
y_preds_rl_4 = modelo_rl_4.predict(X_test_std)
accuracy_rl_4 = accuracy_score(y_test, y_preds_rl_4)

In [31]:
# Modelo 3: SVC
modelo_svc = SVC(C=1.0, kernel='rbf', random_state=2025) # hiperparametros C, kernel y random_state
modelo_svc.fit(X_train_std, y_train)
y_preds_svc = modelo_svc.predict(X_test_std)
accuracy_svc = accuracy_score(y_test, y_preds_svc)

In [16]:
# Versión 2
modelo_svc_2 = SVC(C=0.5, kernel='linear', random_state=42)
modelo_svc_2.fit(X_train_std, y_train)
y_preds_svc_2 = modelo_svc_2.predict(X_test_std)
accuracy_svc_2 = accuracy_score(y_test, y_preds_svc_2)

In [17]:
# Versión 3
modelo_svc_3 = SVC(C=2.0, kernel='poly', degree=3, random_state=0)
modelo_svc_3.fit(X_train_std, y_train)
y_preds_svc_3 = modelo_svc_3.predict(X_test_std)
accuracy_svc_3 = accuracy_score(y_test, y_preds_svc_3)

In [18]:
# Versión 4
modelo_svc_4 = SVC(C=1.5, kernel='sigmoid', random_state=1)
modelo_svc_4.fit(X_train_std, y_train)
y_preds_svc_4 = modelo_svc_4.predict(X_test_std)
accuracy_svc_4 = accuracy_score(y_test, y_preds_svc_4)

In [11]:
# Modelo 4: K-Nearest Neighbors
modelo_knn = KNeighborsClassifier(n_neighbors=5, weights='uniform') # hiperparametros n_neighbors y weights
modelo_knn.fit(X_train_std, y_train)
y_preds_knn = modelo_knn.predict(X_test_std)
accuracy_knn = accuracy_score(y_test, y_preds_knn)

In [19]:
# Versión 2
modelo_knn_2 = KNeighborsClassifier(n_neighbors=10, weights='distance')
modelo_knn_2.fit(X_train_std, y_train)
y_preds_knn_2 = modelo_knn_2.predict(X_test_std)
accuracy_knn_2 = accuracy_score(y_test, y_preds_knn_2)

In [20]:
# Versión 3
modelo_knn_3 = KNeighborsClassifier(n_neighbors=3, weights='uniform')
modelo_knn_3.fit(X_train_std, y_train)
y_preds_knn_3 = modelo_knn_3.predict(X_test_std)
accuracy_knn_3 = accuracy_score(y_test, y_preds_knn_3)

In [21]:
# Versión 4
modelo_knn_4 = KNeighborsClassifier(n_neighbors=7, weights='distance')
modelo_knn_4.fit(X_train_std, y_train)
y_preds_knn_4 = modelo_knn_4.predict(X_test_std)
accuracy_knn_4 = accuracy_score(y_test, y_preds_knn_4)

In [12]:
# Modelo 5: Árbol de Decisión
modelo_dt = DecisionTreeClassifier(max_depth=None, min_samples_split=2, random_state=2025) #hiper parametros max_depth, min_samples_split y random_state
modelo_dt.fit(X_train_std, y_train)
y_preds_dt = modelo_dt.predict(X_test_std)
accuracy_dt = accuracy_score(y_test, y_preds_dt)

In [22]:
# Versión 2
modelo_dt_2 = DecisionTreeClassifier(max_depth=10, min_samples_split=5, random_state=42)
modelo_dt_2.fit(X_train_std, y_train)
y_preds_dt_2 = modelo_dt_2.predict(X_test_std)
accuracy_dt_2 = accuracy_score(y_test, y_preds_dt_2)


In [23]:
# Versión 3
modelo_dt_3 = DecisionTreeClassifier(max_depth=5, min_samples_split=10, random_state=0)
modelo_dt_3.fit(X_train_std, y_train)
y_preds_dt_3 = modelo_dt_3.predict(X_test_std)
accuracy_dt_3 = accuracy_score(y_test, y_preds_dt_3)

In [24]:
# Versión 4
modelo_dt_4 = DecisionTreeClassifier(max_depth=15, min_samples_split=4, random_state=1)
modelo_dt_4.fit(X_train_std, y_train)
y_preds_dt_4 = modelo_dt_4.predict(X_test_std)
accuracy_dt_4 = accuracy_score(y_test, y_preds_dt_4)

In [32]:
# Comparar resultados
resultados = {
    'RandomForest_v1': accuracy_rf,
    'RandomForest_v2': accuracy_rf_2,
    'RandomForest_v3': accuracy_rf_3,
    'RandomForest_v4': accuracy_rf_4,
    'LogisticRegression_v1': accuracy_rl,
    'LogisticRegression_v2': accuracy_rl_2,
    'LogisticRegression_v3': accuracy_rl_3,
    'LogisticRegression_v4': accuracy_rl_4,
    'SVC_v1': accuracy_svc,
    'SVC_v2': accuracy_svc_2,
    'SVC_v3': accuracy_svc_3,
    'SVC_v4': accuracy_svc_4,
    'KNeighbors_v1': accuracy_knn,
    'KNeighbors_v2': accuracy_knn_2,
    'KNeighbors_v3': accuracy_knn_3,
    'KNeighbors_v4': accuracy_knn_4,
    'DecisionTree_v1': accuracy_dt,
    'DecisionTree_v2': accuracy_dt_2,
    'DecisionTree_v3': accuracy_dt_3,
    'DecisionTree_v4': accuracy_dt_4
}

In [33]:
print("Resultados de precisión por modelo:")
for modelo, accuracy in resultados.items():
    print(f"{modelo}: {accuracy:.4f}")

Resultados de precisión por modelo:
RandomForest_v1: 0.9505
RandomForest_v2: 0.9615
RandomForest_v3: 0.8352
RandomForest_v4: 0.9615
LogisticRegression_v1: 0.7711
LogisticRegression_v2: 0.7692
LogisticRegression_v3: 0.7711
LogisticRegression_v4: 0.7711
SVC_v1: 0.8425
SVC_v2: 0.7637
SVC_v3: 0.8626
SVC_v4: 0.6795
KNeighbors_v1: 0.8077
KNeighbors_v2: 0.8663
KNeighbors_v3: 0.8498
KNeighbors_v4: 0.8590
DecisionTree_v1: 0.8956
DecisionTree_v2: 0.8773
DecisionTree_v3: 0.7381
DecisionTree_v4: 0.8846


Guardamos el modelo para producción

In [34]:
with open('../models/random_forest_v1.pkl', 'wb') as f:
    pickle.dump(modelo_rf_2,f)

#Aqui se guarda el modelo que haya dado los mejores resultados, el mejor fue el RF v2