In [80]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder

In [81]:
dataset = pd.read_csv('/Users/mmarchetta/Desktop/Tesis-2024/data-visualization/final_dataset.csv')
dataset1 = pd.read_csv('/Users/mmarchetta/Desktop/Tesis-2024/data-cleaning/final_dataset_first_cleaning.csv')
dataset2= pd.read_csv('/Users/mmarchetta/Desktop/Tesis-2024/data-cleaning/final_dataset_cleaned_kbest.csv')
dataset3 = pd.read_csv('/Users/mmarchetta/Desktop/Tesis-2024/data-cleaning/final_dataset_cleaned_random_forest_feature_selector.csv')

dataset4 = pd.read_csv('/Users/mmarchetta/Desktop/Tesis-2024/data-cleaning/final_dataset_first_cleaning_denormalized.csv')
dataset5= pd.read_csv('/Users/mmarchetta/Desktop/Tesis-2024/data-cleaning/final_dataset_cleaned_kbest_denormalized.csv')
dataset6 = pd.read_csv('/Users/mmarchetta/Desktop/Tesis-2024/data-cleaning/final_dataset_cleaned_random_forest_feature_selector_denormalized.csv')

dataset.drop(['Open_time'], axis=1, inplace=True)
dataset.drop(['Sentimiento'], axis=1, inplace=True)
dataset.drop(['Sentimiento_coin'], axis=1, inplace=True)
dataset.drop(['Sentimiento_referentes'], axis=1, inplace=True)

dataset1.drop(['Open_time'], axis=1, inplace=True)
dataset2.drop(['Open_time'], axis=1, inplace=True)
dataset3.drop(['Open_time'], axis=1, inplace=True)
dataset4.drop(['Open_time'], axis=1, inplace=True)
dataset5.drop(['Open_time'], axis=1, inplace=True)
dataset6.drop(['Open_time'], axis=1, inplace=True)


In [82]:
def basic_logistic_regression(dataset):
    # Dividir los datos en conjunto de entrenamiento y conjunto de prueba
    X_train, X_test, y_train, y_test = train_test_split(dataset.drop(columns=["Tendencia"]), 
                                                        dataset["Tendencia"], 
                                                        test_size=0.2, 
                                                        random_state=42)

    # Entrenar un modelo final de Regresión Logística utilizando las características seleccionadas
    final_model = LogisticRegression()
    final_model.fit(X_train, y_train)

    # Evaluar el modelo final
    accuracy = final_model.score(X_test, y_test)
    
    y_pred = final_model.predict(X_test)
    
    # Calcular el F1-score
    f1score = f1_score(y_test, y_pred, average='weighted')
    
    roc_auc = roc_auc_score(y_test, final_model.predict_proba(X_test), multi_class='ovr')

    return accuracy, f1score, roc_auc, final_model


In [83]:
def basic_gradient_boosting(dataset):
    X_train, X_test, y_train, y_test = train_test_split(dataset.drop(columns=["Tendencia"]), 
                                                        dataset["Tendencia"], 
                                                        test_size=0.2, 
                                                        random_state=42)

    # Crear el clasificador GradientBoostingClassifier
    gb_model = GradientBoostingClassifier(
        n_estimators=1000,  # Usar 1000 estimadores
        learning_rate=0.1,  # Tasa de aprendizaje
        max_depth=5,  # Profundidad máxima de cada árbol
        min_samples_split=2,  # Número mínimo de muestras requeridas para dividir un nodo interno
        min_samples_leaf=1,  # Número mínimo de muestras requeridas para estar en una hoja
        subsample=0.8,  # Fracción de muestras a utilizar para ajustar los estimadores base
        max_features='sqrt',  # Número máximo de características a considerar al dividir nodos: raíz cuadrada del número de características
        random_state=42
    )

    # Entrenar el modelo
    gb_model.fit(X_train, y_train)

    # Evaluar el modelo
    accuracy = gb_model.score(X_test, y_test)
    
    y_pred = gb_model.predict(X_test)
    
    # Calcular el F1-score
    f1score = f1_score(y_test, y_pred, average='weighted')
    
    roc_auc = roc_auc_score(y_test, gb_model.predict_proba(X_test), multi_class='ovr')

    return accuracy, f1score, roc_auc, gb_model


In [84]:
def basic_svm(dataset):
    X_train, X_test, y_train, y_test = train_test_split(dataset.drop(columns=["Tendencia"]), 
                                                        dataset["Tendencia"], 
                                                        test_size=0.2, 
                                                        random_state=42)

    # Codificar las etiquetas de destino numéricamente
    label_encoder = LabelEncoder()
    y_train_encoded = label_encoder.fit_transform(y_train)
    y_test_encoded = label_encoder.transform(y_test)

    svc_model = SVC(
        kernel='rbf',  # Kernel radial
        C=10.0,  # Parámetro de regularización
        gamma='scale',  # Coeficiente de kernel para 'rbf'
        probability=True,  # Habilitar el cálculo de probabilidades
        random_state=42
    )

    # Entrenar el modelo
    svc_model.fit(X_train, y_train_encoded)

    # Predecir las probabilidades de clase
    y_prob = svc_model.predict_proba(X_test)

    # Calcular el F1-score
    f1score = f1_score(y_test_encoded, y_prob.argmax(axis=1), average='weighted')

    # Calcular el ROC AUC
    roc_auc = roc_auc_score(y_test_encoded, y_prob, multi_class='ovr')

    # Calcular la exactitud
    accuracy = svc_model.score(X_test, y_test_encoded)

    return accuracy, f1score, roc_auc, svc_model

In [85]:
def basic_MLP(dataset):
    X_train, X_test, y_train, y_test = train_test_split(dataset.drop(columns=["Tendencia"]), 
                                                        dataset["Tendencia"], 
                                                        test_size=0.2, 
                                                        random_state=42)

    mlp_model = MLPClassifier(
        hidden_layer_sizes=(100, 50),  # Dos capas ocultas con 100 y 50 neuronas respectivamente
        activation='relu',  # Función de activación ReLU
        solver='adam',  # Optimizador Adam
        alpha=0.0001,  # Tasa de regularización L2
        learning_rate='adaptive',  # Tasa de aprendizaje adaptativa
        max_iter=1000,  # Número máximo de iteraciones
        random_state=42
    )

    # Entrenar el modelo
    mlp_model.fit(X_train, y_train)

    # Evaluar el modelo
    accuracy = mlp_model.score(X_test, y_test)
    
    y_pred = mlp_model.predict(X_test)
    
    # Calcular el F1-score
    f1score = f1_score(y_test, y_pred, average='weighted')

    roc_auc = roc_auc_score(y_test, mlp_model.predict_proba(X_test), multi_class='ovr')

    return accuracy, f1score, roc_auc, mlp_model

In [86]:
def predict_constant_class(y_true, constant_class):
    y_pred_constant = [constant_class] * len(y_true)
    
    # Calcular el accuracy entre las etiquetas verdaderas y las predicciones constantes
    accuracy = accuracy_score(y_true, y_pred_constant)
    
    return accuracy

In [87]:
performance_rl_0, f1_score_rl_0, roc_auc_rl_0, modelo_rl_0 = basic_logistic_regression(dataset)
performance_rl_1, f1_score_rl_1, roc_auc_rl_1, modelo_rl_1 = basic_logistic_regression(dataset1)
performance_rl_2, f1_score_rl_2, roc_auc_rl_2, modelo_rl_2 = basic_logistic_regression(dataset2)
performance_rl_3, f1_score_rl_3, roc_auc_rl_3, modelo_rl_3 = basic_logistic_regression(dataset3)
performance_rl_4, f1_score_rl_4, roc_auc_rl_4, modelo_rl_4 = basic_logistic_regression(dataset4)
performance_rl_5, f1_score_rl_5, roc_auc_rl_5, modelo_rl_5 = basic_logistic_regression(dataset5)
performance_rl_6, f1_score_rl_6, roc_auc_rl_6, modelo_rl_6 = basic_logistic_regression(dataset6)

# Imprimir las métricas
print("Métricas de los modelos:")
print(f"Regresión Logística 0: Accuracy={performance_rl_0}, F1-Score={f1_score_rl_0}, ROC AUC={roc_auc_rl_0}")
print(f"Regresión Logística 1: Accuracy={performance_rl_1}, F1-Score={f1_score_rl_1}, ROC AUC={roc_auc_rl_1}")
print(f"Regresión Logística 2: Accuracy={performance_rl_2}, F1-Score={f1_score_rl_2}, ROC AUC={roc_auc_rl_2}")
print(f"Regresión Logística 3: Accuracy={performance_rl_3}, F1-Score={f1_score_rl_3}, ROC AUC={roc_auc_rl_3}")
print(f"Regresión Logística 4: Accuracy={performance_rl_4}, F1-Score={f1_score_rl_4}, ROC AUC={roc_auc_rl_4}")
print(f"Regresión Logística 5: Accuracy={performance_rl_5}, F1-Score={f1_score_rl_5}, ROC AUC={roc_auc_rl_5}")
print(f"Regresión Logística 6: Accuracy={performance_rl_6}, F1-Score={f1_score_rl_6}, ROC AUC={roc_auc_rl_6}")
print("-----------------------------------")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Métricas de los modelos:
Regresión Logística 0: Accuracy=0.38285714285714284, F1-Score=0.3530942256511468, ROC AUC=0.7397754576260613
Regresión Logística 1: Accuracy=0.36, F1-Score=0.3332597575313636, ROC AUC=0.6840298250441812
Regresión Logística 2: Accuracy=0.36, F1-Score=0.3212197185530519, ROC AUC=0.6993069177700555
Regresión Logística 3: Accuracy=0.3942857142857143, F1-Score=0.3724750108856069, ROC AUC=0.7190091394129754
Regresión Logística 4: Accuracy=0.3485714285714286, F1-Score=0.2976393104372038, ROC AUC=0.7406022717118406
Regresión Logística 5: Accuracy=0.34285714285714286, F1-Score=0.2773573393178281, ROC AUC=0.7373644647108922
Regresión Logística 6: Accuracy=0.3657142857142857, F1-Score=0.2994342241401065, ROC AUC=0.7326529073789165
-----------------------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [88]:
performance_gb_0, f1_score_gb_0, roc_auc_gb_0, modelo_gb_0 = basic_gradient_boosting(dataset)
performance_gb_1, f1_score_gb_1, roc_auc_gb_1, modelo_gb_1 = basic_gradient_boosting(dataset1)
performance_gb_2, f1_score_gb_2, roc_auc_gb_2, modelo_gb_2 = basic_gradient_boosting(dataset2)
performance_gb_3, f1_score_gb_3, roc_auc_gb_3, modelo_gb_3 = basic_gradient_boosting(dataset3)
performance_gb_4, f1_score_gb_4, roc_auc_gb_4, modelo_gb_4 = basic_gradient_boosting(dataset4)
performance_gb_5, f1_score_gb_5, roc_auc_gb_5, modelo_gb_5 = basic_gradient_boosting(dataset5)
performance_gb_6, f1_score_gb_6, roc_auc_gb_6, modelo_gb_6 = basic_gradient_boosting(dataset6)

# Imprimir las métricas
print(f"Gradient Boosting 0: Accuracy={performance_gb_0}, F1-Score={f1_score_gb_0}, ROC AUC={roc_auc_gb_0}")
print(f"Gradient Boosting 1: Accuracy={performance_gb_1}, F1-Score={f1_score_gb_1}, ROC AUC={roc_auc_gb_1}")
print(f"Gradient Boosting 2: Accuracy={performance_gb_2}, F1-Score={f1_score_gb_2}, ROC AUC={roc_auc_gb_2}")
print(f"Gradient Boosting 3: Accuracy={performance_gb_3}, F1-Score={f1_score_gb_3}, ROC AUC={roc_auc_gb_3}")
print(f"Gradient Boosting 4: Accuracy={performance_gb_4}, F1-Score={f1_score_gb_4}, ROC AUC={roc_auc_gb_4}")
print(f"Gradient Boosting 5: Accuracy={performance_gb_5}, F1-Score={f1_score_gb_5}, ROC AUC={roc_auc_gb_5}")
print(f"Gradient Boosting 6: Accuracy={performance_gb_6}, F1-Score={f1_score_gb_6}, ROC AUC={roc_auc_gb_6}")
print("-----------------------------------")

Gradient Boosting 0: Accuracy=0.5142857142857142, F1-Score=0.5065362726321937, ROC AUC=0.7512839611085134
Gradient Boosting 1: Accuracy=0.49714285714285716, F1-Score=0.4904066605133962, ROC AUC=0.752054750920345
Gradient Boosting 2: Accuracy=0.49142857142857144, F1-Score=0.4850891596808112, ROC AUC=0.7520520731623925
Gradient Boosting 3: Accuracy=0.4685714285714286, F1-Score=0.4596080636139763, ROC AUC=0.7467436340979873
Gradient Boosting 4: Accuracy=0.5085714285714286, F1-Score=0.5038132515442216, ROC AUC=0.7484625842303271
Gradient Boosting 5: Accuracy=0.49714285714285716, F1-Score=0.49236116754463677, ROC AUC=0.7460065574786309
Gradient Boosting 6: Accuracy=0.4742857142857143, F1-Score=0.46712770673486786, ROC AUC=0.7510182084940417
-----------------------------------


In [89]:
performance_svm_0, f1_score_svm_0, roc_auc_svm_0, modelo_svm_0 = basic_svm(dataset)
performance_svm_1, f1_score_svm_1, roc_auc_svm_1, modelo_svm_1 = basic_svm(dataset1)
performance_svm_2, f1_score_svm_2, roc_auc_svm_2, modelo_svm_2 = basic_svm(dataset2)
performance_svm_3, f1_score_svm_3, roc_auc_svm_3, modelo_svm_3 = basic_svm(dataset3)
performance_svm_4, f1_score_svm_4, roc_auc_svm_4, modelo_svm_4 = basic_svm(dataset4)
performance_svm_5, f1_score_svm_5, roc_auc_svm_5, modelo_svm_5 = basic_svm(dataset5)
performance_svm_6, f1_score_svm_6, roc_auc_svm_6, modelo_svm_6 = basic_svm(dataset6)


# Imprimir las métricas
print("Métricas de los modelos de SVM:")
print(f"SVM 0: Accuracy={performance_svm_0}, F1-Score={f1_score_svm_0}, ROC AUC={roc_auc_svm_0}")
print(f"SVM 1: Accuracy={performance_svm_1}, F1-Score={f1_score_svm_1}, ROC AUC={roc_auc_svm_1}")
print(f"SVM 2: Accuracy={performance_svm_2}, F1-Score={f1_score_svm_2}, ROC AUC={roc_auc_svm_2}")
print(f"SVM 3: Accuracy={performance_svm_3}, F1-Score={f1_score_svm_3}, ROC AUC={roc_auc_svm_3}")
print(f"SVM 4: Accuracy={performance_svm_4}, F1-Score={f1_score_svm_4}, ROC AUC={roc_auc_svm_4}")
print(f"SVM 5: Accuracy={performance_svm_5}, F1-Score={f1_score_svm_5}, ROC AUC={roc_auc_svm_5}")
print(f"SVM 6: Accuracy={performance_svm_6}, F1-Score={f1_score_svm_6}, ROC AUC={roc_auc_svm_6}")
print("-----------------------------------")

Métricas de los modelos de SVM:
SVM 0: Accuracy=0.4514285714285714, F1-Score=0.3523694103656158, ROC AUC=0.7353259370889722
SVM 1: Accuracy=0.42857142857142855, F1-Score=0.33783812453318424, ROC AUC=0.718533139449225
SVM 2: Accuracy=0.38285714285714284, F1-Score=0.2936103896103896, ROC AUC=0.7107460835696278
SVM 3: Accuracy=0.4114285714285714, F1-Score=0.37264017471395344, ROC AUC=0.6972126045830924
SVM 4: Accuracy=0.4057142857142857, F1-Score=0.3804231008514581, ROC AUC=0.7431400325013336
SVM 5: Accuracy=0.4057142857142857, F1-Score=0.3804231008514581, ROC AUC=0.7430399029170208
SVM 6: Accuracy=0.44571428571428573, F1-Score=0.426626842892732, ROC AUC=0.7499140113458804
-----------------------------------


In [90]:
performance_mlp_0, f1_score_mlp_0, roc_auc_mlp_0, modelo_mlp_0 = basic_MLP(dataset)
performance_mlp_1, f1_score_mlp_1, roc_auc_mlp_1, modelo_mlp_1 = basic_MLP(dataset1)
performance_mlp_2, f1_score_mlp_2, roc_auc_mlp_2, modelo_mlp_2 = basic_MLP(dataset2)
performance_mlp_3, f1_score_mlp_3, roc_auc_mlp_3, modelo_mlp_3 = basic_MLP(dataset3)
performance_mlp_4, f1_score_mlp_4, roc_auc_mlp_4, modelo_mlp_4 = basic_MLP(dataset4)
performance_mlp_5, f1_score_mlp_5, roc_auc_mlp_5, modelo_mlp_5 = basic_MLP(dataset5)
performance_mlp_6, f1_score_mlp_6, roc_auc_mlp_6, modelo_mlp_6 = basic_MLP(dataset6)

# Imprimir las métricas
print("Métricas de los modelos de MLP:")
print(f"MLP 0: Accuracy={performance_mlp_0}, F1-Score={f1_score_mlp_0}, ROC AUC={roc_auc_mlp_0}")
print(f"MLP 1: Accuracy={performance_mlp_1}, F1-Score={f1_score_mlp_1}, ROC AUC={roc_auc_mlp_1}")
print(f"MLP 2: Accuracy={performance_mlp_2}, F1-Score={f1_score_mlp_2}, ROC AUC={roc_auc_mlp_2}")
print(f"MLP 3: Accuracy={performance_mlp_3}, F1-Score={f1_score_mlp_3}, ROC AUC={roc_auc_mlp_3}")
print(f"MLP 4: Accuracy={performance_mlp_4}, F1-Score={f1_score_mlp_4}, ROC AUC={roc_auc_mlp_4}")
print(f"MLP 5: Accuracy={performance_mlp_5}, F1-Score={f1_score_mlp_5}, ROC AUC={roc_auc_mlp_5}")
print(f"MLP 6: Accuracy={performance_mlp_6}, F1-Score={f1_score_mlp_6}, ROC AUC={roc_auc_mlp_6}")
print("-----------------------------------")

Métricas de los modelos de MLP:
MLP 0: Accuracy=0.3657142857142857, F1-Score=0.26423821879556186, ROC AUC=0.562858082947901
MLP 1: Accuracy=0.37142857142857144, F1-Score=0.3278009827562566, ROC AUC=0.6000805756833493
MLP 2: Accuracy=0.29714285714285715, F1-Score=0.23696456501514657, ROC AUC=0.5529468094622568
MLP 3: Accuracy=0.32571428571428573, F1-Score=0.24032568633323353, ROC AUC=0.5484151886993803
MLP 4: Accuracy=0.30857142857142855, F1-Score=0.30629948122392914, ROC AUC=0.54077235110191
MLP 5: Accuracy=0.2571428571428571, F1-Score=0.13996464519873883, ROC AUC=0.5037746925329882
MLP 6: Accuracy=0.42857142857142855, F1-Score=0.363694230138594, ROC AUC=0.6155791562931395
-----------------------------------


In [94]:
print("Exactitud clase constante - Alcista fuerte:", predict_constant_class(dataset['Tendencia'], "Alcista fuerte"))
print("Exactitud clase constante - Alcista leve:", predict_constant_class(dataset['Tendencia'], "Alcista leve"))
# print("Exactitud clase constante - Lateral:", predict_constant_class(dataset['Tendencia'], "Lateral"))
print("Exactitud clase constante - Bajista leve:", predict_constant_class(dataset['Tendencia'], "Bajista leve"))
print("Exactitud clase constante - Bajista fuerte:", predict_constant_class(dataset['Tendencia'], "Bajista fuerte"))

Exactitud clase constante - Alcista fuerte: 0.2629161882893226
Exactitud clase constante - Alcista leve: 0.2468427095292767
Exactitud clase constante - Bajista leve: 0.21814006888633755
Exactitud clase constante - Bajista fuerte: 0.27210103329506313


Para seleccionar el mejor dataset basandome en las métricas F1 y ROC, es necesario identificar aquellos conjuntos de datos que proporcionen un rendimiento consistentemente alto en ambas métricas para todos los modelos. Observando los resultados, parece que el conjunto de datos que muestra un mejor desempeño es el que corresponde al índice 0 en cada modelo (dataset completo sin normalizar).

Los resultados del conjunto de datos 0 para cada modelo son los siguientes:

Para la regresión logística:

F1-Score: 0.353
ROC AUC: 0.740
Para Gradient Boosting:

F1-Score: 0.507
ROC AUC: 0.751
Para SVM:

F1-Score: 0.352
ROC AUC: 0.735
Para MLP:

F1-Score: 0.264
ROC AUC: 0.563

Aunque los resultados de MLP no son tan altos como los de los otros modelos, el conjunto de datos 0 sigue siendo el mejor en términos de F1-Score y ROC AUC en comparación con los otros conjuntos de datos.

Por lo tanto, el dataset 0 sería el más recomendable para utilizar en futuros experimentos, ya que parece proporcionar un rendimiento general sólido en todos los modelos basado en las métricas F1-Score y ROC AUC. La siguiente opcion a este, seria el dataset 4, ambos podrian ser evaluados en los siguientes experimentos.

Tomare estos valores como baseline a ser superado por futuros modelos. A continuacion evaluare cual seria la performance de estos modelos en conjuntos de validacion.

In [92]:
class_distribution = dataset['Tendencia'].value_counts()

# Imprimir la distribución de clases
print(class_distribution)

Tendencia
Bajista fuerte    237
Alcista fuerte    229
Alcista leve      215
Bajista leve      190
Name: count, dtype: int64
