In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils import resample

# Replace 'data.csv' with the filename of your CSV file
csv_filename = 'entrenamiento.csv'

# Load CSV file into a DataFrame
data = pd.read_csv(csv_filename)

print("DataFrame loaded successfully.")

# Preprocess the data
data = data.drop(columns=['date'])
data['prev_decision'] = data['prev_decision'].astype('category')
X = data.drop(columns=['next_decision'])
y = data['next_decision']

# Undersample the categories to have the same number of observations
class_neg1 = data[data.next_decision == -1]
class_0 = data[data.next_decision == 0]
class_1 = data[data.next_decision == 1]

min_count = min(len(class_neg1), len(class_0), len(class_1))

class_neg1_undersampled = resample(class_neg1, replace=False, n_samples=min_count, random_state=42)
class_0_undersampled = resample(class_0, replace=False, n_samples=min_count, random_state=42)
class_1_undersampled = resample(class_1, replace=False, n_samples=min_count, random_state=42)

data_undersampled = pd.concat([class_neg1_undersampled, class_0_undersampled, class_1_undersampled])

X_undersampled = data_undersampled.drop(columns=['next_decision'])
y_undersampled = data_undersampled['next_decision']

# Split the undersampled data into training and testing sets
X_train_us, X_test_us, y_train_us, y_test_us = train_test_split(X_undersampled, y_undersampled, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_us_scaled = scaler.fit_transform(X_train_us)
X_test_us_scaled = scaler.transform(X_test_us)

# Initialize models
log_reg = LogisticRegression(random_state=42, max_iter=1000)
svc = SVC(random_state=42)
grad_boost = GradientBoostingClassifier(random_state=42)
mlp = MLPClassifier(random_state=42, max_iter=1000)

# Train and evaluate Logistic Regression
log_reg.fit(X_train_us_scaled, y_train_us)
y_pred_log_reg = log_reg.predict(X_test_us_scaled)
print("Logistic Regression")
print(classification_report(y_test_us, y_pred_log_reg))
print(confusion_matrix(y_test_us, y_pred_log_reg))

# Train and evaluate Support Vector Machine (SVM)
svc.fit(X_train_us_scaled, y_train_us)
y_pred_svc = svc.predict(X_test_us_scaled)
print("Support Vector Machine")
print(classification_report(y_test_us, y_pred_svc))
print(confusion_matrix(y_test_us, y_pred_svc))

# Train and evaluate Gradient Boosting Classifier
grad_boost.fit(X_train_us_scaled, y_train_us)
y_pred_grad_boost = grad_boost.predict(X_test_us_scaled)
print("Gradient Boosting")
print(classification_report(y_test_us, y_pred_grad_boost))
print(confusion_matrix(y_test_us, y_pred_grad_boost))

# Train and evaluate Neural Network
mlp.fit(X_train_us_scaled, y_train_us)
y_pred_mlp = mlp.predict(X_test_us_scaled)
print("Neural Network")
print(classification_report(y_test_us, y_pred_mlp))
print(confusion_matrix(y_test_us, y_pred_mlp))

DataFrame loaded successfully.
Logistic Regression
              precision    recall  f1-score   support

          -1       0.78      0.64      0.70        11
           0       0.54      0.54      0.54        13
           1       0.50      0.62      0.56         8

    accuracy                           0.59        32
   macro avg       0.61      0.60      0.60        32
weighted avg       0.61      0.59      0.60        32

[[7 3 1]
 [2 7 4]
 [0 3 5]]
Support Vector Machine
              precision    recall  f1-score   support

          -1       0.75      0.55      0.63        11
           0       0.56      0.69      0.62        13
           1       0.75      0.75      0.75         8

    accuracy                           0.66        32
   macro avg       0.69      0.66      0.67        32
weighted avg       0.67      0.66      0.66        32

[[6 5 0]
 [2 9 2]
 [0 2 6]]
Gradient Boosting
              precision    recall  f1-score   support

          -1       0.64      0.64  

In [2]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for SVM
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': ['scale', 'auto']
}

# Initialize the SVM model
svc = SVC(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(svc, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

# Fit GridSearchCV to the training data
grid_search.fit(X_train_us_scaled, y_train_us)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Score:", best_score)



# Train the SVM model with the best parameters
best_svc = grid_search.best_estimator_
best_svc.fit(X_train_us_scaled, y_train_us)

# Make predictions on the test set
y_pred_best_svc = best_svc.predict(X_test_us_scaled)

# Evaluate the model's performance
classification_rep_best_svc = classification_report(y_test_us, y_pred_best_svc)
confusion_mat_best_svc = confusion_matrix(y_test_us, y_pred_best_svc)

print("Confusion Matrix for Best SVM:")
print(confusion_mat_best_svc)
print("\nClassification Report for Best SVM:")
print(classification_rep_best_svc)


Fitting 5 folds for each of 32 candidates, totalling 160 fits
Best Parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Best Score: 0.6363333333333333
Confusion Matrix for Best SVM:
[[4 5 2]
 [3 7 3]
 [0 3 5]]

Classification Report for Best SVM:
              precision    recall  f1-score   support

          -1       0.57      0.36      0.44        11
           0       0.47      0.54      0.50        13
           1       0.50      0.62      0.56         8

    accuracy                           0.50        32
   macro avg       0.51      0.51      0.50        32
weighted avg       0.51      0.50      0.49        32



In [3]:

import numpy as np

# Feature importance for SVM with linear kernel
if best_svc.kernel == 'linear':
    feature_importance = np.abs(best_svc.coef_[0])
    feature_importance = feature_importance / np.sum(feature_importance)
    
    feature_importance_df = pd.DataFrame({
        'feature': X_train_us.columns,
        'importance': feature_importance
    }).sort_values(by='importance', ascending=False)
    
    print("Feature Importance for SVM with Linear Kernel:")
    print(feature_importance_df)
else:
    print("Feature importance is only available for SVM with a linear kernel.")





import matplotlib.pyplot as plt

# Ensure to have the correct SVM model with a linear kernel
if best_svc.kernel == 'linear':
    # Calculate feature importance
    feature_importance = np.abs(best_svc.coef_[0])
    feature_importance = feature_importance / np.sum(feature_importance)
    
    feature_importance_df = pd.DataFrame({
        'feature': X_train_us.columns,
        'importance': feature_importance
    }).sort_values(by='importance', ascending=False)
    
    # Plot feature importance
    plt.figure(figsize=(10, 6))
    plt.barh(feature_importance_df['feature'], feature_importance_df['importance'], color='skyblue')
    plt.xlabel('Importance')
    plt.ylabel('Feature')
    plt.title('Feature Importance for SVM with Linear Kernel')
    plt.gca().invert_yaxis()
    plt.show()
else:
    print("Feature importance is only available for SVM with a linear kernel.")

Feature importance is only available for SVM with a linear kernel.
Feature importance is only available for SVM with a linear kernel.
