In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
import xgboost as xgb
import lightgbm as lgb
import time
from datetime import datetime
from scipy.stats import uniform, randint

# Cargar la base de datos proporcionada
file_path = 'fifa_eda_stats.csv'
data = pd.read_csv(file_path)

# Seleccionar características relevantes y eliminar columnas innecesarias
features = ['Age', 'Potential', 'Value', 'Wage', 'Composure', 'Marking', 'StandingTackle', 'SlidingTackle', 
            'GKDiving', 'GKHandling', 'GKKicking', 'GKPositioning', 'GKReflexes']

# Crear una copia del DataFrame original para trabajar
df = data[features + ['Overall']].copy()

# Manejar valores nulos
df = df.dropna()

# Convertir 'Value' y 'Wage' a valores numéricos (eliminar símbolos y convertir a float)
df['Value'] = df['Value'].str.replace('€', '').str.replace('M', 'e6').str.replace('K', 'e3').astype(float)
df['Wage'] = df['Wage'].str.replace('€', '').str.replace('K', 'e3').astype(float)

# Convertir 'Overall' a una variable categórica
df['Overall'] = pd.cut(df['Overall'], bins=[0, 60, 75, 85, 100], labels=['Bajo', 'Medio', 'Alto', 'Muy Alto'])

# Separar características y etiquetas
X = df.drop('Overall', axis=1)
y = df['Overall']

# Dividir datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Definir los modelos y sus respectivos hiper-parámetros para RandomizedSearchCV
models = {
    'Naive Bayes': (GaussianNB(), {}),
    'LDA': (LinearDiscriminantAnalysis(), {'solver': ['svd', 'lsqr', 'eigen']}),
    'Regresión Logística': (LogisticRegression(), {'C': uniform(0.1, 10), 'solver': ['lbfgs', 'liblinear']}),
    'SVM': (SVC(), {'C': uniform(0.1, 10), 'kernel': ['linear', 'rbf']}),
    'Árboles de Decisión': (DecisionTreeClassifier(), {'max_depth': randint(10, 50)}),
    'Random Forest': (RandomForestClassifier(), {'n_estimators': randint(10, 100), 'max_depth': randint(10, 50)}),
    'Análisis de Discriminante Lineal': (LinearDiscriminantAnalysis(), {'solver': ['svd', 'lsqr', 'eigen']}),
    'Análisis de Discriminante Cuadrático': (QuadraticDiscriminantAnalysis(), {}),
    'AdaBoost': (AdaBoostClassifier(), {'n_estimators': randint(50, 200)}),
    'Gradient Boosting': (GradientBoostingClassifier(), {'n_estimators': randint(50, 200), 'learning_rate': uniform(0.01, 0.2)}),
    'XGBoost': (xgb.XGBClassifier(), {'n_estimators': randint(50, 200), 'learning_rate': uniform(0.01, 0.2)}),
    'LGBM': (lgb.LGBMClassifier(), {'n_estimators': randint(50, 200), 'learning_rate': uniform(0.01, 0.2)})
}

# Función para entrenar y evaluar los modelos
def train_and_evaluate(model, params, X_train, y_train, X_test, y_test):
    grid_search = RandomizedSearchCV(model, params, scoring='accuracy', cv=5, n_iter=10, n_jobs=-1, random_state=42)
    start_time = time.time()
    grid_search.fit(X_train, y_train)
    end_time = time.time()
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    return {
        'best_model': best_model,
        'best_params': grid_search.best_params_,
        'accuracy': accuracy,
        'f1_score': f1,
        'training_time': end_time - start_time,
        'training_end': datetime.now()
    }


In [4]:

# Entrenar y evaluar los modelos
results = []

for model_name, (model, params) in models.items():
    result = train_and_evaluate(model, params, X_train, y_train, X_test, y_test)
    results.append({
        'Model': model_name,
        'Best Params': result['best_params'],
        'Accuracy': result['accuracy'],
        'F1 Score': result['f1_score'],
        'Training Time (s)': result['training_time'],
        'Training End': result['training_end']
    })

results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by='Accuracy', ascending=False)



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

KeyboardInterrupt: 

In [None]:

# Exportar los resultados a un archivo Excel
results_df.to_excel('model_results.xlsx', index=False)
