In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import classification_report

# Load transformed data
X_train = pd.read_csv('data/training_data_transformed.csv')
y_train = pd.read_csv('data/y_train_transformed.csv').values.ravel()
X_test = pd.read_csv('data/test_data_transformed.csv')

# Drop non-numeric columns
for col in X_train.columns:
    if X_train[col].dtype == 'object':
        X_train = X_train.drop(columns=[col])
        X_test = X_test.drop(columns=[col])

# Models to compare
models = {
    'XGBoost': xgb.XGBClassifier(eval_metric='mlogloss', random_state=42),
    'RandomForest': RandomForestClassifier(random_state=42),
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42)
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')
    print(f"{name} CV Accuracy: {scores.mean():.4f} ± {scores.std():.4f}")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    results[name] = y_pred

# Convert predictions back to original labels
from sklearn.preprocessing import OrdinalEncoder
target_order = ['None_Existent', 'Low', 'Medium', 'High', 'Very_High']
oe_target = OrdinalEncoder(categories=[target_order])
oe_target.fit(np.array(target_order).reshape(-1, 1))

# Show metrics for each model (using train set for demonstration)
for name, model in models.items():
    y_pred_train = model.predict(X_train)
    y_pred_train_labels = oe_target.inverse_transform(y_pred_train.reshape(-1, 1))
    y_train_labels = oe_target.inverse_transform(y_train.reshape(-1, 1))
    print(f"\n{name} Classification Report (Train):")
    print(classification_report(y_train_labels, y_pred_train_labels))

# Save submission for best model (example: XGBoost)
y_pred_labels = oe_target.inverse_transform(results['XGBoost'].reshape(-1, 1))
submission = pd.DataFrame({
    'RowId': np.arange(1, len(y_pred_labels) + 1),
    'AVERAGE_SPEED_DIFF': pd.Series(y_pred_labels.ravel()).replace('None_Existent', 'None')
})
submission.to_csv('submission_varios.csv', index=False)
print('✅ Submission file with ID saved as submission.csv')


Training XGBoost...
XGBoost CV Accuracy: 0.7782 ± 0.0153

Training RandomForest...
RandomForest CV Accuracy: 0.7848 ± 0.0106

Training LogisticRegression...


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to 

LogisticRegression CV Accuracy: 0.7616 ± 0.0119


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



XGBoost Classification Report (Train):
               precision    recall  f1-score   support

         High       1.00      1.00      1.00      1063
          Low       0.98      0.98      0.98      1419
       Medium       0.99      0.99      0.99      1651
None_Existent       0.99      1.00      0.99      2200
    Very_High       1.00      1.00      1.00       479

     accuracy                           0.99      6812
    macro avg       0.99      0.99      0.99      6812
 weighted avg       0.99      0.99      0.99      6812


RandomForest Classification Report (Train):
               precision    recall  f1-score   support

         High       1.00      1.00      1.00      1063
          Low       1.00      1.00      1.00      1419
       Medium       1.00      1.00      1.00      1651
None_Existent       1.00      1.00      1.00      2200
    Very_High       1.00      1.00      1.00       479

     accuracy                           1.00      6812
    macro avg       1.00      