In [3]:
from google.colab import files
uploaded = files.upload()

Saving test.csv to test.csv
Saving train.csv to train.csv


In [9]:
!pip install lightgbm




In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from lightgbm import LGBMClassifier

In [11]:
train = pd.read_csv("train.csv")
y = train["RENDIMIENTO_GLOBAL"]
X = train.drop(columns=["RENDIMIENTO_GLOBAL"])

In [12]:
# Columnas numéricas y categóricas
num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X.select_dtypes(include="object").columns.tolist()

In [13]:
# Imputación de valores faltantes
for col in num_cols:
    X[col] = X[col].fillna(X[col].median())
for col in cat_cols:
    X[col] = X[col].fillna("Desconocido")

In [14]:
# Preprocesador y modelo
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), cat_cols)
    ],
    remainder="passthrough"
)

model = LGBMClassifier(
    n_estimators=300,
    learning_rate=0.05,
    num_leaves=50,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", model)
])

In [15]:
# Evaluación interna
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)
y_val_pred = pipeline.predict(X_val)

print("Accuracy en validación interna:", accuracy_score(y_val, y_val_pred))
print("\nReporte de Clasificación:")
print(classification_report(y_val, y_val_pred))
print("\nMatriz de Confusión:")
print(confusion_matrix(y_val, y_val_pred))



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.051881 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1493
[LightGBM] [Info] Number of data points in the train set: 554000, number of used features: 20
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294




Accuracy en validación interna: 0.4352346570397112

Reporte de Clasificación:
              precision    recall  f1-score   support

        alto       0.56      0.63      0.59     35165
        bajo       0.46      0.58      0.51     34573
  medio-alto       0.33      0.27      0.30     34259
  medio-bajo       0.33      0.26      0.29     34503

    accuracy                           0.44    138500
   macro avg       0.42      0.43      0.42    138500
weighted avg       0.42      0.44      0.42    138500


Matriz de Confusión:
[[22128  2999  6572  3466]
 [ 2663 19900  4947  7063]
 [ 9883  7578  9268  7530]
 [ 5073 12735  7711  8984]]


In [16]:
# Entrenar con todo el conjunto
pipeline.fit(X, y)




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.063189 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1495
[LightGBM] [Info] Number of data points in the train set: 692500, number of used features: 20
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294


The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [17]:
# Predecir sobre test.csv
test_df = pd.read_csv("test.csv")

In [18]:
# Imputar valores faltantes
test_df_copy = test_df.copy()
for col in num_cols:
    if col in test_df_copy.columns:
        test_df_copy[col] = test_df_copy[col].fillna(X[col].median())
for col in cat_cols:
    if col in test_df_copy.columns:
        test_df_copy[col] = test_df_copy[col].fillna("Desconocido")


In [19]:
# Predicciones
y_pred_test = pipeline.predict(test_df_copy)



In [20]:
# Generar submission
submission = pd.DataFrame({
    "ID": test_df["ID"],
    "RENDIMIENTO_GLOBAL": y_pred_test
})
submission.to_csv("submission.csv", index=False)
print("\nArchivo submission.csv generado correctamente ✅")


Archivo submission.csv generado correctamente ✅


In [21]:
from google.colab import files
files.download('submission.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>