In [2]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from collections import Counter
from ipynb.fs.full.EDA import readCsv,readJson
from sklearn.preprocessing import LabelEncoder



In [3]:
config = readJson('config.json')
dfTrain = readCsv(config['files']['train'])

In [8]:
label_encoder = LabelEncoder()        

def balance_dataframe(df: pd.DataFrame, target_column: pd.Series) -> pd.DataFrame:
    
    X = df.drop(target_column, axis=1)
    y = df[target_column]
    
    print(f"Distribución de clases antes de SMOTE: {Counter(y)}")
    
    smote = SMOTE(random_state=42)
    X_res, y_res = smote.fit_resample(X, y)
    print(f"Distribución de clases después de SMOTE: {Counter(y_res)}")
    
    # Crear un nuevo DataFrame con los datos balanceados
    df_resampled = pd.DataFrame(X_res, columns=X.columns)
    df_resampled[target_column] = y_res
    
    return df_resampled


def extract_labels(df: pd.DataFrame) -> list:
    labels = list(df.columns)
    labels.remove("id")
    labels.remove("Target")
    return labels


def targetEncoded(df: pd.DataFrame, flag: bool = True) -> pd.DataFrame:
    if flag:
        df.Target = label_encoder.fit_transform(df.Target)
        return df
    df.Target = label_encoder.inverse_transform(df.Target)
    return df
    

def normData_log(df: pd.DataFrame, target_column: pd.Series) -> pd.DataFrame:
    df_copy = df.drop(target_column, axis=1)
    df_copy.applymap(lambda x: np.log(x))
    df_copy['Target'] = df['Target']
    return df

In [6]:
df_balanced = balance_dataframe(dfTrain, 'Target')
df_balanced.describe()


Distribución de clases antes de SMOTE: Counter({'Graduate': 36282, 'Dropout': 25296, 'Enrolled': 14940})
Distribución de clases después de SMOTE: Counter({'Graduate': 36282, 'Dropout': 36282, 'Enrolled': 36282})


Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP
count,108846.0,108846.0,108846.0,108846.0,108846.0,108846.0,108846.0,108846.0,108846.0,108846.0,...,108846.0,108846.0,108846.0,108846.0,108846.0,108846.0,108846.0,108846.0,108846.0,108846.0
mean,1.099765,16.720899,1.546185,8997.920778,0.913511,3.768894,131.826417,1.204619,19.76068,23.384718,...,0.051844,0.114464,5.812184,7.336338,3.625204,9.408418,0.059368,11.453249,1.228818,-0.089518
std,0.411798,16.821391,1.125842,1789.100121,0.281086,8.824782,10.646506,3.193309,15.363442,14.889223,...,0.377808,0.835382,1.555046,3.491444,2.638557,5.443059,0.432651,2.590367,1.346339,2.166773
min,1.0,1.0,0.0,33.0,0.0,1.0,95.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.6,-0.8,-4.06
25%,1.0,1.0,1.0,9119.0,1.0,1.0,125.0,1.0,1.0,4.0,...,0.0,0.0,5.0,6.0,1.0,10.0,0.0,9.4,0.3,-1.7
50%,1.0,17.0,1.0,9238.0,1.0,1.0,132.972356,1.0,19.0,19.0,...,0.0,0.0,6.0,8.0,4.0,11.833333,0.0,11.1,1.309228,0.32
75%,1.0,39.0,2.0,9670.0,1.0,1.0,138.0,1.0,37.0,37.0,...,0.0,0.0,6.0,9.0,6.0,13.0,0.0,12.7,2.6,1.74
max,6.0,53.0,9.0,9991.0,1.0,43.0,190.0,109.0,44.0,44.0,...,12.0,19.0,23.0,33.0,20.0,18.0,12.0,16.2,3.7,3.51


In [52]:
df_encoded = targetEncoded(df_balanced)
df_encoded.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,1,1,9238,1,1,126.0,1,1,19,...,0,6,7,6,12.428571,0,11.1,0.6,2.02,2
1,1,17,1,9238,1,1,125.0,1,19,19,...,0,6,9,0,0.0,0,11.1,0.6,2.02,0
2,1,17,2,9254,1,1,137.0,1,3,19,...,0,6,0,0,0.0,0,16.2,0.3,-0.92,0
3,1,1,3,9500,1,1,131.0,1,19,3,...,0,8,11,7,12.82,0,11.1,0.6,2.02,1
4,1,1,2,9500,1,1,132.0,1,19,37,...,0,7,12,6,12.933333,0,7.6,2.6,0.32,2


In [13]:
df_encoded

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,1,1,9238,1,1,126.000000,1,1,19,...,0,6,7,6,12.428571,0,11.100000,0.600000,2.020000,2
1,1,17,1,9238,1,1,125.000000,1,19,19,...,0,6,9,0,0.000000,0,11.100000,0.600000,2.020000,0
2,1,17,2,9254,1,1,137.000000,1,3,19,...,0,6,0,0,0.000000,0,16.200000,0.300000,-0.920000,0
3,1,1,3,9500,1,1,131.000000,1,19,3,...,0,8,11,7,12.820000,0,11.100000,0.600000,2.020000,1
4,1,1,2,9500,1,1,132.000000,1,19,37,...,0,7,12,6,12.933333,0,7.600000,2.600000,0.320000,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108841,1,44,1,9003,1,39,140.000000,1,19,19,...,0,6,10,3,10.868087,0,14.405081,0.394469,0.360061,1
108842,1,17,1,9670,1,1,135.191407,1,19,38,...,0,6,9,4,13.193620,0,9.011654,1.371289,3.455130,1
108843,1,17,1,9119,1,1,129.856896,1,19,19,...,0,5,11,2,13.028621,0,10.745621,1.400000,1.790659,1
108844,1,44,1,9085,1,39,140.000000,1,38,19,...,0,5,9,4,13.002474,0,10.480919,0.898588,0.632334,1


In [53]:
dfTrain

Unnamed: 0_level_0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,1,1,9238,1,1,126.0,1,1,19,...,0,6,7,6,12.428571,0,11.1,0.6,2.02,Graduate
1,1,17,1,9238,1,1,125.0,1,19,19,...,0,6,9,0,0.000000,0,11.1,0.6,2.02,Dropout
2,1,17,2,9254,1,1,137.0,1,3,19,...,0,6,0,0,0.000000,0,16.2,0.3,-0.92,Dropout
3,1,1,3,9500,1,1,131.0,1,19,3,...,0,8,11,7,12.820000,0,11.1,0.6,2.02,Enrolled
4,1,1,2,9500,1,1,132.0,1,19,37,...,0,7,12,6,12.933333,0,7.6,2.6,0.32,Graduate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76513,1,17,1,9254,1,1,121.0,1,19,1,...,0,6,8,5,10.600000,0,13.9,-0.3,0.79,Graduate
76514,1,1,6,9254,1,1,125.0,1,1,38,...,0,6,9,6,13.875000,0,9.4,-0.8,-3.12,Graduate
76515,5,17,1,9085,1,1,138.0,1,37,37,...,0,5,8,5,11.400000,1,9.4,-0.8,-3.12,Enrolled
76516,1,1,3,9070,1,1,136.0,1,38,37,...,0,6,0,0,0.000000,0,7.6,2.6,0.32,Dropout


In [15]:
X = df_encoded.drop('Target', axis = 1)
Y = df_encoded['Target']

In [31]:
import optuna
from optuna.samplers import TPESampler
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, stratify=Y, test_size=0.2, random_state=27)

def objective(trial):
    model = RandomForestClassifier(
        n_estimators=trial.suggest_int("n_estimators", 100, 1000),
        criterion=trial.suggest_categorical("criterion", ['gini', 'entropy', 'log_loss']),
        min_samples_leaf=trial.suggest_int("min_samples_leaf", 1, 100),
        max_depth=trial.suggest_int("max_depth", 1, 100),
        min_samples_split=trial.suggest_int("min_samples_split", 2, 100),
        random_state=27
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)
optuna.logging.set_verbosity(optuna.logging.WARNING)
sampler = TPESampler(seed=27)
study = optuna.create_study(study_name="random_forest", direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=20)

# resultados
print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial
print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")


Number of finished trials:  20
Best trial:
  Value:  0.842443729903537
  Params: 
    n_estimators: 858
    criterion: entropy
    min_samples_leaf: 2
    max_depth: 45
    min_samples_split: 17


In [19]:
%%time
def objective(trial):
    model = XGBClassifier(
        max_depth=trial.suggest_int('max_depth', 10, 200),
        learning_rate=trial.suggest_float('learning_rate', 0.01, 1.0, log=True),
        n_estimators=trial.suggest_int('n_estimators', 2000, 3000),
        min_child_weight=trial.suggest_int('min_child_weight', 1, 10),
        gamma=trial.suggest_float('gamma', 1e-8, 1.0, log=True),
        subsample=trial.suggest_float('subsample', 0.01, 1.0, log=True),
        colsample_bytree=trial.suggest_float('colsample_bytree', 0.01, 1.0, log=True),
        reg_alpha=trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True),
        reg_lambda=trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True),
        eval_metric='mlogloss',
        use_label_encoder=False,
        random_state=27
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)
optuna.logging.set_verbosity(optuna.logging.WARNING)
sampler = TPESampler(seed=27)
study = optuna.create_study(study_name="xgb", direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=20)
print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial
print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")
print()


Number of finished trials:  20
Best trial:
  Value:  0.8722094625631603
  Params: 
    max_depth: 121
    learning_rate: 0.08203293467326497
    n_estimators: 2211
    min_child_weight: 3
    gamma: 0.003987312272961856
    subsample: 0.49282530328223684
    colsample_bytree: 0.7965747379936637
    reg_alpha: 0.01904510715600365
    reg_lambda: 3.822142832293114e-06

CPU times: total: 46min 48s
Wall time: 14min 6s


In [21]:
%%time
def objective(trial):
    model = CatBoostClassifier(
        iterations=trial.suggest_int("iterations", 100, 1000),
        learning_rate=trial.suggest_float("learning_rate", 1e-3, 1e-1, log=True),
        colsample_bylevel=trial.suggest_float("colsample_bylevel", 0.05, 1.0),
        min_data_in_leaf=trial.suggest_int("min_data_in_leaf", 1, 100),
        depth=trial.suggest_int("depth", 7, 10),
        l2_leaf_reg=trial.suggest_float("l2_leaf_reg", 1e-8, 100.0, log=True),
        verbose=False,
        random_state=27
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

optuna.logging.set_verbosity(optuna.logging.WARNING)
sampler = TPESampler(seed=42)
study = optuna.create_study(study_name="catboost", direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=100)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial
print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")
print()

Number of finished trials:  100
Best trial:
  Value:  0.8666972898484152
  Params: 
    iterations: 975
    learning_rate: 0.0999006047176625
    colsample_bylevel: 0.16142480216394106
    min_data_in_leaf: 45
    depth: 10
    l2_leaf_reg: 8.272554073347706e-05

CPU times: total: 14min 4s
Wall time: 1h 9min 39s


In [23]:
def objective(trial):
    model = LGBMClassifier(
        n_estimators=trial.suggest_int("n_estimators", 100, 1000),
        max_depth=trial.suggest_int("max_depth", 1, 100),
        learning_rate=trial.suggest_float("learning_rate", 1e-3, 1e-1, log=True),
        objective='multiclass',
        verbosity=-1,
        boosting_type=trial.suggest_categorical('boosting_type', ['gbdt', 'dart']),
        num_leaves=trial.suggest_int('num_leaves', 2, 256),
        min_child_samples=trial.suggest_int('min_child_samples', 5, 100),
        random_state=27
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

optuna.logging.set_verbosity(optuna.logging.WARNING)
sampler = TPESampler(seed=42)
study = optuna.create_study(study_name="lgbm", direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=10)

# Imprimir resultados
print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial
print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")
print()

Number of finished trials:  10
Best trial:
  Value:  0.8577400091869545
  Params: 
    n_estimators: 266
    max_depth: 97
    learning_rate: 0.035503048581283086
    boosting_type: gbdt
    num_leaves: 154
    min_child_samples: 93



In [24]:
#Base models
base_models = [
    ('catboost', CatBoostClassifier(
        iterations=975,
        learning_rate=0.0999006047176625,
        colsample_bylevel=0.16142480216394106,
        min_data_in_leaf=45,
        depth=10,
        l2_leaf_reg=8.272554073347706,
        random_state=42,
    )),
    ('xgboost', XGBClassifier(
        max_depth=121,
        learning_rate=0.08203293467326497,
        n_estimators=2211,
        min_child_weight=3,
        gamma=0.003987312272961856,
        subsample=0.49282530328223684,
        colsample_bytree=0.7965747379936637,
        reg_alpha=0.01904510715600365,
        reg_lambda=3.822142832293114e-06,
        random_state=42,
    )),
    ('lgbm', LGBMClassifier(
        n_estimators=266,
        max_depth=97,
        learning_rate=0.035503048581283086,
        boosting_type='gbdt',
        num_leaves=154,
        min_child_samples=93,
        random_state=42,
    )),
    ('rand_forest', RandomForestClassifier(
        n_estimators=858,
        criterion='entropy',
        min_samples_leaf=2,
        max_depth=45,
        min_samples_split=17,
        random_state=42,
    )),
]

In [35]:
meta_estimator = CatBoostClassifier(
    iterations=970,
    learning_rate=0.08509951726315906,
    colsample_bylevel=0.7265414512159318,
    min_data_in_leaf=36,
    depth=8,
    l2_leaf_reg=36.529811296436115,
    random_state=27
)

In [36]:
stacking_clf = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_estimator,
    cv=5  
)



In [41]:
stacking_model = StackingClassifier(estimators=base_models, final_estimator=meta_estimator)
stacking_model.fit(X_train, y_train)

0:	learn: 1.0008086	total: 54.7ms	remaining: 53.3s
1:	learn: 0.9219767	total: 97.8ms	remaining: 47.6s
2:	learn: 0.8610243	total: 143ms	remaining: 46.3s
3:	learn: 0.8110431	total: 186ms	remaining: 45.2s
4:	learn: 0.7687974	total: 233ms	remaining: 45.3s
5:	learn: 0.7332326	total: 284ms	remaining: 45.8s
6:	learn: 0.7025181	total: 332ms	remaining: 45.9s
7:	learn: 0.6743721	total: 378ms	remaining: 45.7s
8:	learn: 0.6559551	total: 392ms	remaining: 42s
9:	learn: 0.6441776	total: 398ms	remaining: 38.4s
10:	learn: 0.6255824	total: 447ms	remaining: 39.2s
11:	learn: 0.6098658	total: 499ms	remaining: 40.1s
12:	learn: 0.5939577	total: 550ms	remaining: 40.7s
13:	learn: 0.5803697	total: 599ms	remaining: 41.1s
14:	learn: 0.5669796	total: 653ms	remaining: 41.8s
15:	learn: 0.5580039	total: 701ms	remaining: 42s
16:	learn: 0.5484976	total: 755ms	remaining: 42.5s
17:	learn: 0.5397395	total: 810ms	remaining: 43.1s
18:	learn: 0.5328069	total: 861ms	remaining: 43.3s
19:	learn: 0.5258258	total: 920ms	remaining

In [42]:
y_pred_val = stacking_model.predict(X_test)

accuracy_val = accuracy_score(y_test, y_pred_val)
print(f"Validation Accuracy Score: {accuracy_val:.8f}")

Validation Accuracy Score: 0.96734038


  y = column_or_1d(y, warn=True)


In [43]:
config = readJson('config.json')
dfTest = readCsv(config['files']['test'])

In [48]:
preds = stacking_model.predict(dfTest)
preds

  y = column_or_1d(y, warn=True)


array([0, 2, 2, ..., 0, 0, 0], dtype=int64)

In [58]:
mapeo = {0: "Dropout", 1: "Enrolled", 2: "Graduate"}

In [59]:
predsCat = [mapeo[num] for num in preds]

In [62]:
sample_submission = readCsv(config['files']['sample_submission'])
sample_submission['Target'] = predsCat

In [64]:
sample_submission.to_csv('csv/submission1.csv')