In [3]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from collections import Counter
from ipynb.fs.full.EDA import readCsv,readJson
from sklearn.preprocessing import LabelEncoder



In [4]:
config = readJson('config.json')
dfTrain = readCsv(config['files']['train'])

In [13]:

def balance_dataframe(df: pd.DataFrame, target_column: pd.Series) -> pd.DataFrame:
    
    X = df.drop(target_column, axis=1)
    y = df[target_column]
    
    print(f"Distribución de clases antes de SMOTE: {Counter(y)}")
    
    smote = SMOTE(random_state=42)
    X_res, y_res = smote.fit_resample(X, y)
    
    
    print(f"Distribución de clases después de SMOTE: {Counter(y_res)}")
    
    # Crear un nuevo DataFrame con los datos balanceados
    df_resampled = pd.DataFrame(X_res, columns=X.columns)
    df_resampled[target_column] = y_res
    
    return df_resampled


def extract_labels(df: pd.DataFrame) -> list:
    labels = list(df.columns)
    labels.remove("id")
    labels.remove("Target")
    return labels


def targetEncoded(df: pd.DataFrame) -> pd.DataFrame:
    label_encoder = LabelEncoder()
    df.Target = label_encoder.fit_transform(df.Target)
    return df


def normData_log(df: pd.DataFrame, target_column: pd.Series) -> pd.DataFrame:
    df_copy = df.drop(target_column, axis=1)
    df_copy.applymap(lambda x: np.log(x))
    df_copy['Target'] = df['Target']
    return df

In [7]:
df_balanced = balance_dataframe(dfTrain, 'Target')
df_balanced.describe()


Distribución de clases antes de SMOTE: Counter({'Graduate': 36282, 'Dropout': 25296, 'Enrolled': 14940})
Distribución de clases después de SMOTE: Counter({'Graduate': 36282, 'Dropout': 36282, 'Enrolled': 36282})


Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP
count,108846.0,108846.0,108846.0,108846.0,108846.0,108846.0,108846.0,108846.0,108846.0,108846.0,...,108846.0,108846.0,108846.0,108846.0,108846.0,108846.0,108846.0,108846.0,108846.0,108846.0
mean,1.099765,16.720899,1.546185,8997.920778,0.913511,3.768894,131.826417,1.204619,19.76068,23.384718,...,0.051844,0.114464,5.812184,7.336338,3.625204,9.408418,0.059368,11.453249,1.228818,-0.089518
std,0.411798,16.821391,1.125842,1789.100121,0.281086,8.824782,10.646506,3.193309,15.363442,14.889223,...,0.377808,0.835382,1.555046,3.491444,2.638557,5.443059,0.432651,2.590367,1.346339,2.166773
min,1.0,1.0,0.0,33.0,0.0,1.0,95.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.6,-0.8,-4.06
25%,1.0,1.0,1.0,9119.0,1.0,1.0,125.0,1.0,1.0,4.0,...,0.0,0.0,5.0,6.0,1.0,10.0,0.0,9.4,0.3,-1.7
50%,1.0,17.0,1.0,9238.0,1.0,1.0,132.972356,1.0,19.0,19.0,...,0.0,0.0,6.0,8.0,4.0,11.833333,0.0,11.1,1.309228,0.32
75%,1.0,39.0,2.0,9670.0,1.0,1.0,138.0,1.0,37.0,37.0,...,0.0,0.0,6.0,9.0,6.0,13.0,0.0,12.7,2.6,1.74
max,6.0,53.0,9.0,9991.0,1.0,43.0,190.0,109.0,44.0,44.0,...,12.0,19.0,23.0,33.0,20.0,18.0,12.0,16.2,3.7,3.51


In [10]:
df_encoded = targetEncoded(df_balanced)
df_encoded.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,1,1,9238,1,1,126.0,1,1,19,...,0,6,7,6,12.428571,0,11.1,0.6,2.02,2
1,1,17,1,9238,1,1,125.0,1,19,19,...,0,6,9,0,0.0,0,11.1,0.6,2.02,0
2,1,17,2,9254,1,1,137.0,1,3,19,...,0,6,0,0,0.0,0,16.2,0.3,-0.92,0
3,1,1,3,9500,1,1,131.0,1,19,3,...,0,8,11,7,12.82,0,11.1,0.6,2.02,1
4,1,1,2,9500,1,1,132.0,1,19,37,...,0,7,12,6,12.933333,0,7.6,2.6,0.32,2


In [14]:
df_preprocessing = normData_log(df_encoded, 'Target')

  df_copy.applymap(lambda x: np.log(x))
  df_copy.applymap(lambda x: np.log(x))
  df_copy.applymap(lambda x: np.log(x))


In [16]:
df_preprocessing.to_csv('csv/df_preprocessing.csv')