In [1]:
import pandas as pd

import plotly.express as px

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer


from sklearn.ensemble import RandomForestClassifier

In [2]:
data = pd.read_csv(r"..\data\intermediate\oversampled_dataset.csv")

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74310 entries, 0 to 74309
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Unnamed: 0      74310 non-null  int64 
 1   Age             74310 non-null  int64 
 2   WorkClass       74310 non-null  object
 3   fnlwgt          74310 non-null  int64 
 4   Edu_num         74310 non-null  int64 
 5   Marital_Status  74310 non-null  object
 6   Occupation      74310 non-null  object
 7   Relationship    74310 non-null  object
 8   Race            74310 non-null  object
 9   Sex             74310 non-null  object
 10  Capital_Gain    74310 non-null  int64 
 11  Capital_Loss    74310 non-null  int64 
 12  hpweek          74310 non-null  int64 
 13  Native_Country  74310 non-null  object
 14  Income          74310 non-null  int64 
dtypes: int64(8), object(7)
memory usage: 8.5+ MB


In [4]:
X=data.drop(columns='Income')
y=data['Income']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.15)

In [6]:
columns_to_exclude = ["fnlwgt", "Unnamed: 0"]

numeric_features = make_column_selector(
    dtype_exclude="object", pattern=f'^(?!({"|".join(columns_to_exclude)}))'
)(X)

categorical_features = make_column_selector(
    dtype_include="object", pattern=f'^(?!({"|".join(columns_to_exclude)}))'
)(X)

numeric_transformer = Pipeline(steps=[("imputer", FunctionTransformer(lambda x: x))])

categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(sparse_output=False)),
    ]
)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

Preprocess_Pipeline = Pipeline([("Preprocessor", preprocessor)])

  cols = cols[cols.str.contains(self.pattern, regex=True)]


In [7]:
Preprocess_Pipeline.fit(X)

In [8]:
X_train=Preprocess_Pipeline.transform(X_train)
X_test=Preprocess_Pipeline.transform(X_test)

In [9]:
clf = RandomForestClassifier(max_depth=30, random_state=0)
clf.fit(X_train, y_train.values)

In [10]:
clf.score(X_train,y_train)

0.9616389341861533

In [11]:
clf.score(X_test,y_test)

0.8761101641697318

In [None]:
over=pd.read_csv(r"..\data\intermediate\oversampled_dataset.csv")

In [None]:
newX=over.drop(columns=['Income'])
newY=over['Income']
newX=Preprocess_Pipeline.transform(newX)

In [None]:
clf.score(newX,newY)

In [12]:
original=pd.read_csv(r"..\data\intermediate\grouped_dataset.csv")

newX=original.drop(columns=['Income'])
newY=original['Income']
newX=Preprocess_Pipeline.transform(newX)

clf.score(newX,newY)

0.9291798042668196

In [None]:
list(newY.values[:])

In [13]:
from sklearn.metrics import classification_report

y_pred=clf.predict(newX)



In [14]:

target_names = ['class 0', 'class 1']
print(classification_report(newY.values, y_pred, target_names=target_names))

              precision    recall  f1-score   support

     class 0       0.98      0.93      0.95     37155
     class 1       0.80      0.93      0.86     11687

    accuracy                           0.93     48842
   macro avg       0.89      0.93      0.91     48842
weighted avg       0.94      0.93      0.93     48842



Conclusion: 
El modelo entrenado con Oversampling se desempeña mejor en el dataset original

# Load Data

In [14]:
def process_data(data):
    X=data.drop(columns='Income')
    y=data['Income']
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.15)
    columns_to_exclude = ["fnlwgt", "Unnamed: 0"]

    numeric_features = make_column_selector(
        dtype_exclude="object", pattern=f'^(?!({"|".join(columns_to_exclude)}))'
    )(X)

    categorical_features = make_column_selector(
        dtype_include="object", pattern=f'^(?!({"|".join(columns_to_exclude)}))'
    )(X)

    numeric_transformer = Pipeline(steps=[("imputer", FunctionTransformer(lambda x: x))])

    categorical_transformer = Pipeline(
        steps=[
            ("encoder", OneHotEncoder(sparse_output=False)),
        ]
    )
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, numeric_features),
            ("cat", categorical_transformer, categorical_features),
        ]
    )

    Preprocess_Pipeline = Pipeline([("Preprocessor", preprocessor)])

    return X_train, X_test, y_train, y_test, Preprocess_Pipeline

def train_model(model,X_train, X_test, y_train, y_test):
    train_pipeline=Pipeline([("Preprocessor",Preprocess_Pipeline),
                         ("Model",model)],
                         set_output='pandas')
    
    train_pipeline.fit(X_train,y_train)
    train_score=train_pipeline.score(X_train,y_train)
    test_score=train_pipeline.score(X_test,y_test)

    print(f"Train Score: {train_score}")
    print(f"Test Score: {test_score}")

In [12]:
oversampled_data=pd.read_csv(r"..\data\intermediate\oversampled_dataset.csv")
undersampled_data=pd.read_csv(r"..\data\intermediate\undersampled_dataset.csv")
original_data=pd.read_csv(r"..\data\intermediate\grouped_dataset.csv")


# Random Forest

Entrenamiento con Oversampled
Prueba con:
* Undersampled
* Original

In [8]:
random_forest=RandomForestClassifier(max_depth=30, random_state=0)

X_train, X_test, y_train, y_test, Preprocess_Pipeline=process_data(data)

train_pipeline=Pipeline([("Preprocessor",Preprocess_Pipeline),
                         ("Model",random_forest)],
                         set_output='pandas')

In [9]:
train_pipeline.fit(X_train,y_train)

In [11]:
train_score=train_pipeline.score(X_train,y_train)
test_score=train_pipeline.score(X_test,y_test)

print(f"Train Score: {train_score}")
print(f"Test Score: {test_score}")

Train Score: 0.961892247043364
Test Score: 0.8736879877994079
