In [1]:
import pandas as pd

import plotly.express as px
import plotly.graph_objs as go
import plotly.subplots as sp
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer


from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import pickle as pkl
import joblib

In [2]:
def load_data(data,target):
    X=data.drop(columns=[target])
    y=data[target]
    return X,y

In [3]:
oversampled_data=pd.read_csv(r"..\data\intermediate\oversampled_dataset.csv")
undersampled_data=pd.read_csv(r"..\data\intermediate\undersampled_dataset.csv")
original_data=pd.read_csv(r"..\data\intermediate\grouped_dataset.csv")

In [4]:
X,y=load_data(oversampled_data,'Income')

In [6]:
X['Native_Country'].value_counts()

United-States    70020
OtherCountry      4290
Name: Native_Country, dtype: int64

In [4]:
# joblib and lambda cannot save pipeline with lambda, so define a fucntion:
def pass_cols(x):
    return x

# Select model
random_forest=RandomForestClassifier(max_depth=50, random_state=0)

#Load Data
X,y=load_data(oversampled_data,'Income')

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.15)

# Select columns
columns_to_exclude = ["fnlwgt", "Unnamed: 0","Race"]

numeric_features = make_column_selector(
    dtype_exclude="object", pattern=f'^(?!({"|".join(columns_to_exclude)}))'
)(X)

categorical_features = make_column_selector(
    dtype_include="object", pattern=f'^(?!({"|".join(columns_to_exclude)}))'
)(X)

# Preprocess
numeric_transformer = Pipeline(steps=[("imputer", FunctionTransformer(pass_cols))])

categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(sparse_output=False)),
    ]
)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

# Build Pipeline
Preprocess_Pipeline = Pipeline([("Preprocessor", preprocessor)])

train_pipeline=Pipeline([("Preprocessor",Preprocess_Pipeline),
                        ("Model",random_forest)],
                        verbose=True)


  cols = cols[cols.str.contains(self.pattern, regex=True)]


In [5]:
train_pipeline

In [6]:
pd.DataFrame.from_dict(train_pipeline.named_steps['Model'].get_params(),
                        orient='index',
                        columns=['Value']).to_csv(r"..\reports\dataframe_results\random_forest\rf_config.csv")

## Train (Original Data)

In [7]:
# Train
train_pipeline.fit(X_train,y_train)

# Predict
target_names = ['No Fraude', 'Fraude']
y_pred=train_pipeline.predict(X_train)

# Clasification report
report=classification_report(y_train, y_pred, target_names=target_names,digits=2,output_dict=True)

# Plot
report.update({"accuracy": {"precision": None, "recall": None, "f1-score": report["accuracy"], "support": report['macro avg']['support']}})
report=pd.DataFrame.from_dict(report).round(2)
report=report.fillna('')

report.to_csv(r"..\reports\dataframe_results\random_forest\cr_train.csv")

fig = ff.create_table(report.T, index=True)
fig.update_layout(
    title_text="Classification report (Train)",
    margin = {'t':50, 'b':10,'l':10,'r':10},
    height=300,
    width=550,
)
fig.show()

[Pipeline] ...... (step 1 of 2) Processing Preprocessor, total=   0.1s
[Pipeline] ............. (step 2 of 2) Processing Model, total=   6.6s


## Test

In [8]:
# Predict
y_pred=train_pipeline.predict(X_test)

# Classification report
report=classification_report(y_test, y_pred, target_names=target_names,digits=2,output_dict=True)

# Plot
report.update({"accuracy": {"precision": None, "recall": None, "f1-score": report["accuracy"], "support": report['macro avg']['support']}})
report=pd.DataFrame.from_dict(report).round(2)
report=report.fillna('')
report.to_csv(r"..\reports\dataframe_results\random_forest\cr_test.csv")
fig = ff.create_table(report.T, index=True)
fig.update_layout(
    title_text="Classification report (Test)",
    margin = {'t':50, 'b':10,'l':10,'r':10},
    height=300,
    width=550,
)
fig.show()

In [9]:
joblib.dump(random_forest, r"..\models\random_forest\random_forest.joblib") 

['..\\models\\random_forest\\random_forest.joblib']

In [10]:
joblib.dump(train_pipeline, r"..\models\random_forest\rf_pipeline.joblib")

['..\\models\\random_forest\\rf_pipeline.joblib']

In [None]:
# Confusion Matrix

## Test (UnderSample Dataset)

In [None]:
#Load Data
X,y=load_data(undersampled_data,'Income')

y_pred=train_pipeline.predict(X)

report=classification_report(y, y_pred, target_names=target_names,digits=2,output_dict=True)
report.update({"accuracy": {"precision": None, "recall": None, "f1-score": report["accuracy"], "support": report['macro avg']['support']}})
report=pd.DataFrame.from_dict(report).round(2)
report=report.fillna('')

fig = ff.create_table(report.T, index=True)
fig.update_layout(
    title_text="Classification report (UnderSample Dataset)",
    margin = {'t':50, 'b':10,'l':10,'r':10},
    height=300,
    width=550,
)
fig.show()

## Test Oversampled dataset

In [None]:
#Load Data
X,y=load_data(original_data,'Income')

y_pred=train_pipeline.predict(X)

report=classification_report(y, y_pred, target_names=target_names,digits=2,output_dict=True)
report.update({"accuracy": {"precision": None, "recall": None, "f1-score": report["accuracy"], "support": report['macro avg']['support']}})
report=pd.DataFrame.from_dict(report).round(2)
report=report.fillna('')

fig = ff.create_table(report.T, index=True)
fig.update_layout(
    title_text="Classification report (OverSample Dataset)",
    margin = {'t':50, 'b':10,'l':10,'r':10},
    height=300,
    width=550,
)
fig.show()