In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from imblearn.over_sampling import SMOTE

<h1 style="background-color:powderblue;">General helper Classes</h1>

In [None]:
class CleaningHelper():
    """Cleaning Helper"""
    def __init__(self, version):
        pd.options.mode.chained_assignment = None  # default='warn'
        self.version = version
    
    def __str__(self):
        return f"Cleaning helper version {self.version}"
    
    def get_nulls_data(self, df_):
        #We want to know the quality of data. So, let's start by detecting not null percentage related to every column. 

        df_tot_nulls = df_.isnull().sum().sort_values(ascending=False)
        df_tot_nulls_perc = 100 - round(df_tot_nulls/len(df_)*100,2)
        df_tot_perc_nulls = pd.concat([df_tot_nulls,df_tot_nulls_perc],axis=1)
        df_tot_perc_nulls = df_tot_perc_nulls.rename(columns={0: "Total", 1: "PercNotNull"})
        return df_tot_perc_nulls
    
    def coll_cat_reduction(self, df_):
        """Feature engineering: category reduction"""
        df_res = df_.copy()
        df_res.education[df_res.education=='basic.4y']='Basic'
        df_res.education[df_res.education=='basic.6y']='Basic'
        df_res.education[df_res.education=='basic.9y']='Basic'
        df_res.education[df_res.education=='unknown'] ='Unknown'
        df_res.education[df_res.education=='university.degree'] ='University_Degree'
        df_res.education[df_res.education=='high.school'] ='High_School'
        df_res.education[df_res.education=='professional.course'] ='Professional_Course'
        df_res.education[df_res.education=='illiterate'] ='Illiterate'
        return df_res
    
    def custom_classification_report(self, y_true_, y_pred_):
        report = metrics.classification_report(y_true = y_true_, y_pred = y_pred_, output_dict=True)
        df_classification_report = pd.DataFrame(report).transpose()
        df_classification_report = df_classification_report.sort_values(by=['f1-score'], ascending=False)
        df_classification_report = df_classification_report.reset_index()
        df_classification_report = df_classification_report.rename(columns={'index':'metric_'})
        return df_classification_report