# Pre-processing - Notebook

In this Jupyter Notebook, we will focus on preprocessing steps to mitigate overfitting in our dataset. To address this issue, we will perform some common preprocessing techniques, such as dropping columns and replacing certain values.

## Imports

In [1]:
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


## Data loading

In [2]:
df_bank_a = pd.read_csv('../data/BankA.csv')
df_bank_b = pd.read_csv('../data/BankB.csv')
df_bank_c = pd.read_csv('../data/BankC.csv')

In [3]:
df_all = pd.concat([df_bank_a, df_bank_b, df_bank_c])

## Functions

In [4]:
def pre_processing(df: pd.DataFrame):
    df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

    # Workclass
    df['workclass'] = df['workclass'].replace(['Never-worked', 'Without-pay'], 'Not-working')
    df['workclass'] = df['workclass'].replace(['?', '*'], 'unknown')

    # marital-status
    df['marital-status'] = df['marital-status'].replace(['Married-civ-spouse', 'Married-AF-spouse'], 'Married')

    #occupation
    df['occupation'] = df['occupation'].replace(['Exec-managerial', 'Prof-specialty'], 'high')
    df['occupation'] = df['occupation'].replace(['Armed-Forces', 'Protective-serv', 'Tech-support', 'Sales', 'Craft-repair', 'Transport-moving'], 'medium')
    df['occupation'] = df['occupation'].replace(['Adm-clerical', 'Machine-op-inspct', 'Farming-fishing', 'Handlers-cleaners', 'Other-service', 'Priv-house-serv'], 'low')
    df['occupation'] = df['occupation'].replace(['?', '*'], 'unknown')

    #relationship
    df['relationship'] = df['relationship'].replace(['Husband', 'Wife'], 'Parent')

    # native-country
    df['native-country'] = df['native-country'].str.strip()
    df['native-country'] = df['native-country'].replace(['United-States', 'Puerto-Rico', 'Canada', 'Outlying-US(Guam-USVI-etc)', 'Cuba', 'Jamaica', 'Mexico', 'Dominican-Republic', 'El-Salvador', 'Guatemala', 'Haiti', 'Honduras', 'Nicaragua', 'Trinadad&Tobago', 'Peru', 'Ecuador', 'Columbia', 'Honduras', 'Haiti', 'Guatemala', 'El-Salvador', 'Dominican-Republic', 'Columbia', 'Ecuador', 'Peru', 'Jamaica', 'Mexico', 'Puerto-Rico', 'Cuba', 'Outlying-US(Guam-USVI-etc)', 'Canada', 'United-States'], 'North-America')
    df['native-country'] = df['native-country'].replace(['Germany', 'England', 'Italy', 'Poland', 'Portugal', 'Ireland', 'France', 'Yugoslavia', 'Scotland', 'Greece', 'Hungary', 'Holand-Netherlands'], 'Europe')
    df['native-country'] = df['native-country'].replace(['Philippines', 'India', 'China', 'Japan', 'Vietnam', 'Taiwan', 'Iran', 'Thailand', 'Hong', 'Cambodia', 'Laos'], 'Asia')
    df['native-country'] = df['native-country'].replace(['South', 'Columbia', 'Ecuador', 'Peru'], 'South-America')
    df['native-country'] = df['native-country'].replace(['Trinadad&Tobago', 'Honduras', 'Haiti', 'Guatemala', 'El-Salvador', 'Dominican-Republic', 'Columbia', 'Ecuador', 'Peru'], 'Central-America')
    df['native-country'] = df['native-country'].replace(['?', '*'], 'Unknown')

    # education
    df['education'] = df['education'].replace(['Preschool', '1st-4th', '5th-6th', '7th-8th', '9th', '10th', '11th', '12th'], 'school')
    df['education'] = df['education'].replace(['Assoc-voc', 'Assoc-acdm', 'Prof-school', 'Some-college'], 'higher')

    # capital-diff'
    df['capital-diff'] = df['capital-gain'] - df['capital-loss']

    # race
    df['race'] = df['race'].replace(['Black', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other'], 'Other')

    # Drop the fnlwgt column
    df.drop(['fnlwgt'], axis=1, inplace=True)

    # Drop education-num column
    df.drop(['educational-num'], axis=1, inplace=True)

    # Drop the gender column
    df.drop(['gender'], axis=1, inplace=True)

    # Drop the race column
    df.drop('race', axis=1, inplace=True)

    # Drop the capital-gain column
    df.drop('capital-gain', axis=1, inplace=True)

    # Drop the capital-loss column
    df.drop('capital-loss', axis=1, inplace=True)

    # replace income by 0 and 1
    df['income'] = df['income'].map({'<=50K': 0, '>50K': 1})

    return df

## Pre-Processing

In [5]:
df_bank_a = pre_processing(df_bank_a)
df_bank_b = pre_processing(df_bank_b)
df_bank_c = pre_processing(df_bank_c)
df_all = pre_processing(df_all)

In [None]:
df_bank_a.to_csv('../data/BankA_preprocessed.csv', index=False)
df_bank_b.to_csv('../data/BankB_preprocessed.csv', index=False)
df_bank_c.to_csv('../data/BankC_preprocessed.csv', index=False)
df_all.to_csv('../data/BankABC_preprocessed.csv', index=False)