In [2]:
import pandas as pd

In [216]:
def process_date(date):
    # Given an integer date like 960101 transforms it into the standard format
    # returns 4 columns, year, month, day and full data

    year        = str(int(str(date)[:2]) + 1900)
    month       = str(date)[2:4]
    day         = str(date)[4:]

    full_date   = year + '-' + month + '-' + day

    return year, month, day, full_date

In [217]:
## Load datasets
account = pd.read_csv("dados/raw/account.csv", sep=";", low_memory=False)
card_dev = pd.read_csv("dados/raw/card_dev.csv", sep=";", low_memory=False)
card_comp = pd.read_csv("dados/raw/card_comp.csv", sep=";", low_memory=False)
client = pd.read_csv("dados/raw/client.csv", sep=";", low_memory=False)
disp = pd.read_csv("dados/raw/disp.csv", sep=";", low_memory=False)
district = pd.read_csv("dados/raw/district.csv", sep=";", low_memory=False)
loan_dev = pd.read_csv("dados/raw/loan_dev.csv", sep=";", low_memory=False)
loan_comp = pd.read_csv("dados/raw/loan_comp.csv", sep=";", low_memory=False)
trans_dev = pd.read_csv("dados/raw/trans_dev.csv", sep=";", low_memory=False)
trans_comp = pd.read_csv("dados/raw/trans_comp.csv", sep=";", low_memory=False)

In [219]:
def process_account(df: pd.DataFrame):
    # Process date
    df['acc_creation_year'], df['acc_creation_month'], df['acc_creation_day'], df['acc_creation_date']  = zip(*df['date'].map(process_date))

    # Drop date column
    df.drop(["date"], axis=1, inplace=True)

    # Sort by account id
    df.sort_values(by=['account_id'], inplace=True)

    df.to_csv("dados/pre-processed/account.csv", index=False)

In [220]:
def process_card(df: pd.DataFrame, type: str): 
    # Process date
    df['card_issued_year'], df['card_issued_month'], df['card_issued_day'], df['card_issued_date']  = zip(*df['issued'].map(process_date))

    # Drop issued column
    df.drop(["issued"], axis=1, inplace=True)

    # Sort by account id
    df.sort_values(by=['card_id'], inplace=True)

    df.to_csv("dados/pre-processed/card_"+ type + ".csv", index=False)

In [221]:
def process_client(df: pd.DataFrame):
    # Process date
    df['birthdate_year'], df['birthdate_month'], df['birthdate_day'], df['birthdate']  = zip(*df['birth_number'].map(process_date))

    # Cast month as int
    df['birthdate_month'] = df['birthdate_month'].astype(int)

    # Drop birthnumber column
    df.drop(["birth_number"], axis=1, inplace=True)

    # Get the sex
    df["sex"] = df.apply(lambda row: 'f' if row["birthdate_month"] > 12 else 'm', axis=1)

    # If the month is greater than 12, update the data and subtract 50 from month
    df.loc[df['birthdate_month'] > 12, 'birthdate'] = df['birthdate_year'] + '-' + (df['birthdate_month']-50).astype(str) + '-' + df['birthdate_day']
    
    df.loc[df['birthdate_month'] > 12, 'birthdate_month'] = df['birthdate_month'] - 50

    # Sort by account id
    df.sort_values(by=['client_id'], inplace=True)

    df.to_csv("dados/pre-processed/client.csv", index=False)

In [218]:
def process_district(df: pd.DataFrame):
    def process_city(row):
        # If the city name contains '-' split it
        if '-' in row['city']:
            return row['city'].split('-')[0]
        else:
            return row['city']
    
    def get_zone(row):
        if ' ' in row["region"]:
            return row["region"].split(' ')[0]
        else:
            return 'NULL'

    def process_region(row):
        if ' ' in row["region"]:
            return row["region"].split(' ')[1]
        else:
            return row["region"]

    ## Rename columns
    df.columns = ['id', 'city', 'region', 'num_inhab', 'num_municip_inhab_0_499', \
    'num_municip_inhab_500_1999', 'num_municip_inhab_2000_9999', \
    'num_municip_inhab_10000_', 'num_cities', 'perc_urban_inhab', 'avg_salary', 'perc_unemploy_95', \
    'perc_unemploy_96', 'enterp_per_1000','num_crimes_95', 'num_crimes_96']

    df["city"] = df.apply(lambda row: process_city(row), axis=1)
    df["region_zone"] = df.apply(lambda row: get_zone(row), axis=1)
    df["region"] = df.apply(lambda row: process_region(row), axis=1)

    ## Replace cells with ? with None
    df.replace('?', None, inplace=True)

    df.to_csv("dados/pre-processed/district.csv", index=False)


In [222]:
def process_dispostion(df: pd.DataFrame):
    ## Set the type to lower case
    df["type"] = df["type"].str.lower()

    # Sort by disp id
    df.sort_values(by=['disp_id'], inplace=True)

    df.to_csv("dados/pre-processed/disp.csv", index=False)

In [223]:
def process_loan(df: pd.DataFrame, type: str):
    # Process date
    df['loan_year'], df['loan_month'], df['loan_day'], df['loan_date']  = zip(*df['date'].map(process_date))

    # Drop date column
    df.drop(["date"], axis=1, inplace=True)

    # Sort by account id
    df.sort_values(by=['loan_id'], inplace=True)

    df.to_csv("dados/pre-processed/loan_"+ type + ".csv", index=False)

In [224]:
def process_trans(df: pd.DataFrame, type: str):
    dict_map = {
        'credit in cash':'cash',
        'withdrawal in cash':'cash',
        'collection from another bank':'another bank',
        'remittance to another bank':'another bank',
        'credit card withdrawal':'credit card',
    }
    
    # Process date
    df['trans_year'], df['trans_month'], df['trans_day'], df['trans_date']  = zip(*df['date'].map(process_date))
    
    # Drop date column
    df.drop(["date"], axis=1, inplace=True)

    # Replace nan with empty string
    df['operation'].fillna('', inplace=True)

    # Check if operation is NAN, if so, set it to 'null', otherwise look into the dict
    df["operation"] = df["operation"].apply(lambda x: 'NULL' if x == '' else dict_map[x])

    # Sort by account id
    df.sort_values(by=['trans_id'], inplace=True)

    df.to_csv("dados/pre-processed/trans_"+ type + ".csv", index=False)


In [225]:
def pre_process_data():
    process_account(account)
    process_district(district)
    process_card(card_dev, 'dev')
    process_card(card_comp, 'comp')
    process_client(client)
    process_dispostion(disp)
    process_loan(loan_dev, "dev")
    process_loan(loan_comp, "comp")
    process_trans(trans_dev, "dev")
    process_trans(trans_comp, 'comp')

In [226]:
pre_process_data()