In [1]:
import pandas as pd
from datetime import datetime
import math

DATE_FORMAT = "%Y-%m-%d"

## Account

In [None]:
## Read dfs
account   = pd.read_csv("dados/pre-processed/account.csv")
loan_dev  = pd.read_csv("dados/pre-processed/loan_dev.csv")
loan_comp = pd.read_csv("dados/pre-processed/loan_comp.csv")
loan   = pd.concat([loan_dev, loan_comp])
loan   = loan[["account_id", "loan_date"]]

In [None]:
## Drop unecessary columns
columns_to_drop = [
    "acc_creation_year",
    "acc_creation_month",
    "acc_creation_day",
]
account.drop(columns=columns_to_drop, axis = 1, inplace = True)

In [None]:
## Replace categorical data with numerical
account["frequency"].replace(
    ["monthly issuance", "weekly issuance", "issuance after transaction"],
    range(0, 3),
    inplace = True
    )

In [None]:
## Merge with loan using account id
df = pd.merge(account, loan, on="account_id", how="left")

In [None]:
## ACCOUNTS WITH NO LOANS WILL HAVE TODAYS LOAN_DATE
## THIS IS NOT AN ISSUE SINCE WE WILL ONLY TRAIN WITH ACCOUNTS THAT ASKED FOR A LOAN
df.fillna("2022-11-21", inplace=True)

In [None]:
df["loan_data"]         = df["loan_date"].apply(lambda x: datetime.strptime(x, DATE_FORMAT))
df["acc_creation_date"] = df["acc_creation_date"].apply(lambda x: datetime.strptime(x, DATE_FORMAT))

In [None]:
df["acc_age_at_loan"] = df.apply(lambda x: math.floor((x["loan_date"] - x["acc_creation_date"]).days / 30), axis = 1)

In [None]:
df.drop("loan_date", inplace=True, axis=1)

In [None]:
df.to_csv("dados/cleaned/account.csv", index = False)

## Card

In [None]:
## Read competion and develop dataframes
card_comp = pd.read_csv("dados/pre-processed/card_comp.csv")
card_dev = pd.read_csv("dados/pre-processed/card_dev.csv")

## Concat them
card = pd.concat([card_comp, card_dev])

## Drop type column of disposition
disp_df = pd.read_csv("dados/pre-processed/disp.csv")
disp_df.drop("type", axis=1, inplace=True)

## Merge card with disposition (how=Left so we have all clients and not only those with cards)
card_disp = pd.merge(disp_df, card, on="disp_id", how="left")

## Readability
card_disp.rename(columns = {"type":"type_card"}, inplace=True)

## Replace NaN values with "Other"
card_disp["type_card"].fillna("other", inplace=True)

## Type of card from numerical to categorical
card_disp = pd.get_dummies(card_disp, columns = ['type_card'])

## Function to check whether a person has a card
def has_card(row):
    return 0 if pd.isna(row["card_id"]) else 1

## Apply function to new column
card_disp["has_card"] = card_disp.apply(lambda x: has_card(x), axis = 1)

## Drop columns
card_disp.drop(["card_issued_date", "card_id", "account_id", "client_id"], axis=1, inplace=True)

## Save
card_disp.to_csv("dados/cleaned/card.csv", index=False)

card_disp.head()

## Client

In [6]:
## Read useful dataframes
client = pd.read_csv("dados/pre-processed/client.csv")

## Sex from categorical to numerical
client['sex'].replace(['m', 'f'], [0, 1], inplace=True)

def get_age(row):
    collected_date = datetime.strptime("2000-01-01", DATE_FORMAT)
    date = datetime.strptime(row["birthdate"], DATE_FORMAT)
    return collected_date.year - date.year - ((collected_date.month, collected_date.day) < (date.month, date.day))

client["age"] = client.apply(lambda row: get_age(row), axis = 1)

columns_to_drop=[
                "birthdate",
                "birthdate_year",
                "birthdate_month",
                "birthdate_day"
                ]

client.drop(columns=columns_to_drop, axis=1, inplace = True)
## To csv
client.to_csv("dados/cleaned/client.csv", index = False)
client.head()

Unnamed: 0,client_id,district_id,sex,age
0,1,18,1,29
1,2,1,0,54
2,3,1,1,59
3,4,5,0,43
4,5,5,1,39


## Disposition

In [None]:
disp   = pd.read_csv("dados/pre-processed/disp.csv")

def is_account_shared(account_id):
    return 1 if disp["account_id"].value_counts()[account_id] > 1 else 0

## Check if an account is shared
disp["is_account_shared"] = disp.apply(lambda row: is_account_shared(row["account_id"]), axis = 1)

## Keep only account owners
disp = disp[disp["type"] == "owner"]


## Drop type column
disp.drop("type", axis=1, inplace=True)

## To CSV
disp.to_csv("dados/cleaned/disp.csv", index=False)

disp.head() 

## District

In [None]:
## Read df
dist = pd.read_csv("dados/pre-processed/district.csv")

## Standardize values
dist["num_crimes_95"] = round(dist["num_crimes_95"]/dist["num_inhab"] * 1000, 2)
dist["num_crimes_96"] = round(dist["num_crimes_96"]/dist["num_inhab"] * 1000, 2)

In [None]:
## Find missing values
dist.isna().any()

In [None]:
## We have missing values in both perc_unemploy_95 and num_crimes_95. 
## We can deal with it by using the data avaliable in other years
dist["perc_unemploy_95"].fillna(dist["perc_unemploy_96"], inplace=True)
dist["num_crimes_95"].fillna(dist["num_crimes_96"], inplace=True)

## Deal with missing Prague zoning
dist.loc[dist["region"] == "Prague", ["region_zone"]] = "Prague"

dist.isna().any()

In [None]:
## Drop some columns
columns_to_drop = [
    'num_municip_inhab_0_499',
    'num_municip_inhab_500_1999', 
    'num_municip_inhab_2000_9999',
    'num_municip_inhab_10000_', 
    'num_cities'
    ]
dist.drop(columns=columns_to_drop, axis = 1, inplace = True)

In [None]:
dist.head()

In [None]:
dist[["perc_unemploy_95", "perc_unemploy_96"]].corr()

In [None]:
dist[["num_crimes_95", "num_crimes_96"]].corr()

In [None]:
## Since those variables are correlated, let's drop both and keep only the average value
dist["num_crimes"]   = round((dist["num_crimes_95"] + dist["num_crimes_96"]) / 2, 2)
dist["unemployment"] = round((dist["perc_unemploy_95"] + dist["perc_unemploy_96"]) / 2, 2)

## Calculate the unemployment variation in %
dist["unemployment_delta"] = round((dist["perc_unemploy_96"] - dist["perc_unemploy_95"]) / dist["perc_unemploy_95"] , 2)
dist["crimes_delta"]       = round((dist["num_crimes_96"] - dist["num_crimes_95"]) / dist["num_crimes_95"] * 100, 2)


In [None]:
## Drop columns
columns_to_drop = [
    'perc_unemploy_95',
    'perc_unemploy_96',
    'num_crimes_95',
    'num_crimes_96'
]

dist.drop(columns=columns_to_drop, axis = 1, inplace = True)

In [None]:
dist

In [None]:
dist["region"].replace(["Prague", "Bohemia", "Moravia"], range(0,3), inplace = True)
dist["region_zone"].replace(["north", "west", "south", "east", "central", "Prague"], range(0,6), inplace = True)

In [None]:
dist

In [None]:
dist.to_csv("dados/cleaned/district.csv", index = False)

## Loan

In [85]:
loan_dev = pd.read_csv("dados/pre-processed/loan_dev.csv")
loan_comp  = pd.read_csv("dados/pre-processed/loan_comp.csv")

## Drop some columns
columns_to_drop = ["loan_year", "loan_month", "loan_day"]
loan_dev.drop(columns=columns_to_drop, axis = 1, inplace = True)
loan_comp.drop(columns=columns_to_drop, axis = 1, inplace = True)

loan_dev.to_csv("dados/cleaned/loan_dev.csv", index = False)
loan_comp.to_csv("dados/cleaned/loan_comp.csv", index = False)

## Transaction

In [None]:
## Load training(dev) and testing(comp) datasets
trans_dev = pd.read_csv("dados/pre-processed/trans_dev.csv")
trans_dev.drop(["trans_day", "trans_year", "trans_month"], axis = 1, inplace = True)
loan_dev = pd.read_csv("dados/pre-processed/loan_dev.csv", usecols=["duration", "amount", "status", "account_id", "loan_date"])

trans_comp  = pd.read_csv("dados/pre-processed/trans_comp.csv")
trans_comp.drop(["trans_day", "trans_year", "trans_month"], axis = 1, inplace = True)
loan_comp = pd.read_csv("dados/pre-processed/loan_comp.csv", usecols=["duration", "amount", "status", "account_id", "loan_date"])

In [None]:
def clean_data(df):
    ## Consider nan operations as others
    df["operation"].fillna("other", inplace=True)

    ## Consider empty symbols as other symbols
    df["k_symbol"].replace("", "other", inplace=True)
    df["k_symbol"].replace(" ", "other", inplace=True)

    ## Consider nan symbols as none
    df["k_symbol"].fillna("none", inplace=True)

    ## Consider empty bank as other bank
    df["bank"].replace("", "other", inplace=True)


    ## Table that says whether a payment has a characterization
    df["has_symbol"] = df.apply(lambda x: 0 if x["k_symbol"] == "none" else 1, axis = 1)

    ## Table that says whether a payment is of type: sanction
    df["is_sanction"] = df.apply(lambda x: 1 if x["k_symbol"] == "sanction interest if negative balance" else 0, axis = 1)

    return df

In [None]:
def is_account_shared(account_id: int) -> bool:
    return 1 if disp["account_id"].value_counts()[account_id] > 1 else 0
    

In [None]:
def create_features(trans, loan):
    df = pd.merge(trans, loan, on="account_id", suffixes=('_trans', '_loan'))
    newdf = df.copy()

    ############## -> Check if an account only made transactions to account with NAN values <-##############
    df["account"].fillna(0, inplace=True)
    df["to_NAN"] = df.apply(lambda x : 1 if x["account"] == 0 else 0, axis = 1)

    df = df.groupby(['account_id'], as_index=False).agg(
                                only_to_na = pd.NamedAgg(column='to_NAN', aggfunc='min')
                                )
    
    ############## -> Check balance min, avg and max in the N months preceding a loan request <-##############
    ## Convert dates to comparable format
    newdf["trans_date"] = newdf["trans_date"].apply(lambda x: datetime.strptime(x, DATE_FORMAT))
    newdf["loan_date"] = newdf["loan_date"].apply(lambda x: datetime.strptime(x, DATE_FORMAT))

    ## Drop rows with transactions after loan
    newdf[newdf["loan_date"] > newdf["trans_date"]]

    # -> Check the balance of an account in N months before a loan request
    MONTHS = 12
    newdf["diff_days"]   = newdf.apply(lambda x: (x["loan_date"] - x["trans_date"]).days, axis = 1)
    newdf["diff_months"] = newdf.apply(lambda x: math.floor(x["diff_days"] / 30), axis = 1)
    newdf = newdf[newdf["diff_months"] < MONTHS]

    newdf = newdf[["account_id", "balance", "is_sanction"]]

    newdf = newdf.groupby(['account_id'], as_index=False).agg(
                                min_balance = pd.NamedAgg(column='balance', aggfunc='min'),
                                avg_balance = pd.NamedAgg(column='balance', aggfunc='mean'), 
                                max_balance = pd.NamedAgg(column='balance', aggfunc='max'),
                                sanctions   = pd.NamedAgg(column='is_sanction', aggfunc='sum'))

    return pd.merge(newdf, df, on='account_id')

In [None]:
trans_comp  = clean_data(trans_comp)
trans_dev   = clean_data(trans_dev)

In [None]:
trans_comp  = create_features(trans_comp, loan_comp)
trans_dev   = create_features(trans_dev, loan_dev)

In [None]:
trans_comp

In [None]:
trans_comp  = create_features(trans_comp, loan_comp)
trans_dev   = create_features(trans_dev, loan_dev)

In [None]:
trans_comp.to_csv("dados/cleaned/trans_comp.csv", index=False)
trans_dev.to_csv("dados/cleaned/trans_dev.csv", index=False)