In [2]:
import pandas as pd

## Card

In [5]:
## Read competion and develop dataframes
card_comp = pd.read_csv("dados/pre-processed/card_comp.csv")
card_dev = pd.read_csv("dados/pre-processed//card_dev.csv")

## Concat them
card = pd.concat([card_comp, card_dev])

## Drop type column of disposition
disp_df = pd.read_csv("dados/pre-processed/disp.csv")
disp_df.drop("type", axis=1, inplace=True)

## Merge card with disposition (how=Left so we have all clients and not only those with cards)
card_disp = pd.merge(disp_df, card, on="disp_id", how="left")

## Readability
card_disp.rename(columns = {"type":"type_card"}, inplace=True)

## Replace NaN values with "Other"
card_disp["type_card"].fillna("other", inplace=True)

## Type of card from numerical to categorical
card_disp = pd.get_dummies(card_disp, columns = ['type_card'])

## Function to check whether a person has a card
def has_card(row):
    return 0 if pd.isna(row["card_id"]) else 1

## Apply function to new column
card_disp["has_card"] = card_disp.apply(lambda x: has_card(x), axis = 1)

## Drop columns
card_disp.drop(["card_issued_date", "card_id", "account_id", "client_id"], axis=1, inplace=True)

## Save
card_disp.to_csv("dados/cleaned/card.csv", index=False)

card_disp.head()

Unnamed: 0,disp_id,card_issued_year,card_issued_month,card_issued_day,type_card_classic,type_card_gold,type_card_junior,type_card_other,has_card
0,1,,,,0,0,0,1,0
1,2,,,,0,0,0,1,0
2,3,,,,0,0,0,1,0
3,4,,,,0,0,0,1,0
4,5,,,,0,0,0,1,0


## Client

In [3]:
## Read useful dataframes
client = pd.read_csv("dados/pre-processed/client.csv")

## Sex from categorical to numerical
client['sex'].replace(['m', 'f'], [0, 1], inplace=True)

## To csv
client.to_csv("dados/cleaned/client.csv")

client.head()

Unnamed: 0,client_id,district_id,birthdate_year,birthdate_month,birthdate_day,sex
0,1,18,1970,12,13,1
1,2,1,1945,2,4,0
2,3,1,1940,10,9,1
3,4,5,1956,12,1,0
4,5,5,1960,7,3,1


## Disposition

In [6]:
disp   = pd.read_csv("dados/pre-processed/disp.csv")

## Keep only account owners
disp = disp[disp["type"] == "owner"]

## Drop type column
disp.drop("type", axis=1, inplace=True)

## To CSV
disp.to_csv("dados/cleaned/disp.csv", index=False)

disp.head()

Unnamed: 0,disp_id,client_id,account_id
0,1,1,1
1,2,2,2
3,4,4,3
5,6,6,4
6,7,7,5


## District

In [8]:
## Read df
dist = pd.read_csv("dados/pre-processed/district.csv")

dist.head()

Unnamed: 0,id,city,region,num_inhab,num_municip_inhab_0_499,num_municip_inhab_500_1999,num_municip_inhab_2000_9999,num_municip_inhab_10000_,num_cities,perc_urban_inhab,avg_salary,perc_unemploy_95,perc_unemploy_96,enterp_per_1000,num_crimes_95,num_crimes_96,region_zone
0,1,Hl.m. Praha,Prague,1204953,0,0,0,1,1,100.0,12541.0,0.29,0.43,167,85677.0,99107,
1,2,Benesov,Bohemia,88884,80,26,6,2,5,46.7,8507.0,1.67,1.85,132,2159.0,2674,central
2,3,Beroun,Bohemia,75232,55,26,4,1,5,41.7,8980.0,1.95,2.21,111,2824.0,2813,central
3,4,Kladno,Bohemia,149893,63,29,6,2,6,67.4,9753.0,4.64,5.05,109,5244.0,5892,central
4,5,Kolin,Bohemia,95616,65,30,4,1,6,51.4,9307.0,3.85,4.43,118,2616.0,3040,central


## Transaction