In [None]:
import pandas as pd

## Card

In [None]:
## Read competion and develop dataframes
card_comp = pd.read_csv("dados/pre-processed/card_comp.csv")
card_dev = pd.read_csv("dados/pre-processed//card_dev.csv")

## Concat them
card = pd.concat([card_comp, card_dev])

## Drop type column of disposition
disp_df = pd.read_csv("dados/pre-processed/disp.csv")
disp_df.drop("type", axis=1, inplace=True)

## Merge card with disposition (how=Left so we have all clients and not only those with cards)
card_disp = pd.merge(disp_df, card, on="disp_id", how="left")

## Readability
card_disp.rename(columns = {"type":"type_card"}, inplace=True)

## Replace NaN values with "Other"
card_disp["type_card"].fillna("other", inplace=True)

## Type of card from numerical to categorical
card_disp = pd.get_dummies(card_disp, columns = ['type_card'])

## Function to check whether a person has a card
def has_card(row):
    return 0 if pd.isna(row["card_id"]) else 1

## Apply function to new column
card_disp["has_card"] = card_disp.apply(lambda x: has_card(x), axis = 1)

## Drop columns
card_disp.drop(["card_issued_date", "card_id", "account_id", "client_id"], axis=1, inplace=True)

## Save
card_disp.to_csv("dados/cleaned/card.csv", index=False)

card_disp.head()

## Client

In [None]:
## Read useful dataframes
client = pd.read_csv("dados/pre-processed/client.csv")

## Sex from categorical to numerical
client['sex'].replace(['m', 'f'], [0, 1], inplace=True)

## To csv
client.to_csv("dados/cleaned/client.csv")

client.head()

## Disposition

In [None]:
disp   = pd.read_csv("dados/pre-processed/disp.csv")

## Keep only account owners
disp = disp[disp["type"] == "owner"]

## Drop type column
disp.drop("type", axis=1, inplace=True)

## To CSV
disp.to_csv("dados/cleaned/disp.csv", index=False)

disp.head()

## District

In [None]:
## Read df
dist = pd.read_csv("dados/pre-processed/district.csv")

## Standardize values
dist["num_crimes_95"] = round(dist["num_crimes_95"]/dist["num_inhab"] * 1000, 2)
dist["num_crimes_96"] = round(dist["num_crimes_96"]/dist["num_inhab"] * 1000, 2)

In [None]:
## Find missing values
dist.isna().any()

In [None]:
## We have missing values in both perc_unemploy_95 and num_crimes_95. 
## We can deal with it by using the data avaliable in other years
dist["perc_unemploy_95"].fillna(dist["perc_unemploy_96"], inplace=True)
dist["num_crimes_95"].fillna(dist["num_crimes_96"], inplace=True)

## Deal with missing Prague zoning
dist.loc[dist["region"] == "Prague", ["region_zone"]] = "Prague"

dist.isna().any()

In [None]:
## Drop some columns
columns_to_drop = [
    'num_municip_inhab_0_499',
    'num_municip_inhab_500_1999', 
    'num_municip_inhab_2000_9999',
    'num_municip_inhab_10000_', 
    'num_cities'
    ]
dist.drop(columns=columns_to_drop, axis = 1, inplace = True)

In [None]:
dist.head()

In [None]:
dist[["perc_unemploy_95", "perc_unemploy_96"]].corr()

In [None]:
dist[["num_crimes_95", "num_crimes_96"]].corr()

In [79]:
## Since those variables are correlated, let's drop both and keep only the average value
dist["num_crimes"]   = round((dist["num_crimes_95"] + dist["num_crimes_96"]) / 2, 2)
dist["unemployment"] = round((dist["perc_unemploy_95"] + dist["perc_unemploy_96"]) / 2, 2)

## Calculate the unemployment variation in %
dist["unemployment_delta"] = round((dist["perc_unemploy_96"] - dist["perc_unemploy_95"]) / dist["perc_unemploy_95"] , 2)
dist["crimes_delta"]       = round((dist["num_crimes_96"] - dist["num_crimes_95"]) / dist["num_crimes_95"] * 100, 2)


In [80]:
## Drop columns
columns_to_drop = [
    'perc_unemploy_95',
    'perc_unemploy_96',
    'num_crimes_95',
    'num_crimes_96'
]

dist.drop(columns=columns_to_drop, axis = 1, inplace = True)

In [81]:
dist.head()

Unnamed: 0,id,city,region,num_inhab,perc_urban_inhab,avg_salary,enterp_per_1000,region_zone,num_crimes,unemployment,unemployment_delta,crimes_delta
0,1,Hl.m. Praha,Prague,1204953,100.0,12541.0,167,Prague,76.68,0.36,0.48,15.68
1,2,Benesov,Bohemia,88884,46.7,8507.0,132,central,27.18,1.76,0.11,23.84
2,3,Beroun,Bohemia,75232,41.7,8980.0,111,central,37.47,2.08,0.13,-0.4
3,4,Kladno,Bohemia,149893,67.4,9753.0,109,central,37.14,4.84,0.09,12.38
4,5,Kolin,Bohemia,95616,51.4,9307.0,118,central,29.58,4.14,0.15,16.19


In [82]:
dist.to_csv("dados/cleaned/district.csv", index = False)

## Transaction