# AML Mini-Challenge - Credit Card Affinity Modelling

Dominik Filliger & Noah Leuenberger


# Task

The task can be found [here](https://spaces.technik.fhnw.ch/storage/uploads/spaces/82/exercises/20240218__AML_Trainingscenter_MiniChallenge_Kreditkarten_Aufgabenstellung-1708412668.pdf).

# Setup

In [None]:
import pandas as pd
from datetime import datetime

import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

# Data Import & Preprocessing

## Accounts

In [None]:
accounts = pd.read_csv("data/account.csv", sep=";")
accounts['frequency'] = accounts['frequency'].map({
    "POPLATEK MESICNE": "MONTHLY ISSUANCE",
    "POPLATEK TYDNE": "WEEKLY ISSUANCE",
    "POPLATEK PO OBRATU": "ISSUANCE AFTER TRANSACTION"
})
accounts['date'] = pd.to_datetime(accounts['date'], format='%y%m%d')
accounts['frequency'] = accounts['frequency'].astype('category')

accounts.rename(columns={'date': 'account_created', 
                         'frequency': 'account_frequency'}, inplace=True)

display(accounts.head(), accounts.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4500 entries, 0 to 4499
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   account_id         4500 non-null   int64         
 1   district_id        4500 non-null   int64         
 2   account_frequency  4500 non-null   category      
 3   account_created    4500 non-null   datetime64[ns]
dtypes: category(1), datetime64[ns](1), int64(2)
memory usage: 110.1 KB


Unnamed: 0,account_id,district_id,account_frequency,account_created
0,576,55,MONTHLY ISSUANCE,1993-01-01
1,3818,74,MONTHLY ISSUANCE,1993-01-01
2,704,55,MONTHLY ISSUANCE,1993-01-01
3,2378,16,MONTHLY ISSUANCE,1993-01-01
4,2632,24,MONTHLY ISSUANCE,1993-01-02


None

## Clients

In [None]:
clients = pd.read_csv("data/client.csv", sep=";")

def parse_birth_number(birth_number):
    birth_number_str = str(birth_number)
    
    # Extract year, month, and day from birth number from string
    # according to https://sorry.vse.cz/~berka/challenge/PAST/index.html
    year = int(birth_number_str[:2])
    month = int(birth_number_str[2:4])
    day = int(birth_number_str[4:6])

    # Determine sex based on month and adjust month for female clients
    # according to https://sorry.vse.cz/~berka/challenge/PAST/index.html
    if month > 50:
        sex = "Female"
        month -= 50
    else:
        sex = "Male"

    # Validate date
    assert 1 <= month <= 12
    assert 1 <= day <= 31
    assert 0 <= year <= 99
    
    if month in [4, 6, 9, 11]:
        assert 1 <= day <= 30
    elif month == 2:
        assert 1 <= day <= 29
    else:
        assert 1 <= day <= 31

    # Assuming all dates are in the 1900s
    birth_date = datetime(1900 + year, month, day)
    return pd.Series([sex, birth_date])

clients[['client_sex', 'birth_date']] = clients['birth_number'].apply(parse_birth_number)

# Calculate 'age' assuming the reference year is 1999
clients['age'] = clients['birth_date'].apply(lambda x: 1999 - x.year)

# Drop 'birth_number' column as it is no longer needed
clients = clients.drop(columns=['birth_number'])

clients.head()

Unnamed: 0,client_id,district_id,client_sex,birth_date,age
0,1,18,Female,1970-12-13,29
1,2,1,Male,1945-02-04,54
2,3,1,Female,1940-10-09,59
3,4,5,Male,1956-12-01,43
4,5,5,Female,1960-07-03,39


## Dispositions

In [None]:
dispositions = pd.read_csv("data/disp.csv", sep=";")
dispositions['type'] = dispositions['type'].astype('category')

before = len(dispositions)
dispositions = dispositions[dispositions['type'] == "OWNER"]
after = len(dispositions)
print(f"Dropped {before - after} dispositions that are not of type 'OWNER'.")

if len(dispositions['type'].unique()) == 1:
    dispositions = dispositions.drop(columns=['type'])
    
dispositions.head()

Dropped 869 dispositions that are not of type 'OWNER'.


Unnamed: 0,disp_id,client_id,account_id
0,1,1,1
1,2,2,2
3,4,4,3
5,6,6,4
6,7,7,5


## Orders

In [None]:
orders = pd.read_csv("data/order.csv", sep=";")

# Translated from Czech to English
# according to https://sorry.vse.cz/~berka/challenge/PAST/index.html
orders['k_symbol'] = orders['k_symbol'].map({
    "POJISTNE": "Insurance_Payment",
    "SIPO": "Household",
    "LEASING": "Leasing",
    "UVER": "Loan_Payment"
})

orders['k_symbol'] = orders['k_symbol'].astype('category')
orders['bank_to'] = orders['bank_to'].astype('category')

orders = orders.rename(columns={'amount': 'debited_amount'})

orders.head()

Unnamed: 0,order_id,account_id,bank_to,account_to,debited_amount,k_symbol
0,29401,1,YZ,87144583,2452.0,Household
1,29402,2,ST,89597016,3372.7,Loan_Payment
2,29403,2,QR,13943797,7266.0,Household
3,29404,3,WX,83084338,1135.0,Household
4,29405,3,CD,24485939,327.0,


## Transactions

In [None]:
# column 8 is the 'bank' column which contains NaNs and must be read as string
transactions = pd.read_csv("data/trans.csv", sep=";", dtype={8: str})

transactions['date'] = pd.to_datetime(transactions['date'], format='%y%m%d')

# Translated type from Czech to English
# according to https://sorry.vse.cz/~berka/challenge/PAST/index.html
transactions['type'] = transactions['type'].map({
    "VYBER": "VYDAJ",
    "PRIJEM": "Credit",
    "VYDAJ": "Withdrawal"
})

# Translated operations from Czech to English
# according to https://sorry.vse.cz/~berka/challenge/PAST/index.html
transactions['operation'] = transactions['operation'].map({
    "VYBER KARTOU": "Credit Card Withdrawal",
    "VKLAD": "Credit in Cash",
    "PREVOD Z UCTU": "Collection from Another Bank",
    "VYBER": "Withdrawal in Cash",
    "PREVOD NA UCET": "Remittance to Another Bank"
})

# Translated characteristics from Czech to English
# according to https://sorry.vse.cz/~berka/challenge/PAST/index.html
transactions['k_symbol'] = transactions['k_symbol'].map({
    "POJISTNE": "Insurance Payment",
    "SLUZBY": "Payment on Statement",
    "UROK": "Interest Credited",
    "SANKC. UROK": "Sanction Interest",
    "SIPO": "Household",
    "DUCHOD": "Old-age Pension",
    "UVER": "Loan Payment"
})

transactions['bank'] = transactions['bank'].replace('', np.nan)

transactions['amount'] = np.where(transactions['type'] == "Credit", transactions['amount'], -transactions['amount'])
transactions.rename(columns={'type': 'transaction_type'}, inplace=True)

transactions.head(500)

Unnamed: 0,trans_id,account_id,date,transaction_type,operation,amount,balance,k_symbol,bank,account
0,695247,2378,1993-01-01,Credit,Credit in Cash,700.0,700.0,,,
1,171812,576,1993-01-01,Credit,Credit in Cash,900.0,900.0,,,
2,207264,704,1993-01-01,Credit,Credit in Cash,1000.0,1000.0,,,
3,1117247,3818,1993-01-01,Credit,Credit in Cash,600.0,600.0,,,
4,579373,1972,1993-01-02,Credit,Credit in Cash,400.0,400.0,,,
...,...,...,...,...,...,...,...,...,...,...
495,3629868,3007,1993-02-28,Credit,,72.7,23324.7,Interest Credited,,
496,3585453,1675,1993-02-28,Credit,,13.7,5440.7,Interest Credited,,
497,3583849,1628,1993-02-28,Credit,,20.7,9286.1,Interest Credited,,
498,3584202,1637,1993-02-28,Credit,,25.9,11977.8,Interest Credited,,


## Loans

In [None]:
loans = pd.read_csv("data/loan.csv", sep=";")

loans['date'] = pd.to_datetime(loans['date'], format='%y%m%d')

loans['status'] = loans['status'].map({
    "A": "Contract finished, no problems",
    "B": "Contract finished, loan not payed",
    "C": "Contract running, OK thus-far",
    "D": "Contract running, client in debt"
})

# Rename columns
# according to https://sorry.vse.cz/~berka/challenge/PAST/index.html
loans.rename(columns={
    'date': 'loan_granted_date',
    'amount': 'loan_amount',
    'duration': 'loan_duration',
    'payments': 'loan_monthly_payments',
    'status': 'loan_status'
}, inplace=True)

loans.head()

Unnamed: 0,loan_id,account_id,loan_granted_date,loan_amount,loan_duration,loan_monthly_payments,loan_status
0,5314,1787,1993-07-05,96396,12,8033.0,"Contract finished, loan not payed"
1,5316,1801,1993-07-11,165960,36,4610.0,"Contract finished, no problems"
2,6863,9188,1993-07-28,127080,60,2118.0,"Contract finished, no problems"
3,5325,1843,1993-08-03,105804,36,2939.0,"Contract finished, no problems"
4,7240,11013,1993-09-06,274740,60,4579.0,"Contract finished, no problems"


## Credit Cards

In [None]:
cards = pd.read_csv("data/card.csv", sep=";")
cards['type'] = cards['type'].astype('category')

cards['issued'] = pd.to_datetime(cards['issued'], format='%y%m%d %H:%M:%S').dt.date
cards.rename(columns={'type': 'card_type', 
                      'issued': 'card_issued'}, inplace=True)

cards.head()

Unnamed: 0,card_id,disp_id,card_type,card_issued
0,1005,9285,classic,1993-11-07
1,104,588,classic,1994-01-19
2,747,4915,classic,1994-02-05
3,70,439,classic,1994-02-08
4,577,3687,classic,1994-02-15


## Demographic data

In [None]:
districts = pd.read_csv("data/district.csv", sep=";")

# Rename columns
# according to https://sorry.vse.cz/~berka/challenge/PAST/index.html
districts.rename(columns={
    'A1': 'district_id',
    'A2': 'district_name',
    'A3': 'region',
    'A4': 'inhabitants',
    'A5': 'small_municipalities',
    'A6': 'medium_municipalities',
    'A7': 'large_municipalities',
    'A8': 'huge_municipalities',
    'A9': 'cities',
    'A10': 'ratio_urban_inhabitants',
    'A11': 'average_salary',
    'A12': 'unemployment_rate_1995',
    'A13': 'unemployment_rate_1996',
    'A14': 'entrepreneurs_per_1000_inhabitants',
    'A15': 'crimes_committed_1995',
    'A16': 'crimes_committed_1996'
}, inplace=True)

for col in ['unemployment_rate_1995', 'unemployment_rate_1996', 'crimes_committed_1995', 'crimes_committed_1996']:
    districts[col] = pd.to_numeric(districts[col], errors='coerce')

districts = districts.astype({'region': 'category', 
                              'district_name': 'category'})

districts.head()

Unnamed: 0,district_id,district_name,region,inhabitants,small_municipalities,medium_municipalities,large_municipalities,huge_municipalities,cities,ratio_urban_inhabitants,average_salary,unemployment_rate_1995,unemployment_rate_1996,entrepreneurs_per_1000_inhabitants,crimes_committed_1995,crimes_committed_1996
0,1,Hl.m. Praha,Prague,1204953,0,0,0,1,1,100.0,12541,0.29,0.43,167,85677.0,99107
1,2,Benesov,central Bohemia,88884,80,26,6,2,5,46.7,8507,1.67,1.85,132,2159.0,2674
2,3,Beroun,central Bohemia,75232,55,26,4,1,5,41.7,8980,1.95,2.21,111,2824.0,2813
3,4,Kladno,central Bohemia,149893,63,29,6,2,6,67.4,9753,4.64,5.05,109,5244.0,5892
4,5,Kolin,central Bohemia,95616,65,30,4,1,6,51.4,9307,3.85,4.43,118,2616.0,3040


## Merge Data

# Model Construction

# Feature Engineering

# Model Engineering

# Model Comparison & Selection

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=7d865ccc-7b5c-4f8d-b4e6-4e008791345d' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>