# AML Mini-Challenge - Credit Card Affinity Modelling

Dominik Filliger & Noah Leuenberger


# Task

The task can be found [here](https://spaces.technik.fhnw.ch/storage/uploads/spaces/82/exercises/20240218__AML_Trainingscenter_MiniChallenge_Kreditkarten_Aufgabenstellung-1708412668.pdf).

# Setup

In [None]:
import pandas as pd
from datetime import datetime

import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

# Data Import & Preprocessing

## Accounts

In [None]:
accounts = pd.read_csv("data/account.csv", sep=";")
accounts['frequency'] = accounts['frequency'].map({
    "POPLATEK MESICNE": "MONTHLY ISSUANCE",
    "POPLATEK TYDNE": "WEEKLY ISSUANCE",
    "POPLATEK PO OBRATU": "ISSUANCE AFTER TRANSACTION"
})
accounts['date'] = pd.to_datetime(accounts['date'], format='%y%m%d')
accounts['frequency'] = accounts['frequency'].astype('category')

accounts.rename(columns={'date': 'account_created', 
                         'frequency': 'account_frequency'}, inplace=True)

display(accounts.head(), accounts.info())

## Clients

In [None]:
clients = pd.read_csv("data/client.csv", sep=";")

def parse_birth_number(birth_number):
    birth_number_str = str(birth_number)
    
    # Extract year, month, and day from birth number from string
    # according to https://sorry.vse.cz/~berka/challenge/PAST/index.html
    year = int(birth_number_str[:2])
    month = int(birth_number_str[2:4])
    day = int(birth_number_str[4:6])

    # Determine sex based on month and adjust month for female clients
    # according to https://sorry.vse.cz/~berka/challenge/PAST/index.html
    if month > 50:
        sex = "Female"
        month -= 50
    else:
        sex = "Male"

    # Validate date
    assert 1 <= month <= 12
    assert 1 <= day <= 31
    assert 0 <= year <= 99
    
    if month in [4, 6, 9, 11]:
        assert 1 <= day <= 30
    elif month == 2:
        assert 1 <= day <= 29
    else:
        assert 1 <= day <= 31

    # Assuming all dates are in the 1900s
    birth_date = datetime(1900 + year, month, day)
    return pd.Series([sex, birth_date])

clients[['client_sex', 'birth_date']] = clients['birth_number'].apply(parse_birth_number)

# Calculate 'age' assuming the reference year is 1999
clients['age'] = clients['birth_date'].apply(lambda x: 1999 - x.year)

# Drop 'birth_number' column as it is no longer needed
clients = clients.drop(columns=['birth_number'])

clients.head()

## Dispositions

In [None]:
dispositions = pd.read_csv("data/disp.csv", sep=";")
dispositions['type'] = dispositions['type'].astype('category')

before = len(dispositions)
dispositions = dispositions[dispositions['type'] == "OWNER"]
after = len(dispositions)
print(f"Dropped {before - after} dispositions that are not of type 'OWNER'.")

if len(dispositions['type'].unique()) == 1:
    dispositions = dispositions.drop(columns=['type'])
    
dispositions.head()

## Orders

In [None]:
orders = pd.read_csv("data/order.csv", sep=";")

# Translated from Czech to English
# according to https://sorry.vse.cz/~berka/challenge/PAST/index.html
orders['k_symbol'] = orders['k_symbol'].map({
    "POJISTNE": "Insurance_Payment",
    "SIPO": "Household",
    "LEASING": "Leasing",
    "UVER": "Loan_Payment"
})

orders['k_symbol'] = orders['k_symbol'].astype('category')
orders['bank_to'] = orders['bank_to'].astype('category')

orders = orders.rename(columns={'amount': 'debited_amount'})

orders.head()

## Transactions

In [None]:
# column 8 is the 'bank' column which contains NaNs and must be read as string
transactions = pd.read_csv("data/trans.csv", sep=";", dtype={8: str})

transactions['date'] = pd.to_datetime(transactions['date'], format='%y%m%d')

# Translated type from Czech to English
# according to https://sorry.vse.cz/~berka/challenge/PAST/index.html
transactions['type'] = transactions['type'].map({
    "VYBER": "VYDAJ",
    "PRIJEM": "Credit",
    "VYDAJ": "Withdrawal"
})

# Translated operations from Czech to English
# according to https://sorry.vse.cz/~berka/challenge/PAST/index.html
transactions['operation'] = transactions['operation'].map({
    "VYBER KARTOU": "Credit Card Withdrawal",
    "VKLAD": "Credit in Cash",
    "PREVOD Z UCTU": "Collection from Another Bank",
    "VYBER": "Withdrawal in Cash",
    "PREVOD NA UCET": "Remittance to Another Bank"
})

# Translated characteristics from Czech to English
# according to https://sorry.vse.cz/~berka/challenge/PAST/index.html
transactions['k_symbol'] = transactions['k_symbol'].map({
    "POJISTNE": "Insurance Payment",
    "SLUZBY": "Payment on Statement",
    "UROK": "Interest Credited",
    "SANKC. UROK": "Sanction Interest",
    "SIPO": "Household",
    "DUCHOD": "Old-age Pension",
    "UVER": "Loan Payment"
})

transactions['bank'] = transactions['bank'].replace('', np.nan)

transactions['amount'] = np.where(transactions['type'] == "Credit", transactions['amount'], -transactions['amount'])
transactions.rename(columns={'type': 'transaction_type'}, inplace=True)

transactions.head(500)

## Loans

In [None]:
loans = pd.read_csv("data/loan.csv", sep=";")

loans['date'] = pd.to_datetime(loans['date'], format='%y%m%d')

loans['status'] = loans['status'].map({
    "A": "Contract finished, no problems",
    "B": "Contract finished, loan not payed",
    "C": "Contract running, OK thus-far",
    "D": "Contract running, client in debt"
})

# Rename columns
# according to https://sorry.vse.cz/~berka/challenge/PAST/index.html
loans.rename(columns={
    'date': 'loan_granted_date',
    'amount': 'loan_amount',
    'duration': 'loan_duration',
    'payments': 'loan_monthly_payments',
    'status': 'loan_status'
}, inplace=True)

loans.head()

## Credit Cards

In [None]:
cards = pd.read_csv("data/card.csv", sep=";")
cards['type'] = cards['type'].astype('category')

cards['issued'] = pd.to_datetime(cards['issued'], format='%y%m%d %H:%M:%S').dt.date
cards.rename(columns={'type': 'card_type', 
                      'issued': 'card_issued'}, inplace=True)

cards.head()

## Demographic data

In [None]:
districts = pd.read_csv("data/district.csv", sep=";")

# Rename columns
# according to https://sorry.vse.cz/~berka/challenge/PAST/index.html
districts.rename(columns={
    'A1': 'district_id',
    'A2': 'district_name',
    'A3': 'region',
    'A4': 'inhabitants',
    'A5': 'small_municipalities',
    'A6': 'medium_municipalities',
    'A7': 'large_municipalities',
    'A8': 'huge_municipalities',
    'A9': 'cities',
    'A10': 'ratio_urban_inhabitants',
    'A11': 'average_salary',
    'A12': 'unemployment_rate_1995',
    'A13': 'unemployment_rate_1996',
    'A14': 'entrepreneurs_per_1000_inhabitants',
    'A15': 'crimes_committed_1995',
    'A16': 'crimes_committed_1996'
}, inplace=True)

for col in ['unemployment_rate_1995', 'unemployment_rate_1996', 'crimes_committed_1995', 'crimes_committed_1996']:
    districts[col] = pd.to_numeric(districts[col], errors='coerce')

districts = districts.astype({'region': 'category', 
                              'district_name': 'category'})
districts.head(), districts.dtypes

## Merge Data

# Model Construction

# Feature Engineering

# Model Engineering

# Model Comparison & Selection