# Data Preprocessing and Migrating

This notebook loads and preprocesses data from CSV and JSON files into a MySQL database (`finance`). It populates the `users`, `mcc`, `cards` and `transactions`

In [1]:
import pandas as pd
from sqlalchemy import create_engine, text

# Configuration variables
MYSQL_HOST = 'mysql'
MYSQL_PORT = '3306'
MYSQL_DATABASE = 'finance'
MYSQL_USER = 'root'
MYSQL_PASSWORD = 'root123'

engine = create_engine(f'mysql+pymysql://{MYSQL_USER}:{MYSQL_PASSWORD}@{MYSQL_HOST}:{MYSQL_PORT}/{MYSQL_DATABASE}')

In [2]:
# TRUNCATE ALL TABLES
with engine.begin() as conn:
    conn.execute(text("SET FOREIGN_KEY_CHECKS = 0"))
    try:
        conn.execute(text("TRUNCATE TABLE transactions"))
        conn.execute(text("TRUNCATE TABLE cards"))
        conn.execute(text("TRUNCATE TABLE mcc_codes"))
        conn.execute(text("TRUNCATE TABLE users"))
    finally:
        conn.execute(text("SET FOREIGN_KEY_CHECKS = 1"))

print("Truncated tables: transactions, cards, mcc_codes, users")

Truncated tables: transactions, cards, mcc_codes, users


In [3]:
# POPULATE USERS TABLE
# Preprocess monetary columns by removing '$' and converting to numeric
df = pd.read_csv('/opt/airflow/project_root/datasets/users_data.csv')
df['per_capita_income'] = pd.to_numeric(df['per_capita_income'].astype(str).str.replace('$', ''), errors='coerce')
df['yearly_income'] = pd.to_numeric(df['yearly_income'].astype(str).str.replace('$', ''), errors='coerce')
df['total_debt'] = pd.to_numeric(df['total_debt'].astype(str).str.replace('$', ''), errors='coerce')

# collumn 'id' to 'client_id' to match the database
df.rename(columns={
    'id': 'client_id',
}, inplace=True)

df.to_sql('users', con=engine, if_exists='append', index=False, chunksize=60000)

2000

In [4]:
# POPULATE MCC_CODES TABLE
import json

# Load JSON data
with open('/opt/airflow/project_root/datasets/mcc_codes.json') as f:
    mcc_dict = json.load(f)

df_mcc = pd.DataFrame(list(mcc_dict.items()), columns=['mcc', 'merchant_type'])
df_mcc['mcc'] = df_mcc['mcc'].astype(int)
df_mcc.to_sql('mcc_codes', con=engine, if_exists='append', index=False)

109

In [5]:
# POPULATE CARDS TABLE
df = pd.read_csv('/opt/airflow/project_root/datasets/cards_data.csv')

# column rename
df.rename(columns={
    'id': 'card_id',
}, inplace=True)

# remove $, convert to numeric
df['credit_limit'] = pd.to_numeric(df['credit_limit'].astype(str).str.replace('$', ''), errors='coerce')

# convert date
df['expires'] = pd.to_datetime(df['expires'], format='%m/%Y', errors='coerce').dt.to_period('M').dt.to_timestamp().dt.date

# convert date
df['acct_open_date'] = pd.to_datetime(df['acct_open_date'], format='%m/%Y', errors='coerce')
df['acct_open_date'] = df['acct_open_date'].fillna(pd.to_datetime(df['acct_open_date'], format='%d/%m/%Y', errors='coerce'))
df['acct_open_date'] = df['acct_open_date'].dt.to_period('M').dt.to_timestamp().dt.date

df.to_sql('cards', con=engine, if_exists='append', index=False, chunksize=60000)

6146

In [6]:
# POPULATE TRANSACTIONS TABLE
df = pd.read_csv('/opt/airflow/project_root/datasets/transactions_data.csv')

# column rename
df.rename(columns={
    'id': 'transaction_id',
    'date': 'trans_date'
}, inplace=True)

# remove $, convert to numeric
df['amount'] = pd.to_numeric(df['amount'].astype(str).str.replace('$', ''), errors='coerce')

# convert datetime
df['trans_date'] = pd.to_datetime(df['trans_date'], errors='coerce')

df.to_sql('transactions', con=engine, if_exists='append', index=False, chunksize=50000)

13305915