In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
# unpack the zip file
from shutil import unpack_archive
os.makedirs('./data')
unpack_archive('data.zip', '.')

# read data into pandas
account = pd.read_csv("./data/account.asc", delimiter=';')
card = pd.read_csv("./data/card.asc", delimiter=';')
client = pd.read_csv("./data/client.asc", delimiter=';')
disp = pd.read_csv("./data/disp.asc", delimiter=';')
district = pd.read_csv("./data/district.asc", delimiter=';')
loan = pd.read_csv("./data/loan.asc", delimiter=';')
orders = pd.read_csv("./data/order.asc", delimiter=';')
trans = pd.read_csv("./data/trans.asc", delimiter=';')

# remove data files
from shutil import rmtree
rmtree('./data', ignore_errors=True)

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# info about DataFrames content
#trans_data.info() 

In [4]:
# cleanup data and define types

account["account_id"] = account["account_id"].astype(np.int32)
account["district_id"] = account["district_id"].astype(np.int32)
account["frequency"] = account["frequency"].astype('str')
account["date"] = pd.to_datetime(account["date"],format='%y%m%d')
account["date"] = account["date"].apply(lambda x: x.date())
account.set_index('account_id', inplace=True, verify_integrity=True)

card["card_id"] = card["card_id"].astype(np.int32)
card["disp_id"] = card["disp_id"].astype(np.int32)
card["type"] = card["type"].astype('str')
card["issued"] = pd.to_datetime(card["issued"],format='%y%m%d %H:%M:%S')
card.set_index('card_id', inplace=True, verify_integrity=True)

client["client_id"] = client["client_id"].astype(np.int32)
client["district_id"] = client["district_id"].astype(np.int32)
client["birth_number"] = client["birth_number"].astype('str')
client.set_index('client_id', inplace=True, verify_integrity=True)

disp["disp_id"] = disp["disp_id"].astype(np.int32)
disp["client_id"] = disp["client_id"].astype(np.int32)
disp["account_id"] = disp["account_id"].astype(np.int32)
disp["type"] = disp["type"].astype('str')
disp.set_index('disp_id', inplace=True, verify_integrity=True)

district.columns = district.columns.str.lower()
district["a12"] = pd.to_numeric(district["a12"], errors='coerce')
district["a15"] = pd.to_numeric(district["a15"], errors='coerce')
district["a1"] = district["a1"].astype(np.int32)
district["a2"] = district["a2"].astype('str')
district["a3"] = district["a3"].astype('str')
district["a4"] = district["a4"].astype(np.int32)
district["a5"] = district["a5"].astype(np.int32)
district["a6"] = district["a6"].astype(np.int32)
district["a7"] = district["a7"].astype(np.int32)
district["a8"] = district["a8"].astype(np.int32)
district["a9"] = district["a9"].astype(np.int32)
district["a10"] = district["a10"].astype(np.float32)
district["a11"] = district["a11"].astype(np.int32)
district["a12"] = district["a12"].astype(np.float32)
district["a13"] = district["a13"].astype(np.float32)
district["a14"] = district["a14"].astype(np.int32)
# use float due to ? in the data -> allows for NaN 
district["a15"] = district["a15"].astype(np.float32)
district["a16"] = district["a16"].astype(np.int32)
district.set_index('a1', inplace=True, verify_integrity=True)

loan["loan_id"] = loan["loan_id"].astype(np.int32)
loan["account_id"] = loan["account_id"].astype(np.int32)
loan["date"] = pd.to_datetime(loan["date"],format='%y%m%d')
loan["date"] = loan["date"].apply(lambda x: x.date())
loan["amount"] = loan["amount"].astype(np.float32)
loan["duration"] = loan["duration"].astype(np.int32)
loan["payments"] = loan["payments"].astype(np.float32)
loan["status"] = loan["status"].astype('category')
loan.set_index('loan_id', inplace=True, verify_integrity=True)

orders["order_id"] = orders["order_id"].astype(np.int32)
orders["account_id"] = orders["account_id"].astype(np.int32)
orders["bank_to"] = orders["bank_to"].astype('str')
orders["account_to"] = orders["account_to"].astype('str')
orders["amount"] = orders["amount"].astype(np.float32)
orders["k_symbol"] = orders["k_symbol"].astype('str')
orders.set_index('order_id', inplace=True, verify_integrity=True)

trans["trans_id"] = trans["trans_id"].astype(np.int32)
trans["account_id"] = trans["account_id"].astype(np.int32)
trans["date"] = pd.to_datetime(trans["date"],format='%y%m%d')
trans["date"] = trans["date"].apply(lambda x: x.date())
trans["type"] = trans["type"].astype('str')
trans["operation"] = trans["operation"].astype('str')
trans["amount"] = trans["amount"].astype(np.float32)
trans["balance"] = trans["balance"].astype(np.float32)
trans["k_symbol"] = trans["k_symbol"].astype('str')
trans["bank"] = trans["bank"].astype('str')
trans["account"] = trans["account"].astype('str')
trans.set_index('trans_id', inplace=True, verify_integrity=True)

In [5]:
# host is the bridge gateway of dbnet
DB_HOST = '192.168.0.1' 
DB_PORT = '5432'
DB_DBNAME = 'bank_db'
DB_USERNAME = 'bank_user' 
DB_PASSWORD = 'bank_pw' 
db_str = 'postgresql://{username}:{password}@{host}:{port}/{dbname}'.format(username=DB_USERNAME,password=DB_PASSWORD,host=DB_HOST,port=DB_PORT,dbname=DB_DBNAME)
print(db_str)
#%load_ext sql
#%sql postgresql://bank_user:bank_pw@localhost:5432/bank_db
#%sql select version()

postgresql://bank_user:bank_pw@192.168.0.1:5432/bank_db


In [6]:
# install postgres dependency
# needed to use specific commands since DataFrame.to_sql does not set primary nor foreign keys (https://github.com/pandas-dev/pandas/issues/7984)
!pip install psycopg2
import psycopg2 as pg
drop_commands = (
        """
        DROP TABLE IF EXISTS trans
        """,
        """
        DROP TABLE IF EXISTS orders
        """,
        """
        DROP TABLE IF EXISTS loan
        """,
        """
        DROP TABLE IF EXISTS card
        """,
        """
        DROP TABLE IF EXISTS disp
        """,
        """
        DROP TABLE IF EXISTS client
        """,
        """
        DROP TABLE IF EXISTS account
        """,
        """
        DROP TABLE IF EXISTS district
        """
)
create_commands = (
        """
        CREATE TABLE IF NOT EXISTS district (
            a1 INT PRIMARY KEY NOT NULL,
            a2 TEXT,
            a3 TEXT,
            a4 INT,
            a5 INT,
            a6 INT,
            a7 INT,
            a8 INT,
            a9 INT,
            a10 FLOAT,
            a11 INT,
            a12 FLOAT,
            a13 FLOAT,
            a14 INT,
            a15 INT,
            a16 INT
        )
        """,
        """
        CREATE TABLE IF NOT EXISTS account (
            account_id INT PRIMARY KEY NOT NULL,
            district_id INT NOT NULL REFERENCES district,
            date DATE,
            frequency TEXT
        )
        """,
        """
        CREATE TABLE IF NOT EXISTS client (
            client_id INT PRIMARY KEY NOT NULL,
            birth_number TEXT,
            district_id INT NOT NULL REFERENCES district
        )
        """,
        """
        CREATE TABLE IF NOT EXISTS disp (
            disp_id INT PRIMARY KEY NOT NULL,
            client_id INT NOT NULL REFERENCES client,
            account_id INT NOT NULL REFERENCES account,
            type TEXT
        )
        """,
        """
        CREATE TABLE IF NOT EXISTS card (
            card_id INT PRIMARY KEY NOT NULL,
            disp_id INT NOT NULL REFERENCES disp,
            type TEXT,
            issued TIMESTAMP
        )
        """,
        """
        CREATE TABLE IF NOT EXISTS loan (
            loan_id INT PRIMARY KEY NOT NULL,
            account_id INT NOT NULL REFERENCES account,
            date DATE,
            amount FLOAT,
            duration INT,
            payments FLOAT,
            status TEXT
        )
        """,
        """
        CREATE TABLE IF NOT EXISTS orders (
            order_id INT PRIMARY KEY NOT NULL,
            account_id INT NOT NULL REFERENCES account,
            bank_to TEXT,
            account_to TEXT,
            amount FLOAT,
            k_symbol TEXT
        )
        """,
        """
        CREATE TABLE IF NOT EXISTS trans (
            trans_id INT PRIMARY KEY NOT NULL,
            account_id INT NOT NULL REFERENCES account,
            date DATE,
            type TEXT,
            operation TEXT,
            amount FLOAT,
            balance FLOAT,
            k_symbol TEXT,
            bank TEXT,
            account TEXT
        )
        """
)
conn = pg.connect(db_str)
cur = conn.cursor()
# drop table one by one
for command in drop_commands:
    cur.execute(command)
# create table one by one
for command in create_commands:
    cur.execute(command)
# close communication with the PostgreSQL database server
cur.close()
# commit the changes
conn.commit()
# close the connection
conn.close()

Collecting psycopg2
[?25l  Downloading https://files.pythonhosted.org/packages/23/7e/93c325482c328619870b6cd09370f6dbe1148283daca65115cd63642e60f/psycopg2-2.8.2.tar.gz (368kB)
[K     |████████████████████████████████| 368kB 2.9MB/s eta 0:00:01
[?25hBuilding wheels for collected packages: psycopg2
  Building wheel for psycopg2 (setup.py) ... [?25ldone
[?25h  Stored in directory: /home/jovyan/.cache/pip/wheels/3b/d0/9c/fbbaca1e768e108fdcb88a9a50ea43de141adf842741f8623f
Successfully built psycopg2
Installing collected packages: psycopg2
Successfully installed psycopg2-2.8.2


In [7]:
import pandas.io.sql as sql
import sqlalchemy

engine = sqlalchemy.create_engine(db_str)
# insert data
district.to_sql('district', engine, if_exists='append', chunksize=1000)
account.to_sql('account', engine, if_exists='append', chunksize=1000)
client.to_sql('client', engine, if_exists='append', chunksize=1000)
disp.to_sql('disp', engine, if_exists='append', chunksize=1000)
card.to_sql('card', engine, if_exists='append', chunksize=1000)
loan.to_sql('loan', engine, if_exists='append', chunksize=1000)
orders.to_sql('orders', engine, if_exists='append', chunksize=1000)
trans.to_sql('trans', engine, if_exists='append', chunksize=1000)