In [23]:
import pandas as pd
import numpy as np
import os

In [24]:
# unpack the zip file
from shutil import unpack_archive
os.makedirs('./data', exist_ok=True)
unpack_archive('data.zip', '.')

# read data into pandas
account = pd.read_csv("./data/account.asc", delimiter=';')
card = pd.read_csv("./data/card.asc", delimiter=';')
client = pd.read_csv("./data/client.asc", delimiter=';')
disp = pd.read_csv("./data/disp.asc", delimiter=';')
district = pd.read_csv("./data/district.asc", delimiter=';')
loan = pd.read_csv("./data/loan.asc", delimiter=';')
orders = pd.read_csv("./data/orders.asc", delimiter=';')
trans = pd.read_csv("./data/trans.asc", delimiter=';')

# remove data files
from shutil import rmtree
rmtree('./data', ignore_errors=True)

In [25]:
# info about DataFrames content
#trans_data.info() 

In [26]:
# cleanup data and define types

account["account_id"] = account["account_id"].astype(np.int32)
account["district_id"] = account["district_id"].astype(np.int32)
account["frequency"] = account["frequency"].astype('str')
account["date"] = pd.to_datetime(account["date"],format='%y%m%d')
account["date"] = account["date"].apply(lambda x: x.date())
account.set_index('account_id', inplace=True, verify_integrity=True)

card["card_id"] = card["card_id"].astype(np.int32)
card["disp_id"] = card["disp_id"].astype(np.int32)
card["type"] = card["type"].astype('str')
card["issued"] = pd.to_datetime(card["issued"],format='%y%m%d %H:%M:%S')
card.set_index('card_id', inplace=True, verify_integrity=True)

client["client_id"] = client["client_id"].astype(np.int32)
client["district_id"] = client["district_id"].astype(np.int32)
client["birth_number"] = client["birth_number"].astype('str')
client.set_index('client_id', inplace=True, verify_integrity=True)

disp["disp_id"] = disp["disp_id"].astype(np.int32)
disp["client_id"] = disp["client_id"].astype(np.int32)
disp["account_id"] = disp["account_id"].astype(np.int32)
disp["type"] = disp["type"].astype('str')
disp.set_index('disp_id', inplace=True, verify_integrity=True)

district.columns = district.columns.str.lower()
district["a12"] = pd.to_numeric(district["a12"], errors='coerce')
district["a15"] = pd.to_numeric(district["a15"], errors='coerce')
district["a1"] = district["a1"].astype(np.int32)
district["a2"] = district["a2"].astype('str')
district["a3"] = district["a3"].astype('str')
district["a4"] = district["a4"].astype(np.int32)
district["a5"] = district["a5"].astype(np.int32)
district["a6"] = district["a6"].astype(np.int32)
district["a7"] = district["a7"].astype(np.int32)
district["a8"] = district["a8"].astype(np.int32)
district["a9"] = district["a9"].astype(np.int32)
district["a10"] = district["a10"].astype(np.float32)
district["a11"] = district["a11"].astype(np.int32)
district["a12"] = district["a12"].astype(np.float32)
district["a13"] = district["a13"].astype(np.float32)
district["a14"] = district["a14"].astype(np.int32)
# use float due to ? in the data -> allows for NaN 
district["a15"] = district["a15"].astype(np.float32)
district["a16"] = district["a16"].astype(np.int32)
district.set_index('a1', inplace=True, verify_integrity=True)

loan["loan_id"] = loan["loan_id"].astype(np.int32)
loan["account_id"] = loan["account_id"].astype(np.int32)
loan["date"] = pd.to_datetime(loan["date"],format='%y%m%d')
loan["date"] = loan["date"].apply(lambda x: x.date())
loan["amount"] = loan["amount"].astype(np.float32)
loan["duration"] = loan["duration"].astype(np.int32)
loan["payments"] = loan["payments"].astype(np.float32)
loan["status"] = loan["status"].astype('category')
loan.set_index('loan_id', inplace=True, verify_integrity=True)

orders["order_id"] = orders["order_id"].astype(np.int32)
orders["account_id"] = orders["account_id"].astype(np.int32)
orders["bank_to"] = orders["bank_to"].astype('str')
orders["account_to"] = orders["account_to"].astype('str')
orders["amount"] = orders["amount"].astype(np.float32)
orders["k_symbol"] = orders["k_symbol"].astype('str')
orders.set_index('order_id', inplace=True, verify_integrity=True)

trans["trans_id"] = trans["trans_id"].astype(np.int32)
trans["account_id"] = trans["account_id"].astype(np.int32)
trans["date"] = pd.to_datetime(trans["date"],format='%y%m%d')
trans["date"] = trans["date"].apply(lambda x: x.date())
trans["type"] = trans["type"].astype('str')
trans["operation"] = trans["operation"].astype('str')
trans["amount"] = trans["amount"].astype(np.float32)
trans["balance"] = trans["balance"].astype(np.float32)
trans["k_symbol"] = trans["k_symbol"].astype('str')
trans["bank"] = trans["bank"].astype('str')
trans["account"] = trans["account"].astype('str')
trans.set_index('trans_id', inplace=True, verify_integrity=True)

In [27]:
# host is the bridge gateway of dbnet
DB_HOST = '192.168.0.1' 
DB_PORT = '5432'
DB_DBNAME = 'bank_db'
DB_USERNAME = 'bank_user' 
DB_PASSWORD = 'bank_pw' 
db_str = 'postgresql://{username}:{password}@{host}:{port}/{dbname}'.format(username=DB_USERNAME,password=DB_PASSWORD,host=DB_HOST,port=DB_PORT,dbname=DB_DBNAME)
print(db_str)
#%load_ext sql
#%sql postgresql://bank_user:bank_pw@localhost:5432/bank_db
#%sql select version()

postgresql://bank_user:bank_pw@192.168.0.1:5432/bank_db


In [28]:
# install postgres dependency
# needed to use specific commands since DataFrame.to_sql does not set primary nor foreign keys (https://github.com/pandas-dev/pandas/issues/7984)
!pip install psycopg2
import psycopg2 as pg
drop_commands = (
        """
        DROP TABLE IF EXISTS trans
        """,
        """
        DROP TABLE IF EXISTS orders
        """,
        """
        DROP TABLE IF EXISTS loan
        """,
        """
        DROP TABLE IF EXISTS card
        """,
        """
        DROP TABLE IF EXISTS disp
        """,
        """
        DROP TABLE IF EXISTS client
        """,
        """
        DROP TABLE IF EXISTS account
        """,
        """
        DROP TABLE IF EXISTS district
        """
)
create_commands = (
        """
        CREATE TABLE IF NOT EXISTS district (
            a1 INT PRIMARY KEY NOT NULL,
            a2 TEXT,
            a3 TEXT,
            a4 INT,
            a5 INT,
            a6 INT,
            a7 INT,
            a8 INT,
            a9 INT,
            a10 FLOAT,
            a11 INT,
            a12 FLOAT,
            a13 FLOAT,
            a14 INT,
            a15 INT,
            a16 INT
        )
        """,
        """
        CREATE TABLE IF NOT EXISTS account (
            account_id INT PRIMARY KEY NOT NULL,
            district_id INT NOT NULL REFERENCES district,
            date DATE,
            frequency TEXT
        )
        """,
        """
        CREATE TABLE IF NOT EXISTS client (
            client_id INT PRIMARY KEY NOT NULL,
            birth_number TEXT,
            district_id INT NOT NULL REFERENCES district
        )
        """,
        """
        CREATE TABLE IF NOT EXISTS disp (
            disp_id INT PRIMARY KEY NOT NULL,
            client_id INT NOT NULL REFERENCES client,
            account_id INT NOT NULL REFERENCES account,
            type TEXT
        )
        """,
        """
        CREATE TABLE IF NOT EXISTS card (
            card_id INT PRIMARY KEY NOT NULL,
            disp_id INT NOT NULL REFERENCES disp,
            type TEXT,
            issued TIMESTAMP
        )
        """,
        """
        CREATE TABLE IF NOT EXISTS loan (
            loan_id INT PRIMARY KEY NOT NULL,
            account_id INT NOT NULL REFERENCES account,
            date DATE,
            amount FLOAT,
            duration INT,
            payments FLOAT,
            status TEXT
        )
        """,
        """
        CREATE TABLE IF NOT EXISTS orders (
            order_id INT PRIMARY KEY NOT NULL,
            account_id INT NOT NULL REFERENCES account,
            bank_to TEXT,
            account_to TEXT,
            amount FLOAT,
            k_symbol TEXT
        )
        """,
        """
        CREATE TABLE IF NOT EXISTS trans (
            trans_id INT PRIMARY KEY NOT NULL,
            account_id INT NOT NULL REFERENCES account,
            date DATE,
            type TEXT,
            operation TEXT,
            amount FLOAT,
            balance FLOAT,
            k_symbol TEXT,
            bank TEXT,
            account TEXT
        )
        """
)
conn = pg.connect(db_str)
cur = conn.cursor()
# drop table one by one
for command in drop_commands:
    cur.execute(command)
# create table one by one
for command in create_commands:
    cur.execute(command)
# close communication with the PostgreSQL database server
cur.close()
# commit the changes
conn.commit()
# close the connection
conn.close()

adf
adf
adf
adf
adf
adf
adf
adf


In [8]:
import pandas.io.sql as sql
import sqlalchemy

engine = sqlalchemy.create_engine(db_str)
# insert data
print(district)
district.to_sql('district', engine, if_exists='append', chunksize=1000)
account.to_sql('account', engine, if_exists='append', chunksize=1000)
client.to_sql('client', engine, if_exists='append', chunksize=1000)
disp.to_sql('disp', engine, if_exists='append', chunksize=1000)
card.to_sql('card', engine, if_exists='append', chunksize=1000)
loan.to_sql('loan', engine, if_exists='append', chunksize=1000)
orders.to_sql('orders', engine, if_exists='append', chunksize=1000)
trans.to_sql('trans', engine, if_exists='append', chunksize=1000)

                     a2               a3       a4   a5  a6  a7  a8  a9  \
a1                                                                       
1           Hl.m. Praha           Prague  1204953    0   0   0   1   1   
2               Benesov  central Bohemia    88884   80  26   6   2   5   
3                Beroun  central Bohemia    75232   55  26   4   1   5   
4                Kladno  central Bohemia   149893   63  29   6   2   6   
5                 Kolin  central Bohemia    95616   65  30   4   1   6   
6            Kutna Hora  central Bohemia    77963   60  23   4   2   4   
7                Melnik  central Bohemia    94725   38  28   1   3   6   
8        Mlada Boleslav  central Bohemia   112065   95  19   7   1   8   
9               Nymburk  central Bohemia    81344   61  23   4   2   6   
10       Praha - vychod  central Bohemia    92084   55  29   4   3   5   
11        Praha - zapad  central Bohemia    75637   35  36   9   0   7   
12              Pribram  central Bohem

IntegrityError: (psycopg2.errors.UniqueViolation) duplicate key value violates unique constraint "district_pkey"
DETAIL:  Key (a1)=(1) already exists.

[SQL: INSERT INTO district (a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16) VALUES (%(a1)s, %(a2)s, %(a3)s, %(a4)s, %(a5)s, %(a6)s, %(a7)s, %(a8)s, %(a9)s, %(a10)s, %(a11)s, %(a12)s, %(a13)s, %(a14)s, %(a15)s, %(a16)s)]
[parameters: ({'a1': 1, 'a2': 'Hl.m. Praha', 'a3': 'Prague', 'a4': 1204953, 'a5': 0, 'a6': 0, 'a7': 0, 'a8': 1, 'a9': 1, 'a10': 100.0, 'a11': 12541, 'a12': 0.28999999165534973, 'a13': 0.4300000071525574, 'a14': 167, 'a15': 85677.0, 'a16': 99107}, {'a1': 2, 'a2': 'Benesov', 'a3': 'central Bohemia', 'a4': 88884, 'a5': 80, 'a6': 26, 'a7': 6, 'a8': 2, 'a9': 5, 'a10': 46.70000076293945, 'a11': 8507, 'a12': 1.6699999570846558, 'a13': 1.850000023841858, 'a14': 132, 'a15': 2159.0, 'a16': 2674}, {'a1': 3, 'a2': 'Beroun', 'a3': 'central Bohemia', 'a4': 75232, 'a5': 55, 'a6': 26, 'a7': 4, 'a8': 1, 'a9': 5, 'a10': 41.70000076293945, 'a11': 8980, 'a12': 1.9500000476837158, 'a13': 2.2100000381469727, 'a14': 111, 'a15': 2824.0, 'a16': 2813}, {'a1': 4, 'a2': 'Kladno', 'a3': 'central Bohemia', 'a4': 149893, 'a5': 63, 'a6': 29, 'a7': 6, 'a8': 2, 'a9': 6, 'a10': 67.4000015258789, 'a11': 9753, 'a12': 4.639999866485596, 'a13': 5.050000190734863, 'a14': 109, 'a15': 5244.0, 'a16': 5892}, {'a1': 5, 'a2': 'Kolin', 'a3': 'central Bohemia', 'a4': 95616, 'a5': 65, 'a6': 30, 'a7': 4, 'a8': 1, 'a9': 6, 'a10': 51.400001525878906, 'a11': 9307, 'a12': 3.8499999046325684, 'a13': 4.429999828338623, 'a14': 118, 'a15': 2616.0, 'a16': 3040}, {'a1': 6, 'a2': 'Kutna Hora', 'a3': 'central Bohemia', 'a4': 77963, 'a5': 60, 'a6': 23, 'a7': 4, 'a8': 2, 'a9': 4, 'a10': 51.5, 'a11': 8546, 'a12': 2.950000047683716, 'a13': 4.019999980926514, 'a14': 126, 'a15': 2640.0, 'a16': 3120}, {'a1': 7, 'a2': 'Melnik', 'a3': 'central Bohemia', 'a4': 94725, 'a5': 38, 'a6': 28, 'a7': 1, 'a8': 3, 'a9': 6, 'a10': 63.400001525878906, 'a11': 9920, 'a12': 2.259999990463257, 'a13': 2.869999885559082, 'a14': 130, 'a15': 4289.0, 'a16': 4846}, {'a1': 8, 'a2': 'Mlada Boleslav', 'a3': 'central Bohemia', 'a4': 112065, 'a5': 95, 'a6': 19, 'a7': 7, 'a8': 1, 'a9': 8, 'a10': 69.4000015258789, 'a11': 11277, 'a12': 1.25, 'a13': 1.440000057220459, 'a14': 127, 'a15': 5179.0, 'a16': 4987}  ... displaying 10 of 77 total bound parameter sets ...  {'a1': 76, 'a2': 'Sumperk', 'a3': 'north Moravia', 'a4': 127369, 'a5': 31, 'a6': 32, 'a7': 13, 'a8': 2, 'a9': 7, 'a10': 51.20000076293945, 'a11': 8369, 'a12': 4.730000019073486, 'a13': 5.880000114440918, 'a14': 107, 'a15': 3736.0, 'a16': 2807}, {'a1': 77, 'a2': 'Vsetin', 'a3': 'north Moravia', 'a4': 148545, 'a5': 8, 'a6': 35, 'a7': 12, 'a8': 3, 'a9': 4, 'a10': 53.5, 'a11': 8909, 'a12': 4.010000228881836, 'a13': 5.559999942779541, 'a14': 113, 'a15': 3460.0, 'a16': 3590})]
(Background on this error at: http://sqlalche.me/e/gkpj)