In [1]:
%load_ext lab_black

In [2]:
from typing import List, Dict, Any, Union

In [3]:
import base64
from kubernetes import client, config, utils
import yaml
from http import HTTPStatus

config.load_incluster_config()

with open("/var/run/secrets/kubernetes.io/serviceaccount/namespace", "r") as f:
    NAMESPACE = f.read()

core_v1_api = client.CoreV1Api()

In [4]:
PG_NAMESPACE = f"{NAMESPACE}"

In [5]:
!pip install psycopg[binary,pool]



In [6]:
import psycopg
from psycopg import sql

In [7]:
psycopg.__version__

'3.1.13'

# Gather dataset column names and levels into configmap

In [8]:
import pandas as pd
import numpy as np
import json

df = pd.read_csv("../german_credit_data_biased_training.csv")

for col in df.columns:
    if df[col].dtype == np.dtype("O"):
        df[col] = df[col].astype("category")

column_map: Dict[str, Any] = {}
column_map["columns"] = list(df.columns)

column_map["label_columns"] = {
    col: list(df[col].dtype.categories)
    for col in column_map["columns"]
    if type(df[col].dtype) == pd.core.dtypes.dtypes.CategoricalDtype
}
column_map["int_columns"] = [
    col for col in column_map["columns"] if df[col].dtype == np.dtype("int64")
]

column_map

{'columns': ['CheckingStatus',
  'LoanDuration',
  'CreditHistory',
  'LoanPurpose',
  'LoanAmount',
  'ExistingSavings',
  'EmploymentDuration',
  'InstallmentPercent',
  'Sex',
  'OthersOnLoan',
  'CurrentResidenceDuration',
  'OwnsProperty',
  'Age',
  'InstallmentPlans',
  'Housing',
  'ExistingCreditsCount',
  'Job',
  'Dependents',
  'Telephone',
  'ForeignWorker',
  'Risk'],
 'label_columns': {'CheckingStatus': ['0_to_200',
   'greater_200',
   'less_0',
   'no_checking'],
  'CreditHistory': ['all_credits_paid_back',
   'credits_paid_to_date',
   'no_credits',
   'outstanding_credit',
   'prior_payments_delayed'],
  'LoanPurpose': ['appliances',
   'business',
   'car_new',
   'car_used',
   'education',
   'furniture',
   'other',
   'radio_tv',
   'repairs',
   'retraining',
   'vacation'],
  'ExistingSavings': ['100_to_500',
   '500_to_1000',
   'greater_1000',
   'less_100',
   'unknown'],
  'EmploymentDuration': ['1_to_4',
   '4_to_7',
   'greater_7',
   'less_1',
   'unemp

In [9]:
COLUMNS_CONFIG_MAP_NAME = "credit-risk-columns"
cm = client.V1ConfigMap(
    metadata=client.V1ObjectMeta(name=COLUMNS_CONFIG_MAP_NAME, namespace=NAMESPACE),
    data={"columns": json.dumps(column_map, indent=2)},
)

try:
    core_v1_api.create_namespaced_config_map(namespace=NAMESPACE, body=cm)
except client.ApiException as e:
    if e.status == HTTPStatus.CONFLICT:
        core_v1_api.patch_namespaced_config_map(
            namespace=NAMESPACE, name=COLUMNS_CONFIG_MAP_NAME, body=cm
        )
    else:
        raise

# Split data into test and train datasets

In [10]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, train_size=0.8, random_state=42, shuffle=True)

# PostgreSQL

In [11]:
def get_pg_conn(core_v1_api: client.CoreV1Api) -> psycopg.Connection:
    secret = core_v1_api.read_namespaced_secret("postgresql", PG_NAMESPACE)

    to_str = lambda b64_data: base64.b64decode(b64_data).decode("utf-8")
    host, dbname, username, password, port = (
        f"postgresql.{PG_NAMESPACE}.svc",
        to_str(secret.data["database-name"]),
        to_str(secret.data["database-user"]),
        to_str(secret.data["database-password"]),
        5432,
    )

    conn_str = f"postgresql://{username}:{password}@{host}:{port}/{dbname}?application_name=setup"
    conn = psycopg.connect(conn_str)

    return conn

## Verify pg Connection

In [12]:
import logging
import textwrap

log = logging.getLogger("SETUP")

try:
    db = get_pg_conn(core_v1_api)
except NameError:
    raise
except Exception as e:
    # no better exception to catch
    log.error(e)
    log.error(
        textwrap.dedent(
            """
           PG Connection failed, has postgresql been installed?
            """
        )
    )
    exit()

db

<psycopg.Connection [IDLE] (host=postgresql.ntl-us-ibm-com.svc user=ntl database=credit-risk) at 0x7ff76b23e610>

## DB Utils

In [13]:
def table_exists(table_name: str, db: psycopg.Connection) -> Union[dict, bool]:
    with db.cursor() as cur:
        cur.execute(
            "SELECT EXISTS (SELECT 1 FROM pg_tables WHERE tablename = %s AND schemaname = CURRENT_SCHEMA)",
            (table_name,),
        )
        return cur.fetchone()[0]


def build_check_constraints_sql(
    column_info: Dict[str, Any], conn: psycopg.Connection
) -> List[sql.SQL]:
    constraints = []
    for label_col, levels in column_info["label_columns"].items():
        sql_snippet = sql.SQL("CHECK ({0} IN ({1}))").format(
            sql.Identifier(label_col),
            sql.SQL(", ").join([sql.Literal(level) for level in levels]),
        )
        constraints.append(sql_snippet)
    return constraints


def build_create_table_sql(
    conn: psycopg.Connection,
    table_name: str,
    column_info: Dict[str, Any],
    identity_start: int = 1,
) -> sql.SQL:
    label_columns = [
        sql.SQL("{0} VARCHAR(32000) NOT NULL").format(sql.Identifier(col))
        for col in column_info["label_columns"]
    ]
    int_columns = [
        sql.SQL("{0} BIGINT NOT NULL").format(sql.Identifier(col))
        for col in column_info["int_columns"]
    ]
    constraints = build_check_constraints_sql(column_info, conn)
    id_col = [
        sql.SQL(
            '"ACCOUNT_ID" BIGINT GENERATED BY DEFAULT AS IDENTITY (START WITH {})'
        ).format(sql.Literal(identity_start))
    ]

    create_table = sql.SQL("CREATE TABLE {0} ({1})").format(
        sql.Identifier(table_name),
        sql.SQL(", ").join(label_columns + int_columns + constraints + id_col),
    )

    return create_table


def df_to_sql(
    table_name: str,
    db: psycopg.Connection,
    df: pd.DataFrame,
    columns: Dict[str, Any],
    identity_start: int = 1,
) -> None:
    try:
        if table_exists(table_name, db):
            with db.cursor() as cur:
                cur.execute(sql.SQL("DROP TABLE {}").format(sql.Identifier(table_name)))
            db.commit()

        with db.cursor() as cur:
            cur.execute(build_create_table_sql(db, table_name, columns, identity_start))
        db.commit()

        insert_sql = sql.SQL(
            "INSERT INTO {0} ({1}) VALUES("
            + (", ".join(["%s"] * len(columns["columns"])))
            + ")"
        ).format(
            sql.Identifier(table_name),
            sql.SQL(",").join([sql.Identifier(col) for col in columns["columns"]]),
        )

        with db.cursor() as cur:
            df_tuples = tuple([tuple(x) for x in df.loc[:, columns["columns"]].values])
            cur.executemany(insert_sql, df_tuples)
        db.commit()

    except Exception as e:
        db.rollback()
        raise


def insert_from_row_dicts(
    table_name: str,
    row_dicts: List[Dict[str, Union[str, int]]],
    db: psycopg.Connection,
) -> List[int]:
    stmt = None
    stmt_columns = None
    client_ids = []
    try:
        for row_dict in row_dicts:
            row_cols = list(set(row_dict.keys()))

            insert = sql.SQL(
                "INSERT INTO {0} ({1}) VALUES("
                + (", ".join(["%s"] * len(row_cols)))
                + ') RETURNING "ACCOUNT_ID"'
            ).format(
                sql.Identifier(table_name),
                sql.SQL(",").join([sql.Identifier(col) for col in row_cols]),
            )

            with db.cursor() as cur:
                cur.execute(insert, tuple([row_dict[col] for col in row_cols]))
                client_ids = cur.fetchone()[0]
        db.commit()

    except Exception as e:
        db.rollback()
        raise

    return client_ids


def get_column_names(table_name: str, db: psycopg.Connection) -> List[str]:
    with db.cursor() as cur:
        cur.execute(
            sql.SQL(
                "SELECT column_name FROM information_schema.columns WHERE table_name = {} AND table_schema = CURRENT_SCHEMA ORDER BY ORDINAL_POSITION ASC"
            ).format(sql.Literal("my_table"))
        )
    return [row[0] for row in cur.fetchall()]


def get_client_data(
    table_name: str, client_id: int, conn: psycopg.Connection
) -> Dict[str, Any]:
    # https://stackoverflow.com/questions/24006291/postgresql-return-result-set-as-json-array
    with conn.cursor() as cur:
        cur.execute(
            sql.SQL('SELECT json_agg(t) FROM {} AS t WHERE "ACCOUNT_ID" = %s').format(
                sql.Identifier(table_name)
            ),
            (client_id,),
        )
        r = cur.fetchone()
        return r[0][0] if r and r[0] else False

### Create Test and Training tables

In [14]:
df_to_sql("TRAIN", db, train, column_map)
df_to_sql("TEST", db, test, column_map, identity_start=len(train) + 1)

## Create Application table

In [15]:
application_df = test.copy().drop(test.index)
df_to_sql("CLIENT_DATA", db, application_df, column_map, identity_start=9000)

In [16]:
try:
    with db.cursor() as cur:
        cur.execute('ALTER TABLE "CLIENT_DATA" ALTER COLUMN "Risk" DROP NOT NULL')
        cur.execute(
            """
        ALTER TABLE "CLIENT_DATA" 
        ADD COLUMN "PredictedRisk" VARCHAR(32000) CHECK("PredictedRisk" IN ('No Risk', 'Risk'))
        """
        ),

        cur.execute(
            """
        ALTER TABLE "CLIENT_DATA" 
        ADD COLUMN "ExplainRisk" VARCHAR(32000),
        ADD CONSTRAINT "ExplainRisk_CHK" CHECK("PredictedRisk" <> 'Risk' OR "ExplainRisk" IS NOT NULL)
        """
        )

        cur.execute(
            """
        ALTER TABLE "CLIENT_DATA"
        ADD COLUMN "LastChangeTimestamp" TIMESTAMP(6) without time zone DEFAULT clock_timestamp() NOT NULL
        """,
        )
        db.commit()
except Exception as e:
    db.rollback()
    raise

## Insert JSON

In [17]:
input_json = """
{
    "CheckingStatus":"no_checking",
    "LoanDuration":31,
    "CreditHistory":"outstanding_credit",
    "LoanPurpose":"repairs",
    "LoanAmount":8411,
    "ExistingSavings":"500_to_1000",
    "EmploymentDuration":"4_to_7",
    "InstallmentPercent":5,
    "Sex":"male",
    "OthersOnLoan":"co-applicant",
    "CurrentResidenceDuration":5,
    "OwnsProperty":"unknown",
    "Age":46,
    "InstallmentPlans":"none",
    "Housing":"free",
    "ExistingCreditsCount":2,
    "Job":"management_self-employed",
    "Dependents":2,
    "Telephone":"yes",
    "ForeignWorker":"yes",
    "PredictedRisk":"Risk"
}
"""

In [18]:
row_dicts = [json.loads(input_json)]
row_dicts[0]["ExplainRisk"] = " AND ".join(
    ["Age > 36.00", "CurrentResidenceDuration > 3.00", "InstallmentPercent > 4.00"]
)
insert_from_row_dicts("CLIENT_DATA", row_dicts, db)

9000

### Load row as json

In [19]:
print(json.dumps(get_client_data("CLIENT_DATA", 9000, db), indent=2))

{
  "CheckingStatus": "no_checking",
  "CreditHistory": "outstanding_credit",
  "LoanPurpose": "repairs",
  "ExistingSavings": "500_to_1000",
  "EmploymentDuration": "4_to_7",
  "Sex": "male",
  "OthersOnLoan": "co-applicant",
  "OwnsProperty": "unknown",
  "InstallmentPlans": "none",
  "Housing": "free",
  "Job": "management_self-employed",
  "Telephone": "yes",
  "ForeignWorker": "yes",
  "Risk": null,
  "LoanDuration": 31,
  "LoanAmount": 8411,
  "InstallmentPercent": 5,
  "CurrentResidenceDuration": 5,
  "Age": 46,
  "ExistingCreditsCount": 2,
  "Dependents": 2,
  "ACCOUNT_ID": 9000,
  "PredictedRisk": "Risk",
  "ExplainRisk": "Age > 36.00 AND CurrentResidenceDuration > 3.00 AND InstallmentPercent > 4.00",
  "LastChangeTimestamp": "2023-12-04T23:47:10.830091"
}


In [20]:
import pandas as pd

In [21]:
with db.cursor() as cur:
    cur.execute('SELECT * FROM "CLIENT_DATA" LIMIT 1 OFFSET 0')
    df = pd.DataFrame(cur.fetchall(), columns=[desc[0] for desc in cur.description])

df

Unnamed: 0,CheckingStatus,CreditHistory,LoanPurpose,ExistingSavings,EmploymentDuration,Sex,OthersOnLoan,OwnsProperty,InstallmentPlans,Housing,...,LoanAmount,InstallmentPercent,CurrentResidenceDuration,Age,ExistingCreditsCount,Dependents,ACCOUNT_ID,PredictedRisk,ExplainRisk,LastChangeTimestamp
0,no_checking,outstanding_credit,repairs,500_to_1000,4_to_7,male,co-applicant,unknown,none,free,...,8411,5,5,46,2,2,9000,Risk,Age > 36.00 AND CurrentResidenceDuration > 3.0...,2023-12-04 23:47:10.830091


In [22]:
query = psycopg.sql.SQL(
    'SELECT "ACCOUNT_ID", "Risk", "PredictedRisk" FROM {} ORDER BY "ACCOUNT_ID" ASC LIMIT %s OFFSET %s'
).format(psycopg.sql.Identifier("CLIENT_DATA"))

with db.cursor() as cur:
    cur.execute(query, (25, 0))
    columns = [c[0] for c in cur.description]
    row = cur.fetchone()
    while row:
        print({columns[i]: row[i] for i in range(len(columns))})
        row = cur.fetchone()

{'ACCOUNT_ID': 9000, 'Risk': None, 'PredictedRisk': 'Risk'}


In [23]:
with db.cursor() as cur:
    cur.execute(
        psycopg.sql.SQL(
            'SELECT json_agg(t) FROM {} AS t WHERE "ACCOUNT_ID" = %s'
        ).format(psycopg.sql.Identifier("CLIENT_DATA")),
        (9000,),
    )
    r = cur.fetchone()

    print(type(r[0][0]))

<class 'dict'>


In [24]:
row_cols = json.loads(input_json)
insert = psycopg.sql.SQL(
    'INSERT INTO {0} ({1}) VALUES({2}) RETURNING "ACCOUNT_ID"'
).format(
    psycopg.sql.Identifier("CLIENT_DATA"),
    psycopg.sql.SQL(", ").join([psycopg.sql.Identifier(col) for col in row_cols]),
    psycopg.sql.SQL(", ").join([psycopg.sql.SQL("%s")] * len(row_cols)),
)

print(insert.as_string(db))

INSERT INTO "CLIENT_DATA" ("CheckingStatus", "LoanDuration", "CreditHistory", "LoanPurpose", "LoanAmount", "ExistingSavings", "EmploymentDuration", "InstallmentPercent", "Sex", "OthersOnLoan", "CurrentResidenceDuration", "OwnsProperty", "Age", "InstallmentPlans", "Housing", "ExistingCreditsCount", "Job", "Dependents", "Telephone", "ForeignWorker", "PredictedRisk") VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) RETURNING "ACCOUNT_ID"


In [25]:
db.rollback()
query = sql.SQL(
    'SELECT EXTRACT(EPOCH FROM (CURRENT_TIMESTAMP - "LastChangeTimestamp")) / 60 < 1080 FROM "CLIENT_DATA" '
)

with db.cursor() as cur:
    cur.execute(query)
    print(cur.fetchall())

[(True,)]


# Create secret with AWS/MinIo Credentials
# Kubeflow install stores these in mlpipeline-minio-artifact

In [26]:
minio_creds = core_v1_api.read_namespaced_secret("mlpipeline-minio-artifact", NAMESPACE)
AWS_ACCESS_KEY_ID = minio_creds.data["accesskey"]
AWS_SECRET_ACCESS_KEY = minio_creds.data["secretkey"]

In [27]:
new_secret = client.V1Secret(
    metadata=client.V1ObjectMeta(
        name="minio-credentials",
        namespace=NAMESPACE,
        annotations={
            "serving.kserve.io/s3-endpoint": "minio-service.kubeflow:9000",
            "serving.kserve.io/s3-region": "us-west1",
            "serving.kserve.io/s3-useanoncredential": "false",
            "serving.kserve.io/s3-usehttps": "0",
        },
    ),
    data={
        "AWS_ACCESS_KEY_ID": AWS_ACCESS_KEY_ID,
        "AWS_SECRET_ACCESS_KEY": AWS_SECRET_ACCESS_KEY,
    },
)

In [29]:
try:
    core_v1_api.create_namespaced_secret(
        NAMESPACE,
        new_secret,
    )
except Exception as e:
    core_v1_api.patch_namespaced_secret("minio-credentials", NAMESPACE, new_secret)