# Finalising pipeline to transform data for predictions <a class="jp-toc-ignore"></a>

The aim of this notebook is to finalize the pipeline that could be used for predicting probability of defaulting for new entries in application_train.csv

It also provides a code to generate example dictionary that could be used to make prediction in deployed app using Swagger [https://default-risk-fastapi-f4fhso7e5q-nw.a.run.app/docs](https://default-risk-fastapi-f4fhso7e5q-nw.a.run.app/docs)

In [1]:
import numpy as np
import pandas as pd
import random
from typing import List
from pandas.core.frame import DataFrame
import json

from feature_engine.imputation import MeanMedianImputer, CategoricalImputer
from feature_engine.encoding import OneHotEncoder, RareLabelEncoder
from feature_engine.creation import MathFeatures
from feature_engine.discretisation import EqualWidthDiscretiser

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import FunctionTransformer

import joblib
import pickle

import warnings

warnings.filterwarnings("ignore")

In [2]:
# ======================= Getting data =======================
def reduce_memory_usage(df: DataFrame) -> DataFrame:
    """Reduced memory usage by downcasting datatype of columns.
    Input: DataFrame
    Output: DataFrame"""

    # Downcasting dataframe
    for column in df:
        if df[column].dtype in ["float64", "float32"]:
            df[column] = pd.to_numeric(df[column], downcast="float")
        if df[column].dtype in ["int64", "int32"]:
            df[column] = pd.to_numeric(df[column], downcast="integer")
    return df


def load_data(name: str) -> DataFrame:
    """Loads DataFrame from csv and reduces used memory.
    Parameters: name (the name of csv file without .csv extension)
    Returns: DataFrame"""

    print("-" * 100)
    print(f"{name}.csv loading")
    df = pd.read_csv(f"{name}.csv")
    memory = df.memory_usage().sum() / 1024**2
    df = reduce_memory_usage(df)
    print(
        f"memory usage reduced from {memory:.1f}MB to {(df.memory_usage().sum() / 1024**2):.1f}MB"
    )
    print("-" * 100)
    return df


# ======================= Initial pipe =======================


def organization_replacer(value: any) -> any:
    """Reduces the number of unique values
    where there are subcategories with ':' sign"""

    if value not in [np.nan, None]:
        x = value.split()[0]
        if x[-1] == ":":
            return x[:-1]
        elif x == "Business":
            return "Business"
    return value


def organization(df: pd.DataFrame) -> pd.DataFrame:
    """Replaces organizations and reduces their numbers
    in ORGANIZATION_TYPE column."""

    df["ORGANIZATION_TYPE"] = df["ORGANIZATION_TYPE"].map(organization_replacer)
    return df


def encode_education(df: pd.DataFrame) -> pd.DataFrame:
    """Assigns ordinality to NAME_EDUCATION_TYPE column"""

    education = {
        "Secondary / secondary special": 1,
        "Higher education": 3,
        "Incomplete higher": 2,
        "Lower secondary": 0,
        "Academic degree": 4,
    }
    df["NAME_EDUCATION_TYPE"] = df["NAME_EDUCATION_TYPE"].replace(education)

    return df


def gender_replacer(df: pd.DataFrame) -> pd.DataFrame:
    """Encodes CODE_GENDER column."""

    df["CODE_GENDER"].replace({"XNA": np.nan, "M": 0, "F": 1}, inplace=True)
    return df


def sign_change(df: pd.DataFrame) -> pd.DataFrame:
    """Changes sign of chosen columns."""

    for col in [
        "DAYS_BIRTH",
        "DAYS_LAST_PHONE_CHANGE",
        "DAYS_ID_PUBLISH",
        "DAYS_EMPLOYED",
    ]:
        df[col] = df[col].apply(lambda x: x * (-1))
    return df


# ======================= Feature pipe =======================


def devision(x: List) -> int:
    """Devides two features from the list
    avoiding ZeroDevisionError."""

    return x[0] / (x[1] + 0.001)


def sum_dev(x: List) -> int:
    """Performs three features math operation
    from the list avoiding ZeroDevisionError."""

    return (x[0] + x[1]) * x[2] / 2


def weighted_mul(x: List) -> int:
    """Gets weighted sum of three values in a list."""

    return x[0] * 2 + x[1] * 3 + x[2] * 4


def remove_special_chars(s: str) -> str:
    """Replaces special characters from string with '_'."""

    return "".join(e if e.isalnum() else "_" for e in s)


def standardize_col_names(df: pd.DataFrame) -> pd.DataFrame:
    """Removes special characters from the"""

    return df.rename(columns=remove_special_chars)


def nn_mean(x: DataFrame, X_train_prep: DataFrame, y_train: pd.Series) -> DataFrame:
    """Adds two columns to DataFrame of mean values of target for 50 and 100
    nearest neighbors od the poin from training set.
    Parameters: x (DataFrame to be transformer),
                X_train_prep (preprocessed DataFrame to be fitted to NearestNeighbors model)
                y_train (Series of target values to be used to calculate means)."""

    # Getting columns of interest
    columns_of_int = [
        "EXT_SOURCE_1",
        "EXT_SOURCE_2",
        "EXT_SOURCE_3",
        "AMT_CREDIT",
        "AMT_ANNUITY",
    ]
    # Getting data for fitting
    df_nn = X_train_prep[columns_of_int]
    df_nn["CREDIT_ANNUITY_RATIO"] = df_nn["AMT_CREDIT"] / (
        df_nn["AMT_ANNUITY"] + 0.0001
    )
    # Getting data for neighbors
    df_get = x[columns_of_int]
    df_get["CREDIT_ANNUITY_RATIO"] = df_get["AMT_CREDIT"] / (
        df_get["AMT_ANNUITY"] + 0.0001
    )
    # 50 neighbors
    # Fitting model with 50 neighbors
    nn_50 = NearestNeighbors(n_neighbors=50).fit(df_nn)
    # Indices of neighbours
    train_50_neighbours = nn_50.kneighbors(df_get)[1]
    # Calculating means
    new_column_1 = [y_train.iloc[ind].mean() for ind in train_50_neighbours]
    # Adding column
    x["MEAN_50_NN"] = new_column_1

    # 100 neighbors
    nn_100 = NearestNeighbors(n_neighbors=100).fit(df_nn)
    train_100_neighbours = nn_50.kneighbors(df_get)[1]
    new_column_2 = [y_train.iloc[ind].mean() for ind in train_100_neighbours]
    x["MEAN_100_NN"] = new_column_2

    return x


# ======================= Merging pipe =======================
def merging(df):
    """Merges data with columns generated through aggregations from
    previous applications."""

    df = df.merge(merged, on="SK_ID_CURR", how="left")
    df = df.fillna(0)

    return df


def getting_model_columns(df):
    """Selects most important columns."""

    # Read column names from the text file
    with open("column_names.txt", "r") as file:
        column_names = file.read().splitlines()

    return df[column_names]

# Loading data

In [3]:
app = load_data("application_train")
merged = pd.read_csv("preprocessed_data/merged.csv", index_col=0)

----------------------------------------------------------------------------------------------------
application_train.csv loading
memory usage reduced from 286.2MB to 129.3MB
----------------------------------------------------------------------------------------------------


# Loading model

In [4]:
# Loading XGBClassifier that was trained in Modeling part
model = joblib.load("model.joblib")

# Preparation of final pipeline

As this step is done only once and the final pipeline is stored, the code is commented.

In [5]:
# # Read in SK_ID_CURR of test and validation set
# with open("test_sk_id_curr.txt", "r") as f:
#     test_val_ids = []
#     for line in f:
#         sk_id_curr = int(line.strip())
#         test_val_ids.append(sk_id_curr)

# # Getting training data
# app_train = app.loc[~app["SK_ID_CURR"].isin(test_val_ids)]
# X_train = app_train.drop(["TARGET"], axis=1)
# y_train = app_train["TARGET"]

# # Loading pipelines from engineering
# with open("initial_pipe.pkl", "rb") as file:
#     initial_pipe = pickle.load(file)

# with open("preprocess_pipe.pkl", "rb") as file:
#     preprocess_pipe = pickle.load(file)

# with open("feature_pipe.pkl", "rb") as file:
#     feature_pipe = pickle.load(file)

# # Transforming features
# X_initial = initial_pipe.transform(X_train)
# X_preprocessed = preprocess_pipe.transform(X_initial)
# X_features = feature_pipe.transform(X_preprocessed)

# # Creating and fitting pipeline for merging and selecting features
# merging_pipe = Pipeline(
#     steps=[
#         # Merging with data from other sources
#         ("merging", FunctionTransformer(merging)),
#         # Getting columns that are used for model training
#         ("getting_model_columns", FunctionTransformer(getting_model_columns)),
#     ]
# )

# # Fitting with preprocessed features
# merging_pipe.fit(X_features)

# # Storing pipeline
# pickle.dump(merging_pipe, open("merging_pipe.pkl", "wb"))

# # Making a final pipeline
# final_pipeline = Pipeline(
#     [
#         ("initial", initial_pipe),
#         ("preprocess", preprocess_pipe),
#         ("feature", feature_pipe),
#         ("merge", merging_pipe),
#     ]
# )

# # Storing pipeline
# pickle.dump(final_pipeline, open("final_pipeline.pkl", "wb"))

# Loading final pipeline

In [6]:
# Load final pipeline
with open("final_pipeline.pkl", "rb") as file:
    final_pipeline = pickle.load(file)

# Testing final pipeline

To test the pipeline we will use 4 random indices from application train.

In [7]:
# Generate four random numbers
random_numbers = [random.randint(0, app.shape[0] - 1) for _ in range(4)]

# Get X out of application_train.csv
X = app.drop(["TARGET"], axis=1)

# Generate DataFrame for predictions
X_testing = X.iloc[random_numbers, :]

# Transforming selected rows with final pipeline
X_pipeline = final_pipeline.transform(X_testing)

print("Probability of defaulting: ")
# Predicting probability of defaulting
for i, proba in zip(random_numbers, model.predict_proba(X_pipeline)[:, 1]):
    print(f"index {i}: {proba}")

Probability of defaulting: 
index 142105: 0.011837313883006573
index 8373: 0.006339050829410553
index 27306: 0.021818062290549278
index 205324: 0.023363055661320686


# Making batch predictions on new entries in application_train locally

1. Select rows of interest from the dataframe and store in variable ```X_pred```
2. Transform with final pipeline: ```X_transformed = final_pipeline.transform(X_pred)```
3. Make predictions of defaulting: ```model.predict_proba(X_transformed)```

# Generating input for API predictions

This section provides an example code to generate entries for predictions with deployed app at [https://default-risk-fastapi-f4fhso7e5q-nw.a.run.app/docs](https://default-risk-fastapi-f4fhso7e5q-nw.a.run.app/docs)

* Go to the API
* Click *predict*
* Click *Try it out*
* Generate the input with code

```python
# Choosing a random entry
X_api = X.iloc[[123], :]

# Transforming
X_api = final_pipeline.transform(X_api)

# Store in dictionary
X_api = X_api.to_dict(orient='record')
```
* Copy the dictionary and past to try it out
* Get the prediction

In [8]:
# Choosing a random entry
X_api = X.iloc[[123], :]

# Transforming
X_api = final_pipeline.transform(X_api).astype(float)

# Store in dictionary
X_api = X_api.to_dict(orient="record")[0]
json_X_api = json.dumps(X_api)
json_X_api

'{"CODE_GENDER": 1.0, "AMT_INCOME_TOTAL": 112500.0, "AMT_CREDIT": 535500.0, "AMT_ANNUITY": 30028.5, "AMT_GOODS_PRICE": 535500.0, "NAME_EDUCATION_TYPE": 1.0, "REGION_POPULATION_RELATIVE": 0.028663000091910362, "DAYS_BIRTH": 16759.0, "DAYS_EMPLOYED": 4560.0, "OWN_CAR_AGE": 9.0, "FLAG_EMP_PHONE": 1.0, "REGION_RATING_CLIENT": 2.0, "HOUR_APPR_PROCESS_START": 4.0, "REG_CITY_NOT_LIVE_CITY": 0.0, "REG_CITY_NOT_WORK_CITY": 0.0, "LIVE_CITY_NOT_WORK_CITY": 0.0, "EXT_SOURCE_1": 0.5057128667831421, "EXT_SOURCE_2": 0.6116366982460022, "APARTMENTS_AVG": 0.08449999988079071, "YEARS_BEGINEXPLUATATION_AVG": 0.9757000207901001, "YEARS_BUILD_AVG": 0.6668000221252441, "ELEVATORS_AVG": 0.0, "FLOORSMAX_AVG": 0.16670000553131104, "FLOORSMIN_AVG": 0.20829999446868896, "LANDAREA_AVG": 0.042100001126527786, "LIVINGAREA_AVG": 0.0640999972820282, "APARTMENTS_MODE": 0.08609999716281891, "BASEMENTAREA_MODE": 0.06750000268220901, "YEARS_BEGINEXPLUATATION_MODE": 0.9757000207901001, "YEARS_BUILD_MODE": 0.67979997396469

Getting the output with predictive probability of defaulting: 

![default](images/default_risk_test.png)

# Conclusions

The business needs are not define in the requirements of the project. Creating pipeline with preprocessing, feature engineering and predictive model is possible but would require more computational power and would generate additional costs on GCP that's why the transformations are done locally with an option to call the model with prepared data. The solution may not be ideall but good enough for exploring possibilities.