In [1]:
import numpy as np
import pandas as pd
from typing import List
from pandas.core.frame import DataFrame
import matplotlib.pyplot as plt
import seaborn as sns

from feature_engine.imputation import MeanMedianImputer, CategoricalImputer
from feature_engine.encoding import OneHotEncoder, RareLabelEncoder
from feature_engine.creation import MathFeatures
from feature_engine.discretisation import EqualWidthDiscretiser
from feature_engine.selection import (
    SmartCorrelatedSelection,
    DropHighPSIFeatures,
    SelectBySingleFeaturePerformance,
)
from feature_engine.wrappers import SklearnTransformerWrapper


from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import FunctionTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score
import pickle

import warnings

warnings.filterwarnings("ignore")
pd.options.display.max_columns = None
pd.options.display.max_rows = None

pd.options.display.width = None
pd.options.display.max_colwidth = None
pd.options.display.float_format = "{:,.2f}".format

In [2]:
# ======================= Getting data =======================
def reduce_memory_usage(df: DataFrame) -> DataFrame:
    """Reduced memory usage by downcasting datatype of columns.
    Input: DataFrame
    Output: DataFrame"""

    # Downcasting dataframe
    for column in df:
        if df[column].dtype in ["float64", "float32"]:
            df[column] = pd.to_numeric(df[column], downcast="float")
        if df[column].dtype in ["int64", "int32"]:
            df[column] = pd.to_numeric(df[column], downcast="integer")
    return df


def load_data(name: str) -> DataFrame:
    """Loads DataFrame from csv and reduces used memory.
    Parameters: name (the name of csv file without .csv extension)
    Returns: DataFrame"""

    print("-" * 100)
    print(f"{name}.csv loading")
    df = pd.read_csv(f"{name}.csv")
    memory = df.memory_usage().sum() / 1024**2
    df = reduce_memory_usage(df)
    print(
        f"memory usage reduced from {memory:.1f}MB to {(df.memory_usage().sum() / 1024**2):.1f}MB"
    )
    print("-" * 100)
    return df


# ======================= Initial pipe =======================

def organization_replacer(value: any) -> any:
    """Reduces the number of unique values
    where there are subcategories with ':' sign"""

    if value not in [np.nan, None]:
        x = value.split()[0]
        if x[-1] == ":":
            return x[:-1]
        elif x == "Business":
            return "Business"
    return value

def organization(df: pd.DataFrame) -> pd.DataFrame:
    """Replaces organizations and reduces their numbers
    in ORGANIZATION_TYPE column."""
    
    df["ORGANIZATION_TYPE"] = df["ORGANIZATION_TYPE"].map(organization_replacer)
    return df


def encode_education(df: pd.DataFrame) -> pd.DataFrame:
    """Assigns ordinality to NAME_EDUCATION_TYPE column"""
    
    education = {
        "Secondary / secondary special": 1,
        "Higher education": 3,
        "Incomplete higher": 2,
        "Lower secondary": 0,
        "Academic degree": 4,
    }
    df["NAME_EDUCATION_TYPE"] = df["NAME_EDUCATION_TYPE"].map(education)
    
    return  df


def gender_replacer(df: pd.DataFrame) -> pd.DataFrame:
    """Encodes CODE_GENDER column."""
    
    df["CODE_GENDER"].replace({"XNA": np.nan, "M": 0, "F": 1}, inplace=True)
    return df


def sign_change(df: pd.DataFrame) -> pd.DataFrame:
    """Changes sign of chosen columns."""
    
    for col in ["DAYS_BIRTH", "DAYS_LAST_PHONE_CHANGE",
                "DAYS_ID_PUBLISH", "DAYS_EMPLOYED"]:
        df[col] = df[col].apply(lambda x: x*(-1))
    return df


# ======================= Feature pipe =======================

def devision(x: List) -> int:
    """Devides two features from the list
    avoiding ZeroDevisionError."""

    return x[0] / (x[1] + 0.001)


def sum_dev(x: List) -> int:
    """Performs three features math operation
    from the list avoiding ZeroDevisionError."""

    return (x[0] + x[1]) * x[2] / 2


def weighted_mul(x: List) -> int:
    """Gets weighted sum of three values in a list."""
    
    return x[0] * 2 + x[1] * 3 + x[2] * 4


def remove_special_chars(s: str) -> str:
    """Replaces special characters from string with '_'."""

    return "".join(e if e.isalnum() else "_" for e in s)


def standardize_col_names(df: pd.DataFrame) -> pd.DataFrame:
    """Removes special characters from the """
    
    return df.rename(columns=remove_special_chars)


def nn_mean(x: DataFrame, X_train_prep: DataFrame, y_train: pd.Series) -> DataFrame:
    """Adds two columns to DataFrame of mean values of target for 50 and 100
    nearest neighbors od the poin from training set.
    Parameters: x (DataFrame to be transformer),
                X_train_prep (preprocessed DataFrame to be fitted to NearestNeighbors model)
                y_train (Series of target values to be used to calculate means)."""
    
    # Getting columns of interest
    columns_of_int = [
        "EXT_SOURCE_1",
        "EXT_SOURCE_2",
        "EXT_SOURCE_3",
        "AMT_CREDIT",
        "AMT_ANNUITY",
    ]
    # Getting data for fitting
    df_nn = X_train_prep[columns_of_int]
    df_nn["CREDIT_ANNUITY_RATIO"] =  df_nn["AMT_CREDIT"] / (df_nn["AMT_ANNUITY"]+0.0001)
    # Getting data for neighbors
    df_get = x[columns_of_int]
    df_get["CREDIT_ANNUITY_RATIO"] =  df_get["AMT_CREDIT"] / (df_get["AMT_ANNUITY"]+0.0001)
    # 50 neighbors
    # Fitting model with 50 neighbors
    nn_50 = NearestNeighbors(n_neighbors=50).fit(df_nn)
    # Indices of neighbours
    train_50_neighbours = nn_50.kneighbors(df_get)[1]
    # Calculating means
    new_column_1 = [y_train.iloc[ind].mean() for ind in train_50_neighbours]
    # Adding column
    x["MEAN_50_NN"] = new_column_1

    # 100 neighbors
    nn_100 = NearestNeighbors(n_neighbors=100).fit(df_nn)
    train_100_neighbours = nn_50.kneighbors(df_get)[1]
    new_column_2 = [y_train.iloc[ind].mean() for ind in train_100_neighbours]
    x["MEAN_100_NN"] = new_column_2

    return x


# ======================= Merging pipe =======================
def merging(df):
    df = df.merge(merged, on='SK_ID_CURR', how='left')
    df = df.fillna(0)
    return df 


def getting_model_columns(df):

    # Read column names from the text file
    with open('column_names.txt', 'r') as file:
        column_names = file.read().splitlines()
    return df[column_names]

In [3]:
app = load_data("application_train")
merged = pd.read_csv('preprocessed_data/merged.csv', index_col=0)

----------------------------------------------------------------------------------------------------
application_train.csv loading
memory usage reduced from 286.2MB to 129.3MB
----------------------------------------------------------------------------------------------------


In [4]:
# Read in SK_ID_CURR of test and validation set 
with open('test_sk_id_curr.txt', 'r') as f:
    test_val_ids = []
    for line in f:
        sk_id_curr = int(line.strip())
        test_val_ids.append(sk_id_curr)

# Getting training data
app_train = app.loc[~app['SK_ID_CURR'].isin(test_val_ids)]
X_train = app_train.drop(['TARGET'], axis=1)
y_train = app_train['TARGET']

# Loading pipelines from engineering
with open('initial_pipe.pkl', 'rb') as file:
    initial_pipe = pickle.load(file)

with open('preprocess_pipe.pkl', 'rb') as file:
    preprocess_pipe = pickle.load(file)

with open('feature_pipe.pkl', 'rb') as file:
    feature_pipe = pickle.load(file)

In [5]:
X_initial = initial_pipe.transform(X_train)
X_preprocessed = preprocess_pipe.transform(X_initial)
X_features = feature_pipe.transform(X_preprocessed)

In [6]:
merging_pipe = Pipeline(steps=[
                            # Merging with data from other sources
                            ('merging', FunctionTransformer(merging)),
                            # Getting columns that are used for model training
                            ('getting_model_columns', FunctionTransformer(getting_model_columns))
])

X_merged = merging_pipe.fit_transform(X_features)

In [7]:
with open('model.pkl', 'rb') as file:
    model = pickle.load(file)

In [8]:
model.predict_proba(X_merged)

array([[0.9896145 , 0.01038554],
       [0.94888335, 0.05111665],
       [0.9904215 , 0.00957855],
       ...,
       [0.9080734 , 0.09192657],
       [0.7530327 , 0.2469673 ],
       [0.9107421 , 0.08925792]], dtype=float32)