In [9]:
import numpy as np
import pandas as pd
from typing import List
from pandas.core.frame import DataFrame
import matplotlib.pyplot as plt
import seaborn as sns

from feature_engine.imputation import MeanMedianImputer, CategoricalImputer
from feature_engine.encoding import OneHotEncoder, RareLabelEncoder
from feature_engine.creation import MathFeatures
from feature_engine.discretisation import EqualWidthDiscretiser
from feature_engine.selection import (
    SmartCorrelatedSelection,
    DropHighPSIFeatures,
    SelectBySingleFeaturePerformance,
)
from feature_engine.wrappers import SklearnTransformerWrapper


from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import FunctionTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score


import warnings

warnings.filterwarnings("ignore")
pd.options.display.max_columns = None
pd.options.display.max_rows = None

pd.options.display.width = None
pd.options.display.max_colwidth = None
pd.options.display.float_format = "{:,.2f}".format

In [None]:
app = load_data("application_train")
merged = pd.read_csv('preprocessed_data/merged.csv', index_col=0)

In [116]:
def organization_replacer(value: any) -> any:
    """Reduces the number of unique values
    where there are subcategories with ':' sign"""

    if value not in [np.nan, None]:
        x = value.split()[0]
        if x[-1] == ":":
            return x[:-1]
        elif x == "Business":
            return "Business"
    return value

def remove_special_chars(s: str) -> str:
    """Replaces special characters from string with '_'."""

    return "".join(e if e.isalnum() else "_" for e in s)


def change_sign(value: [int, float]) -> [int, float]:
    """Changes sign of negative numerical values."""

    if value < 0:
        return value * (-1)


def y_n_encode(value: str) -> int:
    """Encodes values 'Y' with 1 and 'N' with 0."""

    if value == "Y":
        return 1
    elif value == "N":
        return 0

def nn_mean(x: DataFrame, X_tain_prep: DataFrame, y_train: pd.Series) -> DataFrame:
    """Adds two columns to DataFrame of mean values of target for 50 and 100
    nearest neighbors od the poin from training set.
    Parameters: x (DataFrame to be transformer),
                X_train_prep (preprocessed DataFrame to be fitted to NearestNeighbors model)
                y_train (Series of target values to be used to calculate means)."""
    
    # Getting columns of interest
    columns_of_int = [
        "EXT_SOURCE_1",
        "EXT_SOURCE_2",
        "EXT_SOURCE_3",
        "AMT_CREDIT",
        "AMT_ANNUITY",
    ]
    # Getting data for fitting
    df_nn = X_train_prep[columns_of_int]
    df_nn["CREDIT_ANNUITY_RATIO"] =  df_nn["AMT_CREDIT"] / (df_nn["AMT_ANNUITY"]+0.0001)
    # Getting data for neighbors
    df_get = x[columns_of_int]
    df_get["CREDIT_ANNUITY_RATIO"] =  df_get["AMT_CREDIT"] / (df_get["AMT_ANNUITY"]+0.0001)
    # 50 neighbors
    # Fitting model with 50 neighbors
    nn_50 = NearestNeighbors(n_neighbors=50).fit(df_nn)
    # Indices of neighbours
    train_50_neighbours = nn_50.kneighbors(df_get)[1]
    # Calculating means
    new_column_1 = [y_train.iloc[ind].mean() for ind in train_50_neighbours]
    # Adding column
    x["MEAN_50_NN"] = new_column_1

    # 100 neighbors
    nn_100 = NearestNeighbors(n_neighbors=100).fit(df_nn)
    train_100_neighbours = nn_50.kneighbors(df_get)[1]
    new_column_2 = [y_train.iloc[ind].mean() for ind in train_100_neighbours]
    x["MEAN_100_NN"] = new_column_2

    return x


def devision(x: List) -> int:
    """Devides two features from the list
    avoiding ZeroDevisionError."""

    return x[0] / (x[1] + 0.001)


def sum_dev(x: List) -> int:
    """Performs three features math operation
    from the list avoiding ZeroDevisionError."""

    return (x[0] + x[1]) * x[2] / 2


def weighted_mul(x: List) -> int:
    """Gets weighted sum of three values in a list."""
    return x[0] * 2 + x[1] * 3 + x[2] * 4
    
# Assign ordinality to NAME_EDUCATION_TYPE
def encode_education(df: pd.DataFrame) -> pd.DataFrame:
    """Assigns ordinality to NAME_EDUCATION_TYPE column"""
    
    education = {
        "Secondary / secondary special": 1,
        "Higher education": 3,
        "Incomplete higher": 2,
        "Lower secondary": 0,
        "Academic degree": 4,
    }
    df["NAME_EDUCATION_TYPE"] = df["NAME_EDUCATION_TYPE"].map(education)
    
    return  df

def merging(df):
    df = df.merge(merged, on='SK_ID_CURR', how='left')
    df = df.fillna(0)
    return df 



def getting_model_columns(df):

    # Read column names from the text file
    with open('column_names.txt', 'r') as file:
        column_names = file.read().splitlines()
    return df[column_names]


def organization(df):
    df["ORGANIZATION_TYPE"] = df["ORGANIZATION_TYPE"].map(
    organization_replacer)
    return df


def gender_replacer(df):
    df["CODE_GENDER"].replace({"XNA": np.nan, "M": 0, "F": 1}, inplace=True)
    return df


def sign_change(df):
    for col in ["DAYS_BIRTH", "DAYS_LAST_PHONE_CHANGE", "DAYS_ID_PUBLISH", "DAYS_EMPLOYED"]:
        df[col] = df[col].map(change_sign)
    return df

# ======================= Getting data =======================
def reduce_memory_usage(df: DataFrame) -> DataFrame:
    """Reduced memory usage by downcasting datatype of columns.
    Input: DataFrame
    Output: DataFrame"""

    # Downcasting dataframe
    for column in df:
        if df[column].dtype in ["float64", "float32"]:
            df[column] = pd.to_numeric(df[column], downcast="float")
        if df[column].dtype in ["int64", "int32"]:
            df[column] = pd.to_numeric(df[column], downcast="integer")
    return df


def load_data(name: str) -> DataFrame:
    """Loads DataFrame from csv and reduces used memory.
    Parameters: name (the name of csv file without .csv extension)
    Returns: DataFrame"""

    print("-" * 100)
    print(f"{name}.csv loading")
    df = pd.read_csv(f"{name}.csv")
    memory = df.memory_usage().sum() / 1024**2
    df = reduce_memory_usage(df)
    print(
        f"memory usage reduced from {memory:.1f}MB to {(df.memory_usage().sum() / 1024**2):.1f}MB"
    )
    print("-" * 100)
    return df

In [122]:
small_app = app.iloc[:1000, :]
small_app.head(2)
X_train = small_app
y_train = X_train['TARGET']

In [124]:
initial_pipe = Pipeline(steps=[
                                ('education_type', FunctionTransformer(encode_education)),
                                ('organization', FunctionTransformer(organization)), 
                                ('gender', FunctionTransformer(gender_replacer)),
                                ('sign_change', FunctionTransformer(sign_change)),
                                ('ewd', EqualWidthDiscretiser(bins=6, variables=["HOUR_APPR_PROCESS_START"])),
])

preprocess_pipe = Pipeline(steps=[
                                # Reduce cardinality
                                ("rle", RareLabelEncoder(tol=0.04, n_categories=1, variables=categorical, missing_values="ignore")),
                                # Replace NA by the median in numerical features
                                (
                                    "continuous_var_median_imputer",
                                    MeanMedianImputer(imputation_method="median", variables=numerical),
                                ),
                                # Replace NA in discrete variables by most frequent value
                                (
                                    "discreate_var_mode_imputer",
                                    CategoricalImputer(
                                        imputation_method="frequent", ignore_format=True, variables=discrete
                                    ),
                                ),
                                # Replace NA by adding the label "Missing" in categorical variables
                                (
                                    "categorical_imputer",
                                    CategoricalImputer(fill_value="XNA", variables=categorical),
                                ),
                                # Encode categorical variables using one hot encoding
                                ("one_hot_encoder", OneHotEncoder(variables=categorical)),
])

feature_pipe = Pipeline(
    steps=[
        # Adding features
        (
            "f_1",
            MathFeatures(
                ["DAYS_EMPLOYED", "DAYS_BIRTH"], devision, ["RATIO_EMPLOYED_TO_AGE"]
            ),
        ),
        (
            "f_2",
            MathFeatures(
                ["AMT_ANNUITY", "AMT_INCOME_TOTAL"],
                devision,
                ["RATIO_ANNUITY_TO_INCOME"],
            ),
        ),
        (
            "f_3",
            MathFeatures(
                [
                    "REGION_RATING_CLIENT",
                    "REGION_RATING_CLIENT_W_CITY",
                    "AMT_INCOME_TOTAL",
                ],
                sum_dev,
                ["REGION_TO_INCOME"],
            ),
        ),
        (
            "f_4",
            MathFeatures(
                ["AMT_INCOME_TOTAL", "EXT_SOURCE_3"],
                devision,
                ["RATIO_INCOME_TO_EXT_SOURCE"],
            ),
        ),
        (
            "f_5",
            MathFeatures(
                ["AMT_CREDIT", "EXT_SOURCE_3"], devision, ["RATIO_CREDIT_TO_EXT_SOURCE"]
            ),
        ),
        (
            "f_6",
            MathFeatures(
                ["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"],
                [np.sum, np.mean, np.max, np.min, np.prod, weighted_mul],
                [
                    "SUM_EXT_SOURCES",
                    "MEAN_EXT_SOURCES",
                    "MAX_EXTERNAL_SOURCES",
                    "MIN_EXT_SOURCES",
                    "PROD_EXT_SOURCES",
                    "WEIGHTED_EXT_SOURCES",
                ],
            ),
        ),
        (
            "f_7",
            MathFeatures(
                ["AMT_CREDIT", "AMT_ANNUITY"], devision, ["CREDIT_ANNUITY_RATIO"]
            ),
        ),
        (
            "f_8",
            MathFeatures(
                ["REGION_POPULATION_RELATIVE", "AMT_CREDIT"],
                np.prod,
                ["PROD_REGION_POPULATION_AMT_CREDIT"],
            ),
        ),
        (
            "f_9",
            MathFeatures(
                ["REGION_RATING_CLIENT", "AMT_INCOME_TOTAL"],
                np.prod,
                ["PROD_REGION_RATING_AMT_INCOME"],
            ),
        ),
        (
            "f_10",
            MathFeatures(
                ["AMT_INCOME_TOTAL", "CNT_CHILDREN"], devision, ["INCOME_PER_CHILD"]
            ),
        ),
        (
            "f_11",
            MathFeatures(
                ["AMT_INCOME_TOTAL", "CNT_FAM_MEMBERS"], devision, ["INCOME_PER_PERSON"]
            ),
        ),
        (
            "f_12",
            MathFeatures(
                ["AMT_ANNUITY", "AMT_INCOME_TOTAL"], devision, ["ANNUITY_INCOME_PERC"]
            ),
        ),
        (
            "f_13",
            MathFeatures(["AMT_ANNUITY", "AMT_CREDIT"], devision, ["PAYMENT_RATE"]),
        ),
        (
            "col_names",
            FunctionTransformer(lambda x: x.rename(columns=remove_special_chars)),
        ),
        ("nearest_neighbours", FunctionTransformer(nn_mean, kw_args={'X_tain_prep': X_train,
                                                                     'y_train': y_train})
        ),
    ]
)


merging_pipe = Pipeline(steps=[
                            # Merging with data from other sources
                            ('merging', FunctionTransformer(merging)),
                            # Getting columns that are used for model training
                            ('getting_model_columns', FunctionTransformer(getting_model_columns))
])

In [133]:
small_app = app.iloc[:1000, :]
small_app.head(2)
X_train = small_app
y_train = X_train['TARGET']

In [None]:
X_for_columns = initial_pipe.fit_transform(X_train)

# List of categorical variables
categorical = [var for var in X_for_columns.columns if X_for_columns[var].dtype == "O"]

# List of numerical variables
numerical = [var for var in X_for_columns.columns if X_for_columns[var].dtype != "O"]

# List of discrete variables
discrete = [var for var in numerical if len(X_for_columns[var].unique()) < 20]

# Continuous variables
numerical = [var for var in numerical if var not in discrete]


In [134]:
pipe = Pipeline(steps=[
    ('initial_pipe', initial_pipe),
    ('preprocess_pipe', preprocess_pipe),
    ('feature_pipe', feature_pipe),
    ('merging_pipe', merging_pipe)
])

X_train = pipe.fit_transform(X_train, y_train)

In [97]:
import pickle
with open('model.pkl', 'rb') as file:
    model = pickle.load(file)

In [135]:
model.predict_proba(X_train)

array([[0.702756  , 0.297244  ],
       [0.96980566, 0.03019435],
       [0.9596382 , 0.04036184],
       ...,
       [0.98946756, 0.01053243],
       [0.9121358 , 0.08786421],
       [0.91791207, 0.08208794]], dtype=float32)