Basic imports

In [1]:
import pandas as pd
import numpy as np
import warnings

from autoimpute.imputations import SingleImputer
from sklearn.experimental import enable_iterative_imputer

from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer
from sklearn.preprocessing import (
    PolynomialFeatures,
    OneHotEncoder,
    StandardScaler,
    MinMaxScaler,
    MaxAbsScaler,
    RobustScaler,
    Normalizer,
)
from imblearn.under_sampling import RandomUnderSampler
from sklearn.neighbors import KNeighborsClassifier

warnings.filterwarnings("ignore")
%load_ext nb_black



<IPython.core.display.Javascript object>

Importing data

In [2]:
train_df = pd.read_csv("merged_train.csv")
test_df = pd.read_csv("merged_test.csv")

<IPython.core.display.Javascript object>

### 2.3 Data cleaning

In [3]:
# extracting lists for numerical and categorical features
numerical_features = train_df.select_dtypes(np.number).columns.to_list()[2:]
categorical_features = train_df.select_dtypes(object).columns.to_list()

<IPython.core.display.Javascript object>

In [4]:
# imputing numerical features
imputer = SimpleImputer(missing_values=np.nan, strategy="constant")

train_df[numerical_features] = imputer.fit_transform(train_df[numerical_features])
test_df[numerical_features] = imputer.transform(test_df[numerical_features])

<IPython.core.display.Javascript object>

In [5]:
# imputing categorical features
imputer = SimpleImputer(missing_values=np.nan, strategy="constant")

train_df[categorical_features] = imputer.fit_transform(train_df[categorical_features])
test_df[categorical_features] = imputer.transform(test_df[categorical_features])

<IPython.core.display.Javascript object>

### 2-4 Feature engineering.

**2-4-1** Creating additional features. Here I have two problems:
1. Majority of existing features have really low correlation coefficient. I don't really need lots of poor quality features.
2. Dataset is huge in size. So in order to prevent MemoryLoss and crashes, I will do everything small steps.

I'm planning to generate these features: 1/X, X^2, X^3, X*Y, X/Y, X+Y, X-Y. Before feature is implemented, its correlation coeficient will be checked.

In [6]:
def create_reversed_feature(series: pd.Series) -> pd.Series:
    """From feature A creates new feature A**(-1) and return it as series"""
    rev_series = np.power(series, -1)
    return rev_series


def create_squared_feature(series: pd.Series) -> pd.Series:
    """From feature A creates new feature A**2 and return it as series"""
    sq_series = np.power(series, 2)
    return sq_series


def create_cubic_feature(series: pd.Series) -> pd.Series:
    """From feature A creates new feature A**3 and return it as series"""
    cub_series = np.power(series, 3)
    return cub_series


def create_product_feature(series1: pd.Series, series2: pd.Series) -> pd.Series:
    """From feature A and B creates new feature A*B and return it as series"""
    product_series = np.multiply(series1, series2)
    return product_series


def create_division_feature(series1: pd.Series, series2: pd.Series) -> pd.Series:
    """From feature A and B creates new feature A/B and return it as series"""
    div_series = np.divide(series1, (series2 + 0.0000001))
    return div_series


def create_addition_feature(series1: pd.Series, series2: pd.Series) -> pd.Series:
    """From feature A and B creates new feature A+B and return it as series"""
    add_series = np.add(series1, series2)
    return add_series


def create_subtraction_feature(series1: pd.Series, series2: pd.Series) -> pd.Series:
    """From feature A and B creates new feature A-B and return it as series"""
    sub_series = np.subtract(series1, series2)
    return sub_series

<IPython.core.display.Javascript object>

In [7]:
# looping through functions which needs one series as argument:
for feature in numerical_features:
    # reverse part
    train_rev_series = create_reversed_feature(train_df[feature])
    # those who will not have correlation coefficient of 0.05 will not be implemented
    if train_df["TARGET"].corr(train_rev_series) >= 0.05:
        test_rev_series = create_reversed_feature(test_df[feature])
        rev_feature_name = str(feature) + "_reversed"
        test_df[rev_feature_name] = test_rev_series
        train_df[rev_feature_name] = train_rev_series

    # square part
    train_sq_series = create_squared_feature(train_df[feature])
    # those who will not have correlation coefficient of 0.05 will not be implemented
    if train_df["TARGET"].corr(train_sq_series) >= 0.05:
        test_sq_series = create_squared_feature(test_df[feature])
        sq_feature_name = str(feature) + "^2"
        test_df[sq_feature_name] = test_sq_series
        train_df[sq_feature_name] = train_sq_series

    # cubic part
    train_cub_series = create_cubic_feature(train_df[feature])
    # those who will not have correlation coefficient of 0.05 will not be implemented
    if train_df["TARGET"].corr(train_cub_series) >= 0.05:
        test_cub_series = create_cubic_feature(test_df[feature])
        cub_feature_name = str(feature) + "^3"
        test_df[cub_feature_name] = test_cub_series
        train_df[cub_feature_name] = train_cub_series

<IPython.core.display.Javascript object>

In [8]:
# looping through functions which needs two series as arguments:
for feature1 in numerical_features:
    for feature2 in numerical_features:
        if (
            feature1 != feature2
        ):  # in order to eliminate duplicates or irrelevant manipulations
            # product part
            train_product_series = create_product_feature(
                train_df[feature1], train_df[feature2]
            )
            # those who will not have correlation coefficient of 0.1 will not be implemented
            if train_df["TARGET"].corr(train_product_series) >= 0.1:
                test_product_series = create_product_feature(
                    test_df[feature1], test_df[feature2]
                )
                product_feature_name = str(feature1) + "*" + str(feature2)
                test_df[product_feature_name] = test_product_series
                train_df[product_feature_name] = train_product_series

            # division part
            train_division_series = create_division_feature(
                train_df[feature1], train_df[feature2]
            )
            # those who will not have correlation coefficient of 0.1 will not be implemented
            if train_df["TARGET"].corr(train_division_series) >= 0.1:
                test_division_series = create_division_feature(
                    test_df[feature1], test_df[feature2]
                )
                division_feature_name = str(feature1) + "/" + str(feature2)
                test_df[division_feature_name] = test_division_series
                train_df[division_feature_name] = train_division_series

            # addition part
            train_addition_series = create_addition_feature(
                train_df[feature1], train_df[feature2]
            )
            # those who will not have correlation coefficient of 0.1 will not be implemented
            if train_df["TARGET"].corr(train_addition_series) >= 0.1:
                test_addition_series = create_addition_feature(
                    test_df[feature1], test_df[feature2]
                )
                addition_feature_name = str(feature1) + "+" + str(feature2)
                test_df[addition_feature_name] = test_addition_series
                train_df[addition_feature_name] = train_addition_series

            # subtraction part
            train_subtraction_series = create_subtraction_feature(
                train_df[feature1], train_df[feature2]
            )
            # those who will not have correlation coefficient of 0.1 will not be implemented
            if train_df["TARGET"].corr(train_subtraction_series) >= 0.1:
                test_subtraction_series = create_subtraction_feature(
                    test_df[feature1], test_df[feature2]
                )
                subtraction_feature_name = str(feature1) + "-" + str(feature2)
                test_df[subtraction_feature_name] = test_subtraction_series
                train_df[subtraction_feature_name] = train_subtraction_series

<IPython.core.display.Javascript object>

**2-4-2** Creating relevant feature inspired by Kaggle grandmasters ([source link](https://github.com/rishabhrao1997/Home-Credit-Default-Risk/blob/main/Feature%20Engineering%20and%20Modelling.ipynb))

In [9]:
# additional feature which will be used
for data in [train_df, test_df]:
    data["CREDIT_ANNUITY_RATIO"] = data["AMT_CREDIT_x"] / (
        data["AMT_ANNUITY_x"] + 0.00001
    )

# KNN feature (which has high correlation with target)
knn = KNeighborsClassifier(500, n_jobs=-1)
train_data_for_neighbors = train_df[
    ["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3", "CREDIT_ANNUITY_RATIO"]
]
train_target = train_df.TARGET
test_data_for_neighbors = test_df[
    ["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3", "CREDIT_ANNUITY_RATIO"]
]

knn.fit(train_data_for_neighbors, train_target)

train_500_neighbors = knn.kneighbors(train_data_for_neighbors)[1]
test_500_neighbors = knn.kneighbors(test_data_for_neighbors)[1]

train_df["TARGET_NEIGHBORS_500_MEAN"] = [
    train_df["TARGET"].iloc[ele].mean() for ele in train_500_neighbors
]
test_df["TARGET_NEIGHBORS_500_MEAN"] = [
    train_df["TARGET"].iloc[ele].mean() for ele in test_500_neighbors
]

<IPython.core.display.Javascript object>

**2-4-3** Applying one hot encoding to categorical features.

In [10]:
# creating encoder
ohe = OneHotEncoder(handle_unknown="ignore")

<IPython.core.display.Javascript object>

In [11]:
# creating encoded values in dataframes
train_df = train_df.join(
    pd.DataFrame(ohe.fit_transform(train_df[categorical_features]).toarray())
)
test_df = test_df.join(
    pd.DataFrame(ohe.transform(test_df[categorical_features]).toarray())
)

<IPython.core.display.Javascript object>

In [12]:
# dropping categorical columns (not encoded)
train_df = train_df.drop(categorical_features, axis=1)
test_df = test_df.drop(categorical_features, axis=1)

<IPython.core.display.Javascript object>

**2-4-4** Scaling features.

In [13]:
print(train_df.shape[1], test_df.shape[1])

563 562


<IPython.core.display.Javascript object>

In [16]:
# scaling will be done in small batches
batches = [(100 * x) + 1 for x in range(5)]
batches.append(563)

for counter in range(len(batches) - 1):
    scaler = MinMaxScaler()
    train_df.iloc[
        :, batches[counter] + 1 : batches[counter + 1] + 1
    ] = scaler.fit_transform(
        train_df.iloc[:, batches[counter] + 1 : batches[counter + 1] + 1]
    )
    test_df.iloc[:, batches[counter] : batches[counter + 1]] = scaler.transform(
        test_df.iloc[:, batches[counter] : batches[counter + 1]]
    )

<IPython.core.display.Javascript object>

**2-4-5** Undersampling.

In [18]:
# reducing size of train_df using UnderSampler and balancing target distribution
undersample = RandomUnderSampler(sampling_strategy="majority")

# fit and apply the transform
X_under, y_under = undersample.fit_resample(train_df.iloc[:, 2:], train_df.iloc[:, 1])

# saving to pd DataFrame
under_train_df = pd.DataFrame()
under_train_df["TARGET"] = y_under
under_train_df[[str(x) for x in range(1, 562)]] = X_under

<IPython.core.display.Javascript object>

**2-4-6** Saving train and test datasets

In [None]:
train_df.to_csv("train1.csv", index=False)
test_df.to_csv("test1.csv", index=False)
under_train_df.to_csv("train_under1.csv", index=False)