# Feature Engineering

In [225]:
# import the libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import numpy as np
from statsmodels.stats.multitest import multipletests

In [226]:
# path to the data on the local machine
path = "/Users/maximkiesel/Desktop/Master/Model Engineering/use_case_1/PSP_Jan_Feb_2019.xlsx"

In [227]:
# load the data
df = pd.read_excel(path)

In [228]:
# split the timestamp
df['year'] = df['tmsp'].dt.year
df['month'] = df['tmsp'].dt.month
df['day'] = df['tmsp'].dt.day
df['day_of_week'] = df['tmsp'].dt.dayofweek  # Montag=0, Sonntag=6
df['quarter'] = df['tmsp'].dt.quarter
df['is_weekend'] = (df['tmsp'].dt.weekday >= 5).astype(int)
df['hour'] = df['tmsp'].dt.hour

In [229]:
list_tmsp = ["year", "month", "day", "day_of_week", "quarter", "is_weekend", "hour"]

In [230]:
# check unique values of the splitted timestamps
list_column = ["year", "month", "day", "day_of_week", "quarter", "is_weekend", "hour"]

for column in list_column:
    print("")
    print(column)
    print(df[column].unique())
    print("-------------") 


year
[2019]
-------------

month
[1 2]
-------------

day
[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31]
-------------

day_of_week
[1 2 3 4 5 6 0]
-------------

quarter
[1]
-------------

is_weekend
[0 1]
-------------

hour
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23]
-------------


In [231]:
df = df.drop(["year", "quarter", "Unnamed: 0", "tmsp", "country"], axis=1)

In [232]:
# convert continuous numerical to ordinal numeric data
df['amount_bin'] = pd.qcut(df['amount'], q=4)

le = LabelEncoder()
df['amount_bin_encoded'] = le.fit_transform(df['amount_bin'])
df = df.drop("amount_bin", axis=1)

In [233]:
# create one-hot encoding for 2 features
df= pd.get_dummies(df, columns=['card', 'PSP'])

In [234]:
# using Benjamini-Hochberg-method for FDR to see if features can be cutted
p_values = np.array([0.03634995786578327, 0.5683037749425706, 1.144669780335135e-09, 2.392241166548454e-40])

reject, pvals_corrected, _, _ = multipletests(p_values, method='fdr_bh')

print('Original p-Werte:', p_values)
print('After FDR p-Werte:', pvals_corrected)
print('Rejected H0:', reject)

Original p-Werte: [3.63499579e-02 5.68303775e-01 1.14466978e-09 2.39224117e-40]
After FDR p-Werte: [4.84666105e-02 5.68303775e-01 2.28933956e-09 9.56896467e-40]
Rejected H0: [ True False  True  True]


In [235]:
# saving the df as .xlsx after feature engineering
df.to_excel('feature_engineering.xlsx', index=False)