## Libraries and variables

In [20]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

data_path = '../01_Data/'
file_name_cleaned_data = 'psp-data_cleaned.csv'
file_path_cleaned_data = data_path + file_name_cleaned_data

## Data load

In [21]:
df = pd.read_csv(file_path_cleaned_data, index_col=0)
df['tmsp'] = pd.to_datetime(df['tmsp'])

## Preprocessing
https://scikit-learn.org/stable/modules/preprocessing.html \
https://scikit-learn.org/stable/api/sklearn.preprocessing.html

### Data enrichment

In [22]:
# Add fee column
def calculate_fee(row):
    if row['PSP'] == 'Moneycard':
        return 5.0 if row['success'] == 1 else 2.0
    elif row['PSP'] == 'Goldcard':
        return 10.0 if row['success'] == 1 else 5.0
    elif row['PSP'] == 'UK_Card':
        return 3.0 if row['success'] == 1 else 1.0
    elif row['PSP'] == 'Simplecard':
        return 1.0 if row['success'] == 1 else 0.5
    else:
        return 0
    
df['fee'] = df.apply(calculate_fee, axis=1)

In [23]:
# Add transaction hour
df['transaction_hour'] = df['tmsp'].dt.hour

In [24]:
# Add purchase index
# A purchase is defined as follows: previous transaction timestamp is <= 60 seconds and country, amount, card and 3D_secured are equal
def assign_purchase(df):
    purchases = []
    purchase_num = 0
    last_transaction_time = None
    
    for index, row in df.iterrows():
        if last_transaction_time is None or \
            (row['tmsp'] - last_transaction_time).total_seconds() >= 60 or \
            (row['country'], row['amount'], row['card'], row['3D_secured']) != last_transaction:
            
            purchase_num += 1
        
        purchases.append(purchase_num)
        last_transaction_time = row['tmsp']
        last_transaction = (row['country'], row['amount'], row['card'], row['3D_secured'])
    
    return purchases


df['purchase'] = assign_purchase(df)

In [25]:
# Add amount buckets separated by 50 Euros
df['amount_bucket'] = pd.cut(df['amount'], bins=range(0, max(df['amount'])+51, 50))

### Feature encoding
One-Hot-Encoding, Ordinal-Encoding, Target-Encoding

In [26]:
# One-Hot-Encodig for colums 'card', 'PSP' and 'country'
# Decision in favour of One-Hot-Encodign due to fewer categorical values and no order in the categorical values
encoder = OneHotEncoder()

encoded_data = encoder.fit_transform(df[['card', 'PSP', 'country']])
encoded_df = pd.DataFrame(encoded_data.toarray(), columns=encoder.get_feature_names_out(['card', 'PSP', 'country']))

df = pd.concat([df, encoded_df], axis=1)

### Skalierung und Normalisierung
- StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler
- Normalizer (L1, L2, Max-Norm)

In [27]:
min_max_scaler = MinMaxScaler()
df[["amount_sc", "transaction_hour_sc"]] = min_max_scaler.fit_transform(df[["amount", "transaction_hour"]])
df.head()

Unnamed: 0,tmsp,country,amount,success,PSP,3D_secured,card,fee,transaction_hour,purchase,...,card_Visa,PSP_Goldcard,PSP_Moneycard,PSP_Simplecard,PSP_UK_Card,country_Austria,country_Germany,country_Switzerland,amount_sc,transaction_hour_sc
0,2019-01-01 00:01:11,Germany,89,0,UK_Card,0,Visa,1.0,0,1,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.133013,0.0
1,2019-01-01 00:01:17,Germany,89,1,UK_Card,0,Visa,3.0,0,1,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.133013,0.0
2,2019-01-01 00:02:49,Germany,238,0,UK_Card,1,Diners,1.0,0,2,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.371795,0.0
3,2019-01-01 00:03:13,Germany,238,1,UK_Card,1,Diners,3.0,0,2,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.371795,0.0
4,2019-01-01 00:04:33,Austria,124,0,Simplecard,0,Diners,0.5,0,3,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.189103,0.0


## Save preprocessed data

In [28]:
df.to_csv(data_path + 'psp-data_preprocessed.csv')