In [1]:
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder

### Read the Data

In [2]:
df = pd.read_csv('data/data.csv')
df.drop([df.columns[0]], axis = 1, inplace=True)

### Dealing with categorical values

#### Countries

In [3]:
# TOP 3 countries with the most frauds
# high_fraud_countries = df.loc[df['FRAUD'] == 1]['COUNTRY'].value_counts()[0:3].index.to_list()
# df.loc[~df['COUNTRY'].isin(high_fraud_countries), 'COUNTRY'] = None
# df.dropna(inplace=True)

#ENCODE COUNTRIES - Target Encoding
df["countries_encoded"] = df.groupby("COUNTRY")["FRAUD"].transform("mean")

#### Ages

In [4]:
# Convert age to range of ages
def age_range(x):
    if x < 31:
        return '20 - 30'
    if x > 30 and x < 41:
        return '31 - 40'
    if x > 40 and x < 51:
        return '41 - 50'
    if x > 50 and x < 61:
        return '51 - 60'
    if x > 60 and x < 71:
        return '61 - 70'
    else:
        return '71 - 80'

df['AGE'] = df['AGE'].apply(age_range)             


In [5]:
#creating instance of one-hot-encoder
encoder = OneHotEncoder(handle_unknown='ignore')

#perform one-hot encoding on 'AGE' column 
encoder_age = pd.DataFrame(encoder.fit_transform(df[['AGE']]).toarray())
df = df.join(encoder_age)
df = df.rename(columns={
    0:'age_(20 - 30)',
    1:'age_(31 - 40)',
    2:'age_(41 - 50)',
    3:'age_(51 - 60)',
    4:'age_(61 - 70)',
    5:'age_(71 - 80)'})

#### Currency

In [6]:
#ENCODE CURRENCIES - Target Encoding
df['currencies_encoded'] = df.groupby("CURRENCY")["FRAUD"].transform("mean")

#### State

In [7]:
df.STATE.value_counts()

COMPLETED    698585
REVERTED     200730
FAILED       105853
DECLINED      63193
Name: STATE, dtype: int64

In [8]:
#perform one-hot encoding on 'STATE' column 
encoder_state = pd.DataFrame(encoder.fit_transform(df[['STATE']]).toarray())
df = df.join(encoder_state)
df = df.rename(columns={
    0:'state_completed',
    1:'state_declined',
    2:'state_failed',
    3:'state_reverted'})

#### Type

In [9]:
df.TYPE.value_counts()

TOPUP           602096
CARD_PAYMENT    294504
FEE              63458
EXCHANGE         52565
TRANSFER         36382
ATM              19356
Name: TYPE, dtype: int64

In [10]:
# perform one-hot encoding on 'TYPE' column 
encoder_type = pd.DataFrame(encoder.fit_transform(df[['TYPE']]).toarray())
df = df.join(encoder_type)
df = df.rename(columns={
    0:'type_atm',
    1:'type_card_payment',
    2:'type_exchange',
    3:'type_fee',
    4:'type_topup',
    5:'type_transfer'})

#### Dates

In [11]:
# Convert to datetime objects
df[["TRANSACTION_DATE_TIME", "USER_CREATION_DATE"]] = df[["TRANSACTION_DATE_TIME", "USER_CREATION_DATE"]].apply(pd.to_datetime)


# Transaction day of the week
df['transaction_day_week'] = df['TRANSACTION_DATE_TIME'].dt.day_name()
df['transaction_day_week'] = df['transaction_day_week'].apply(lambda x: 'Weekend' if (x == 'Saturday' or x == 'Sunday') else x)

In [12]:
pd.get_dummies(
        df, 
        columns = ['AGE', 'STATE', 'TYPE', 'transaction_day_week'])

Unnamed: 0,ID,USER_ID,TRANSACTION_DATE_TIME,USER_CREATION_DATE,COUNTRY,AMOUNT_GBP,CURRENCY,FRAUD,countries_encoded,age_(20 - 30),...,TYPE_EXCHANGE,TYPE_FEE,TYPE_TOPUP,TYPE_TRANSFER,transaction_day_week_Friday,transaction_day_week_Monday,transaction_day_week_Thursday,transaction_day_week_Tuesday,transaction_day_week_Wednesday,transaction_day_week_Weekend
0,f659b44e-cfdf-48de-bcf3-06f47ef26e9f,fd7f3ff6-0ed6-4a85-a7b5-2f205e0ef72f,2019-04-20 18:04:03.930,2019-04-18 10:59:26.974,PL,13.12,PLN,0,0.000294,0.0,...,0,0,0,0,0,0,0,0,0,1
1,b3bdd0fc-641e-4952-a562-86db402fbe75,fd7f3ff6-0ed6-4a85-a7b5-2f205e0ef72f,2019-04-28 17:55:04.803,2019-04-18 10:59:26.974,PL,168.83,PLN,0,0.000294,0.0,...,0,0,1,0,0,0,0,0,0,1
2,2e44356b-ada5-4453-bf85-9495d01b1517,fd7f3ff6-0ed6-4a85-a7b5-2f205e0ef72f,2019-05-02 08:33:28.777,2019-04-18 10:59:26.974,PL,32.13,EUR,0,0.000294,0.0,...,0,0,0,0,0,0,1,0,0,0
3,fc9b6e20-9355-4dac-95b0-9b2e5dff3f8b,fd7f3ff6-0ed6-4a85-a7b5-2f205e0ef72f,2019-04-29 10:16:43.887,2019-04-18 10:59:26.974,PL,379.74,EUR,0,0.000294,0.0,...,1,0,0,0,0,1,0,0,0,0
4,d0716ef1-50bb-4035-88da-b10a4ee336aa,fd7f3ff6-0ed6-4a85-a7b5-2f205e0ef72f,2019-04-28 17:44:35.678,2019-04-18 10:59:26.974,PL,421.87,EUR,0,0.000294,0.0,...,1,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1068356,6477a0f5-f793-47f1-bac8-56ce1bdceddc,cff40363-c2ae-4e1d-aeba-c17abd9912fa,2019-04-29 07:12:45.815,2019-04-28 20:21:04.565,GB,10.00,GBP,0,0.016328,0.0,...,0,0,1,0,0,1,0,0,0,0
1068357,0a363268-f812-43f8-a3e9-e7fc090044a8,e15eb74d-05c7-46ae-8d98-b42becc6f74b,2019-04-22 17:45:13.790,2019-04-22 17:40:39.175,IE,44.75,EUR,0,0.000978,1.0,...,0,0,1,0,0,1,0,0,0,0
1068358,6348d49d-93ec-47df-ad60-1a75c54ca716,8df4c173-2af4-4c94-8e3f-0cb9f6893657,2019-04-04 11:49:33.670,2019-04-04 09:56:59.919,PL,4.15,PLN,0,0.000294,0.0,...,0,0,1,0,0,0,1,0,0,0
1068359,6e902d37-230c-4683-a585-3bdb4475cf4f,89fec9e2-aa02-4e4f-9ee6-a99fc230817d,2019-04-05 19:02:34.649,2019-04-05 14:23:26.054,IE,8.86,EUR,0,0.000978,0.0,...,0,0,1,0,1,0,0,0,0,0


In [15]:
df["user_year"] = df['USER_CREATION_DATE'].dt.year

: 

In [13]:
# perform one-hot encoding on 'Transaction day of the week' column 
encoder_trans_day_week= pd.DataFrame(encoder.fit_transform(df[['transaction_day_week']]).toarray())
df = df.join(encoder_trans_day_week)


In [None]:
X = df['AMOUNT_GBP', 'user_year', 'countries_encoded', 'currencies_encoded', 'AGE_20-35', 'AGE_36-50', 'AGE_51-65', 'AGE_66-80', 'STATE_COMPLETED', 'STATE_DECLINED', 'STATE_FAILED', 'STATE_REVERTED', 'TYPE_ATM', 'TYPE_CARD_PAYMENT', 'TYPE_EXCHANGE', 'TYPE_FEE', 'TYPE_TOPUP', 'TYPE_TRANSFER', 'transaction_day_week_Friday', 'transaction_day_week_Monday', 'transaction_day_week_Thursday', 'transaction_day_week_Tuesday', 'transaction_day_week_Wednesday', 'transaction_day_week_Weekend']
y = df['FRAUD']

## Imbalanced Classification

In [None]:
fig = plt.figure(figsize=(12, 8))
colors = [(160/255, 217/255, 149/255), (108/255, 196/255, 161/255)]
ax = sns.countplot(df['FRAUD'], palette=colors)
ax.set(title='Counts (Target)')
ax.title.set_size(20)  # Set title font size

# Remove spines
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

# Set x-label and y-label font size
ax.set_xlabel('Fraud', fontsize=15)
ax.set_ylabel('Counts', fontsize=15)

# Add percentages to the plot
total = len(df['FRAUD'])
for p in ax.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height()/total)
    x = p.get_x() + p.get_width()/2
    y = p.get_y() + p.get_height()/2
    ax.annotate(percentage, (x, y), ha='center', va='center', fontsize=20)

plt.show()

This problem could be resolved by transforming the dataset into a new one with an equal number of elements of different classes. There are 2 possible solutions:

- Undersampling the majority class.
- Oversampling the minority class by creating a synthetic sample.
- Over and under-sampling.

### Applying SMOTE

Down-sampling the majority class. It refers to the procedure of removing samples of the majority class. The drawback of this procedure is that the overall amount of data you have drecreases. There are various techniques to remove those instances, yet here we will use the simple random approach to do so.

In [None]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler()

X_rus, y_rus = rus.fit_resample(X, y)

fig = plt.figure(figsize=(12, 8))
colors = [(160/255, 217/255, 149/255), (108/255, 196/255, 161/255)]
ax = sns.countplot(
    y_rus, 
    palette=colors)
ax.set(title='Counts (Target)')
ax.title.set_size(20)  # Set title font size

# Remove spines
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

# Set x-label and y-label font size
ax.set_xlabel('Fraud', fontsize=15)
ax.set_ylabel('Counts', fontsize=15)

# Add percentages to the plot
total = len(y_rus)
for p in ax.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height()/total)
    x = p.get_x() + p.get_width()/2
    y = p.get_y() + p.get_height()/2
    ax.annotate(percentage, (x, y), ha='center', va='center', fontsize=20)

plt.show()


### Too much countries

In [None]:
X_rus.COUNTRY.value_counts()