In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [None]:
df_train_identity = pd.read_csv("data/train_identity.csv")
df_train_transaction = pd.read_csv("data/train_transaction.csv")

In [None]:
df_train_identity.shape

In [None]:
df_train_transaction.shape

In [None]:
df_train_identity.head()

In [None]:
df_train_transaction = df_train_transaction[df_train_transaction.columns[:17]]
df_train_transaction.head()

### Missing values

In [None]:
df_train_transaction.isnull().sum()

In [None]:
data = df_train_identity.isnull().sum().sort_values() / len(df_train_identity) * 100
x_data = data[data > 0].index
y_data = data[data > 0]

In [None]:
sns.set_theme(style="whitegrid")
plt.figure(figsize=(15,10))
sns.barplot(x=x_data, y=y_data, color="lightblue")
#plt.title("Missing values for transaction identities", fontsize=16)
plt.xlabel("Attribute name", fontsize=15)
plt.ylabel("Percentage of missing values [%]", fontsize=15)
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("missing_values_for_transaction_identities.png")
plt.show()

In [None]:
data = df_train_transaction.isnull().sum().sort_values() / len(df_train_transaction) * 100
x_data = data[data > 0].index
y_data = data[data > 0]

sns.set_theme(style="whitegrid")
plt.figure(figsize=(15,10))
sns.barplot(x=x_data, y=y_data, color="lightblue")
#plt.title("Missing values for transaction data", fontsize=16)
plt.xlabel("Attribute name", fontsize=15)
plt.ylabel("Percentage of missing values [%]", fontsize=15)
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("missing_values_for_transaction_data.png")
plt.show()

## Transaction identity : creating device nodes

Get only those attributes which have less than 4000 missing values. Perform imputation of a character

In [None]:
identity_attributes = df_train_identity.isnull().sum().sort_values()[df_train_identity.isnull().sum().sort_values() < 4000].index
device_nodes_features = df_train_identity[identity_attributes]
device_nodes_features = device_nodes_features.dropna(axis=0, how="all", subset=["id_38", "id_37", "id_36", "id_35", "id_15", "id_29", "id_28", "id_11", "id_02", "DeviceType", "id_31"])

Again, drop the records which contain missing values in the attributes which allow for proper composition of node's identity

In [None]:
device_nodes_features.isnull().sum()

As the last resort, fill missing values with 0

In [None]:
device_nodes_features = device_nodes_features.fillna(0)

In [None]:
device_nodes_features

In [None]:
numeric_attributes = []
for col in df_train_identity[identity_attributes].columns:
    if df_train_identity[identity_attributes][col].dtypes == "object":
        print(df_train_identity[identity_attributes][col].str[0])
    else:
        numeric_attributes.append(col)

In [None]:
def retrieveDeviceNode(device_nodes_features):

    numeric_attribues = []
    string_attributes = []

    unique_devices = device_nodes_features.drop_duplicates()

    for col in unique_devices.columns:
        if unique_devices[col].dtypes == "object":
            string_attributes.append(col)
        else:
            numeric_attribues.append(col)

    numeric_part = unique_devices[numeric_attribues].astype("string")
    numeric_part = numeric_part.agg("".join, axis=1)

    char_part = []
    for attr in string_attributes:
        char_part.append(unique_devices[attr].str[0])

    result = numeric_part
    for char_signature in char_part:
        result += char_signature

    return pd.concat([pd.DataFrame(result), unique_devices[["TransactionID"]]], axis=1)

In [None]:
device_nodes = retrieveDeviceNode(device_nodes_features=device_nodes_features)
device_nodes.shape

In [None]:
df_train_transaction["TransactionDT"]

In [None]:
grouped = df_train_transaction[df_train_transaction["TransactionDT"] < 3701882].groupby("TransactionDT")
plt.figure(figsize=(15,10))
sns.scatterplot(grouped.isFraud.apply(sum))
plt.title("The frequencies of fraudulent transactions throughout time", fontsize=15)

plt.xlabel("Time Step")
plt.ylabel("Number of fraudulent transactions")

plt.axhline(y=np.mean(grouped.isFraud.apply(sum)), label = "Mean", color='r', linestyle='-')
plt.axhline(y=np.median(grouped.isFraud.apply(sum)), label = "Median", color='black', linestyle="dotted")

plt.legend(loc ="upper right")

#plt.savefig("Fraudulent_transactions_in_time.PNG")
plt.show()

### Fraudulent transactions in each of the categorical variables

In [None]:
df_train_transaction.groupby("TransactionDT")["isFraud"].value_counts()

In [None]:
entire_dataset = pd.merge(df_train_transaction, df_train_identity, on="TransactionID")

In [None]:
entire_dataset.head()

In [None]:
entire_dataset.shape

In [None]:
def getFraudFrequencies(df, attribute):
    fraud_frequencies = pd.DataFrame(df.groupby(attribute)["isFraud"].value_counts())
    fraud_frequencies.columns = ["count"]
    fraud_frequencies = fraud_frequencies.reset_index()
    return fraud_frequencies

In [None]:
getFraudFrequencies(entire_dataset, "ProductCD")

In [None]:
getFraudFrequencies(entire_dataset, "DeviceType")

In [None]:
getFraudFrequencies(entire_dataset, "DeviceInfo").sort_values("count", ascending=False)

In [None]:
getFraudFrequencies(entire_dataset, "R_emaildomain").sort_values("count", ascending=False)

In [None]:
sns.set_theme(style="whitegrid")
plt.figure(figsize=(8,7))
sns.catplot(x="isFraud", y="TransactionAmt", data=entire_dataset, showfliers=False, kind="box")
plt.title("Amounts distribution for non-fraudulent and fraudulent transactions", fontsize=14)
plt.xlabel("\nTransaction category")
plt.ylabel("Transaction amount")
plt.xticks([0, 1], ["licit", "illicit"])
plt.show()

In [None]:
sns.set_theme(style="whitegrid")
plt.figure(figsize=(8,7))
sns.catplot(x="isFraud", y="TransactionAmt", data=entire_dataset, showfliers=True, kind="box")
plt.title("Amounts distribution for non-fraudulent and fraudulent transactions", fontsize=14)
plt.xlabel("\nTransaction category")
plt.ylabel("Transaction amount")
plt.xticks([0, 1], ["licit", "illicit"])
plt.show()