In [1]:
import pandas as pd

# Load the dataset

In [2]:
data = pd.read_csv("fraud_detection_dataset.csv")

In [3]:
data = data.dropna(subset=["card1", "card2", "card3", "card4", "card5", "card6"])

# Functions

In [4]:
def generateSignatures(dataframe, attributes):
    identity_attributes = dataframe[attributes].dropna()
    unique_accounts = identity_attributes.drop_duplicates()
    
    numeric_part = identity_attributes[["card1", "card2", "card3", "card5"]].astype(int)
    numeric_part = numeric_part.astype(str)
    numeric_part = numeric_part[["card1", "card2", "card3", "card5"]].agg("".join, axis=1)
    
    first_char = identity_attributes["card4"].str[0]
    
    second_char = identity_attributes["card6"].apply(len).astype(str)
    
    result = numeric_part + first_char + second_char
    
    return result

# Get Unique Nodes

In [5]:
signatures = pd.DataFrame(generateSignatures(data, ["card1", "card2", "card3", "card4", "card5", "card6"]))

In [6]:
signatures.columns = ["AccountName"]

In [8]:
data_nodes = pd.concat([signatures, data], axis=1)

In [9]:
data_nodes.drop(columns=["Unnamed: 0"], inplace=True)

In [10]:
data_nodes.shape

(142667, 20)

In [11]:
data_nodes["AccountName"].nunique()

8404

In [14]:
data_nodes.to_csv("node_oriented_dataset.csv", index=False)

# Map Device Info/Transaction Amount to Unique Accounts


In [16]:
#data_nodes.drop(columns=["Unnamed: 0"], inplace=True)

In [17]:
data_nodes.columns

Index(['AccountName', 'TransactionID', 'DeviceType', 'DeviceInfo', 'isFraud',
       'TransactionDT', 'TransactionAmt', 'ProductCD', 'card1', 'card2',
       'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'dist1', 'dist2',
       'P_emaildomain', 'R_emaildomain'],
      dtype='object')

In [18]:
data_nodes.groupby("AccountName")["DeviceInfo"].value_counts()

AccountName       DeviceInfo      
10003555128226v5  S60 Build/MMB29M     3
10004529150162v6  Windows              3
                  iOS Device           3
                  Trident/7.0          2
                  MacOS                1
                                      ..
9995479150166v5   Trident/7.0          1
                  Windows              1
9999174150226v5   rv:11.0             15
                  Trident/7.0          1
                  Windows              1
Name: DeviceInfo, Length: 25984, dtype: int64

In [19]:
test_df = pd.DataFrame(data_nodes.groupby(["AccountName"])["DeviceInfo"].value_counts())
test_df.columns=["weight"]

In [20]:
test_df = test_df.reset_index()

In [21]:
merged = pd.merge(test_df, data_nodes, on=["AccountName", "DeviceInfo"])

In [22]:
merged.groupby("AccountName")["TransactionID"].apply(list)

AccountName
10003555128226v5                          [3328484, 3337343, 3337365]
10004529150162v6    [3006707, 3051248, 3522508, 3045388, 3086681, ...
10005399150150a6                 [3076343, 3076354, 3076367, 3478374]
1000555185224m5                                             [3230924]
10006390150102m6                                            [3121106]
                                          ...                        
9992455150126m5     [3124651, 3185203, 3193849, 3271358, 3297459, ...
9993399150185a6                                             [3033166]
9994442150226v6                                             [3053803]
9995479150166v5                                    [3039376, 3223057]
9999174150226v5     [3136247, 3136255, 3136256, 3136270, 3151581, ...
Name: TransactionID, Length: 7750, dtype: object

In [23]:
merged[["AccountName", "DeviceInfo", "weight", "TransactionID", "isFraud", "TransactionDT", "TransactionAmt", "ProductCD"]].to_csv("weights_account_to_device.csv")

In [24]:
merged.groupby("DeviceType")["isFraud"].value_counts()

DeviceType  isFraud
desktop     0          68719
            1           4016
mobile      0          40288
            1           4318
Name: isFraud, dtype: int64

In [25]:
merged.groupby("DeviceInfo")["isFraud"].value_counts()

DeviceInfo      isFraud
0PAJ5           0          1
0PJA2           0          1
0PM92           0          2
                1          2
1016S           0          1
                          ..
verykools5004   0          2
verykools5034   0          1
verykools5035   0          1
vivo            0          5
xs-Z47b7VqTMxs  0          1
Name: isFraud, Length: 2150, dtype: int64