In [1]:
import datetime as dt
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import seaborn as sns

from scipy.stats import pointbiserialr

### Load the dataset with weights assigned to edges

In [2]:
df = pd.read_csv("weights_account_to_device.csv")

In [3]:
df.drop(columns=["Unnamed: 0"], inplace=True)

In [4]:
df.columns = ["AccountName", "DeviceInfo", "Weight", "TransactionID", "isFraud", "TimeStep", "AmountTransferred", "ServiceType"]

#### Convert the timestamp to date and time (from seconds since epoch)

In [5]:
x = lambda a : dt.datetime.fromtimestamp(a)
df.TimeStep = pd.to_datetime(df.TimeStep.apply(x), format="%Y-%m-%d %H:%M:%S")

In [6]:
df.head()

Unnamed: 0,AccountName,DeviceInfo,Weight,TransactionID,isFraud,TimeStep,AmountTransferred,ServiceType
0,10003555128226v5,S60 Build/MMB29M,3,3328484,0,1970-04-08 13:23:35,39.394,C
1,10003555128226v5,S60 Build/MMB29M,3,3337343,0,1970-04-11 00:34:42,10.755,C
2,10003555128226v5,S60 Build/MMB29M,3,3337365,0,1970-04-11 00:40:15,19.093,C
3,10004529150162v6,Windows,3,3006707,0,1970-01-06 23:35:03,35.0,H
4,10004529150162v6,Windows,3,3051248,0,1970-01-17 20:58:38,25.0,H


### Construct the graph

In [8]:
graphtype = nx.Graph()
graph = nx.from_pandas_edgelist(df, 
                                source="AccountName", 
                                target="DeviceInfo",
                                edge_attr=True, 
                                create_using=graphtype)

### Get degrees for accounts

The degree of the accounts will provide information on to how many devices they accessed. Logically, the bigger number of devices the bigger the chance that one of them belonged to criminals. Additionally, we can create profiles of customers.

In [9]:
device_info = df["DeviceInfo"].unique()
degrees_accounts = []
account_names = []
# account name --- degree
degree_df = pd.DataFrame(list(nx.degree(graph)))
# iterate over the degrees dataframe
for i in range(len(degree_df)):
    # check if the account name is a 'device name'
    if degree_df.iloc[i][0] not in device_info:
        # if it is not, then it is an account
        # append its degree
        degrees_accounts.append(degree_df.iloc[i][1])
        # append its name
        account_names.append(degree_df.iloc[i][0])

In [10]:
accounts_degrees_df = pd.DataFrame(account_names, degrees_accounts).reset_index()
accounts_degrees_df.columns = ["degree", "account_name"]

In [11]:
degree_distribution_accounts = pd.DataFrame(accounts_degrees_df["degree"].value_counts()).reset_index()
degree_distribution_accounts.columns = ["degree", "counts"]

In [12]:
accounts_degrees_df.columns = ["degree", "AccountName"]

In [13]:
degree_and_fraud_df = pd.merge(accounts_degrees_df, df[["AccountName", "isFraud"]], on="AccountName")

### Compute Point Biserial Correlation

The point biserial correlation is used to measure the relationship between a binary variable, x, and a continuous variable, y. Like other correlation coefficients, this one varies between -1 and +1 with 0 implying no correlation. Correlations of -1 or +1 imply a determinative relationship.

In [14]:
stat = pointbiserialr(x=degree_and_fraud_df.isFraud.values.tolist(), y=degree_and_fraud_df.degree.values.tolist())

In [15]:
print("Correlation: %0.5f\np-value: %0.5f" % (stat.correlation, stat.pvalue))

Correlation: 0.01599
p-value: 0.00000


As seen above, we can assume that there is no correlation between the degree of a node and fraud.

In [19]:
accounts_degrees_df.to_csv("accounts_and_number_of_devices.csv", index=False)