In [1]:
import pandas as pd

METAVERSE = '/kaggle/input/metaverse-financial-transactions-dataset/metaverse_transactions_dataset.csv'
df = pd.read_csv(filepath_or_buffer=METAVERSE, parse_dates=['timestamp'])
df.head()

Unnamed: 0,timestamp,hour_of_day,sending_address,receiving_address,amount,transaction_type,location_region,ip_prefix,login_frequency,session_duration,purchase_pattern,age_group,risk_score,anomaly
0,2022-04-11 12:47:27,12,0x9d32d0bf2c00f41ce7ca01b66e174cc4dcb0c1da,0x39f82e1c09bc6d7baccc1e79e5621ff812f50572,796.949206,transfer,Europe,192.0,3,48,focused,established,18.75,low_risk
1,2022-06-14 19:12:46,19,0xd6e251c23cbf52dbd472f079147873e655d8096f,0x51e8fbe24f124e0e30a614e14401b9bbfed5384c,0.01,purchase,South America,172.0,5,61,focused,established,25.0,low_risk
2,2022-01-18 16:26:59,16,0x2e0925b922fed01f6a85d213ae2718f54b8ca305,0x52c7911879f783d590af45bda0c0ef2b8536706f,778.19739,purchase,Asia,192.168,3,74,focused,established,31.25,low_risk
3,2022-06-15 09:20:04,9,0x93efefc25fcaf31d7695f28018d7a11ece55457f,0x8ac3b7bd531b3a833032f07d4e47c7af6ea7bace,300.838358,transfer,South America,172.0,8,111,high_value,veteran,36.75,low_risk
4,2022-02-18 14:35:30,14,0xad3b8de45d63f5cce28aef9a82cf30c397c6ceb9,0x6fdc047c2391615b3facd79b4588c7e9106e49f2,775.569344,sale,Africa,172.16,6,100,high_value,veteran,62.5,moderate_risk


In [2]:
import warnings
from plotly import express
warnings.filterwarnings(action='ignore', category=FutureWarning)

express.histogram(data_frame=df, x='timestamp', y='amount', color='anomaly')

In [3]:
express.scatter(data_frame=df.sample(n=1000, random_state=2024), x='timestamp', y='risk_score', color='anomaly')

This is a clue that the risk score and the anomaly assessment are essentially the same thing.

In [4]:
express.histogram(data_frame=df, x='risk_score', color='anomaly')

Why the risk scores fall where they do, meaning e.g. why there are none between 71 and 83, is an exercise for the reader. 

In [5]:
express.pie(data_frame=df, names='anomaly', color='anomaly')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78600 entries, 0 to 78599
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   timestamp          78600 non-null  datetime64[ns]
 1   hour_of_day        78600 non-null  int64         
 2   sending_address    78600 non-null  object        
 3   receiving_address  78600 non-null  object        
 4   amount             78600 non-null  float64       
 5   transaction_type   78600 non-null  object        
 6   location_region    78600 non-null  object        
 7   ip_prefix          78600 non-null  float64       
 8   login_frequency    78600 non-null  int64         
 9   session_duration   78600 non-null  int64         
 10  purchase_pattern   78600 non-null  object        
 11  age_group          78600 non-null  object        
 12  risk_score         78600 non-null  float64       
 13  anomaly            78600 non-null  object        
dtypes: dat

Let's do the simple thing first and build a regression classifier using the numerical data.

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

columns = ['hour_of_day', 'amount', 'login_frequency', 'session_duration', 
#            'risk_score' # we need to leave out the risk score for reasons above
          ]
target = 'anomaly'

Xr_train, Xr_test, yr_train, yr_test = train_test_split(df[columns], df[target], test_size=0.2, random_state=2024, stratify=df[target])

regression = LogisticRegression(max_iter=1000, tol=1e-6)
regression.fit(X=Xr_train, y=yr_train)
print('fit complete after {} iterations.'.format(regression.n_iter_[0]))
print('accuracy: {:5.4f} '.format(regression.score(X=Xr_test, y=yr_test)))
express.histogram(y=regression.coef_.tolist()[0], x=columns, title='Regression coefficients').show(validate=True)

fit complete after 744 iterations.
accuracy: 0.8302 


This is fascinating; the model says the transaction amount is unimportant, leaving us with numerical data that is all temporal behavior. Of course we don't yet know what the model is telling us: we know from our pie chart above that a model that says every transaction is low risk will be right nearly 81% of the time. So let's dig a little deeper into our regression results.

In [8]:
from sklearn.metrics import classification_report
print(classification_report(y_true=yr_test, y_pred=regression.predict(X=Xr_test), zero_division=0))

               precision    recall  f1-score   support

    high_risk       0.00      0.00      0.00      1299
     low_risk       0.84      0.97      0.90     12699
moderate_risk       0.66      0.43      0.52      1722

     accuracy                           0.83     15720
    macro avg       0.50      0.47      0.47     15720
 weighted avg       0.75      0.83      0.79     15720



Well this is bad. Our simple regression model never catches any high risk behavior.

In [9]:
df.nunique()

timestamp            78513
hour_of_day             24
sending_address       1161
receiving_address     1166
amount               76771
transaction_type         5
location_region          5
ip_prefix                5
login_frequency          8
session_duration       140
purchase_pattern         3
age_group                3
risk_score              31
anomaly                  3
dtype: int64