In [1]:
import pandas as pd

METAVERSE = '/kaggle/input/metaverse-financial-transactions-dataset/metaverse_transactions_dataset.csv'
df = pd.read_csv(filepath_or_buffer=METAVERSE, parse_dates=['timestamp'])
df.head()

Unnamed: 0,timestamp,hour_of_day,sending_address,receiving_address,amount,transaction_type,location_region,ip_prefix,login_frequency,session_duration,purchase_pattern,age_group,risk_score,anomaly
0,2022-04-11 12:47:27,12,0x9d32d0bf2c00f41ce7ca01b66e174cc4dcb0c1da,0x39f82e1c09bc6d7baccc1e79e5621ff812f50572,796.949206,transfer,Europe,192.0,3,48,focused,established,18.75,low_risk
1,2022-06-14 19:12:46,19,0xd6e251c23cbf52dbd472f079147873e655d8096f,0x51e8fbe24f124e0e30a614e14401b9bbfed5384c,0.01,purchase,South America,172.0,5,61,focused,established,25.0,low_risk
2,2022-01-18 16:26:59,16,0x2e0925b922fed01f6a85d213ae2718f54b8ca305,0x52c7911879f783d590af45bda0c0ef2b8536706f,778.19739,purchase,Asia,192.168,3,74,focused,established,31.25,low_risk
3,2022-06-15 09:20:04,9,0x93efefc25fcaf31d7695f28018d7a11ece55457f,0x8ac3b7bd531b3a833032f07d4e47c7af6ea7bace,300.838358,transfer,South America,172.0,8,111,high_value,veteran,36.75,low_risk
4,2022-02-18 14:35:30,14,0xad3b8de45d63f5cce28aef9a82cf30c397c6ceb9,0x6fdc047c2391615b3facd79b4588c7e9106e49f2,775.569344,sale,Africa,172.16,6,100,high_value,veteran,62.5,moderate_risk


In [2]:
import warnings
from plotly import express
warnings.filterwarnings(action='ignore', category=FutureWarning)

express.histogram(data_frame=df, x='risk_score', color='anomaly')

In [3]:
from sklearn.naive_bayes import CategoricalNB
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.inspection import permutation_importance

categories = ['transaction_type', 'purchase_pattern', 'age_group', 'login_frequency', 'session_duration']
target = 'risk_score'

# we could do this a column at a time with LabelEncoder
# but since all of our data is categorical we can use OrdinalEncoder
# and do the whole thing at once
df_categorical = OrdinalEncoder(categories='auto').set_output(transform='pandas').fit_transform(X=df[categories + [target]])

Xc_train, Xc_test, yc_train, yc_test = train_test_split(df_categorical[categories], df_categorical[target], test_size=0.2, random_state=2024,
                                                        stratify=df_categorical[target])

categorical = CategoricalNB(alpha=1.0, force_alpha='warn', fit_prior=True, class_prior=None, min_categories=None, )
categorical.fit(X=Xc_train, y=yc_train)
print('accuracy: {:5.4f} '.format(categorical.score(X=Xc_test, y=yc_test)))

express.histogram(y=permutation_importance(estimator=categorical, X=Xc_test, y=yc_test)['importances_mean'].tolist(),
                  x=categories, title='Categorical mean importance').show(validate=True)

accuracy: 0.3937 


In [4]:
from sklearn.metrics import classification_report
print(classification_report(y_true=yc_test, y_pred=categorical.predict(X=Xc_test), zero_division=0))

              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00       238
         1.0       0.41      1.00      0.58       641
         2.0       0.00      0.00      0.00       239
         3.0       0.00      0.00      0.00        95
         4.0       0.00      0.00      0.00       250
         5.0       0.00      0.00      0.00       749
         6.0       0.00      0.00      0.00        88
         7.0       0.41      0.75      0.53      1997
         8.0       0.00      0.00      0.00       754
         9.0       0.00      0.00      0.00       307
        10.0       0.00      0.00      0.00       426
        11.0       0.00      0.00      0.00       843
        12.0       0.28      1.00      0.44      1180
        13.0       0.00      0.00      0.00       191
        14.0       0.41      1.00      0.58       524
        15.0       0.00      0.00      0.00       306
        16.0       0.00      0.00      0.00       454
        17.0       0.00    