Bank Account Fraud Dataset Suite (NeurIPS 2022)-linear SVM

In [1]:
import numpy as np
import pandas as pd

In [2]:
data=pd.read_csv('Base.csv')

In [3]:
print(data.head(5))

   fraud_bool  income  name_email_similarity  prev_address_months_count  \
0           0     0.3               0.986506                         -1   
1           0     0.8               0.617426                         -1   
2           0     0.8               0.996707                          9   
3           0     0.6               0.475100                         11   
4           0     0.9               0.842307                         -1   

   current_address_months_count  customer_age  days_since_request  \
0                            25            40            0.006735   
1                            89            20            0.010095   
2                            14            40            0.012316   
3                            14            30            0.006991   
4                            29            40            5.742626   

   intended_balcon_amount payment_type  zip_count_4w  ...  has_other_cards  \
0              102.453711           AA          1059  ..

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [5]:
#replace -1 and pd.NA = np.nan # standard the Nan
data = data.replace(-1, np.nan)
data = data.astype(object).where(pd.notna(data), np.nan)  

In [6]:
Y = data['fraud_bool']  
X = data.drop(columns=['fraud_bool'])

In [7]:
data['fraud_bool'].value_counts()

fraud_bool
0    988971
1     11029
Name: count, dtype: int64

In [8]:
Y = data['fraud_bool'].astype(int)# int 

In [9]:
# a lot of columns, pipeline+one hot

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

In [11]:
#automatically identify numerical and categorical columns.
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

In [12]:
#For all numeric columns, first fill in missing values using the mean, then scale them using standardization.
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

In [13]:
#For all categorical columns, first fill in missing values with the most frequent category, 
#then convert the categories to one-hot encoded columns so machine learning models can use them.
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

In [14]:
#For the numeric columns, apply this transformation.
#For the categorical columns, apply a different transformation.
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

In [15]:
#	1.	Preprocesses your data.
	#2.	Trains a Linear SVM classifier on that data.

clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LinearSVC(class_weight='balanced', max_iter=10000))
])

In [16]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [17]:
# Ensure 'fraud_bool' is treated as integer values (0 or 1)
Y = data['fraud_bool'].astype(int)

In [18]:
print(Y.value_counts())  # Check how many unique values there are

fraud_bool
0    988971
1     11029
Name: count, dtype: int64


In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99    197891
           1       0.00      0.00      0.00      2109

    accuracy                           0.99    200000
   macro avg       0.49      0.50      0.50    200000
weighted avg       0.98      0.99      0.98    200000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
