In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# ML Libraries
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from transformers import pipeline
from transformers import BertTokenizer
import re
### NAIVE BAYES ######################
dfb = pd.read_csv('/content/BBASE.csv')
dfb['email_is_free'] = dfb['email_is_free'].astype('category')
dfb['phone_home_valid'] = dfb['phone_home_valid'].astype('category')
dfb['phone_mobile_valid'] = dfb['phone_mobile_valid'].astype('category')
dfb['has_other_cards'] = dfb['has_other_cards'].astype('category')
dfb['foreign_request'] = dfb['foreign_request'].astype('category')
dfb['keep_alive_session'] = dfb['keep_alive_session'].astype('category')
dfb.drop(columns=['device_fraud_count'], inplace=True)
y = dfb['fraud_bool']
ds=dfb
ds.drop(columns=['fraud_bool'],inplace=True)
categorical_columns = dfb.select_dtypes(include=['object', 'category']).columns
numerical_columns = ds.select_dtypes(include=['number']).columns

# Separation
X_categorical = dfb[categorical_columns]
X_numerical = dfb[numerical_columns]

# Encoding
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)
X_categorical_encoded = encoder.fit_transform(X_categorical)

X_processed = np.hstack((X_categorical_encoded, X_numerical))

X_processed = np.nan_to_num(X_processed, nan=-1)


X = np.array(X_processed)
y = np.array(y)


from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

X = np.array(X_resampled)
y = np.array(y_resampled)

# Stratified initial train-test split
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
train_index, test_index = next(sss.split(X, y))

X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]

X_categorical_train, X_categorical_test = X_train[:, :X_categorical_encoded.shape[1]], X_test[:, :X_categorical_encoded.shape[1]]
X_numerical_train, X_numerical_test = X_train[:, X_categorical_encoded.shape[1]:], X_test[:, X_categorical_encoded.shape[1]:]

# Normalisation
scaler = MinMaxScaler()
X_numerical_train_scaled = scaler.fit_transform(X_numerical_train)
X_numerical_test_scaled = scaler.transform(X_numerical_test)

# grid search
param_grid_gnb = {'var_smoothing': [1e-9, 1e-7, 1e-5, 1e-3]}
param_grid_mnb = {'alpha': [0.1, 0.5, 1.0, 2.0], 'fit_prior': [True, False]}

gnb = GaussianNB()
grid_search_gnb = GridSearchCV(gnb, param_grid_gnb, cv=5, scoring='recall_macro')
grid_result=grid_search_gnb.fit(X_numerical_train_scaled, y_train)
print(f"Best Parameters: {grid_result.best_params_}")
print(f"Best Score: {grid_result.best_score_:.2f}")

mnb = MultinomialNB()
grid_search_mnb = GridSearchCV(mnb, param_grid_mnb, cv=5, scoring='recall_macro')
grid_result=grid_search_mnb.fit(X_categorical_train, y_train)
print(f"Best Parameters: {grid_result.best_params_}")
print(f"Best Score: {grid_result.best_score_:.2f}")
best_gnb = grid_search_gnb.best_estimator_
best_mnb = grid_search_mnb.best_estimator_

gnb_log_probs = best_gnb.predict_log_proba(X_numerical_test_scaled)
mnb_log_probs = best_mnb.predict_log_proba(X_categorical_test)

final_log_probs = gnb_log_probs + mnb_log_probs
y_pred = np.argmax(final_log_probs, axis=1)

# Evaluation
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(classification_report(y_test, y_pred))




Best Parameters: {'var_smoothing': 1e-05}
Best Score: 0.73
Best Parameters: {'alpha': 2.0, 'fit_prior': True}
Best Score: 0.83
Accuracy: 0.82
              precision    recall  f1-score   support

           0       0.81      0.84      0.83    221577
           1       0.84      0.80      0.82    221577

    accuracy                           0.82    443154
   macro avg       0.82      0.82      0.82    443154
weighted avg       0.82      0.82      0.82    443154

