In [None]:
import pandas as pd

df=pd.read_csv('..\\data\\fraud.csv', index_col = 0)
y = df['Class'].values
df = df.iloc[:,1:]
X = df.drop(columns = 'Class').values

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, 
                                    test_size = 0.40, 
                                    random_state=1)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

#--------------------------------------------------
## ------------Logistic Regresion----------------## 
#--------------------------------------------------

from sklearn.linear_model import LogisticRegression

steps = [('scaler', StandardScaler()),         
         ('logReg', LogisticRegression(penalty = "l2", C = 1.0))]

LR_pipeline = Pipeline(steps)
LR_pipeline.fit(X_train, y_train)

#--------------------------------------------------
## ----------- K-NN Classifier ------------------## 
#--------------------------------------------------

from sklearn.neighbors import KNeighborsClassifier

steps = [('scaler', StandardScaler()),         
         ('knn', KNeighborsClassifier(n_neighbors = 5))]

knn_pipeline = Pipeline(steps)
knn_pipeline.fit(X_train, y_train)

#--------------------------------------------------
## ------------ SVM Classifier ------------------## 
#--------------------------------------------------

from sklearn.svm import SVC

## Linear Kernel  ---------------
steps = [('scaler', StandardScaler()),         
         ('svc', SVC(kernel = 'linear',
                     class_weight='balanced'))]

svcL_pipeline = Pipeline(steps)
svcL_pipeline.fit(X_train, y_train)

## Polynomial Kernel -----------------------
steps = [('scaler', StandardScaler()),         
         ('svc', SVC(kernel = 'poly', degree = 3, 
                     class_weight='balanced'))]

svcPoly_pipeline = Pipeline(steps)
svcPoly_pipeline.fit(X_train, y_train)

## RBF Kernel -----------------------
steps = [('scaler', StandardScaler()),         
         ('svc', SVC(kernel = 'rbf', gamma = 'scale',
                     class_weight='balanced'))]

svcRBF_pipeline = Pipeline(steps)
svcRBF_pipeline.fit(X_train, y_train)




In [None]:
#--------------------------------------------------
## Model Evaluation ## 
#--------------------------------------------------
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

result_df = pd.DataFrame(columns = ['Acc_Train', 
                                    'Acc_Test', 
                                    'Recall_Train', 
                                    'Recall_Test', 
                                    'AUC_Train', 
                                    'AUC_Test'])
model_name = [LR_pipeline, knn_pipeline, svcL_pipeline, 
              svcPoly_pipeline, svcRBF_pipeline]

for idx, model in enumerate(model_name):
    ## for training data
    ypred_train = model.predict(X_train)
    report_clf = classification_report(y_train, 
                                       ypred_train, 
                                       output_dict=True)
    df_r = pd.DataFrame(report_clf).transpose()
    acc_tr = df_r.loc['accuracy', 'recall'].round(3)
    recall_tr = df_r.iloc[1,1].round(3)   
    auc_tr = roc_auc_score(y_train, ypred_train)
    
    ## for testing data
    ypred_test = model.predict(X_test)
    report_clf = classification_report(y_test, 
                                       ypred_test, 
                                       output_dict=True)
    df_r = pd.DataFrame(report_clf).transpose()
    acc = df_r.loc['accuracy', 'recall'].round(3)
    recall = df_r.iloc[1,1].round(3)   
    auc = roc_auc_score(y_test, ypred_test)
    
    result_df.loc[idx,:]=[acc_tr, acc, recall_tr, 
                          recall, auc_tr.round(3), 
                          auc.round(3)]


In [None]:
result_df['name'] = model_name
result_df

In [None]:
## weighted version
steps = [('scaler', StandardScaler()),         
         ('svc', SVC(kernel = 'rbf', gamma = 'scale', class_weight = 'balanced', random_state = 42))]

svcRBF_pipeline = Pipeline(steps)
svcRBF_pipeline.fit(X_train, y_train)
ypred_train = svcRBF_pipeline.predict(X_train)
ypred_test = svcRBF_pipeline.predict(X_test)
report_clf_tr = classification_report(y_train,ypred_train,output_dict=True)
report_clf_test = classification_report(y_test,ypred_test, output_dict=True)


In [None]:
## Unweighted version
steps = [('scaler', StandardScaler()),         
         ('svc', SVC(kernel = 'rbf', gamma = 'scale', random_state = 42))]

svcRBF_pipeline = Pipeline(steps)
svcRBF_pipeline.fit(X_train, y_train)
ypred_train = svcRBF_pipeline.predict(X_train)
ypred_test = svcRBF_pipeline.predict(X_test)
report_clf_tr = classification_report(y_train,ypred_train)
report_clf_test = classification_report(y_test,ypred_test)

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


# Assuming 'class' is the name of the target variable
# Separate data into minority and majority classes
minority_class = df[df['Class'] == 1]
majority_class = df[df['Class'] == 0]

# Visualize data distribution for relevant features in the minority class
features_to_visualize = ['V1', 'V2', 'V3']
for feature in features_to_visualize:
    sns.boxplot(x=minority_class[feature])
    # Alternatively, you can use histograms or other plots
    # sns.histplot(minority_class[feature], bins=20)
    plt.title("Distribution of {}".format(feature))
    plt.show()

# Detect outliers in the minority class using appropriate techniques
# Example: Z-score method
from scipy.stats import zscore
z_scores = zscore(minority_class[features_to_visualize])
outliers = (z_scores > 3).any(axis=1)
outlier_instances = minority_class[outliers]

# Visualize outliers
sns.scatterplot(x='V1', y='V2', data=majority_class, color = 'black', label='Non-fraud')
sns.scatterplot(x='V1', y='V2', data=minority_class, label = "fraud")
#sns.scatterplot(x='V1', y='V2', data=outlier_instances, color='red', label='Outliers')
plt.title("Outliers in Minority Class")
plt.legend()
plt.show()
