In [43]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

In [44]:
data = pd.read_csv("payroll_fraud_dataset.csv")

In [45]:
data.head()

Unnamed: 0,Employee ID,Employee Name,Employment Status,Salary,Bonuses,Working Hours,Overtime Hours,Payroll Amount,Bank Account Number,Suspicious Activity Flag
0,7252,Matthew Ryan,Employee,70758.96,10049.9,174,69,91158.86,PLKL41326049435292,0
1,5684,Aimee Berg,Terminated,105712.97,47949.94,170,46,160562.91,ZUXT02556184442182,1
2,2731,Tina Gordon,Employee,89098.3,6699.62,141,75,107047.92,CCUV05703996306670,0
3,5742,Stephanie Morris,Employee,102808.67,782.64,144,10,105091.31,LHWY72420092996952,0
4,5521,Nicole Mccormick,Employee,119172.54,572.44,164,22,123044.98,DHEJ26289893651956,0


In [46]:
try:
    data['overtime_ratio'] = data['Overtime Hours'] / data['Working Hours']
except Exception as e:
    data['overtime_ratio'] = 1e5

data['Total Compensation'] = data['Salary'] + data['Bonuses']

In [47]:
data['Is_Active'] = data['Employment Status'].apply(lambda x: 1 if x=='Employee' else 0)
data['Bank Account Number'] = data['Bank Account Number'].str.extract('(\d+)', expand=False).astype(int)

In [48]:
data.head()

Unnamed: 0,Employee ID,Employee Name,Employment Status,Salary,Bonuses,Working Hours,Overtime Hours,Payroll Amount,Bank Account Number,Suspicious Activity Flag,overtime_ratio,Total Compensation,Is_Active
0,7252,Matthew Ryan,Employee,70758.96,10049.9,174,69,91158.86,41326049435292,0,0.396552,80808.86,1
1,5684,Aimee Berg,Terminated,105712.97,47949.94,170,46,160562.91,2556184442182,1,0.270588,153662.91,0
2,2731,Tina Gordon,Employee,89098.3,6699.62,141,75,107047.92,5703996306670,0,0.531915,95797.92,1
3,5742,Stephanie Morris,Employee,102808.67,782.64,144,10,105091.31,72420092996952,0,0.069444,103591.31,1
4,5521,Nicole Mccormick,Employee,119172.54,572.44,164,22,123044.98,26289893651956,0,0.134146,119744.98,1


In [49]:
data['Employee Name'].unique()

array(['Matthew Ryan', 'Aimee Berg', 'Tina Gordon', ..., 'Erin Adams',
       'Phillip Brown', 'Jenny Camacho'], shape=(9687,), dtype=object)

In [50]:
X = data.drop(columns=["Employee ID", "Employee Name", "Employment Status","Suspicious Activity Flag"], inplace=False) 
Y = data['Suspicious Activity Flag']

In [51]:
X

Unnamed: 0,Salary,Bonuses,Working Hours,Overtime Hours,Payroll Amount,Bank Account Number,overtime_ratio,Total Compensation,Is_Active
0,70758.96,10049.90,174,69,91158.86,41326049435292,0.396552,80808.86,1
1,105712.97,47949.94,170,46,160562.91,2556184442182,0.270588,153662.91,0
2,89098.30,6699.62,141,75,107047.92,5703996306670,0.531915,95797.92,1
3,102808.67,782.64,144,10,105091.31,72420092996952,0.069444,103591.31,1
4,119172.54,572.44,164,22,123044.98,26289893651956,0.134146,119744.98,1
...,...,...,...,...,...,...,...,...,...
9995,67321.48,24771.85,144,62,101393.33,91008914582308,0.430556,92093.33,1
9996,64891.40,29968.83,150,9,96210.23,72352488071963,0.060000,94860.23,1
9997,107975.36,51299.57,156,78,170974.93,43627141722283,0.500000,159274.93,1
9998,119716.50,38732.74,143,68,168649.24,37347642737203,0.475524,158449.24,1


In [52]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

scaler = StandardScaler()

x_scaled = scaler.fit_transform(X) 

X_train, X_test, Y_train, Y_test = train_test_split(x_scaled, Y, test_size=0.3, random_state=42)

In [53]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, Y_train) 


0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [54]:
Y_pred = clf.predict(X_test)

In [55]:
from sklearn.metrics import accuracy_score, confusion_matrix

acc = accuracy_score(Y_pred, Y_test)

print(acc*100)

96.16666666666667


In [56]:
print(confusion_matrix(Y_test, Y_pred))

[[2445    0]
 [ 115  440]]


In [57]:
import pickle 

with open("model.pkl", "wb") as f:
    pickle.dump(clf, f)

print("Model saved")

with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

print("scaler saved")



Model saved
scaler saved


In [58]:
print(clf.predict(x_scaled))

[0 1 0 ... 0 0 0]


In [59]:
with open("model.pkl", 'rb') as f:
    model = pickle.load(f)


print(model.predict(x_scaled))

[0 1 0 ... 0 0 0]
