In [18]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

In [19]:
data = pd.read_csv("payroll_fraud_dataset_10000.csv")

In [20]:
data.head()

Unnamed: 0,Employee ID,Employee Name,Employment Status,Salary,Bonuses,Working Hours,Overtime Hours,Payroll Amount,Bank Account Number,Suspicious Activity Flag
0,1001,Jane Smith,Employee,3668,638,165,49,5531,ACCT11953,True
1,1002,Sophia Moore,Employee,5715,36,153,28,6451,ACCT17726,False
2,1003,David Wilson,Employee,3576,568,175,0,4144,ACCT10972,False
3,1004,Emma Davis,Terminated,6961,1434,175,19,8870,ACCT13118,True
4,1005,Chris Taylor,Employee,4340,449,157,1,4814,ACCT11592,False


In [21]:
try:
    data['overtime_ratio'] = data['Overtime Hours'] / data['Working Hours']
except Exception as e:
    data['overtime_ratio'] = 1e5

data['Total Compensation'] = data['Salary'] + data['Bonuses']

In [22]:
data['Is_Active'] = data['Employment Status'].apply(lambda x: 1 if x=='Employee' else 0)
data['Bank Account Number'] = data['Bank Account Number'].str.extract('(\d+)', expand=False).astype(int)

In [23]:
data.head()

Unnamed: 0,Employee ID,Employee Name,Employment Status,Salary,Bonuses,Working Hours,Overtime Hours,Payroll Amount,Bank Account Number,Suspicious Activity Flag,overtime_ratio,Total Compensation,Is_Active
0,1001,Jane Smith,Employee,3668,638,165,49,5531,11953,True,0.29697,4306,1
1,1002,Sophia Moore,Employee,5715,36,153,28,6451,17726,False,0.183007,5751,1
2,1003,David Wilson,Employee,3576,568,175,0,4144,10972,False,0.0,4144,1
3,1004,Emma Davis,Terminated,6961,1434,175,19,8870,13118,True,0.108571,8395,0
4,1005,Chris Taylor,Employee,4340,449,157,1,4814,11592,False,0.006369,4789,1


In [24]:
data['Employee Name'].unique()

array(['Jane Smith', 'Sophia Moore', 'David Wilson', 'Emma Davis',
       'Chris Taylor', 'Jim Brown', 'Linda Walker', 'Ethan Turner',
       'Daniel King', 'Mark Johnson', 'Olivia Scott', 'Michelle Clark',
       'Ava Hall', 'Brian Adams', 'John Doe', 'Sara Lee'], dtype=object)

In [25]:
X = data.drop(columns=["Employee ID", "Employee Name", "Employment Status","Suspicious Activity Flag"], inplace=False) 
Y = data['Suspicious Activity Flag']

In [26]:
X

Unnamed: 0,Salary,Bonuses,Working Hours,Overtime Hours,Payroll Amount,Bank Account Number,overtime_ratio,Total Compensation,Is_Active
0,3668,638,165,49,5531,11953,0.296970,4306,1
1,5715,36,153,28,6451,17726,0.183007,5751,1
2,3576,568,175,0,4144,10972,0.000000,4144,1
3,6961,1434,175,19,8870,13118,0.108571,8395,0
4,4340,449,157,1,4814,11592,0.006369,4789,1
...,...,...,...,...,...,...,...,...,...
9995,5050,892,171,32,6742,12570,0.187135,5942,1
9996,3101,964,180,7,4240,18495,0.038889,4065,1
9997,6120,1374,176,60,8994,18296,0.340909,7494,1
9998,3385,1264,155,32,5449,16451,0.206452,4649,1


In [27]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

scaler = StandardScaler()

x_scaled = scaler.fit_transform(X) 

X_train, X_test, Y_train, Y_test = train_test_split(x_scaled, Y, test_size=0.2, random_state=42)

In [28]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, Y_train) 


0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [29]:
Y_pred = clf.predict(X_test)

In [30]:
from sklearn.metrics import accuracy_score, confusion_matrix

acc = accuracy_score(Y_pred, Y_test)

print(acc*100)

85.1


In [31]:
print(confusion_matrix(Y_test, Y_pred))

[[  19  268]
 [  30 1683]]


In [38]:
import pickle 

with open("model.pkl", "wb") as f:
    pickle.dump(clf, f)

print("Model saved")

with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

print("scaler saved")



Model saved
scaler saved


In [36]:
print(clf.predict(x_scaled))

[ True False False ...  True  True  True]


In [37]:
with open("model.pkl", 'rb') as f:
    model = pickle.load(f)


print(model.predict(x_scaled))

[ True False False ...  True  True  True]
