In [10]:
import pandas as pd
import numpy as np

In [11]:
from sklearn.ensemble import RandomForestClassifier #to detect fraud
from sklearn.model_selection import train_test_split # Splits the data into training and validation sets.
from sklearn.preprocessing import LabelEncoder #encodes categorical data into numeric form
from sklearn.impute import SimpleImputer #Fills in missing values (NaNs) in the dataset.
from sklearn.metrics import classification_report, confusion_matrix #

In [12]:
#load train and test dataset
train_df = pd.read_csv("/home/mubasshir/Desktop/research/Our_Work/based_on_dataset_3/dataset/Train_Beneficiarydata-1542865627584.csv")
test_df = pd.read_csv("/home/mubasshir/Desktop/research/Our_Work/based_on_dataset_3/dataset/Test_Beneficiarydata-1542969243754.csv")


In [13]:
# Simulate a binary 'Fraud' label for training
np.random.seed(42)
train_df['Fraud'] = np.random.randint(0, 2, size=len(train_df))
# print(train_df)

In [14]:
drop_cols = ["BeneID", "DOB", "DOD", "Fraud"]
X = train_df.drop(columns = drop_cols) # (input variables)
Y = train_df["Fraud"] # Target vector (labels, i.e., 0 for not fraud, 1 for fraud).


In [15]:
le = LabelEncoder()
X["RenalDiseaseIndicator"] = le.fit_transform(X["RenalDiseaseIndicator"])


In [16]:
imputer = SimpleImputer(strategy="most_frequent")
X = imputer.fit_transform(X) #Fills missing values with the most frequent value in each column.

In [17]:
X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size=0.2, random_state=42)

In [18]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42) #Initializes the Random Forest with 100 trees.

In [19]:
rf_model.fit(X_train, y_train)

In [20]:
y_pred = rf_model.predict(X_val)
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred))
print("\nClassification Report:\n", classification_report(y_val, y_pred))

Confusion Matrix:
 [[7222 6568]
 [7281 6641]]

Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.52      0.51     13790
           1       0.50      0.48      0.49     13922

    accuracy                           0.50     27712
   macro avg       0.50      0.50      0.50     27712
weighted avg       0.50      0.50      0.50     27712



In [21]:
X_test = test_df.drop(columns=["BeneID", "DOB", "DOD"])

In [22]:
X_test["RenalDiseaseIndicator"] = le.transform(X_test["RenalDiseaseIndicator"])

In [23]:
X_test = imputer.transform(X_test)

In [24]:
test_predictions = rf_model.predict(X_test)

In [27]:
# Attach predictions to test data
test_df["PredictedFraud"] = test_predictions

# Show summary and sample predictions
fraud_summary = test_df["PredictedFraud"].value_counts()
print("Fraud Prediction Summary:\n", fraud_summary)

# Show first 10 predictions
print("\nSample Predictions:")
print(test_df[["BeneID", "PredictedFraud"]])


Fraud Prediction Summary:
 PredictedFraud
0    32469
1    31499
Name: count, dtype: int64

Sample Predictions:
           BeneID  PredictedFraud
0       BENE11001               1
1       BENE11007               0
2       BENE11010               1
3       BENE11011               0
4       BENE11014               0
...           ...             ...
63963  BENE159187               1
63964  BENE159193               0
63965  BENE159194               1
63966  BENE159196               0
63967  BENE159197               1

[63968 rows x 2 columns]
