In [4]:
import pandas as pd

file_path = r"pth/to/dataset"
df = pd.read_excel(file_path)

print("Dataset Overview:")
print(df.info())
print("\nMissing Values:")
print(df.isnull().sum())

drop_columns = []
df = df.drop(columns=drop_columns)

df.ffill(inplace=True)
df = df.astype(str)
print("\nCleaned Dataset Overview:")
print(df.info())

Dataset Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1321 entries, 0 to 1320
Data columns (total 24 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Dummy Policy No             1321 non-null   int64  
 1   ASSURED_AGE                 1321 non-null   int64  
 2   NOMINEE_RELATION            1321 non-null   object 
 3   OCCUPATION                  1321 non-null   object 
 4   POLICY SUMASSURED           1321 non-null   int64  
 5   Premium                     1321 non-null   float64
 6   PREMIUMPAYMENTMODE          1321 non-null   object 
 7   Annual Income               1321 non-null   int64  
 8   HOLDERMARITALSTATUS         1321 non-null   object 
 9   INDIV_REQUIREMENTFLAG       1321 non-null   object 
 10  Policy Term                 1321 non-null   int64  
 11  Policy Payment Term         1321 non-null   int64  
 12  CORRESPONDENCECITY          1321 non-null   object 
 13  CORRESPONDENCES

In [5]:

num_cols = ["ASSURED_AGE", "POLICY SUMASSURED", "Premium", "Annual Income", "Policy Term", "Policy Payment Term", "Bank code"]
for col in num_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

df["Bank code"] = df["Bank code"].fillna(-1)

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1321 entries, 0 to 1320
Data columns (total 24 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Dummy Policy No             1321 non-null   object 
 1   ASSURED_AGE                 1321 non-null   int64  
 2   NOMINEE_RELATION            1321 non-null   object 
 3   OCCUPATION                  1321 non-null   object 
 4   POLICY SUMASSURED           1321 non-null   int64  
 5   Premium                     1321 non-null   float64
 6   PREMIUMPAYMENTMODE          1321 non-null   object 
 7   Annual Income               1321 non-null   int64  
 8   HOLDERMARITALSTATUS         1321 non-null   object 
 9   INDIV_REQUIREMENTFLAG       1321 non-null   object 
 10  Policy Term                 1321 non-null   int64  
 11  Policy Payment Term         1321 non-null   int64  
 12  CORRESPONDENCECITY          1321 non-null   object 
 13  CORRESPONDENCESTATE         1321 

In [6]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

num_cols = ["ASSURED_AGE", "POLICY SUMASSURED", "Premium", "Annual Income", "Policy Term", "Policy Payment Term", "Bank code"]
for col in num_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

df["Bank code"] = df["Bank code"].fillna(-1)

cat_cols = ["NOMINEE_RELATION", "OCCUPATION", "PREMIUMPAYMENTMODE", "HOLDERMARITALSTATUS", "INDIV_REQUIREMENTFLAG", "CORRESPONDENCECITY", "CORRESPONDENCESTATE", "CORRESPONDENCEPOSTCODE", "Product Type", "CHANNEL", "STATUS", "SUB_STATUS", "Fraud Category"]

label_encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1321 entries, 0 to 1320
Data columns (total 24 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Dummy Policy No             1321 non-null   object 
 1   ASSURED_AGE                 1321 non-null   int64  
 2   NOMINEE_RELATION            1321 non-null   int32  
 3   OCCUPATION                  1321 non-null   int32  
 4   POLICY SUMASSURED           1321 non-null   int64  
 5   Premium                     1321 non-null   float64
 6   PREMIUMPAYMENTMODE          1321 non-null   int32  
 7   Annual Income               1321 non-null   int64  
 8   HOLDERMARITALSTATUS         1321 non-null   int32  
 9   INDIV_REQUIREMENTFLAG       1321 non-null   int32  
 10  Policy Term                 1321 non-null   int64  
 11  Policy Payment Term         1321 non-null   int64  
 12  CORRESPONDENCECITY          1321 non-null   int32  
 13  CORRESPONDENCESTATE         1321 

In [8]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, mutual_info_classif

if 'df' not in locals():
    raise ValueError("Dataset 'df' is not defined. Load it before running this script.")

datetime_cols = df.select_dtypes(include=['object']).columns
for col in datetime_cols:
    try:
        df[col] = pd.to_datetime(df[col], errors='coerce')
        df[col] = df[col].astype('int64') // 10**9
    except Exception as e:
        print(f"Skipping column {col} due to conversion error: {e}")

num_cols = ["ASSURED_AGE", "POLICY SUMASSURED", "Premium", "Annual Income"]
for col in num_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')
    df[col].fillna(df[col].median(), inplace=True)

df["Bank code"].fillna(-1, inplace=True)

cat_cols = ["CORRESPONDENCECITY", "CORRESPONDENCESTATE", "CORRESPONDENCEPOSTCODE", "POLICYRISKCOMMENCEMENTDATE", "STATUS", "SUB_STATUS"]

label_encoders = {}
for col in cat_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

X = df.drop(columns=["Fraud Category"])
y = df["Fraud Category"]
selector = SelectKBest(score_func=mutual_info_classif, k=min(10, X.shape[1]))
X_selected = selector.fit_transform(X, y)
selected_features = X.columns[selector.get_support()]
print("Selected Features:", selected_features)

Selected Features: Index(['ASSURED_AGE', 'POLICY SUMASSURED', 'Premium', 'Annual Income',
       'CORRESPONDENCECITY', 'CORRESPONDENCESTATE', 'CORRESPONDENCEPOSTCODE',
       'POLICYRISKCOMMENCEMENTDATE', 'STATUS', 'SUB_STATUS'],
      dtype='object')


In [9]:
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, mutual_info_classif

class_counts = Counter(y)
print("Class Distribution:", class_counts)
rare_classes = [cls for cls, count in class_counts.items() if count < 2]
if rare_classes:
    print(f"Rare classes detected: {rare_classes}. These will be merged or removed.")
    df = df[~df["Fraud Category"].isin(rare_classes)]

X = df.drop(columns=["Fraud Category"])
y = df["Fraud Category"]

selector = SelectKBest(score_func=mutual_info_classif, k=min(10, X.shape[1]))
X_selected = selector.fit_transform(X, y)
selected_features = X.columns[selector.get_support()]
print("Selected Features:", selected_features)

print("Shape of X_selected:", X_selected.shape)
print("Shape of y:", y.shape)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
print("New Class Labels:", dict(enumerate(label_encoder.classes_)))

smote = SMOTE(sampling_strategy='auto', k_neighbors=1, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_selected, y)

print("Resampled Class Distribution:", Counter(y_resampled))

X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
)

xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))

Class Distribution: Counter({8: 939, 2: 276, 10: 39, 6: 26, 9: 15, 1: 13, 3: 6, 7: 3, 5: 1, 4: 1, 0: 1, 11: 1})
Rare classes detected: [5, 4, 0, 11]. These will be merged or removed.
Selected Features: Index(['ASSURED_AGE', 'POLICY SUMASSURED', 'Premium', 'Annual Income',
       'CORRESPONDENCECITY', 'CORRESPONDENCESTATE', 'CORRESPONDENCEPOSTCODE',
       'POLICYRISKCOMMENCEMENTDATE', 'STATUS', 'SUB_STATUS'],
      dtype='object')
Shape of X_selected: (1317, 10)
Shape of y: (1317,)
New Class Labels: {0: 1, 1: 2, 2: 3, 3: 6, 4: 7, 5: 8, 6: 9, 7: 10}
Resampled Class Distribution: Counter({5: 939, 1: 939, 7: 939, 6: 939, 3: 939, 4: 939, 2: 939, 0: 939})
Model Accuracy: 0.9866932801064537
Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00       188
           1       0.97      0.94      0.96       187
           2       0.99      0.99      0.99       188
           3       1.00      1.00      1.00       188
           

In [11]:
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GATConv
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import pickle

selected_features = ["ASSURED_AGE", "POLICY SUMASSURED", "Premium", "Annual Income"]
X = df[selected_features]
y = df["Fraud Category"]

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

num_nodes = X_scaled.shape[0]
num_features = X_scaled.shape[1]
node_features = torch.tensor(X_scaled, dtype=torch.float).to(device)

edge_index = []
policy_to_claims = df.groupby("Dummy Policy No").indices
for claims in policy_to_claims.values():
    if len(claims) > 1:
        for i in range(len(claims)):
            for j in range(i + 1, len(claims)):
                edge_index.append([claims[i], claims[j]])
                edge_index.append([claims[j], claims[i]])

edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous().to(device)
y_tensor = torch.tensor(y_encoded, dtype=torch.long).to(device)

data = Data(x=node_features, edge_index=edge_index, y=y_tensor)

train_mask, test_mask = train_test_split(
    np.arange(num_nodes), test_size=0.2, random_state=42, stratify=y_encoded
)
train_mask = torch.tensor(train_mask, dtype=torch.long).to(device)
test_mask = torch.tensor(test_mask, dtype=torch.long).to(device)

data.train_mask = train_mask
data.test_mask = test_mask

class GNN(torch.nn.Module):
    def __init__(self, num_features, hidden_dim, num_classes):
        super(GNN, self).__init__()
        self.conv1 = GATConv(num_features, hidden_dim)
        self.conv2 = GATConv(hidden_dim, num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.elu(x)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

num_classes = len(np.unique(y_encoded))
model = GNN(num_features=num_features, hidden_dim=128, num_classes=num_classes).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005, weight_decay=1e-4)

def train():
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

def test():
    model.eval()
    logits = model(data)
    pred = logits.argmax(dim=1)
    correct = pred[data.test_mask] == data.y[data.test_mask]
    accuracy = correct.sum().item() / data.test_mask.shape[0]
    return accuracy

epochs = 1000
for epoch in range(epochs):
    loss = train()
    if epoch == 0 or epoch == 999:
        acc = test()
        print(f"Epoch {epoch}: Loss {loss:.4f}, Test Accuracy {acc:.4f}")

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)
xgb_model = XGBClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)
xgb_preds = xgb_model.predict(X_test)

def ensemble_predict(gnn_preds, xgb_preds):
    final_preds = []
    for i in range(len(gnn_preds)):
        if gnn_preds[i] == xgb_preds[i]:
            final_preds.append(gnn_preds[i])
        else:
            final_preds.append(xgb_preds[i])
    return np.array(final_preds)

gnn_logits = model(data)
gnn_preds = gnn_logits.argmax(dim=1).cpu().numpy()[test_mask.cpu().numpy()]
final_preds = ensemble_predict(gnn_preds, xgb_preds)
ensemble_accuracy = (final_preds == y_test).sum() / len(y_test)
print(f"Ensemble Test Accuracy: {ensemble_accuracy:.4f}")

with open(r"path\to\model\fraud_ensemble.pkl", "wb") as f:
    pickle.dump((model.state_dict(), xgb_model, scaler, label_encoder), f)

print("Models saved successfully!")

Epoch 0: Loss 2.1931, Test Accuracy 0.0114
Epoch 999: Loss 0.8832, Test Accuracy 0.7121
Ensemble Test Accuracy: 0.8636
Models saved successfully!
