In [1]:
# Install required packages
!pip install deep_translator transformers torch tensorflow seaborn matplotlib pandas numpy scikit-learn

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
# from deep_translator import GoogleTranslator  # Commented out backtranslation for now
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Load dataset
df = pd.read_csv("/content/combined_data.csv", usecols=["Text", "Fraud level"])
df = df[df['Fraud level'].isin([0, 1])]

# Split dataset (80% training, 20% test)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["Fraud level"])

"""
# Back-translation function (commented out for now)
def back_translate(text, src_lang="bn", mid_lang="en"):
    try:
        translated = GoogleTranslator(source=src_lang, target=mid_lang).translate(text)
        back_translated = GoogleTranslator(source=mid_lang, target=src_lang).translate(translated)
        return back_translated
    except:
        return text

# Augment training data (commented out for now)
back_translated_df = train_df.copy()
back_translated_df["Text"] = back_translated_df["Text"].apply(lambda x: x + " " + back_translate(x))
augmented_train_df = pd.concat([train_df, back_translated_df], ignore_index=True)
"""

# Use original training data without augmentation
augmented_train_df = train_df.copy()

# Reset indices
augmented_train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

# Prepare device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)
model.to(device)

# Custom Dataset for RoBERTa
class FraudDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Prepare datasets and dataloaders
max_length = 128
batch_size = 16

train_dataset = FraudDataset(
    augmented_train_df['Text'].values,
    augmented_train_df['Fraud level'].values,
    tokenizer,
    max_length=max_length
)
test_dataset = FraudDataset(
    test_df['Text'].values,
    test_df['Fraud level'].values,
    tokenizer,
    max_length=max_length
)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

# Training loop
epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = torch.argmax(logits, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    print(f"Epoch {epoch + 1}/{epochs} | Loss: {total_loss/len(train_loader):.4f} | Accuracy: {correct/total:.4f}")

# Evaluation
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Confusion Matrix
cm = confusion_matrix(all_labels, all_preds)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Fraud (0)', 'Fraud (1)'], yticklabels=['Not Fraud (0)', 'Fraud (1)'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - RoBERTa')
plt.show()


Collecting deep_translator
  Downloading deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 

FileNotFoundError: [Errno 2] No such file or directory: '/content/combined_data.csv'

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizer, RobertaForSequenceClassification, get_linear_schedule_with_warmup
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, confusion_matrix

# Load data
df = pd.read_csv("/content/combined_data.csv", usecols=["Text", "Fraud level"])
df = df[df['Fraud level'].isin([0,1])]

# Stratified train-test split (80-20)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['Fraud level'])

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)
model.to(device)

class FraudDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer(text,
                                  max_length=self.max_length,
                                  padding='max_length',
                                  truncation=True,
                                  return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.long)
        }

max_length = 128
batch_size = 16

train_dataset = FraudDataset(train_df['Text'].values, train_df['Fraud level'].values, tokenizer, max_length)
test_dataset = FraudDataset(test_df['Text'].values, test_df['Fraud level'].values, tokenizer, max_length)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

optimizer = AdamW(model.parameters(), lr=3e-5, weight_decay=0.01, eps=1e-8)

epochs = 4

total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=int(0.1 * total_steps),
                                            num_training_steps=total_steps)

def train_epoch(model, dataloader):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        preds = torch.argmax(logits, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)
    return total_loss / len(dataloader), correct / total

def eval_model(model, dataloader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    acc = accuracy_score(all_labels, all_preds)
    return acc, all_labels, all_preds

for epoch in range(epochs):
    train_loss, train_acc = train_epoch(model, train_loader)
    test_acc, y_true, y_pred = eval_model(model, test_loader)

    print(f"Epoch {epoch+1}/{epochs} - Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}")

cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Not Fraud (0)', 'Fraud (1)'],
            yticklabels=['Not Fraud (0)', 'Fraud (1)'])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()


In [None]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from deep_translator import GoogleTranslator
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, Trainer, TrainingArguments
from transformers import TrainerCallback
import torch
from torch.utils.data import Dataset

# Load dataset
try:
    df = pd.read_csv("/content/combined_data.csv", usecols=["Text", "Fraud level"])
except FileNotFoundError:
    print("Error: combined_data.csv not found.")
    exit(1)

# Remove rows where 'Fraud level' is NaN or not in [0, 1]
df = df[df['Fraud level'].isin([0, 1])]

# Split dataset (80% training, 20% test)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["Fraud level"])

# Function for back translation (Bangla → English → Bangla)
# def back_translate(text, src_lang="bn", mid_lang="en"):
#     try:
#         translated = GoogleTranslator(source=src_lang, target=mid_lang).translate(text)
#         back_translated = GoogleTranslator(source=mid_lang, target=src_lang).translate(translated)
#         return back_translated
#     except Exception as e:
#         print(f"Translation error: {e}")
#         return text  # If translation fails, return original text

# Create a new DataFrame with back-translated and concatenated text
# back_translated_df = train_df.copy()
# back_translated_df["Text"] = back_translated_df["Text"].apply(lambda x: x + " " + back_translate(x))

# Concatenate original training set with back-translated set
# augmented_train_df = pd.concat([train_df, back_translated_df], ignore_index=True)
augmented_train_df = train_df.copy()  # Use original training data without back-translation

# Display results
print("Original Training Set Size:", len(train_df))
print("New Augmented Training Set Size:", len(augmented_train_df))
print("\nSample of Augmented Training Set:\n", augmented_train_df.head(10))
print("\nTest Set (Unchanged):\n", test_df.head())
print("\nUnique Fraud Levels in Training Set:", augmented_train_df['Fraud level'].unique())
print("Unique Fraud Levels in Test Set:", test_df['Fraud level'].unique())

# Combine test and augmented training set for visualization
df_combined = pd.concat([test_df, augmented_train_df])

# Count fraud labels
fraud_counts = df_combined['Fraud level'].value_counts()

# Chart.js configuration for fraud label distribution
print("""
```chartjs
{
  "type": "bar",
  "data": {
    "labels": ["Not Fraud (0)", "Fraud (1)"],
    "datasets": [{
      "label": "Fraud Label Count",
      "data": [655, 634],
      "backgroundColor": ["#36A2EB", "#FF9F40"],
      "borderColor": ["#36A2EB", "#FF9F40"],
      "borderWidth": 1
    }]
  },
  "options": {
    "scales": {
      "y": {
        "beginAtZero": true,
        "title": {
          "display": true,
          "text": "Count"
        }
      },
      "x": {
        "title": {
          "display": true,
          "text": "Fraud Label"
        }
      }
    },
    "plugins": {
      "title": {
        "display": true,
        "text": "Fraud Label Distribution in Test + Training Set"
      },
      "legend": {
        "display": false
      }
    }
  }
}
```
""")

# Display dataset info
print("\nDataset Info:")
print(df.info())
print("\nDataset Description:")
print(df.describe())
print("\nFirst 5 Rows of Dataset:")
print(df.head())

# Text preprocessing functions
def replace_strings(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"
                               u"\U0001F300-\U0001F5FF"
                               u"\U0001F680-\U0001F6FF"
                               u"\U0001F1E0-\U0001F1FF"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\u00C0-\u017F"
                               u"\u2000-\u206F"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    # Keep English and numbers for mixed-language text
    return text

def remove_punctuations(my_str):
    punctuations = '''````£|¢|Ñ+-*/=EROero৳০১২৩৪৫৬৭৮৯012–34567•89।!()-[]{};:'"“\’,<>./?@#$%^&*_~‘—॥”‰⚽️✌ ￰৷￰'''
    no_punct = ""
    for char in my_str:
        if char not in punctuations:
            no_punct += char
    return no_punct

def preprocessing(text):
    out = remove_punctuations(replace_strings(text))
    return out

# Apply preprocessing once
augmented_train_df['Text'] = augmented_train_df['Text'].apply(lambda x: preprocessing(str(x)))
test_df['Text'] = test_df['Text'].apply(lambda x: preprocessing(str(x)))

# Reset indices
augmented_train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

# Prepare training and testing data
train_texts = augmented_train_df['Text'].values
train_labels = augmented_train_df['Fraud level'].values.astype(int)
test_texts = test_df['Text'].values
test_labels = test_df['Fraud level'].values.astype(int)

# Initialize XLM-RoBERTa tokenizer
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

# Tokenize data
def tokenize_data(texts, max_length=128):
    encodings = tokenizer(
        texts.tolist(),
        truncation=True,
        padding=True,
        max_length=max_length,
        return_tensors='pt'
    )
    return encodings

train_encodings = tokenize_data(train_texts)
test_encodings = tokenize_data(test_texts)

# Create custom dataset class
class FraudDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = FraudDataset(train_encodings, train_labels)
test_dataset = FraudDataset(test_encodings, test_labels)

# Initialize XLM-RoBERTa model
model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=2)

# Custom callback to log loss and accuracy
class CustomCallback(TrainerCallback):
    def __init__(self):
        self.train_loss = []
        self.eval_loss = []
        self.eval_acc = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs.get('loss') is not None:
            self.train_loss.append(logs['loss'])
        if logs.get('eval_loss') is not None:
            self.eval_loss.append(logs['eval_loss'])
        if logs.get('eval_accuracy') is not None:
            self.eval_acc.append(logs['eval_accuracy'])

    def plot_metrics(self):
        plt.figure(figsize=(10, 5))
        plt.subplot(1, 2, 1)
        plt.plot(self.train_loss, label='Train Loss')
        plt.plot(self.eval_loss, label='Validation Loss')
        plt.title('Model Loss')
        plt.xlabel('Logging Step')
        plt.ylabel('Loss')
        plt.legend()

        plt.subplot(1, 2, 2)
        plt.plot(self.eval_acc, label='Validation Accuracy')
        plt.title('Model Accuracy')
        plt.xlabel('Epoch')
        plt.ylabel('Accuracy')
        plt.legend()
        plt.tight_layout()
        plt.show()

custom_callback = CustomCallback()

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    report_to="none"  # Disable W&B logging
)

# Define compute metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    return {"accuracy": accuracy}

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[custom_callback]
)

# Train model
trainer.train()

# Evaluate model
results = trainer.evaluate()
print("Test Accuracy:", results['eval_accuracy'])

# Plot loss and accuracy
custom_callback.plot_metrics()

# Generate predictions for confusion matrix
predictions = trainer.predict(test_dataset)
pred_labels = predictions.predictions.argmax(-1)
true_labels = predictions.label_ids

# Plot confusion matrix
cm = confusion_matrix(true_labels, pred_labels)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Fraud (0)', 'Fraud (1)'], yticklabels=['Not Fraud (0)', 'Fraud (1)'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix for XLM-RoBERTa Classification')
plt.show()

In [None]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from deep_translator import GoogleTranslator
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, Trainer, TrainingArguments
from transformers import TrainerCallback
import torch
from torch.utils.data import Dataset

# Load dataset
try:
    df = pd.read_csv("/content/combined_data.csv", usecols=["Text", "Fraud level"])
except FileNotFoundError:
    print("Error: combined_data.csv not found.")
    exit(1)

# Remove rows where 'Fraud level' is NaN or not in [0, 1]
df = df[df['Fraud level'].isin([0, 1])]

# Split dataset: 70% train, 10% validation, 20% test
train_val_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["Fraud level"])
train_df, val_df = train_test_split(train_val_df, test_size=0.125, random_state=42, stratify=train_val_df["Fraud level"])  # 10% of 80% = 12.5% of train_val

# Function for back translation (Bangla → English → Bangla) without API key
def back_translate(text, src_lang="bn", mid_lang="en"):
    try:
        translator = GoogleTranslator(source=src_lang, target=mid_lang)
        translated = translator.translate(text)
        back_translator = GoogleTranslator(source=mid_lang, target=src_lang)
        back_translated = back_translator.translate(translated)
        return back_translated
    except Exception as e:
        print(f"Translation error: {e}")
        return text  # If translation fails, return original text

# Create a new DataFrame with back-translated and concatenated text for training set
back_translated_df = train_df.copy()
back_translated_df["Text"] = back_translated_df["Text"].apply(lambda x: x + " " + back_translate(x))

# Concatenate original training set with back-translated set
augmented_train_df = pd.concat([train_df, back_translated_df], ignore_index=True)

# Display results
print("Original Training Set Size:", len(train_df))
print("Augmented Training Set Size:", len(augmented_train_df))
print("Validation Set Size:", len(val_df))
print("Test Set Size:", len(test_df))
print("\nSample of Augmented Training Set:\n", augmented_train_df.head(10))
print("\nValidation Set:\n", val_df.head())
print("\nTest Set:\n", test_df.head())
print("\nUnique Fraud Levels in Training Set:", augmented_train_df['Fraud level'].unique())
print("Unique Fraud Levels in Validation Set:", val_df['Fraud level'].unique())
print("Unique Fraud Levels in Test Set:", test_df['Fraud level'].unique())

# Combine test, validation, and augmented training set for visualization
df_combined = pd.concat([test_df, val_df, augmented_train_df])

# Count fraud labels
fraud_counts = df_combined['Fraud level'].value_counts()

# Chart.js configuration for fraud label distribution
print("""
```chartjs
{
  "type": "bar",
  "data": {
    "labels": ["Not Fraud (0)", "Fraud (1)"],
    "datasets": [{
      "label": "Fraud Label Count",
      "data": [""" + str(fraud_counts.get(0, 0)) + """, """ + str(fraud_counts.get(1, 0)) + """],
      "backgroundColor": ["#36A2EB", "#FF9F40"],
      "borderColor": ["#36A2EB", "#FF9F40"],
      "borderWidth": 1
    }]
  },
  "options": {
    "scales": {
      "y": {
        "beginAtZero": true,
        "title": {
          "display": true,
          "text": "Count"
        }
      },
      "x": {
        "title": {
          "display": true,
          "text": "Fraud Label"
        }
      }
    },
    "plugins": {
      "title": {
        "display": true,
        "text": "Fraud Label Distribution in Test + Validation + Training Set"
      },
      "legend": {
        "display": false
      }
    }
  }
}
```
""")

# Display dataset info
print("\nDataset Info:")
print(df.info())
print("\nDataset Description:")
print(df.describe())
print("\nFirst 5 Rows of Dataset:")
print(df.head())

# Text preprocessing functions
def replace_strings(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"
                               u"\U0001F300-\U0001F5FF"
                               u"\U0001F680-\U0001F6FF"
                               u"\U0001F1E0-\U0001F1FF"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\u00C0-\u017F"
                               u"\u2000-\u206F"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    # Keep English and numbers for mixed-language text
    return text

def remove_punctuations(my_str):
    punctuations = '''````£|¢|Ñ+-*/=EROero৳০১২৩৪৫৬৭৮৯012–34567•89।!()-[]{};:'"“\’,<>./?@#$%^&*_~‘—॥”‰⚽️✌ ￰৷￰'''
    no_punct = ""
    for char in my_str:
        if char not in punctuations:
            no_punct += char
    return no_punct

def preprocessing(text):
    out = remove_punctuations(replace_strings(text))
    return out

# Apply preprocessing once
augmented_train_df['Text'] = augmented_train_df['Text'].apply(lambda x: preprocessing(str(x)))
val_df['Text'] = val_df['Text'].apply(lambda x: preprocessing(str(x)))
test_df['Text'] = test_df['Text'].apply(lambda x: preprocessing(str(x)))

# Reset indices
augmented_train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

# Prepare training, validation, and testing data
train_texts = augmented_train_df['Text'].values
train_labels = augmented_train_df['Fraud level'].values.astype(int)
val_texts = val_df['Text'].values
val_labels = val_df['Fraud level'].values.astype(int)
test_texts = test_df['Text'].values
test_labels = test_df['Fraud level'].values.astype(int)

# Initialize XLM-RoBERTa tokenizer
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

# Tokenize data
def tokenize_data(texts, max_length=128):
    encodings = tokenizer(
        texts.tolist(),
        truncation=True,
        padding=True,
        max_length=max_length,
        return_tensors='pt'
    )
    return encodings

train_encodings = tokenize_data(train_texts)
val_encodings = tokenize_data(val_texts)
test_encodings = tokenize_data(test_texts)

# Create custom dataset class
class FraudDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = FraudDataset(train_encodings, train_labels)
val_dataset = FraudDataset(val_encodings, val_labels)
test_dataset = FraudDataset(test_encodings, test_labels)

# Initialize XLM-RoBERTa model
model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=2)

# Custom callback to log loss and accuracy
class CustomCallback(TrainerCallback):
    def __init__(self):
        self.train_loss = []
        self.eval_loss = []
        self.eval_acc = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs.get('loss') is not None:
            self.train_loss.append(logs['loss'])
        if logs.get('eval_loss') is not None:
            self.eval_loss.append(logs['eval_loss'])
        if logs.get('eval_accuracy') is not None:
            self.eval_acc.append(logs['eval_accuracy'])

    def plot_metrics(self):
        plt.figure(figsize=(10, 5))
        plt.subplot(1, 2, 1)
        plt.plot(self.train_loss, label='Train Loss')
        plt.plot(self.eval_loss, label='Validation Loss')
        plt.title('Model Loss')
        plt.xlabel('Logging Step')
        plt.ylabel('Loss')
        plt.legend()

        plt.subplot(1, 2, 2)
        plt.plot(self.eval_acc, label='Validation Accuracy')
        plt.title('Model Accuracy')
        plt.xlabel('Epoch')
        plt.ylabel('Accuracy')
        plt.legend()
        plt.tight_layout()
        plt.show()

custom_callback = CustomCallback()

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,  # Increased to reduce log verbosity
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    report_to="none"  # Disable W&B logging
)

# Define compute metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    return {"accuracy": accuracy}

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,  # Use validation set for evaluation
    compute_metrics=compute_metrics,
    callbacks=[custom_callback]
)

# Train model
trainer.train()

# Evaluate model on test set
results = trainer.evaluate(test_dataset)
print(f"Test Accuracy: {results['eval_accuracy']}")

# Plot loss and accuracy
custom_callback.plot_metrics()

# Generate predictions for confusion matrix on test set
predictions = trainer.predict(test_dataset)
pred_labels = predictions.predictions.argmax(-1)
true_labels = predictions.label_ids

# Plot confusion matrix
cm = confusion_matrix(true_labels, pred_labels)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Fraud (0)', 'Fraud (1)'], yticklabels=['Not Fraud (0)', 'Fraud (1)'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix for XLM-RoBERTa Classification')
plt.show()

In [None]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, Trainer, TrainingArguments
from transformers import TrainerCallback
import torch
from torch.utils.data import Dataset
from sklearn.utils.class_weight import compute_class_weight

# Display results
print("Original Training Set Size:", len(train_df))
print("Augmented Training Set Size:", len(augmented_train_df))
print("Validation Set Size:", len(val_df))
print("Test Set Size:", len(test_df))
print("\nSample of Augmented Training Set:\n", augmented_train_df.head(10))
print("\nValidation Set:\n", val_df.head())
print("\nTest Set:\n", test_df.head())
print("\nUnique Fraud Levels in Training Set:", augmented_train_df['Fraud level'].unique())
print("Unique Fraud Levels in Validation Set:", val_df['Fraud level'].unique())
print("Unique Fraud Levels in Test Set:", test_df['Fraud level'].unique())

# Combine test, validation, and augmented training set for visualization
df_combined = pd.concat([test_df, val_df, augmented_train_df])

# Count fraud labels
fraud_counts = df_combined['Fraud level'].value_counts()

# Chart.js configuration for fraud label distribution
print("""
```chartjs
{
  "type": "bar",
  "data": {
    "labels": ["Not Fraud (0)", "Fraud (1)"],
    "datasets": [{
      "label": "Fraud Label Count",
      "data": [""" + str(fraud_counts.get(0, 0)) + """, """ + str(fraud_counts.get(1, 0)) + """],
      "backgroundColor": ["#36A2EB", "#FF9F40"],
      "borderColor": ["#36A2EB", "#FF9F40"],
      "borderWidth": 1
    }]
  },
  "options": {
    "scales": {
      "y": {
        "beginAtZero": true,
        "title": {
          "display": true,
          "text": "Count"
        }
      },
      "x": {
        "title": {
          "display": true,
          "text": "Fraud Label"
        }
      }
    },
    "plugins": {
      "title": {
        "display": true,
        "text": "Fraud Label Distribution in Test + Validation + Training Set"
      },
      "legend": {
        "display": false
      }
    }
}
```
""")

# Display dataset info
print("\nDataset Info:")
print(df.info())
print("\nDataset Description:")
print(df.describe())
print("\nFirst 5 Rows of Dataset:")
print(df.head())

# Text preprocessing functions
def replace_strings(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"
                               u"\U0001F300-\U0001F5FF"
                               u"\U0001F680-\U0001F6FF"
                               u"\U0001F1E0-\U0001F1FF"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\u00C0-\u017F"
                               u"\u2000-\u206F"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    return text

def preprocessing(text):
    return replace_strings(text)  # Only remove emojis, keep punctuations

# Apply preprocessing
augmented_train_df['Text'] = augmented_train_df['Text'].apply(lambda x: preprocessing(str(x)))
val_df['Text'] = val_df['Text'].apply(lambda x: preprocessing(str(x)))
test_df['Text'] = test_df['Text'].apply(lambda x: preprocessing(str(x)))

# Reset indices
augmented_train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

# Prepare training, validation, and testing data
train_texts = augmented_train_df['Text'].values
train_labels = augmented_train_df['Fraud level'].values.astype(int)
val_texts = val_df['Text'].values
val_labels = val_df['Fraud level'].values.astype(int)
test_texts = test_df['Text'].values
test_labels = test_df['Fraud level'].values.astype(int)

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))

# Initialize XLM-RoBERTa tokenizer
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

# Tokenize data
def tokenize_data(texts, max_length=256):  # Increased max_length
    encodings = tokenizer(
        texts.tolist(),
        truncation=True,
        padding=True,
        max_length=max_length,
        return_tensors='pt'
    )
    return encodings

train_encodings = tokenize_data(train_texts)
val_encodings = tokenize_data(val_texts)
test_encodings = tokenize_data(test_texts)

# Create custom dataset class
class FraudDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = FraudDataset(train_encodings, train_labels)
val_dataset = FraudDataset(val_encodings, val_labels)
test_dataset = FraudDataset(test_encodings, test_labels)

# Initialize XLM-RoBERTa model with dropout
model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=2)
model.config.hidden_dropout_prob = 0.3  # Added dropout

# Custom trainer to apply class weights
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get('logits')
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights_tensor)
        loss = loss_fct(logits.view(-1, model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

# Custom callback to log loss and accuracy
class CustomCallback(TrainerCallback):
    def __init__(self):
        self.train_loss = []
        self.eval_loss = []
        self.eval_acc = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs.get('loss') is not None:
            self.train_loss.append(logs['loss'])
        if logs.get('eval_loss') is not None:
            self.eval_loss.append(logs['eval_loss'])
        if logs.get('eval_accuracy') is not None:
            self.eval_acc.append(logs['eval_accuracy'])

    def plot_metrics(self):
        plt.figure(figsize=(10, 5))
        plt.subplot(1, 2, 1)
        plt.plot(self.train_loss, label='Train Loss')
        plt.plot(self.eval_loss, label='Validation Loss')
        plt.title('Model Loss')
        plt.xlabel('Logging Step')
        plt.ylabel('Loss')
        plt.legend()

        plt.subplot(1, 2, 2)
        plt.plot(self.eval_acc, label='Validation Accuracy')
        plt.title('Model Accuracy')
        plt.xlabel('Epoch')
        plt.ylabel('Accuracy')
        plt.legend()
        plt.tight_layout()
        plt.show()

custom_callback = CustomCallback()

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,  # Increased epochs
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=2e-5,  # Standard for Transformers
    warmup_steps=500,
    weight_decay=0.1,  # Increased regularization
    logging_dir='./logs',
    logging_steps=50,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    report_to="none"  # Disable W&B
)

# Define compute metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    report = classification_report(labels, preds, output_dict=True)
    return {
        "accuracy": accuracy,
        "precision_0": report['0']['precision'],
        "recall_0": report['0']['recall'],
        "f1_0": report['0']['f1-score'],
        "precision_1": report['1']['precision'],
        "recall_1": report['1']['recall'],
        "f1_1": report['1']['f1-score']
    }

# Initialize CustomTrainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[custom_callback]
)

# Train model
trainer.train()

# Evaluate on validation set
val_results = trainer.evaluate(val_dataset)
print("\nValidation Metrics:")
print(f"Accuracy: {val_results['eval_accuracy']:.4f}")
print(f"Non-Fraud (0) - Precision: {val_results['eval_precision_0']:.4f}, Recall: {val_results['eval_recall_0']:.4f}, F1: {val_results['eval_f1_0']:.4f}")
print(f"Fraud (1) - Precision: {val_results['eval_precision_1']:.4f}, Recall: {val_results['eval_recall_1']:.4f}, F1: {val_results['eval_f1_1']:.4f}")

# Evaluate on test set
test_results = trainer.evaluate(test_dataset)
print("\nTest Metrics:")
print(f"Accuracy: {test_results['eval_accuracy']:.4f}")
print(f"Non-Fraud (0) - Precision: {test_results['eval_precision_0']:.4f}, Recall: {test_results['eval_recall_0']:.4f}, F1: {test_results['eval_f1_0']:.4f}")
print(f"Fraud (1) - Precision: {test_results['eval_precision_1']:.4f}, Recall: {test_results['eval_recall_1']:.4f}, F1: {test_results['eval_f1_1']:.4f}")

# Plot loss and accuracy
custom_callback.plot_metrics()

# Generate predictions for confusion matrix on test set
predictions = trainer.predict(test_dataset)
pred_labels = predictions.predictions.argmax(-1)
true_labels = predictions.label_ids

# Print prediction distribution
print("\nPrediction Distribution on Test Set:")
print(pd.Series(pred_labels).value_counts())

# Plot confusion matrix
cm = confusion_matrix(true_labels, pred_labels)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Fraud (0)', 'Fraud (1)'], yticklabels=['Not Fraud (0)', 'Fraud (1)'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix for XLM-RoBERTa Classification')
plt.show()

# Print classification report
print("\nTest Set Classification Report:")
print(classification_report(true_labels, pred_labels, target_names=['Not Fraud (0)', 'Fraud (1)']))

In [None]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import TrainerCallback
import torch
from torch.utils.data import Dataset
from sklearn.utils.class_weight import compute_class_weight

# Display results
print("Original Training Set Size:", len(train_df))
print("Augmented Training Set Size:", len(augmented_train_df))
print("Validation Set Size:", len(val_df))
print("Test Set Size:", len(test_df))
print("\nSample of Augmented Training Set:\n", augmented_train_df.head(10))
print("\nValidation Set:\n", val_df.head())
print("\nTest Set:\n", test_df.head())
print("\nUnique Fraud Levels in Training Set:", augmented_train_df['Fraud level'].unique())
print("Unique Fraud Levels in Validation Set:", val_df['Fraud level'].unique())
print("Unique Fraud Levels in Test Set:", test_df['Fraud level'].unique())

# Combine test, validation, and augmented training set for visualization
df_combined = pd.concat([test_df, val_df, augmented_train_df])

# Count fraud labels
fraud_counts = df_combined['Fraud level'].value_counts()

# Chart.js configuration for fraud label distribution
print("""
```chartjs
{
  "type": "bar",
  "data": {
    "labels": ["Not Fraud (0)", "Fraud (1)"],
    "datasets": [{
      "label": "Fraud Label Count",
      "data": [""" + str(fraud_counts.get(0, 0)) + """, """ + str(fraud_counts.get(1, 0)) + """],
      "backgroundColor": ["#36A2EB", "#FF9F40"],
      "borderColor": ["#36A2EB", "#FF9F40"],
      "borderWidth": 1
    }]
  },
  "options": {
    "scales": {
      "y": {
        "beginAtZero": true,
        "title": {
          "display": true,
          "text": "Count"
        }
      },
      "x": {
        "title": {
          "display": true,
          "text": "Fraud Label"
        }
      }
    },
    "plugins": {
      "title": {
        "display": true,
        "text": "Fraud Label Distribution in Test + Validation + Training Set"
      },
      "legend": {
        "display": false
      }
    }
}
```
""")

# Display dataset info
print("\nDataset Info:")
print(df.info())
print("\nDataset Description:")
print(df.describe())
print("\nFirst 5 Rows of Dataset:")
print(df.head())

# Text preprocessing functions
def replace_strings(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"
                               u"\U0001F300-\U0001F5FF"
                               u"\U0001F680-\U0001F6FF"
                               u"\U0001F1E0-\U0001F1FF"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\u00C0-\u017F"
                               u"\u2000-\u206F"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    return text

def preprocessing(text):
    return replace_strings(text)  # Only remove emojis, keep punctuations

# Apply preprocessing
augmented_train_df['Text'] = augmented_train_df['Text'].apply(lambda x: preprocessing(str(x)))
val_df['Text'] = val_df['Text'].apply(lambda x: preprocessing(str(x)))
test_df['Text'] = test_df['Text'].apply(lambda x: preprocessing(str(x)))

# Reset indices
augmented_train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

# Prepare training, validation, and testing data
train_texts = augmented_train_df['Text'].values
train_labels = augmented_train_df['Fraud level'].values.astype(int)
val_texts = val_df['Text'].values
val_labels = val_df['Fraud level'].values.astype(int)
test_texts = test_df['Text'].values
test_labels = test_df['Fraud level'].values.astype(int)

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize data
def tokenize_data(texts, max_length=256):  # Increased max_length
    encodings = tokenizer(
        texts.tolist(),
        truncation=True,
        padding=True,
        max_length=max_length,
        return_tensors='pt'
    )
    return encodings

train_encodings = tokenize_data(train_texts)
val_encodings = tokenize_data(val_texts)
test_encodings = tokenize_data(test_texts)

# Create custom dataset class
class FraudDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = FraudDataset(train_encodings, train_labels)
val_dataset = FraudDataset(val_encodings, val_labels)
test_dataset = FraudDataset(test_encodings, test_labels)

# Initialize BERT model with dropout
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.config.hidden_dropout_prob = 0.3  # Added dropout

# Custom trainer to apply class weights
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get('logits')
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights_tensor)
        loss = loss_fct(logits.view(-1, model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

# Custom callback to log loss and accuracy
class CustomCallback(TrainerCallback):
    def __init__(self):
        self.train_loss = []
        self.eval_loss = []
        self.eval_acc = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs.get('loss') is not None:
            self.train_loss.append(logs['loss'])
        if logs.get('eval_loss') is not None:
            self.eval_loss.append(logs['eval_loss'])
        if logs.get('eval_accuracy') is not None:
            self.eval_acc.append(logs['eval_accuracy'])

    def plot_metrics(self):
        plt.figure(figsize=(10, 5))
        plt.subplot(1, 2, 1)
        plt.plot(self.train_loss, label='Train Loss')
        plt.plot(self.eval_loss, label='Validation Loss')
        plt.title('Model Loss')
        plt.xlabel('Logging Step')
        plt.ylabel('Loss')
        plt.legend()

        plt.subplot(1, 2, 2)
        plt.plot(self.eval_acc, label='Validation Accuracy')
        plt.title('Model Accuracy')
        plt.xlabel('Epoch')
        plt.ylabel('Accuracy')
        plt.legend()
        plt.tight_layout()
        plt.show()

custom_callback = CustomCallback()

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=2e-5,
    warmup_steps=500,
    weight_decay=0.1,
    logging_dir='./logs',
    logging_steps=50,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    report_to="none"
)

# Define compute metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    report = classification_report(labels, preds, output_dict=True)
    return {
        "accuracy": accuracy,
        "precision_0": report['0']['precision'],
        "recall_0": report['0']['recall'],
        "f1_0": report['0']['f1-score'],
        "precision_1": report['1']['precision'],
        "recall_1": report['1']['recall'],
        "f1_1": report['1']['f1-score']
    }

# Initialize CustomTrainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[custom_callback]
)

# Train model
trainer.train()

# Evaluate on validation set
val_results = trainer.evaluate(val_dataset)
print("\nValidation Metrics:")
print(f"Accuracy: {val_results['eval_accuracy']:.4f}")
print(f"Non-Fraud (0) - Precision: {val_results['eval_precision_0']:.4f}, Recall: {val_results['eval_recall_0']:.4f}, F1: {val_results['eval_f1_0']:.4f}")
print(f"Fraud (1) - Precision: {val_results['eval_precision_1']:.4f}, Recall: {val_results['eval_recall_1']:.4f}, F1: {val_results['eval_f1_1']:.4f}")

# Evaluate on test set
test_results = trainer.evaluate(test_dataset)
print("\nTest Metrics:")
print(f"Accuracy: {test_results['eval_accuracy']:.4f}")
print(f"Non-Fraud (0) - Precision: {test_results['eval_precision_0']:.4f}, Recall: {test_results['eval_recall_0']:.4f}, F1: {test_results['eval_f1_0']:.4f}")
print(f"Fraud (1) - Precision: {test_results['eval_precision_1']:.4f}, Recall: {test_results['eval_recall_1']:.4f}, F1: {test_results['eval_f1_1']:.4f}")

# Plot loss and accuracy
custom_callback.plot_metrics()

# Generate predictions for confusion matrix on test set
predictions = trainer.predict(test_dataset)
pred_labels = predictions.predictions.argmax(-1)
true_labels = predictions.label_ids

# Print prediction distribution
print("\nPrediction Distribution on Test Set:")
print(pd.Series(pred_labels).value_counts())

# Plot confusion matrix
cm = confusion_matrix(true_labels, pred_labels)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Fraud (0)', 'Fraud (1)'], yticklabels=['Not Fraud (0)', 'Fraud (1)'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix for BERT Classification')
plt.show()

# Print classification report
print("\nTest Set Classification Report:")
print(classification_report(true_labels, pred_labels, target_names=['Not Fraud (0)', 'Fraud (1)']))

In [None]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import TrainerCallback
import torch
from torch.utils.data import Dataset
from sklearn.utils.class_weight import compute_class_weight

# Display results
print("Original Training Set Size:", len(train_df))
print("Augmented Training Set Size:", len(augmented_train_df))
print("Validation Set Size:", len(val_df))
print("Test Set Size:", len(test_df))
print("\nSample of Augmented Training Set:\n", augmented_train_df.head(10))
print("\nValidation Set:\n", val_df.head())
print("\nTest Set:\n", test_df.head())
print("\nUnique Fraud Levels in Training Set:", augmented_train_df['Fraud level'].unique())
print("Unique Fraud Levels in Validation Set:", val_df['Fraud level'].unique())
print("Unique Fraud Levels in Test Set:", test_df['Fraud level'].unique())

# Combine test, validation, and augmented training set for visualization
df_combined = pd.concat([test_df, val_df, augmented_train_df])

# Count fraud labels
fraud_counts = df_combined['Fraud level'].value_counts()

# Chart.js configuration for fraud label distribution
print("""
```chartjs
{
  "type": "bar",
  "data": {
    "labels": ["Not Fraud (0)", "Fraud (1)"],
    "datasets": [{
      "label": "Fraud Label Count",
      "data": [""" + str(fraud_counts.get(0, 0)) + """, """ + str(fraud_counts.get(1, 0)) + """],
      "backgroundColor": ["#36A2EB", "#FF9F40"],
      "borderColor": ["#36A2EB", "#FF9F40"],
      "borderWidth": 1
    }]
  },
  "options": {
    "scales": {
      "y": {
        "beginAtZero": true,
        "title": {
          "display": true,
          "text": "Count"
        }
      },
      "x": {
        "title": {
          "display": true,
          "text": "Fraud Label"
        }
      }
    },
    "plugins": {
      "title": {
        "display": true,
        "text": "Fraud Label Distribution in Test + Validation + Training Set"
      },
      "legend": {
        "display": false
      }
    }
}
```
""")

# Display dataset info
print("\nDataset Info:")
print(df.info())
print("\nDataset Description:")
print(df.describe())
print("\nFirst 5 Rows of Dataset:")
print(df.head())

# Text preprocessing functions
def replace_strings(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"
                               u"\U0001F300-\U0001F5FF"
                               u"\U0001F680-\U0001F6FF"
                               u"\U0001F1E0-\U0001F1FF"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\u00C0-\u017F"
                               u"\u2000-\u206F"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    return text

def preprocessing(text):
    return replace_strings(text)  # Only remove emojis, keep punctuations

# Apply preprocessing
augmented_train_df['Text'] = augmented_train_df['Text'].apply(lambda x: preprocessing(str(x)))
val_df['Text'] = val_df['Text'].apply(lambda x: preprocessing(str(x)))
test_df['Text'] = test_df['Text'].apply(lambda x: preprocessing(str(x)))

# Reset indices
augmented_train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

# Prepare training, validation, and testing data
train_texts = augmented_train_df['Text'].values
train_labels = augmented_train_df['Fraud level'].values.astype(int)
val_texts = val_df['Text'].values
val_labels = val_df['Fraud level'].values.astype(int)
test_texts = test_df['Text'].values
test_labels = test_df['Fraud level'].values.astype(int)

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))

# Initialize mBERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')

# Tokenize data
def tokenize_data(texts, max_length=256):  # Increased max_length
    encodings = tokenizer(
        texts.tolist(),
        truncation=True,
        padding=True,
        max_length=max_length,
        return_tensors='pt'
    )
    return encodings

train_encodings = tokenize_data(train_texts)
val_encodings = tokenize_data(val_texts)
test_encodings = tokenize_data(test_texts)

# Create custom dataset class
class FraudDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = FraudDataset(train_encodings, train_labels)
val_dataset = FraudDataset(val_encodings, val_labels)
test_dataset = FraudDataset(test_encodings, test_labels)

# Initialize mBERT model with dropout
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-uncased', num_labels=2)
model.config.hidden_dropout_prob = 0.3  # Added dropout

# Custom trainer to apply class weights
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get('logits')
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights_tensor)
        loss = loss_fct(logits.view(-1, model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

# Custom callback to log loss and accuracy
class CustomCallback(TrainerCallback):
    def __init__(self):
        self.train_loss = []
        self.eval_loss = []
        self.eval_acc = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs.get('loss') is not None:
            self.train_loss.append(logs['loss'])
        if logs.get('eval_loss') is not None:
            self.eval_loss.append(logs['eval_loss'])
        if logs.get('eval_accuracy') is not None:
            self.eval_acc.append(logs['eval_accuracy'])

    def plot_metrics(self):
        plt.figure(figsize=(10, 5))
        plt.subplot(1, 2, 1)
        plt.plot(self.train_loss, label='Train Loss')
        plt.plot(self.eval_loss, label='Validation Loss')
        plt.title('Model Loss')
        plt.xlabel('Logging Step')
        plt.ylabel('Loss')
        plt.legend()

        plt.subplot(1, 2, 2)
        plt.plot(self.eval_acc, label='Validation Accuracy')
        plt.title('Model Accuracy')
        plt.xlabel('Epoch')
        plt.ylabel('Accuracy')
        plt.legend()
        plt.tight_layout()
        plt.show()

custom_callback = CustomCallback()

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=2e-5,
    warmup_steps=500,
    weight_decay=0.1,
    logging_dir='./logs',
    logging_steps=50,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    report_to="none"
)

# Define compute metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    report = classification_report(labels, preds, output_dict=True)
    return {
        "accuracy": accuracy,
        "precision_0": report['0']['precision'],
        "recall_0": report['0']['recall'],
        "f1_0": report['0']['f1-score'],
        "precision_1": report['1']['precision'],
        "recall_1": report['1']['recall'],
        "f1_1": report['1']['f1-score']
    }

# Initialize CustomTrainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[custom_callback]
)

# Train model
trainer.train()

# Evaluate on validation set
val_results = trainer.evaluate(val_dataset)
print("\nValidation Metrics:")
print(f"Accuracy: {val_results['eval_accuracy']:.4f}")
print(f"Non-Fraud (0) - Precision: {val_results['eval_precision_0']:.4f}, Recall: {val_results['eval_recall_0']:.4f}, F1: {val_results['eval_f1_0']:.4f}")
print(f"Fraud (1) - Precision: {val_results['eval_precision_1']:.4f}, Recall: {val_results['eval_recall_1']:.4f}, F1: {val_results['eval_f1_1']:.4f}")

# Evaluate on test set
test_results = trainer.evaluate(test_dataset)
print("\nTest Metrics:")
print(f"Accuracy: {test_results['eval_accuracy']:.4f}")
print(f"Non-Fraud (0) - Precision: {test_results['eval_precision_0']:.4f}, Recall: {test_results['eval_recall_0']:.4f}, F1: {test_results['eval_f1_0']:.4f}")
print(f"Fraud (1) - Precision: {test_results['eval_precision_1']:.4f}, Recall: {test_results['eval_recall_1']:.4f}, F1: {test_results['eval_f1_1']:.4f}")

# Plot loss and accuracy
custom_callback.plot_metrics()

# Generate predictions for confusion matrix on test set
predictions = trainer.predict(test_dataset)
pred_labels = predictions.predictions.argmax(-1)
true_labels = predictions.label_ids

# Print prediction distribution
print("\nPrediction Distribution on Test Set:")
print(pd.Series(pred_labels).value_counts())

# Plot confusion matrix
cm = confusion_matrix(true_labels, pred_labels)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Fraud (0)', 'Fraud (1)'], yticklabels=['Not Fraud (0)', 'Fraud (1)'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix for mBERT Classification')
plt.show()

# Print classification report
print("\nTest Set Classification Report:")
print(classification_report(true_labels, pred_labels, target_names=['Not Fraud (0)', 'Fraud (1)']))

In [None]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from transformers import ElectraTokenizer, ElectraForSequenceClassification, Trainer, TrainingArguments
from transformers import TrainerCallback
import torch
from torch.utils.data import Dataset
from sklearn.utils.class_weight import compute_class_weight

# Display results
print("Original Training Set Size:", len(train_df))
print("Augmented Training Set Size:", len(augmented_train_df))
print("Validation Set Size:", len(val_df))
print("Test Set Size:", len(test_df))
print("\nSample of Augmented Training Set:\n", augmented_train_df.head(10))
print("\nValidation Set:\n", val_df.head())
print("\nTest Set:\n", test_df.head())
print("\nUnique Fraud Levels in Training Set:", augmented_train_df['Fraud level'].unique())
print("Unique Fraud Levels in Validation Set:", val_df['Fraud level'].unique())
print("Unique Fraud Levels in Test Set:", test_df['Fraud level'].unique())

# Combine test, validation, and augmented training set for visualization
df_combined = pd.concat([test_df, val_df, augmented_train_df])

# Count fraud labels
fraud_counts = df_combined['Fraud level'].value_counts()

# Chart.js configuration for fraud label distribution
print("""
```chartjs
{
  "type": "bar",
  "data": {
    "labels": ["Not Fraud (0)", "Fraud (1)"],
    "datasets": [{
      "label": "Fraud Label Count",
      "data": [""" + str(fraud_counts.get(0, 0)) + """, """ + str(fraud_counts.get(1, 0)) + """],
      "backgroundColor": ["#36A2EB", "#FF9F40"],
      "borderColor": ["#36A2EB", "#FF9F40"],
      "borderWidth": 1
    }]
  },
  "options": {
    "scales": {
      "y": {
        "beginAtZero": true,
        "title": {
          "display": true,
          "text": "Count"
        }
      },
      "x": {
        "title": {
          "display": true,
          "text": "Fraud Label"
        }
      }
    },
    "plugins": {
      "title": {
        "display": true,
        "text": "Fraud Label Distribution in Test + Validation + Training Set"
      },
      "legend": {
        "display": false
      }
    }
}
```
""")

# Display dataset info
print("\nDataset Info:")
print(df.info())
print("\nDataset Description:")
print(df.describe())
print("\nFirst 5 Rows of Dataset:")
print(df.head())

# Text preprocessing functions
def replace_strings(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"
                               u"\U0001F300-\U0001F5FF"
                               u"\U0001F680-\U0001F6FF"
                               u"\U0001F1E0-\U0001F1FF"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\u00C0-\u017F"
                               u"\u2000-\u206F"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    return text

def preprocessing(text):
    return replace_strings(text)  # Only remove emojis, keep punctuations

# Apply preprocessing
augmented_train_df['Text'] = augmented_train_df['Text'].apply(lambda x: preprocessing(str(x)))
val_df['Text'] = val_df['Text'].apply(lambda x: preprocessing(str(x)))
test_df['Text'] = test_df['Text'].apply(lambda x: preprocessing(str(x)))

# Reset indices
augmented_train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

# Prepare training, validation, and testing data
train_texts = augmented_train_df['Text'].values
train_labels = augmented_train_df['Fraud level'].values.astype(int)
val_texts = val_df['Text'].values
val_labels = val_df['Fraud level'].values.astype(int)
test_texts = test_df['Text'].values
test_labels = test_df['Fraud level'].values.astype(int)

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))

# Initialize Bangla BERT tokenizer
tokenizer = ElectraTokenizer.from_pretrained('csebuetnlp/banglabert')

# Tokenize data
def tokenize_data(texts, max_length=256):  # Increased max_length
    encodings = tokenizer(
        texts.tolist(),
        truncation=True,
        padding=True,
        max_length=max_length,
        return_tensors='pt'
    )
    return encodings

train_encodings = tokenize_data(train_texts)
val_encodings = tokenize_data(val_texts)
test_encodings = tokenize_data(test_texts)

# Create custom dataset class
class FraudDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = FraudDataset(train_encodings, train_labels)
val_dataset = FraudDataset(val_encodings, val_labels)
test_dataset = FraudDataset(test_encodings, test_labels)

# Initialize Bangla BERT model with dropout
model = ElectraForSequenceClassification.from_pretrained('csebuetnlp/banglabert', num_labels=2)
model.config.hidden_dropout_prob = 0.3  # Added dropout

# Custom trainer to apply class weights
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get('logits')
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights_tensor)
        loss = loss_fct(logits.view(-1, model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

# Custom callback to log loss and accuracy
class CustomCallback(TrainerCallback):
    def __init__(self):
        self.train_loss = []
        self.eval_loss = []
        self.eval_acc = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs.get('loss') is not None:
            self.train_loss.append(logs['loss'])
        if logs.get('eval_loss') is not None:
            self.eval_loss.append(logs['eval_loss'])
        if logs.get('eval_accuracy') is not None:
            self.eval_acc.append(logs['eval_accuracy'])

    def plot_metrics(self):
        plt.figure(figsize=(10, 5))
        plt.subplot(1, 2, 1)
        plt.plot(self.train_loss, label='Train Loss')
        plt.plot(self.eval_loss, label='Validation Loss')
        plt.title('Model Loss')
        plt.xlabel('Logging Step')
        plt.ylabel('Loss')
        plt.legend()

        plt.subplot(1, 2, 2)
        plt.plot(self.eval_acc, label='Validation Accuracy')
        plt.title('Model Accuracy')
        plt.xlabel('Epoch')
        plt.ylabel('Accuracy')
        plt.legend()
        plt.tight_layout()
        plt.show()

custom_callback = CustomCallback()

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=2e-5,
    warmup_steps=500,
    weight_decay=0.1,
    logging_dir='./logs',
    logging_steps=50,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    report_to="none"
)

# Define compute metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    report = classification_report(labels, preds, output_dict=True)
    return {
        "accuracy": accuracy,
        "precision_0": report['0']['precision'],
        "recall_0": report['0']['recall'],
        "f1_0": report['0']['f1-score'],
        "precision_1": report['1']['precision'],
        "recall_1": report['1']['recall'],
        "f1_1": report['1']['f1-score']
    }

# Initialize CustomTrainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[custom_callback]
)

# Train model
trainer.train()

# Evaluate on validation set
val_results = trainer.evaluate(val_dataset)
print("\nValidation Metrics:")
print(f"Accuracy: {val_results['eval_accuracy']:.4f}")
print(f"Non-Fraud (0) - Precision: {val_results['eval_precision_0']:.4f}, Recall: {val_results['eval_recall_0']:.4f}, F1: {val_results['eval_f1_0']:.4f}")
print(f"Fraud (1) - Precision: {val_results['eval_precision_1']:.4f}, Recall: {val_results['eval_recall_1']:.4f}, F1: {val_results['eval_f1_1']:.4f}")

# Evaluate on test set
test_results = trainer.evaluate(test_dataset)
print("\nTest Metrics:")
print(f"Accuracy: {test_results['eval_accuracy']:.4f}")
print(f"Non-Fraud (0) - Precision: {test_results['eval_precision_0']:.4f}, Recall: {test_results['eval_recall_0']:.4f}, F1: {test_results['eval_f1_0']:.4f}")
print(f"Fraud (1) - Precision: {test_results['eval_precision_1']:.4f}, Recall: {test_results['eval_recall_1']:.4f}, F1: {test_results['eval_f1_1']:.4f}")

# Plot loss and accuracy
custom_callback.plot_metrics()

# Generate predictions for confusion matrix on test set
predictions = trainer.predict(test_dataset)
pred_labels = predictions.predictions.argmax(-1)
true_labels = predictions.label_ids

# Print prediction distribution
print("\nPrediction Distribution on Test Set:")
print(pd.Series(pred_labels).value_counts())

# Plot confusion matrix
cm = confusion_matrix(true_labels, pred_labels)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Fraud (0)', 'Fraud (1)'], yticklabels=['Not Fraud (0)', 'Fraud (1)'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix for Bangla BERT Classification')
plt.show()

# Print classification report
print("\nTest Set Classification Report:")
print(classification_report(true_labels, pred_labels, target_names=['Not Fraud (0)', 'Fraud (1)']))

In [None]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, f1_score
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, Bidirectional, LSTM, Dropout, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import Callback
from tensorflow.keras.utils import to_categorical

import tensorflow as tf
from tensorflow.keras import mixed_precision

# Enable mixed precision training
policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_global_policy(policy)

# Minimal Bengali stopword list
bengali_stopwords = [
    'এ', 'ও', 'তা', 'তাই', 'তার', 'তিনি', 'তুমি', 'তো', 'থেকে', 'দিয়ে', 'না', 'নেই', 'যে', 'যা', 'যার',
    'যিনি', 'যদি', 'যখন', 'কি', 'কিন্তু', 'কারণ', 'এবং', 'অথবা', 'হয়', 'হতে', 'হয়েছে', 'আমি', 'আমার'
]

# Assume train_df, val_df, test_df, augmented_train_df are predefined
# If not, uncomment the following to load and split dataset
"""
df = pd.read_csv("/content/combined_data.csv", usecols=["Text", "Fraud level"])
df = df[df['Fraud level'].isin([0, 1])]
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42, stratify=df["Fraud level"])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df["Fraud level"])
augmented_train_df = train_df.copy()
"""

# Display results
print("Original Training Set Size:", len(train_df))
print("Augmented Training Set Size:", len(augmented_train_df))
print("Validation Set Size:", len(val_df))
print("Test Set Size:", len(test_df))
print("\nSample of Augmented Training Set:\n", augmented_train_df.head(10))
print("\nValidation Set:\n", val_df.head())
print("\nTest Set:\n", test_df.head())
print("\nUnique Fraud Levels in Training Set:", augmented_train_df['Fraud level'].unique())
print("Unique Fraud Levels in Validation Set:", val_df['Fraud level'].unique())
print("Unique Fraud Levels in Test Set:", test_df['Fraud level'].unique())

# Combine test, validation, and augmented training set for visualization
df_combined = pd.concat([test_df, val_df, augmented_train_df])

# Count fraud labels
fraud_counts = df_combined['Fraud level'].value_counts()

# Chart.js configuration for fraud label distribution
print("""
```chartjs
{
  "type": "bar",
  "data": {
    "labels": ["Not Fraud (0)", "Fraud (1)"],
    "datasets": [{
      "label": "Fraud Label Count",
      "data": [""" + str(fraud_counts.get(0, 0)) + """, """ + str(fraud_counts.get(1, 0)) + """],
      "backgroundColor": ["#36A2EB", "#FF9F40"],
      "borderColor": ["#36A2EB", "#FF9F40"],
      "borderWidth": 1
    }]
  },
  "options": {
    "scales": {
      "y": {
        "beginAtZero": true,
        "title": {
          "display": true,
          "text": "Count"
        }
      },
      "x": {
        "title": {
          "display": true,
          "text": "Fraud Label"
        }
      }
    },
    "plugins": {
      "title": {
        "display": true,
        "text": "Fraud Label Distribution in Test + Validation + Training Set"
      },
      "legend": {
        "display": false
      }
    }
}
```
""")

# Display dataset info
print("\nDataset Info:")
print(df.info())
print("\nDataset Description:")
print(df.describe())
print("\nFirst 5 Rows of Dataset:")
print(df.head())

# Text preprocessing functions
def replace_strings(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"
                               u"\U0001F300-\U0001F5FF"
                               u"\U0001F680-\U0001F6FF"
                               u"\U0001F1E0-\U0001F1FF"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\u00C0-\u017F"
                               u"\u2000-\u206F"
                               "]+", flags=re.UNICODE)
    english_pattern = re.compile('[a-zA-Z0-9]+', flags=re.I)
    text = emoji_pattern.sub(r'', text)
    text = english_pattern.sub(r'', text)
    return text

def remove_punctuations(my_str):
    punctuations = '''````£|¢|Ñ+-*/=EROero৳০১২৩৪৫৬৭৮৯012–34567•89।!()-[]{};:'"“\’,<>./?@#$%^&*_~‘—॥”‰⚽️✌ ￰৷￰'''
    no_punct = ""
    for char in my_str:
        if char not in punctuations:
            no_punct = no_punct + char
    return no_punct

def preprocessing(text):
    text = str(text)
    try:
        text = normalize(text)  # Unicode normalization
    except Exception as e:
        print(f"Normalization failed: {e}")
    text = remove_punctuations(replace_strings(text))
    text = ' '.join([word for word in text.split() if word not in bengali_stopwords])  # Stopword removal
    text = re.sub(r'\s+', ' ', text).strip()  # Normalize whitespace
    return text

# Apply preprocessing
augmented_train_df['Text'] = augmented_train_df['Text'].apply(preprocessing)
val_df['Text'] = val_df['Text'].apply(preprocessing)
test_df['Text'] = test_df['Text'].apply(preprocessing)

# Reset indices
augmented_train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

# Prepare training, validation, and testing data
train_texts = augmented_train_df['Text'].values
train_labels = augmented_train_df['Fraud level'].values.astype(int)
val_texts = val_df['Text'].values
val_labels = val_df['Fraud level'].values.astype(int)
test_texts = test_df['Text'].values
test_labels = test_df['Fraud level'].values.astype(int)

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels)
class_weights_dict = {i: class_weights[i] for i in range(len(class_weights))}

# Tokenize data
vocab_size = 25000
embedding_dim = 300
max_length = 100
oov_tok = "<OOV>"

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_texts)
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(train_texts)
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post', truncating='post')

val_sequences = tokenizer.texts_to_sequences(val_texts)
val_padded = pad_sequences(val_sequences, maxlen=max_length, padding='post', truncating='post')

test_sequences = tokenizer.texts_to_sequences(test_texts)
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post', truncating='post')

# One-hot encode labels
train_labels_cat = to_categorical(train_labels)
val_labels_cat = to_categorical(val_labels)
test_labels_cat = to_categorical(test_labels)

# LSTM+CNN Model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length))
model.add(Conv1D(256, kernel_size=3, activation="relu"))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Dropout(0.20))
model.add(Conv1D(512, kernel_size=3, activation="relu"))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.20))
model.add(Dense(128, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(2, activation='softmax'))

# Compile with categorical crossentropy
adam = Adam(learning_rate=0.00005, beta_1=0.9, beta_2=0.999, epsilon=1e-07)
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])

# Build model
model.build(input_shape=(None, max_length))
model.summary()

# Custom callback to log metrics and compute F1-score
class CustomCallback(Callback):
    def __init__(self, validation_data, test_data):
        super().__init__()
        self.val_data = validation_data
        self.test_data = test_data
        self.train_loss = []
        self.eval_loss = []
        self.eval_acc = []
        self.eval_f1 = []

    def on_epoch_end(self, epoch, logs=None):
        val_x, val_y = self.val_data
        val_y_binary = np.argmax(val_y, axis=1)
        val_pred = self.model.predict(val_x, verbose=0)
        val_pred_binary = np.argmax(val_pred, axis=1)
        val_f1 = f1_score(val_y_binary, val_pred_binary, labels=[1], average='binary')

        self.train_loss.append(logs.get('loss'))
        self.eval_loss.append(logs.get('val_loss'))
        self.eval_acc.append(logs.get('val_accuracy'))
        self.eval_f1.append(val_f1)
        print(f"Epoch {epoch+1}: Validation F1 (Fraud) = {val_f1:.4f}")

    def plot_metrics(self):
        plt.figure(figsize=(15, 5))
        plt.subplot(1, 3, 1)
        plt.plot(self.train_loss, label='Train Loss')
        plt.plot(self.eval_loss, label='Validation Loss')
        plt.title('Model Loss')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.legend()

        plt.subplot(1, 3, 2)
        plt.plot(self.eval_acc, label='Validation Accuracy')
        plt.title('Model Accuracy')
        plt.xlabel('Epoch')
        plt.ylabel('Accuracy')
        plt.legend()

        plt.subplot(1, 3, 3)
        plt.plot(self.eval_f1, label='Validation F1 (Fraud)')
        plt.title('F1-Score (Fraud Class)')
        plt.xlabel('Epoch')
        plt.ylabel('F1-Score')
        plt.legend()
        plt.tight_layout()
        plt.show()

# Early stopping based on F1-score
early_stopping = EarlyStopping(monitor='val_f1', mode='max', patience=3, restore_best_weights=True, verbose=1)

custom_callback = CustomCallback(validation_data=(val_padded, val_labels_cat), test_data=(test_padded, test_labels_cat))

# Train model
history = model.fit(
    train_padded,
    train_labels_cat,
    epochs=30,
    batch_size=30,
    validation_data=(val_padded, val_labels_cat),
    class_weight=class_weights_dict,
    callbacks=[early_stopping, custom_callback]
)

# Evaluate on validation set
val_loss, val_acc = model.evaluate(val_padded, val_labels_cat, verbose=0)
val_pred = model.predict(val_padded, verbose=0)
val_pred_binary = np.argmax(val_pred, axis=1)
val_true_binary = np.argmax(val_labels_cat, axis=1)
val_report = classification_report(val_true_binary, val_pred_binary, output_dict=True)

print("\nValidation Metrics:")
print(f"Accuracy: {val_acc:.4f}")
print(f"Non-Fraud (0) - Precision: {val_report['0']['precision']:.4f}, Recall: {val_report['0']['recall']:.4f}, F1: {val_report['0']['f1-score']:.4f}")
print(f"Fraud (1) - Precision: {val_report['1']['precision']:.4f}, Recall: {val_report['1']['recall']:.4f}, F1: {val_report['1']['f1-score']:.4f}")

# Evaluate on test set
test_loss, test_acc = model.evaluate(test_padded, test_labels_cat, verbose=0)
test_pred = model.predict(test_padded, verbose=0)
test_pred_binary = np.argmax(test_pred, axis=1)
test_true_binary = np.argmax(test_labels_cat, axis=1)
test_report = classification_report(test_true_binary, test_pred_binary, output_dict=True)

print("\nTest Metrics:")
print(f"Accuracy: {test_acc:.4f}")
print(f"Non-Fraud (0) - Precision: {test_report['0']['precision']:.4f}, Recall: {test_report['0']['recall']:.4f}, F1: {test_report['0']['f1-score']:.4f}")
print(f"Fraud (1) - Precision: {test_report['1']['precision']:.4f}, Recall: {test_report['1']['recall']:.4f}, F1: {test_report['1']['f1-score']:.4f}")

# Plot loss and accuracy
custom_callback.plot_metrics()

# Generate predictions for confusion matrix on test set
predictions = test_pred_binary
true_labels = test_true_binary

# Print prediction distribution
print("\nPrediction Distribution on Test Set:")
print(pd.Series(predictions).value_counts())

# Plot confusion matrix
cm = confusion_matrix(true_labels, predictions)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Fraud (0)', 'Fraud (1)'], yticklabels=['Not Fraud (0)', 'Fraud (1)'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix for LSTM+CNN Classification')
plt.show()

# Print classification report
print("\nTest Set Classification Report:")
print(classification_report(true_labels, predictions, target_names=['Not Fraud (0)', 'Fraud (1)']))