In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pip install torch torch-geometric scikit-learn pandas tqdm


In [None]:
import pandas as pd
import torch
import torch.nn.functional as F
from torch_geometric.data import Data, Dataset
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GCNConv
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [None]:
df=pd.read_csv('/kaggle/input/spam-dataset/V2X_Data.csv')
df

In [None]:
# Encode categorical variables
le_source = LabelEncoder()
le_dest = LabelEncoder()
le_msg_type = LabelEncoder()
le_priority = LabelEncoder()
le_spam = LabelEncoder()

df['Source_encoded'] = le_source.fit_transform(df['Source Vehicle'])
df['Dest_encoded'] = le_dest.fit_transform(df['Destination Vehicle'])
df['MsgType_encoded'] = le_msg_type.fit_transform(df['Message Type'])
df['Priority_encoded'] = le_priority.fit_transform(df['Priority'])
df['Spam_encoded'] = le_spam.fit_transform(df['Spam'])

# Step 2: Prepare BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

In [None]:
from torch_geometric.data import Dataset, Data

class CarNetworkDataset(Dataset):
    def __init__(self, df, tokenizer, bert_model):
        self.df = df
        self.tokenizer = tokenizer
        self.bert_model = bert_model

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        if idx >= len(self.df):
            raise IndexError(f"Index {idx} out of range")
        row = self.df.iloc[idx]
        
        # Edge index for graph data
        edge_index = torch.tensor([[row['Source_encoded']], [row['Dest_encoded']]], dtype=torch.long)
        
        # Graph features
        x_graph = torch.tensor([row['MsgType_encoded'], row['Priority_encoded']], dtype=torch.float).unsqueeze(0)
        
        # Process text data with BERT
        encoded_input = self.tokenizer(row['Message Content'], padding=True, truncation=True, return_tensors='pt')
        with torch.no_grad():
            bert_output = self.bert_model(**encoded_input)
        text_features = bert_output.last_hidden_state[:, 0, :]  # Use [CLS] token representation
        
        # Debugging print statements
        print(f"x_graph shape: {x_graph.shape}")
        print(f"text_features shape: {text_features.shape}")
        
        # Ensure x_graph and text_features are aligned
        if x_graph.shape[0] != text_features.shape[0]:
            print(f"Expanding x_graph from shape {x_graph.shape} to {text_features.shape[0]}")
            x_graph = x_graph.expand(text_features.shape[0], -1)
        
        # Combine graph features and text features
        x = torch.cat([x_graph, text_features], dim=1)
        
        # Target label
        y = torch.tensor([row['Spam_encoded']], dtype=torch.long)
        
        return Data(x=x, edge_index=edge_index, y=y)


In [None]:

# Create dataset
dataset = CarNetworkDataset(df, tokenizer, bert_model)

# Step 4: Split the dataset
train_dataset, test_dataset = train_test_split(dataset, test_size=0.2, random_state=42)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:

class GCNModel(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCNModel, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.fc = torch.nn.Linear(hidden_dim, output_dim)

    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        x = F.relu(self.conv2(x, edge_index))
        x = self.fc(x)
        return F.log_softmax(x, dim=1)

# Step 6: Initialize the model and optimizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCNModel(input_dim=dataset[0].x.size(1), hidden_dim=64, output_dim=2).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import torch.nn.functional as F

def train():
    model.train()
    total_loss = 0
    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()
        out = model(data.x, data.edge_index)
        loss = F.nll_loss(out, data.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

def evaluate(loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for data in loader:
            data = data.to(device)
            out = model(data.x, data.edge_index)
            pred = out.argmax(dim=1)
            all_preds.append(pred.cpu().numpy())
            all_labels.append(data.y.cpu().numpy())

    # Flatten the lists
    all_preds = np.concatenate(all_preds)
    all_labels = np.concatenate(all_labels)

    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average='binary')
    recall = recall_score(all_labels, all_preds, average='binary')
    f1 = f1_score(all_labels, all_preds, average='binary')
    cm = confusion_matrix(all_labels, all_preds)

    return accuracy, precision, recall, f1, cm

train_acc, train_precision, train_recall, train_f1, train_cm = evaluate(train_loader)
test_acc, test_precision, test_recall, test_f1, test_cm = evaluate(test_loader)

print(f'Train Accuracy: {train_acc:.4f}, Test Accuracy: {test_acc:.4f}')
print(f'Train Precision: {train_precision:.4f}, Recall: {train_recall:.4f}, F1: {train_f1:.4f}')
print(f'Test Precision: {test_precision:.4f}, Recall: {test_recall:.4f}, F1: {test_f1:.4f}')
print(f'Train Confusion Matrix:\n{train_cm}')
print(f'Test Confusion Matrix:\n{test_cm}')


In [None]:
#Step 9: Run the training
for epoch in range(200):
    loss = train()
    train_acc = test(train_loader)
    test_acc = test(test_loader)
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}')


In [None]:
print(f"x shape: {x.shape}")
print(f"text_features shape: {text_features.shape}")


In [None]:
def test(loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for data in loader:
            data = data.to(device)
            out = model(data.x, data.edge_index)
            pred = out.argmax(dim=1)
            all_preds.extend(pred.cpu().numpy())
            all_labels.extend(data.y.cpu().numpy())
    return np.array(all_preds), np.array(all_labels)

In [None]:
# Step 10: Final evaluation and metrics
final_preds, final_labels = test(test_loader)
final_acc = (final_preds == final_labels).mean()
print(f'Final Test Accuracy: {final_acc:.4f}')

# Calculate precision, recall, and F1 score
precision, recall, f1, _ = precision_recall_fscore_support(final_labels, final_preds, average='binary')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

# Create confusion matrix
cm = confusion_matrix(final_labels, final_preds)

# Visualize confusion matrix
plt.figure(figsize=(10,7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# Visualize precision, recall, and F1 score
metrics = ['Precision', 'Recall', 'F1 Score']
values = [precision, recall, f1]

plt.figure(figsize=(10,7))
sns.barplot(x=metrics, y=values)
plt.title('Precision, Recall, and F1 Score')
plt.ylim(0, 1)
for i, v in enumerate(values):
    plt.text(i, v, f'{v:.2f}', ha='center', va='bottom')
plt.show()


In [None]:

# Step 1: Load and preprocess the data
df = pd.read_csv('/kaggle/input/spam-dataset/V2X_Data.csv')

# Encode categorical variables
le_source = LabelEncoder()
le_dest = LabelEncoder()
le_msg_type = LabelEncoder()
le_priority = LabelEncoder()
le_spam = LabelEncoder()

df['Source_encoded'] = le_source.fit_transform(df['Source Vehicle'])
df['Dest_encoded'] = le_dest.fit_transform(df['Destination Vehicle'])
df['MsgType_encoded'] = le_msg_type.fit_transform(df['Message Type'])
df['Priority_encoded'] = le_priority.fit_transform(df['Priority'])
df['Spam_encoded'] = le_spam.fit_transform(df['Spam'])

# Step 2: Prepare BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Step 3: Create graph data
edge_index = torch.tensor([df['Source_encoded'].tolist(), df['Dest_encoded'].tolist()], dtype=torch.long)
x = torch.tensor(df[['MsgType_encoded', 'Priority_encoded']].values, dtype=torch.float)

# Process text data with BERT
encoded_input = tokenizer(df['Message Content'].tolist(), padding=True, truncation=True, return_tensors='pt')
with torch.no_grad():
    bert_output = bert_model(**encoded_input)
text_features = bert_output.last_hidden_state[:, 0, :]  # Use [CLS] token representation

# Combine graph features and text features
x = torch.cat([x, text_features], dim=1)

y = torch.tensor(df['Spam_encoded'].values, dtype=torch.long)

# Create PyTorch Geometric Data object
data = Data(x=x, edge_index=edge_index, y=y)

In [None]:


# Step 4: Define the Graph CNN model
class GCNModel(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCNModel, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.fc = torch.nn.Linear(hidden_dim, output_dim)

    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        x = F.relu(self.conv2(x, edge_index))
        x = self.fc(x)
        return F.log_softmax(x, dim=1)

# Step 5: Train-test split
train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
train_indices, test_indices = train_test_split(range(data.num_nodes), test_size=0.2, random_state=42)
train_mask[train_indices] = True
test_mask[test_indices] = True

data.train_mask = train_mask
data.test_mask = test_mask

# Step 6: Initialize the model and optimizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCNModel(input_dim=x.size(1), hidden_dim=64, output_dim=2).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
data = data.to(device)

# Step 7: Training loop
def train():
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

# Step 8: Testing
def test():
    model.eval()
    with torch.no_grad():
        out = model(data.x, data.edge_index)
        pred = out.argmax(dim=1)
        correct = pred[data.test_mask] == data.y[data.test_mask]
        acc = int(correct.sum()) / int(data.test_mask.sum())
    return acc




In [None]:
# Step 9: Run the training
for epoch in range(200):
    loss = train()
    if epoch % 10 == 0:
        acc = test()
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Test Acc: {acc:.4f}')

# Step 10: Final evaluation
final_acc = test()
print(f'Final Test Accuracy: {final_acc:.4f}')



In [None]:
model.eval()
with torch.no_grad():
    out = model(data.x, data.edge_index)
    pred = out.argmax(dim=1)
    correct = data.y[data.test_mask]


In [None]:
pred
correct

In [None]:
import torch
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


In [None]:
def predict(new_data):
    model.eval()
    with torch.no_grad():
        out = model(new_data.x, new_data.edge_index)
        pred = out.argmax(dim=1)
    return pred


In [None]:
import pandas as pd
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Step 1: Load and preprocess the data
#df = pd.read_csv('car_network_data.csv')

# Encode categorical variables
le_source = LabelEncoder()
le_dest = LabelEncoder()
le_msg_type = LabelEncoder()
le_priority = LabelEncoder()
le_spam = LabelEncoder()

df['Source_encoded'] = le_source.fit_transform(df['Source Vehicle'])
df['Dest_encoded'] = le_dest.fit_transform(df['Destination Vehicle'])
df['MsgType_encoded'] = le_msg_type.fit_transform(df['Message Type'])
df['Priority_encoded'] = le_priority.fit_transform(df['Priority'])
df['Spam_encoded'] = le_spam.fit_transform(df['Spam'])

# Step 2: Prepare BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Step 3: Create graph data
edge_index = torch.tensor([df['Source_encoded'].tolist(), df['Dest_encoded'].tolist()], dtype=torch.long)
x = torch.tensor(df[['MsgType_encoded', 'Priority_encoded']].values, dtype=torch.float)

# Process text data with BERT
encoded_input = tokenizer(df['Message Content'].tolist(), padding=True, truncation=True, return_tensors='pt')
with torch.no_grad():
    bert_output = bert_model(**encoded_input)
text_features = bert_output.last_hidden_state[:, 0, :]  # Use [CLS] token representation

# Combine graph features and text features
x = torch.cat([x, text_features], dim=1)

y = torch.tensor(df['Spam_encoded'].values, dtype=torch.long)

# Create PyTorch Geometric Data object
data = Data(x=x, edge_index=edge_index, y=y)

# Step 4: Split the data
num_nodes = data.num_nodes
node_indices = list(range(num_nodes))
train_indices, test_indices = train_test_split(node_indices, test_size=0.2, random_state=42)

data.train_mask = torch.zeros(num_nodes, dtype=torch.bool)
data.train_mask[train_indices] = True
data.test_mask = torch.zeros(num_nodes, dtype=torch.bool)
data.test_mask[test_indices] = True

# Step 5: Define the Graph CNN model
class GCNModel(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCNModel, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.fc = torch.nn.Linear(hidden_dim, output_dim)

    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        x = F.relu(self.conv2(x, edge_index))
        x = self.fc(x)
        return F.log_softmax(x, dim=1)

# Step 6: Initialize the model and optimizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCNModel(input_dim=data.x.size(1), hidden_dim=64, output_dim=2).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
data = data.to(device)

# Step 7: Training loop
def train():
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

# Step 8: Testing
def test():
    model.eval()
    with torch.no_grad():
        out = model(data.x, data.edge_index)
        pred = out.argmax(dim=1)
        train_correct = pred[data.train_mask] == data.y[data.train_mask]
        train_acc = int(train_correct.sum()) / int(data.train_mask.sum())
        test_correct = pred[data.test_mask] == data.y[data.test_mask]
        test_acc = int(test_correct.sum()) / int(data.test_mask.sum())
    return train_acc, test_acc, pred[data.test_mask], data.y[data.test_mask]

# Step 9: Run the training
for epoch in range(200):
    loss = train()
    train_acc, test_acc, _, _ = test()
    if epoch % 10 == 0:
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}')

# Step 10: Final evaluation and metrics
_, _, final_preds, final_labels = test()
final_preds = final_preds.cpu().numpy()
final_labels = final_labels.cpu().numpy()

precision, recall, f1, _ = precision_recall_fscore_support(final_labels, final_preds, average='binary')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

# Create confusion matrix
cm = confusion_matrix(final_labels, final_preds)

# Visualize confusion matrix
plt.figure(figsize=(10,7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# Visualize precision, recall, and F1 score
metrics = ['Precision', 'Recall', 'F1 Score']
values = [precision, recall, f1]

plt.figure(figsize=(10,7))
sns.barplot(x=metrics, y=values)
plt.title('Precision, Recall, and F1 Score')
plt.ylim(0, 1)
for i, v in enumerate(values):
    plt.text(i, v, f'{v:.2f}', ha='center', va='bottom')
plt.show()