In [2]:
import json
import pandas as pd


data_file = '/content/split_file_0.jsonl'

rows = []
with open(data_file, 'r') as f:
    for line in f:
        record = json.loads(line.strip())
        num = record['num']
        target = int(record['target'])
        for shingle in record['shingles']:
            rows.append({'shingle': shingle, 'target': target})

df = pd.DataFrame(rows)

output_file = 'processed_shingles.csv'
df.to_csv(output_file, index=False)


In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score
import matplotlib.pyplot as plt

class ShingleDataset(Dataset):
    def __init__(self, data, input_dim=130):
        self.data = data
        self.input_dim = input_dim

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        shingle_ids = [int(s) for s in item['shingle_id']]

        input_ids = torch.zeros(self.input_dim, dtype=torch.float32)
        input_ids[:len(shingle_ids)] = torch.tensor(shingle_ids[:self.input_dim], dtype=torch.float32)

        label = int(item['target'])
        return {'input_ids': input_ids, 'label': torch.tensor(label, dtype=torch.long)}

In [4]:
file_paths = ['/content/split_file_0.jsonl']
dataframes = []
for file_path in file_paths:
    data = pd.read_json(file_path, lines=True)
    reshaped_data = [{'shingle_id': row['shingles'], 'target': int(row['target'])} for _, row in data.iterrows()]
    reshaped_df = pd.DataFrame(reshaped_data)
    dataframes.append(reshaped_df)

full_data = pd.concat(dataframes, ignore_index=True)

train_data = full_data.sample(frac=0.8, random_state=42)
val_data = full_data.drop(train_data.index)

train_dataset = ShingleDataset(train_data)
val_dataset = ShingleDataset(val_data)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [47]:
from sklearn.preprocessing import LabelEncoder

class AutoencoderClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(AutoencoderClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.encoder = nn.Linear(embedding_dim, hidden_dim)
        self.decoder = nn.Linear(hidden_dim, 1)
        self.classifier = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        embedded = self.embedding(x)
        encoded = torch.tanh(self.encoder(embedded))
        reconstructed = self.decoder(encoded)
        logits = self.classifier(encoded.mean(dim=1)).squeeze(1)
        return logits, reconstructed


In [24]:

all_inputs = []
for batch in train_loader:
    all_inputs.extend(batch['input_ids'].view(-1).numpy())
for batch in val_loader:
    all_inputs.extend(batch['input_ids'].view(-1).numpy())

label_encoder = LabelEncoder()
label_encoder.fit(all_inputs)




In [48]:
model = AutoencoderClassifier(vocab_size = len(label_encoder.classes_), embedding_dim=128, hidden_dim=256)


criterion_reconstruction = nn.MSELoss()
criterion_classification = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [63]:
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

num_epochs = 10
train_losses = []
val_losses = []
val_accuracies = []
val_roc_auc = []
val_precisions = []
val_recalls = []

for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0
    for batch in train_loader:
      inputs, targets = batch['input_ids'].to(device), batch['label'].to(device)

      inputs = inputs.requires_grad_()

      encoded_inputs = label_encoder.transform(inputs.view(-1).detach().cpu().numpy())
      encoded_inputs = torch.tensor(encoded_inputs, dtype=torch.long).view(inputs.size()).to(device)

      logits, reconstructed = model(encoded_inputs)

      reconstructed = reconstructed.requires_grad_()

      reconstructed_flat = reconstructed.view(-1, reconstructed.size(-1))
      encoded_inputs_flat = encoded_inputs.view(-1)

      targets = targets.float()

      loss_reconstruction = criterion_reconstruction(reconstructed_flat, encoded_inputs_flat.float())
      loss_classification = criterion_classification(logits, targets)

      loss = loss_reconstruction + loss_classification

      optimizer.zero_grad()
      loss.backward()
      optimizer.step()


    train_losses.append(total_train_loss / len(train_loader))

    model.eval()
    total_val_loss = 0
    all_preds = []
    all_targets = []
    all_probs = []

    with torch.no_grad():
        for batch in val_loader:
            inputs, targets = batch['input_ids'].to(device), batch['label'].to(device)

            encoded_inputs = label_encoder.transform(inputs.view(-1).cpu().numpy())
            encoded_inputs = torch.tensor(encoded_inputs, dtype=torch.long).view(inputs.size()).to(device)

            logits, reconstructed = model(encoded_inputs)

            reconstructed_flat = reconstructed.view(-1, reconstructed.size(-1))
            encoded_inputs_flat = encoded_inputs.view(-1)

            loss_reconstruction = criterion_reconstruction(reconstructed_flat, encoded_inputs_flat.float())

            targets = targets.float()

            loss_classification = criterion_classification(logits, targets)

            loss = loss_reconstruction + loss_classification

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    val_losses.append(total_val_loss / len(val_loader))

    val_accuracy = accuracy_score(all_targets, all_preds)
    val_roc_auc = roc_auc_score(all_targets, all_probs)
    val_precision = precision_score(all_targets, all_preds)
    val_recall = recall_score(all_targets, all_preds)

    val_accuracies.append(val_accuracy)
    val_roc_auc.append(val_roc_auc)
    val_precisions.append(val_precision)
    val_recalls.append(val_recall)

    print(f"Epoch {epoch+1}/{num_epochs}:")
    print(f"  Train Loss: {train_losses[-1]:.4f}")
    print(f"  Val Loss: {val_losses[-1]:.4f}")
    print(f"  Val Accuracy: {val_accuracy:.4f}")
    print(f"  Val ROC-AUC: {val_roc_auc:.4f}")
    print(f"  Val Precision: {val_precision:.4f}")
    print(f"  Val Recall: {val_recall:.4f}")


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(inpu

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn