In [1]:
%pip install jsonlines

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [2]:
import jsonlines
import pandas as pd
from collections import defaultdict
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, TensorDataset
from torch import nn
import matplotlib.pyplot as plt

data_file = '/home/jupyter/datasphere/project/split_file_0 (2).jsonl'

shingle_counts = defaultdict(int)
shingle_phishing_counts = defaultdict(int)

with jsonlines.open(data_file) as reader:
    for obj in reader:
        num = obj['num']
        target = obj['target']
        for shingle in obj['shingles']:
            shingle_counts[shingle] += num
            if target == 1:
                shingle_phishing_counts[shingle] += num

shingle_data = []
for shingle, count in shingle_counts.items():
    phishing_count = shingle_phishing_counts[shingle]
    phishing_ratio = phishing_count / count if count > 0 else 0
    group = int(phishing_ratio * 5)
    shingle_data.append((shingle, phishing_ratio, group))

shingle_df = pd.DataFrame(shingle_data, columns=["shingle", "phishing_ratio", "group"])

le = LabelEncoder()
shingle_df['shingle_id'] = le.fit_transform(shingle_df['shingle'])

X = shingle_df[['shingle_id', 'phishing_ratio', 'group']]
y = shingle_df['shingle'].apply(lambda x: 1 if x in shingle_phishing_counts and shingle_phishing_counts[x] > 0 else 0)

X['group'] = X['group'].astype('category')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['group'] = X['group'].astype('category')


In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_ids = torch.tensor(X_train['shingle_id'].values, dtype=torch.long)
y_train = torch.tensor(y_train.values, dtype=torch.float)

train_data = TensorDataset(X_train_ids, y_train)
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)


X_test_ids = torch.tensor(X_test['shingle_id'].values, dtype=torch.long).view(-1)
y_test = torch.tensor(y_test.values, dtype=torch.long)

test_data = TensorDataset(X_test_ids, y_test)
test_loader = DataLoader(test_data, batch_size=32)

In [4]:
class AutoencoderClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(AutoencoderClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.encoder = nn.Linear(embedding_dim, hidden_dim)
        self.decoder = nn.Linear(hidden_dim, embedding_dim)
        self.classifier = nn.Linear(hidden_dim, 1)  # Для бинарной классификации

    def forward(self, x):
        embedded = self.embedding(x)  # [batch_size, embedding_dim]
        encoded = self.encoder(embedded)  # [batch_size, hidden_dim]
        reconstructed = self.decoder(encoded)  # [batch_size, embedding_dim]
        logits = self.classifier(encoded).squeeze(1)  # [batch_size]
        return logits, reconstructed


In [5]:
for batch in train_loader:
    inputs, targets = batch
    print(f"Input shape: {inputs.shape}")
    print(f"Targets shape: {targets.shape}")
    break

Input shape: torch.Size([32])
Targets shape: torch.Size([32])


In [6]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def train_autoencoder_with_embeddings(model, train_loader, test_loader, vocab_size, embedding_dim, epochs=10, lr=0.001):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion_reconstruction = nn.MSELoss()
    criterion_classification = nn.BCEWithLogitsLoss()

    model.train()

    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []

    for epoch in range(epochs):
        total_loss = 0
        total_reconstruction_loss = 0
        total_classification_loss = 0

        for batch in train_loader:
            inputs, targets = batch

            logits, reconstructed = model(inputs)

            loss_reconstruction = criterion_reconstruction(reconstructed, model.embedding(inputs))
            loss_classification = criterion_classification(logits.squeeze(), targets.float())
            loss = loss_reconstruction + loss_classification

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            total_reconstruction_loss += loss_reconstruction.item()
            total_classification_loss += loss_classification.item()

        model.eval()
        all_preds = []
        all_targets = []
        with torch.no_grad():
            for batch in test_loader:
                inputs, targets = batch
                logits, _ = model(inputs)
                predictions = torch.sigmoid(logits).squeeze() > 0.5
                all_preds.extend(predictions.cpu().numpy())
                all_targets.extend(targets.cpu().numpy())

        accuracy = accuracy_score(all_targets, all_preds)
        precision = precision_score(all_targets, all_preds, zero_division=0)
        recall = recall_score(all_targets, all_preds, zero_division=0)
        f1 = f1_score(all_targets, all_preds, zero_division=0)
        
        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)

        print(f"Epoch {epoch + 1}:")
        print(f"  Accuracy: {accuracy:.4f}")
        print(f"  Precision: {precision:.4f}")
        print(f"  Recall: {recall:.4f}")
        print(f"  F1 Score: {f1:.4f}\n")

        model.train()

    return accuracies, precisions, recalls, f1_scores


In [7]:
vocab_size = len(le.classes_)
embedding_dim = 128 
hidden_dim = 64
epochs = 10 
lr = 0.001

model = AutoencoderClassifier(vocab_size, embedding_dim, hidden_dim)

In [None]:
accuracies, precisions, recalls, f1_scores = train_autoencoder_with_embeddings(
    model, train_loader, test_loader, vocab_size, embedding_dim, epochs=10, lr=0.001
)

plt.figure(figsize=(12, 10))

plt.subplot(2, 2, 1)
plt.plot(range(1, epochs + 1), accuracies, label="Accuracy", color='blue', marker='o')
plt.title('Accuracy over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.grid(True)
plt.legend()

plt.subplot(2, 2, 2)
plt.plot(range(1, epochs + 1), precisions, label="Precision", color='green', marker='o')
plt.title('Precision over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Precision')
plt.grid(True)
plt.legend()

plt.subplot(2, 2, 3)
plt.plot(range(1, epochs + 1), recalls, label="Recall", color='orange', marker='o')
plt.title('Recall over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Recall')
plt.grid(True)
plt.legend()

plt.subplot(2, 2, 4)
plt.plot(range(1, epochs + 1), f1_scores, label="F1 Score", color='red', marker='o')
plt.title('F1 Score over Epochs')
plt.xlabel('Epoch')
plt.ylabel('F1 Score')
plt.grid(True)
plt.legend()

plt.tight_layout()
plt.show()
