In [19]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/xi-not-pos/xj_pos_filtered_without_xi_val.csv
/kaggle/input/xi-not-pos/xj_pos_filtered_without_xi_train.csv
/kaggle/input/xi-not-pos/xj_pos_filtered_without_xi_test.csv
/kaggle/input/xi-completed/xj_without_xi_completed_sentences_test.csv
/kaggle/input/xi-completed/xj_without_xi_completed_sentences_val.csv
/kaggle/input/xi-completed/xj_without_xi_completed_sentences_train.csv


In [20]:
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, BertModel
from transformers import AdamW, get_scheduler
import torch.nn.functional as F
from tqdm import tqdm
import torch.nn as nn
from sklearn.metrics import roc_auc_score


In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [22]:
train_df = pd.read_csv('/kaggle/input/xi-completed/xj_without_xi_completed_sentences_train.csv')
val_df = pd.read_csv('/kaggle/input/xi-completed/xj_without_xi_completed_sentences_val.csv')
test_df = pd.read_csv('/kaggle/input/xi-completed/xj_without_xi_completed_sentences_test.csv')


In [23]:
X_train = train_df['xi'] + " " + train_df['xj']
X_val = val_df['xi'] + " " + val_df['xj']

# Labels
y_train = train_df['LLM_model'].factorize()[0]
y_val = val_df['LLM_model'].factorize()[0]



In [24]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')


In [25]:
def encode_data(texts, tokenizer, max_len=128):
    return tokenizer(texts.tolist(), padding=True, truncation=True, max_length=max_len, return_tensors="pt")


In [26]:
tokenizer

DistilBertTokenizer(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [27]:
encoded_train = encode_data(X_train, tokenizer, max_len=128)
encoded_val = encode_data(X_val, tokenizer, max_len=128)


In [28]:
X_train_bert = encoded_train['input_ids']
attention_mask_train = encoded_train['attention_mask']
X_val_bert = encoded_val['input_ids']
attention_mask_val = encoded_val['attention_mask']

In [29]:
class BertDataset(Dataset):
    def __init__(self, X, attention_mask, y):
        self.X = X
        self.attention_mask = attention_mask
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.attention_mask[idx], self.y[idx]


In [30]:
train_dataset_bert = BertDataset(X_train_bert, attention_mask_train, y_train)
val_dataset_bert = BertDataset(X_val_bert, attention_mask_val, y_val)
train_loader_bert = DataLoader(train_dataset_bert, batch_size=64, shuffle=True)
val_loader_bert = DataLoader(val_dataset_bert, batch_size=64)


In [31]:
class EnhancedModel(nn.Module):
    def __init__(self, num_classes):
        super(EnhancedModel, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')  
        self.dropout = nn.Dropout(p=0.2)  
        self.fc1 = nn.Linear(self.bert.config.hidden_size, 512) 
        self.fc2 = nn.Linear(512, 256) 
        self.classifier = nn.Linear(256, num_classes)  

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]  
        x = self.dropout(pooled_output)
        x = F.relu(self.fc1(x))  
        x = F.relu(self.fc2(x))  
        x = self.dropout(x)
        return self.classifier(x)


In [32]:
num_classes = len(set(y_train))
bert_model = EnhancedModel(num_classes).to(device)
epochs = 20  
#1e-5
optimizer = AdamW(bert_model.parameters(), lr=1e-5, weight_decay=1e-4) 
num_training_steps = len(train_loader_bert) * epochs  
num_warmup_steps = int(0.1 * num_training_steps)  
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]



In [33]:
def train_bert_model(model, train_loader, val_loader, optimizer, scheduler):
    #patience = 5  
    epochs_no_improve = 0
    best_val_loss = float('inf')
    
    for param in model.bert.parameters():
        param.requires_grad = False


    for epoch in range(epochs):
        model.train()
        total_loss, total_correct = 0, 0

        for X_batch, attention_mask, y_batch in tqdm(train_loader):
            X_batch, attention_mask, y_batch = X_batch.to(device), attention_mask.to(device), y_batch.to(device)
            optimizer.zero_grad()
            outputs = model(X_batch, attention_mask=attention_mask)  
            loss = F.cross_entropy(outputs, y_batch)
            loss.backward()
            optimizer.step()
            scheduler.step()

            total_loss += loss.item()
            total_correct += (outputs.argmax(1) == y_batch).sum().item()

        accuracy = total_correct / len(train_loader.dataset)
        print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}, Accuracy: {accuracy}")

        # Validate
        val_loss, val_accuracy, val_auc = validate_bert_model(model, val_loader)
        print(f"Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}, AUC-ROC: {val_auc}")

# Validation loop
def validate_bert_model(model, val_loader):
    model.eval()
    total_loss, total_correct = 0, 0
    all_targets = []
    all_probs = []

    with torch.no_grad():
        for X_batch, attention_mask, y_batch in val_loader:
            X_batch, attention_mask, y_batch = X_batch.to(device), attention_mask.to(device), y_batch.to(device)
            outputs = model(X_batch, attention_mask=attention_mask)  
            loss = F.cross_entropy(outputs, y_batch)

            total_loss += loss.item()
            total_correct += (outputs.argmax(1) == y_batch).sum().item()

            all_targets.extend(y_batch.cpu().numpy())
            all_probs.extend(F.softmax(outputs, dim=1).cpu().numpy())  

    val_accuracy = total_correct / len(val_loader.dataset)
    val_auc = roc_auc_score(all_targets, all_probs, multi_class='ovr')  
    return total_loss / len(val_loader), val_accuracy, val_auc

In [34]:
train_bert_model(bert_model, train_loader_bert, val_loader_bert, optimizer, lr_scheduler)


100%|██████████| 63/63 [00:12<00:00,  5.02it/s]


Epoch 1, Loss: 1.61082422922528, Accuracy: 0.20279720279720279
Validation Loss: 1.6110219657421112, Validation Accuracy: 0.154, AUC-ROC: 0.47336999999999996


100%|██████████| 63/63 [00:11<00:00,  5.28it/s]


Epoch 2, Loss: 1.6038480845708696, Accuracy: 0.23201798201798202
Validation Loss: 1.6156504601240158, Validation Accuracy: 0.168, AUC-ROC: 0.459585


100%|██████████| 63/63 [00:12<00:00,  5.11it/s]


Epoch 3, Loss: 1.595883221853347, Accuracy: 0.2395104895104895
Validation Loss: 1.622293084859848, Validation Accuracy: 0.182, AUC-ROC: 0.4652900000000001


100%|██████████| 63/63 [00:12<00:00,  5.01it/s]


Epoch 4, Loss: 1.5880473379104856, Accuracy: 0.25924075924075923
Validation Loss: 1.6278842091560364, Validation Accuracy: 0.16, AUC-ROC: 0.46965500000000004


100%|██████████| 63/63 [00:12<00:00,  4.85it/s]


Epoch 5, Loss: 1.5800038727502974, Accuracy: 0.27972027972027974
Validation Loss: 1.6327045559883118, Validation Accuracy: 0.148, AUC-ROC: 0.472265


100%|██████████| 63/63 [00:13<00:00,  4.76it/s]


Epoch 6, Loss: 1.5751668498629616, Accuracy: 0.27522477522477523
Validation Loss: 1.638840526342392, Validation Accuracy: 0.166, AUC-ROC: 0.47404499999999994


100%|██████████| 63/63 [00:13<00:00,  4.57it/s]


Epoch 7, Loss: 1.564544573662773, Accuracy: 0.2947052947052947
Validation Loss: 1.6460947692394257, Validation Accuracy: 0.158, AUC-ROC: 0.47353500000000004


100%|██████████| 63/63 [00:14<00:00,  4.39it/s]


Epoch 8, Loss: 1.5598812349258908, Accuracy: 0.29795204795204794
Validation Loss: 1.6525430083274841, Validation Accuracy: 0.16, AUC-ROC: 0.47462


100%|██████████| 63/63 [00:13<00:00,  4.50it/s]


Epoch 9, Loss: 1.5563439282159957, Accuracy: 0.296953046953047
Validation Loss: 1.6577782928943634, Validation Accuracy: 0.16, AUC-ROC: 0.47529000000000005


100%|██████████| 63/63 [00:13<00:00,  4.62it/s]


Epoch 10, Loss: 1.5526257204631018, Accuracy: 0.3036963036963037
Validation Loss: 1.664351761341095, Validation Accuracy: 0.17, AUC-ROC: 0.47424


100%|██████████| 63/63 [00:13<00:00,  4.65it/s]


Epoch 11, Loss: 1.5469812987342713, Accuracy: 0.3031968031968032
Validation Loss: 1.6690539568662643, Validation Accuracy: 0.164, AUC-ROC: 0.47453


100%|██████████| 63/63 [00:13<00:00,  4.59it/s]


Epoch 12, Loss: 1.5454424089855618, Accuracy: 0.3011988011988012
Validation Loss: 1.6737838685512543, Validation Accuracy: 0.162, AUC-ROC: 0.47478


100%|██████████| 63/63 [00:13<00:00,  4.53it/s]


Epoch 13, Loss: 1.540925905818031, Accuracy: 0.3051948051948052
Validation Loss: 1.6761293411254883, Validation Accuracy: 0.17, AUC-ROC: 0.47491000000000005


100%|██████████| 63/63 [00:13<00:00,  4.53it/s]


Epoch 14, Loss: 1.5386627231325423, Accuracy: 0.3079420579420579
Validation Loss: 1.6801594197750092, Validation Accuracy: 0.168, AUC-ROC: 0.47506000000000004


100%|██████████| 63/63 [00:13<00:00,  4.55it/s]


Epoch 15, Loss: 1.538270694868905, Accuracy: 0.30744255744255744
Validation Loss: 1.6826021075248718, Validation Accuracy: 0.168, AUC-ROC: 0.47503


100%|██████████| 63/63 [00:13<00:00,  4.59it/s]


Epoch 16, Loss: 1.5323031970432825, Accuracy: 0.3191808191808192
Validation Loss: 1.6850963979959488, Validation Accuracy: 0.166, AUC-ROC: 0.47496


100%|██████████| 63/63 [00:13<00:00,  4.57it/s]


Epoch 17, Loss: 1.5326975773251246, Accuracy: 0.31368631368631367
Validation Loss: 1.6870922297239304, Validation Accuracy: 0.164, AUC-ROC: 0.475115


100%|██████████| 63/63 [00:13<00:00,  4.54it/s]


Epoch 18, Loss: 1.5370151410027155, Accuracy: 0.31193806193806195
Validation Loss: 1.6878706067800522, Validation Accuracy: 0.164, AUC-ROC: 0.474995


100%|██████████| 63/63 [00:13<00:00,  4.54it/s]


Epoch 19, Loss: 1.5303357215154738, Accuracy: 0.32042957042957043
Validation Loss: 1.6887610405683517, Validation Accuracy: 0.166, AUC-ROC: 0.47536500000000004


100%|██████████| 63/63 [00:13<00:00,  4.54it/s]


Epoch 20, Loss: 1.5309685601128473, Accuracy: 0.3254245754245754
Validation Loss: 1.6888775676488876, Validation Accuracy: 0.166, AUC-ROC: 0.47534


In [35]:
def prepare_test_data(test_df, tokenizer, max_len=100):
    X_test = test_df['xi'] + " " + test_df['xj']  
    y_test = test_df['LLM_model'].factorize()[0]  

    encoded_test = encode_data(X_test, tokenizer, max_len)
    X_test_bert = encoded_test['input_ids']
    attention_mask_test = encoded_test['attention_mask']

    test_dataset_bert = BertDataset(X_test_bert, attention_mask_test, y_test)
    return DataLoader(test_dataset_bert, batch_size=64), y_test  

test_loader_bert, y_test = prepare_test_data(test_df, tokenizer, max_len=100)

def evaluate_test_data(model, test_loader):
    model.eval()
    total_loss, total_correct = 0, 0
    all_targets = []
    all_probs = []

    with torch.no_grad():
        for X_batch, attention_mask, y_batch in test_loader:
            X_batch, attention_mask, y_batch = X_batch.to(device), attention_mask.to(device), y_batch.to(device)
            outputs = model(X_batch, attention_mask=attention_mask) 
            loss = F.cross_entropy(outputs, y_batch)

            total_loss += loss.item()
            total_correct += (outputs.argmax(1) == y_batch).sum().item()
            all_targets.extend(y_batch.cpu().numpy())
            all_probs.extend(F.softmax(outputs, dim=1).cpu().numpy())

    test_accuracy = total_correct / len(test_loader.dataset)
    test_loss = total_loss / len(test_loader)
    test_auc = roc_auc_score(all_targets, all_probs, multi_class='ovr')

    return test_loss, test_accuracy, test_auc

In [36]:
test_loss, test_accuracy, test_auc = evaluate_test_data(bert_model, test_loader_bert)
print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}, Test AUC-ROC: {test_auc}")

Test Loss: 1.6776182353496552, Test Accuracy: 0.1377245508982036, Test AUC-ROC: 0.4966096269227919
