In [5]:
import pandas as pd
import numpy as np
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from transformers import DistilBertForMaskedLM
from transformers import DataCollatorForLanguageModeling
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import torch
from tqdm import tqdm
from sklearn.metrics import f1_score
import warnings
from sklearn.exceptions import ConvergenceWarning
import os
import torch
warnings.filterwarnings('ignore', category=ConvergenceWarning)

In [6]:
root = os.getcwd()
train_path = os.path.join(root, 'train.pkl')
test_path = os.path.join(root, 'test_input.pkl')
unlabel_path = os.path.join(root, 'unlabel.pkl')
train_data = pd.read_pickle(train_path)
test_data = pd.read_pickle(test_path)
unlabel_data = pd.read_pickle(unlabel_path)

In [7]:
train_df = pd.DataFrame.from_dict(train_data)
test_df = pd.DataFrame(test_data, columns=['texts'])
unlabel_df = pd.DataFrame(unlabel_data, columns=['texts'])

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [8]:
def chunk_text(text, chunk_size=10):
    words = text.split()
    return [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]

In [9]:
chunked_texts = []
for text in unlabel_df['texts']:
    chunked_texts.extend(chunk_text(text, chunk_size=10))
chunked_texts_df = pd.DataFrame(chunked_texts, columns=['texts'])

In [10]:
chunked_texts_df.to_csv('unlabel.csv', index=True)

In [7]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
mlm_model = DistilBertForMaskedLM.from_pretrained('distilbert-base-uncased')
mlm_model.to(device)
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)
model.to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [None]:
unlabel_encodings = tokenizer(chunked_texts_df['texts'].tolist(), truncation=True, padding=True, max_length=128)

# Step 1: Pretrain a Language Model on Unlabeled Data
class UnlabeledDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings['input_ids'])

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

In [9]:
unlabel_dataset = UnlabeledDataset(unlabel_encodings)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

training_args = TrainingArguments(
    output_dir="./mlm_pretrain_results",
    overwrite_output_dir=True,
    per_device_train_batch_size=16,
    learning_rate=5e-5,
    num_train_epochs=100,
    save_steps=500,
    save_total_limit=2,
    logging_dir='./logs',
    dataloader_num_workers=2,
    fp16=True if torch.cuda.is_available() else False,
)
trainer = Trainer(
    model=mlm_model,
    args=training_args,
    train_dataset=unlabel_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

In [None]:
print("Pretraining model on unlabeled data...")
trainer.train()

Pretraining model on unlabeled data...


[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mchunchunn[0m ([33mchunchunn-viettel[0m). Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/160100 [00:00<?, ?it/s]

In [None]:
# Step 2: Pseudolabeling
pseudo_labels = []
model.eval()
for text in tqdm(chunked_texts_df['texts'].tolist(), desc="Generating Pseudolabels"):
    inputs = tokenizer(text, return_tensors="pt", truncation=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        logits = model(**inputs).logits
    pred_label = torch.argmax(logits, dim=1).item()
    confidence = torch.max(torch.softmax(logits, dim=1)).item()
    if confidence > 0.8:
        pseudo_labels.append(pred_label)
    else:
        pseudo_labels.append(None)

# Filter out None pseudolabels
pseudo_data = chunked_texts_df.assign(labels=pseudo_labels).dropna().reset_index(drop=True)
pseudo_data['labels'] = pseudo_data['labels'].astype(int)

In [None]:
# Step 3: Combine Pseudo-labeled Data with Training Data
model.to(device)
combined_df = pd.concat([train_df, pseudo_data], ignore_index=True)
train_encodings = tokenizer(combined_df['texts'].tolist(), truncation=True, padding=True, max_length=128)
labels = combined_df['labels'].tolist()

class CombinedDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

combined_dataset = CombinedDataset(train_encodings, labels)

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    num_train_epochs=50,
    evaluation_strategy="no",
    save_steps=500,
    save_total_limit=2,
    logging_dir='./logs',
    dataloader_num_workers=2,
    fp16=True if torch.cuda.is_available() else False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=combined_dataset,
    tokenizer=tokenizer
)

print("Fine-tuning on combined labeled and pseudo-labeled data...")
trainer.train()

In [None]:
from torch.utils.data import DataLoader
print("Running inference on test data...")

test_encodings = tokenizer(test_df['texts'].tolist(), truncation=True, padding=True, max_length=128)
test_dataset = CombinedDataset(test_encodings, [0] * len(test_df))
model.to(device)
predictions = []
model.eval()

for batch in DataLoader(test_dataset, batch_size=16):
    batch = {k: v.to(device) for k, v in batch.items()}

    with torch.no_grad():
        outputs = model(**batch)

    preds = torch.argmax(outputs.logits, axis=1)
    predictions.extend(preds.cpu().numpy())
test_df['labels'] = predictions

print("Inference completed. Predictions added to the test dataframe.")


In [None]:
with open('predictions.txt', 'w') as f:
    for index, row in test_df.iterrows():
        tokens = row['texts']
        predicted_label = row['labels']
        formatted_output = f"'{tokens}' {predicted_label}\n"

        f.write(formatted_output)

print("Predictions have been saved to predictions.txt")
