In [None]:
# Import libraries 
import transformers as ts
from datasets import Dataset
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from torch.optim import AdamW
import torch.nn as nn
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import pipeline
import matplotlib.pyplot as plt

In [None]:
# Set seed
def set_seed(seed = 42): 
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

In [None]:
# modo = 'uguale' # or "assoluto" uguale means taking the same number of examples from each class
# soglia = 0.8
# do_mapping = False
# num_epochs = 10
# data = pd.DataFrame([[]], columns=['text','labels','prob'])

# Training of the distilbert model using labels from the bart zero-shot classification in 1_step.ipynb

def ZS_self_lrn(data, soglia = 0.9, modo = 'assoluto', 
                num_epochs = 10, batch_size = 5, do_mapping = False):

    """
    Performs zero-shot self-learning on a labeled dataset using DistilBERT.

    Parameters:
    - data (DataFrame): A pandas DataFrame with at least 'text', 'labels', and 'prob' columns.
    - soglia (float): Threshold for selecting high-confidence samples.
                      In 'assoluto' mode, samples with prob >= soglia are selected.
                      In 'uguale' mode, determines sample proportion per class.
    - modo (str): Selection mode. 
                  'assoluto' selects samples based on absolute probability threshold.
                  'uguale' selects an equal number of top samples from each class.
    - num_epochs (int): Number of training epochs
    - batch_size (int): Training batch size 
    - do_mapping (bool): If True, maps string labels to integers and modifies the dataset
    """

    set_seed()

    labels = data.labels.value_counts().index.to_list()   # pandas
    if do_mapping:
        mapping = dict()
        for i,el in enumerate(labels):
            mapping[el] = i
        data['labels'] = data['labels'].map(mapping)   # pandas
        print(mapping)
    
    n_labels = len(labels)
    if modo == 'uguale':
        conta = int(data.shape[0]*(1-soglia)/n_labels)   # pandas
        sottoins = []
        for el in range(n_labels):
            s_ins = data.loc[data.labels==el,:].sort_values('prob', ascending=False)   # pandas
            sottoins.append(s_ins.head(conta)[['text','labels']])   # pandas
        dataset = pd.concat(sottoins, axis=0)   # pandas
    elif modo == 'assoluto':
        dataset = data.loc[data.prob >= soglia,['text','labels']]   # pandas
        
    dati = Dataset.from_pandas(dataset)   # pandas
    try:
        dati = dati.remove_columns('__index_level_0__')   # pandas
    except:
        pass
    
    # we use distilbert and its tokenizer as discussed in the paper
    tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
    model = AutoModelForSequenceClassification.from_pretrained(
        "distilbert/distilbert-base-uncased", num_labels=n_labels)
    
    # training loop. we track the loss function values if the user wants to check it
    
    def tokenize_function(examples):
        return tokenizer(examples["text"], padding="max_length", max_length=512, truncation=True, return_tensors="pt")
        # return tokenizer(examples["text"], padding=True, max_length=512, truncation=True, return_tensors="pt")
    tokenized_text = dati.map(tokenize_function, batched=True)
    tokenized_text = tokenized_text.remove_columns("text")
    tokenized_text.set_format("torch")
    text_loader = DataLoader(tokenized_text, batch_size=batch_size, num_workers=0, shuffle=True)
    
    optimizer = AdamW(model.parameters(), lr=1e-5)
    num_training_steps = num_epochs * len(text_loader)
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model.to(device)
    f_loss = nn.CrossEntropyLoss()
    progress_bar = tqdm(range(num_training_steps))
    model.train()
    loss_l = []
    
    for epoch in range(num_epochs):
        for i, batch in enumerate(text_loader):
            labels = batch.pop('labels')
            labels = labels.to(device)
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = f_loss(outputs.logits, labels)
            
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            progress_bar.update(1)
            
            if i%30==0:
                loss_l.append(loss.item())
    
    # Use these to save and load the finetuned model
    # model.save_pretrained(your_path, from_pt=True)
    # model=AutoModelForSequenceClassification.from_pretrained(your_path, num_labels=n_labels)


In [None]:
# Additional filter to delete submission that contains the string [deleted], 
# since it corresponds to deleted posts at the moment of data collection

#load dataset
data_pandas_prova=pd.read_csv(r'C:\Tempor\Reddit\Pos_Neg\df_pandas.csv')
data_pandas_prova=data_pandas_prova.drop(labels=['Unnamed: 0'], axis=1)

#delete all records with text deleted
data_pandas_prova=data_pandas_prova[data_pandas_prova['text']!='[deleted]']
print('Length of dataset without [deleted] users messages: '+ str(len(data_pandas_prova)))

# counts how many records have high-condifence labels (over 90%) and checks if the dataset is balanced
data_pandas_check=data_pandas_prova[data_pandas_prova['prob']>0.9]
print('Length of dataset containing high-confidence texts: '+ str(len(data_pandas_check)))
val_count = data_pandas_check['labels'].value_counts()
percentage=val_count*100/len(data_pandas_check)

# plt.figure(figsize=(8,4))
# plt.bar(val_count.index, percentage.values)
# plt.title("Sentiment Data Distribution")

num_labels=[]
for line in data_pandas_prova['labels']:
    if(line=='Positive'):
        num_labels.append(0)
    else:
        num_labels.append(1)

num_labels_series=pd.Series(num_labels)
data_pandas=data_pandas_prova.drop(['labels'], axis=1)
data_pandas['labels']=num_labels_series.values

La lunghezza del dataset completo senza [deleted] nei testi è 25528
La lunghezza del dataset con labels sicure oltre il 90% è 6061


labels
Positive    50.371226
Negative    49.628774
Name: count, dtype: float64

In [None]:
ZS_self_lrn(data_pandas)