<a href="https://colab.research.google.com/github/neethanm/Knowledge-Distillation/blob/main/Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install indic_transliteration
from indic_transliteration import sanscript
from indic_transliteration.sanscript import SchemeMap, SCHEMES, transliterate
import regex as re
import torch
import pandas as pd


from transformers import XLMRobertaForSequenceClassification, Trainer, TrainingArguments, XLMRobertaTokenizer, DistilBertTokenizer,DataCollatorWithPadding
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
def transliterate_text(text, from_lang='hk', to_lang='kannada'):
    # hk - harvard kyoto romanization scheme
    return transliterate(text, from_lang, to_lang)

# def is_kannada_script(text):
# # Regular expression to match Kannada characters and variants
#     kannada_pattern = re.compile(r'[\p{Script=Kannada}\p{Script=Kannada_Extensions}\p{Script=Kannada_Supplement}]+', re.UNICODE)
#     return bool(kannada_pattern.search(text))

from sklearn.preprocessing import LabelEncoder

# Encode the sentiment labels
def encode_sentiments(df):
    le = LabelEncoder()
    df['sentiment_encoded'] = le.fit_transform(df['Sentiment'])
    return df, le

In [5]:
df_train = pd.read_csv("gdrive/My Drive/Colab Notebooks/kannada_sentiment_train.csv")
final_train_arr = []

for index, row in df_train.iterrows():
    # Convert the row to a string and then print
    row_str = ', '.join(map(str, row))
    row_str.strip();
    arr = row_str.split(";");
    arr[0] = arr[0].strip()

    arr[1] = arr[1].strip()
    arr[0] = re.sub(r'[^\w\s]','',transliterate_text(arr[0])) # removing punctuation and transliteration
    final_train_arr.append([arr[0], arr[1]])

final_train_df = pd.DataFrame(final_train_arr, columns=['Reviews', 'Sentiment'])
final_train_df, label_encoder = encode_sentiments(final_train_df)

In [6]:
final_train_df['sentiment_encoded'] -= 1
invalid_labels = final_train_df[final_train_df['sentiment_encoded']<0]
print(invalid_labels)
final_train_df = final_train_df[final_train_df['sentiment_encoded']>=0]
final_train_df.head()

             Reviews Sentiment  sentiment_encoded
391  Fಲರೇ ಗುನ್ ಅತ್ ೧        30                 -1


Unnamed: 0,Reviews,Sentiment,sentiment_encoded
0,Bರೋ ನಿನ್ ಏಲ್ಲೋ ಹೋಗ್ಬುತ್ತೇ ಬೇದು ಬ್ರೋ,Positive,2
1,ಪೃಥ್ವಿ ಅಂಬರ್ ಏನ್ ಗುರೂ ನೀನು ನತಿಓನಲ್ ಅwಅರ್ದ್ wಇ...,Positive,2
2,ಸರ್ ಅಶ್ವತ್ಥಾಮ ಆ ವೈದ್ಯರ ಮನೆಗೆ ಬರುತ್ತಿದ್ದ ಅಂತ ಹೇ...,unknown_state,4
3,ಬೇಸ್ತ್ ಸೋನ್ಗ್ ಏವೇರ್,not-Kannada,3
4,ಷ್wಅಲ್ಪ ಜಸ್ತಿ ಬಿಲ್ದುಪ್ ಅಯಿತು ಅನ್ನಿಸುತ್ತೇ,Negative,1


In [7]:
df_test = pd.read_csv("gdrive/My Drive/Colab Notebooks/kannada_sentiment_test.csv")
final_test_arr = []

for index, row in df_test.iterrows():
    # Convert the row to a string and then print
    row_str = ', '.join(map(str, row))
    row_str.strip();
    arr = row_str.split(";");
    arr[0] = arr[0].strip()

    arr[1] = arr[1].strip()
    arr[0] = re.sub(r'[^\w\s]','',transliterate_text(arr[0])) # removing punctuation and transliteration
    final_test_arr.append([arr[0], arr[1]])

final_test_df = pd.DataFrame(final_test_arr, columns=['Reviews', 'Sentiment'])
final_test_df, label_encoder = encode_sentiments(final_test_df)

In [8]:
final_test_df.head()

Unnamed: 0,Reviews,Sentiment,sentiment_encoded
0,೧೦ ಋ ರೇಪೋರ್ತ್ ಮದಿ ಅವ್ನ,unknown_state,4
1,ಈ ಚಿತ್ರ ರಿ ರಿಲೀಸ್ ಆಗ್ಬೇಕು ಈ ಚಿತ್ರಕ್ಕೆ ಗೌರವ ...,Positive,2
2,ಂಅತ್ತೇ ಚೋಮ್ಮೇನ್ತ್ ಯರು ಕೇತ್ತದಗಿ ಮಅದ್ಬರ್ದನ್ಥ ಚೋಮ...,Negative,1
3,ನೀನು ಒಳ್ಳೆಯದೇ ಮಾತನಾಡಿದಿಯಾ ಗುರು ನಿನ್ನಗೆ ಧನ್ಯವಾದ...,Positive,2
4,ನಮ್ಮ ಕಾಮೆಂಟ್ ನೋಡಿ ಓದಿದವರ ಸಮಸ್ಯೆಗಳು ಬೇಗ ಪರಿಹಾರವ...,Mixed_feelings,0


In [9]:
def tokenize_reviews(df, tokenizer):
    return tokenizer(df['Reviews'].tolist(), padding="max_length", truncation=True, return_tensors='pt')

In [10]:
# Initialize the tokenizers
xlmr_tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize reviews
train_encodings = tokenize_reviews(final_train_df, xlmr_tokenizer)
test_encodings = tokenize_reviews(final_test_df, xlmr_tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [11]:
# Debugging prints
print(final_train_df.head())
print(train_encodings['input_ids'][:5])
print(test_encodings['input_ids'][:5])

                                             Reviews      Sentiment  \
0                Bರೋ ನಿನ್ ಏಲ್ಲೋ ಹೋಗ್ಬುತ್ತೇ ಬೇದು ಬ್ರೋ       Positive   
1  ಪೃಥ್ವಿ ಅಂಬರ್ ಏನ್  ಗುರೂ ನೀನು ನತಿಓನಲ್ ಅwಅರ್ದ್ wಇ...       Positive   
2  ಸರ್ ಅಶ್ವತ್ಥಾಮ ಆ ವೈದ್ಯರ ಮನೆಗೆ ಬರುತ್ತಿದ್ದ ಅಂತ ಹೇ...  unknown_state   
3                                ಬೇಸ್ತ್ ಸೋನ್ಗ್ ಏವೇರ್    not-Kannada   
4           ಷ್wಅಲ್ಪ ಜಸ್ತಿ ಬಿಲ್ದುಪ್ ಅಯಿತು ಅನ್ನಿಸುತ್ತೇ       Negative   

   sentiment_encoded  
0                  2  
1                  2  
2                  4  
3                  3  
4                  1  
tensor([[     0,    335,  21924,  ...,      1,      1,      1],
        [     0,  16125,  61718,  ...,      1,      1,      1],
        [     0, 107614,   3349,  ...,      1,      1,      1],
        [     0,  70722,   3022,  ...,      1,      1,      1],
        [     0,      6,  33887,  ...,      1,      1,      1]])
tensor([[     0,  62626,  16100,  ...,      1,      1,      1],
        [     0,   2498,  16738,  ...,      1,    

In [12]:
# Validate token ids for DistilBert
max_vocab_size = distilbert_tokenizer.vocab_size - 1

def validate_token_ids(encodings, max_id):
    for key in encodings:
        encodings[key] = torch.where(encodings[key] > max_id, torch.tensor(max_id), encodings[key])
    return encodings

train_encodings = validate_token_ids(train_encodings, max_vocab_size)
test_encodings = validate_token_ids(test_encodings, max_vocab_size)

# Print after validation
print(train_encodings['input_ids'][:5])
print(test_encodings['input_ids'][:5])

tensor([[    0,   335, 21924,  ...,     1,     1,     1],
        [    0, 16125, 30521,  ...,     1,     1,     1],
        [    0, 30521,  3349,  ...,     1,     1,     1],
        [    0, 30521,  3022,  ...,     1,     1,     1],
        [    0,     6, 30521,  ...,     1,     1,     1]])
tensor([[    0, 30521, 16100,  ...,     1,     1,     1],
        [    0,  2498, 16738,  ...,     1,     1,     1],
        [    0,     6,  6598,  ...,     1,     1,     1],
        [    0, 30521, 30521,  ...,     1,     1,     1],
        [    0, 14765, 30521,  ...,     1,     1,     1]])


In [13]:
import torch
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels, soft_targets=None):
        self.encodings = encodings
        self.labels = labels
        self.soft_targets = soft_targets

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        if self.soft_targets is not None:
            item['soft_targets'] = torch.tensor(self.soft_targets[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [14]:
train_dataset = SentimentDataset(train_encodings, final_train_df['sentiment_encoded'].tolist())
test_dataset = SentimentDataset(test_encodings, final_test_df['sentiment_encoded'].tolist())

In [15]:
print(train_dataset)
print(test_dataset)

<__main__.SentimentDataset object at 0x79e8c1a7b4c0>
<__main__.SentimentDataset object at 0x79e8c1a7bee0>


In [16]:
num_labels = len(label_encoder.classes_)
print(num_labels)
print(label_encoder.classes_)

5
['Mixed_feelings' 'Negative' 'Positive' 'not-Kannada' 'unknown_state']


In [17]:
# Initialize the teacher model
teacher_model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=num_labels)
teacher_model.gradient_checkpointing_enable()
# Define the training arguments
training_args = TrainingArguments(
    output_dir='./teacher_results',
    fp16 = True, # Enables 16-bit (half-precision) training, which can speed up training and reduce memory usage.
    num_train_epochs=3,
    per_device_train_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
)

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
# Initialize the Trainer
trainer = Trainer(
    model=teacher_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

In [19]:
trainer.train()

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss
1,1.4372,1.425511
2,1.428,1.434991
3,1.4207,1.42836


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


TrainOutput(global_step=2301, training_loss=1.428535869255215, metrics={'train_runtime': 1220.7794, 'train_samples_per_second': 15.074, 'train_steps_per_second': 1.885, 'total_flos': 4841900057622528.0, 'train_loss': 1.428535869255215, 'epoch': 3.0})

In [34]:
teacher_metrics = trainer.evaluate()
print(teacher_metrics)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'eval_loss': 1.4283602237701416, 'eval_runtime': 9.2338, 'eval_samples_per_second': 83.064, 'eval_steps_per_second': 10.397, 'epoch': 3.0}


In [20]:
# generating soft targets
import torch
from torch.utils.data import DataLoader

# Create a DataLoader for the training dataset
train_loader = DataLoader(train_dataset, batch_size=8)

# Put the teacher model in evaluation mode
teacher_model.eval()

soft_targets = []

# Generate soft targets
for batch in train_loader:
    inputs = {key: val.to(teacher_model.device) for key, val in batch.items() if key != 'labels'}
    with torch.no_grad():
        outputs = teacher_model(**inputs)
    logits = outputs.logits
    soft_targets.append(torch.nn.functional.softmax(logits, dim=-1))

# Concatenate the soft targets
soft_targets = torch.cat(soft_targets)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


In [21]:
# Custom trainer for Distillation
from transformers import DistilBertForSequenceClassification, DataCollatorWithPadding, Trainer

# Define a custom loss function for knowledge distillation
def distillation_loss(student_logits, teacher_logits, true_labels, alpha=0.5, temperature=2.0):
    # Cross-entropy loss with soft targets
    soft_loss = torch.nn.functional.kl_div(
        input=torch.nn.functional.log_softmax(student_logits / temperature, dim=-1),
        target=torch.nn.functional.softmax(teacher_logits / temperature, dim=-1),
        reduction='batchmean'
    ) * (temperature ** 2)

    # Cross-entropy loss with hard targets
    hard_loss = torch.nn.functional.cross_entropy(student_logits, true_labels)

    # Combine the two losses
    return alpha * soft_loss + (1 - alpha) * hard_loss

# Custom Trainer with distillation loss
class DistillationTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        # Soft targets should be included in the inputs
        teacher_logits = inputs.pop("soft_targets", None)

        # Model output
        outputs = model(**inputs)
        student_logits = outputs.logits

        # Compute distillation loss
        if teacher_logits is not None:
            loss_fct = torch.nn.KLDivLoss(reduction="batchmean")
            loss = loss_fct(
                torch.nn.functional.log_softmax(student_logits, dim=-1),
                torch.nn.functional.softmax(teacher_logits, dim=-1)
            )
        else:
            loss = torch.nn.functional.cross_entropy(student_logits, labels)

        return (loss, outputs) if return_outputs else loss


In [26]:
# using optuna for hyperparameter tuning
!pip install optuna
import optuna
from transformers import TrainingArguments
print(soft_targets)

# Define the objective function for Optuna
def objective(trial):
    training_args = TrainingArguments(
        output_dir='./student_results',
        num_train_epochs=trial.suggest_int('num_train_epochs', 2, 4),
        per_device_train_batch_size=trial.suggest_categorical('per_device_train_batch_size', [8, 16, 32]),
        learning_rate=trial.suggest_float('learning_rate', 1e-5, 5e-5, log=True),
        weight_decay=trial.suggest_float('weight_decay', 0.0, 0.3),
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_dir='./logs',
        logging_steps=10,
        disable_tqdm=False,
    )

    # Initialize the student model
    student_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_labels)

    # Initialize the student trainer
    student_trainer = DistillationTrainer(
        model=student_model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        data_collator=DataCollatorWithPadding(tokenizer=xlmr_tokenizer),
    )

    # Train the student model
    student_trainer.train()

    # Evaluate the student model
    metrics = student_trainer.evaluate()
    eval_accuracy = metrics.get('eval_accuracy', 0.0)
    print(f"Trial {trial.number}: eval_accuracy = {eval_accuracy}")

    return eval_accuracy

# Create the study and optimize
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=5)

# Print the best hyperparameters
print("Best hyperparameters: ", study.best_params)




[I 2024-07-31 18:33:08,201] A new study created in memory with name: no-name-94166e39-ac20-4b92-b387-948316d7ec77


tensor([[0.0798, 0.1371, 0.4915, 0.1743, 0.1173],
        [0.0798, 0.1371, 0.4915, 0.1743, 0.1173],
        [0.0798, 0.1371, 0.4915, 0.1743, 0.1173],
        ...,
        [0.0798, 0.1371, 0.4915, 0.1743, 0.1173],
        [0.0797, 0.1371, 0.4916, 0.1743, 0.1173],
        [0.0797, 0.1371, 0.4916, 0.1743, 0.1173]], device='cuda:0')


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss
1,1.1941,1.206057
2,1.0275,1.161718
3,1.0751,1.173511


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


[I 2024-07-31 18:39:55,657] Trial 0 finished with value: 0.0 and parameters: {'num_train_epochs': 3, 'per_device_train_batch_size': 8, 'learning_rate': 4.8679589930000486e-05, 'weight_decay': 0.1621103540375082}. Best is trial 0 with value: 0.0.


Trial 0: eval_accuracy = 0.0


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss
1,1.1798,1.216031
2,1.1043,1.179709
3,1.1023,1.144197


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


[I 2024-07-31 18:45:43,659] Trial 1 finished with value: 0.0 and parameters: {'num_train_epochs': 3, 'per_device_train_batch_size': 32, 'learning_rate': 2.2933436955260507e-05, 'weight_decay': 0.07761419086981353}. Best is trial 0 with value: 0.0.


Trial 1: eval_accuracy = 0.0


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss
1,1.173,1.213091
2,1.1265,1.175756


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


[I 2024-07-31 18:49:51,808] Trial 2 finished with value: 0.0 and parameters: {'num_train_epochs': 2, 'per_device_train_batch_size': 16, 'learning_rate': 1.3239341070028873e-05, 'weight_decay': 0.2787053703050778}. Best is trial 0 with value: 0.0.


Trial 2: eval_accuracy = 0.0


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss
1,1.1883,1.225573
2,1.1511,1.186128
3,1.1255,1.16434


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


[I 2024-07-31 18:55:45,805] Trial 3 finished with value: 0.0 and parameters: {'num_train_epochs': 3, 'per_device_train_batch_size': 32, 'learning_rate': 1.590663431894074e-05, 'weight_decay': 0.20137201804520344}. Best is trial 0 with value: 0.0.


Trial 3: eval_accuracy = 0.0


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss
1,1.1487,1.201258
2,1.1222,1.195758
3,1.1005,1.143459


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


[I 2024-07-31 19:01:45,714] Trial 4 finished with value: 0.0 and parameters: {'num_train_epochs': 3, 'per_device_train_batch_size': 16, 'learning_rate': 1.772610950704358e-05, 'weight_decay': 0.021715811047876833}. Best is trial 0 with value: 0.0.


Trial 4: eval_accuracy = 0.0
Best hyperparameters:  {'num_train_epochs': 3, 'per_device_train_batch_size': 8, 'learning_rate': 4.8679589930000486e-05, 'weight_decay': 0.1621103540375082}


In [35]:
from transformers import TrainingArguments, DistilBertForSequenceClassification, DataCollatorWithPadding

from sklearn.metrics import accuracy_score

# Define the function to compute accuracy
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}

# Initialize the training arguments
training_args = TrainingArguments(
    output_dir='./student_results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    learning_rate=3e-5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
    disable_tqdm=False,
)

# Initialize the student model
student_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_labels)

# Create the dataset with soft targets included
student_train_dataset = SentimentDataset(train_dataset.encodings, train_dataset.labels, soft_targets=soft_targets)

# Initialize the student trainer
student_trainer = DistillationTrainer(
    model=student_model,
    args=training_args,
    train_dataset=student_train_dataset,
    eval_dataset=test_dataset,
    data_collator=DataCollatorWithPadding(tokenizer=xlmr_tokenizer),
    compute_metrics = compute_metrics
)

# Train the student model
student_trainer.train()

# Evaluate the student model
metrics = student_trainer.evaluate()

# Extract and print the accuracy from the metrics
accuracy = metrics.get('eval_accuracy')
print(f"Evaluation accuracy: {accuracy * 100:.2f}%")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['soft_targets'] = torch.tensor(self.soft_targets[idx])


Epoch,Training Loss,Validation Loss,Accuracy
1,1.1675,1.209884,0.521512
2,1.1093,1.178737,0.552803
3,1.0934,1.135193,0.560626


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['soft_targets'] = torch.tensor(self.soft_targets[idx])
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['soft_targets'] = torch.tensor(self.soft_targets[idx])
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Evaluation accuracy: 56.06%


In [37]:
from sklearn.metrics import accuracy_score
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
from sklearn.preprocessing import LabelEncoder

# Define a dataset class
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Label encoding
label_encoder = LabelEncoder()
final_train_df['sentiment_encoded'] = label_encoder.fit_transform(final_train_df['Sentiment'])
final_test_df['sentiment_encoded'] = label_encoder.transform(final_test_df['Sentiment'])

# Initialize the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Create datasets
train_dataset = SentimentDataset(final_train_df['Reviews'].tolist(), final_train_df['sentiment_encoded'].tolist(), tokenizer)
test_dataset = SentimentDataset(final_test_df['Reviews'].tolist(), final_test_df['sentiment_encoded'].tolist(), tokenizer)

# Initialize the model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(label_encoder.classes_))

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,  # Increase the number of epochs if needed
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,  # Default learning rate for BERT
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
    disable_tqdm=False,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=lambda p: {"accuracy": accuracy_score(p.label_ids, p.predictions.argmax(-1))}
)

# Train and evaluate
trainer.train()
metrics = trainer.evaluate()

# Print accuracy
print("DistilBERT Evaluation Accuracy: {:.2f}%".format(metrics["eval_accuracy"] * 100))


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2895,1.32713,0.47588
2,1.2788,1.32203,0.479791
3,1.3276,1.317142,0.483703
4,1.3395,1.311742,0.48631
5,1.1464,1.310373,0.485007


DistilBERT Evaluation Accuracy: 48.50%
