In [2]:
from backend import config
import os
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer




In [3]:
# Load the CSV email dataset
data_frame = pd.read_csv(os.path.join(config.DATADIR, 'dataset', 'raw', 'sample_1000_email_dataset.csv'))
data_frame.head(5)

Unnamed: 0,subject,messages,label
0,Webinar Confirmation: GenAI: The Shift to Data...,"Hi Nilay,\n\nThank you for registering for Gen...",2
1,Invitation: Connor <> Nilay - building RAG job...,"When Wednesday Aug 21, 2024 ⋅ 11am – 11:45am (...",2
2,Thank you for applying to Johns Hopkins APL!,"Dear abc,\n\nThank you for applying to the Joh...",0
3,Garner Health Application Follow Up,"Hi Nilay,\n\nThank you for applying to our Dat...",3
4,Thank you for applying to Perplexity AI,"Hi Nilay,\n\nThank you for applying to the AI ...",3


In [4]:
# Count of each label
data_frame['messages'].groupby(data_frame['label']).count()

label
0    243
1    233
2    257
3    267
Name: messages, dtype: int64

In [5]:
# Download the essential NLTK files for stopwords, punctuations, and lemmatization
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

# Download necessary resources
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nilay\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\nilay\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nilay\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nilay\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
# Initialize stop words and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function for removing stopwords, lemmaztize and tokenize the data
def clean_text(text):
    # Tokenize text
    tokens = word_tokenize(text.lower())  # Convert to lowercase
    # Remove stopwords and punctuation, then lemmatize
    cleaned_tokens = [
        lemmatizer.lemmatize(word) for word in tokens
        if word not in stop_words and word not in string.punctuation
    ]
    return ' '.join(cleaned_tokens)

In [7]:
# Creating a copy of the original data frame for a separate tokanized data frame
data_frame_tokanized = data_frame.copy()
data_frame_tokanized.head(3)

Unnamed: 0,subject,messages,label
0,Webinar Confirmation: GenAI: The Shift to Data...,"Hi Nilay,\n\nThank you for registering for Gen...",2
1,Invitation: Connor <> Nilay - building RAG job...,"When Wednesday Aug 21, 2024 ⋅ 11am – 11:45am (...",2
2,Thank you for applying to Johns Hopkins APL!,"Dear abc,\n\nThank you for applying to the Joh...",0


In [8]:
# Apply the function to the 'subject' and 'messages' columns and store in 'tokenized_subject' and 'tokenized_messages'
data_frame_tokanized['tokenized_subject'] = data_frame_tokanized['subject'].apply(clean_text)
data_frame_tokanized['tokenized_messages'] = data_frame_tokanized['messages'].apply(clean_text)
data_frame_tokanized = data_frame_tokanized.drop(['subject', 'messages'], axis=1)
data_frame_tokanized.head(5)

Unnamed: 0,label,tokenized_subject,tokenized_messages
0,2,webinar confirmation genai shift data intellig...,hi nilay thank registering genai shift data in...
1,2,invitation connor nilay building rag job chatb...,wednesday aug 21 2024 ⋅ 11am – 11:45am pacific...
2,0,thank applying john hopkins apl,dear abc thank applying john hopkins applied p...
3,3,garner health application follow,hi nilay thank applying data analyst role ’ re...
4,3,thank applying perplexity ai,hi nilay thank applying ai software engineer r...


In [9]:
# Concatenate the tokenized_subject and tokenized_messages
data_frame_tokanized['email_content'] = data_frame_tokanized['tokenized_subject'] + data_frame_tokanized['tokenized_messages']
data_frame_tokanized.head(3)

Unnamed: 0,label,tokenized_subject,tokenized_messages,email_content
0,2,webinar confirmation genai shift data intellig...,hi nilay thank registering genai shift data in...,webinar confirmation genai shift data intellig...
1,2,invitation connor nilay building rag job chatb...,wednesday aug 21 2024 ⋅ 11am – 11:45am pacific...,invitation connor nilay building rag job chatb...
2,0,thank applying john hopkins apl,dear abc thank applying john hopkins applied p...,thank applying john hopkins apldear abc thank ...


In [10]:
# Using embed model, convert tokenized frame into embeddings
model_embedding = SentenceTransformer('all-MiniLM-L6-v2')
data_frame_tokanized['email_embeddings'] = data_frame_tokanized['email_content'].apply(model_embedding.encode)
data_frame_tokanized.head(3)

Unnamed: 0,label,tokenized_subject,tokenized_messages,email_content,email_embeddings
0,2,webinar confirmation genai shift data intellig...,hi nilay thank registering genai shift data in...,webinar confirmation genai shift data intellig...,"[-0.068945006, -0.0932267, -0.00037362284, -0...."
1,2,invitation connor nilay building rag job chatb...,wednesday aug 21 2024 ⋅ 11am – 11:45am pacific...,invitation connor nilay building rag job chatb...,"[-0.10992242, 0.008053303, 0.09707958, -0.0057..."
2,0,thank applying john hopkins apl,dear abc thank applying john hopkins applied p...,thank applying john hopkins apldear abc thank ...,"[-0.09812444, 0.028659057, 0.0556533, -0.02935..."


In [28]:
data_frame_embeddings = data_frame_tokanized.copy()
data_frame_embeddings = data_frame_embeddings.drop(['tokenized_subject', 'tokenized_messages', 'email_content'], axis=1)
data_frame_embeddings.head(3)

Unnamed: 0,label,email_embeddings
0,2,"[-0.068945006, -0.0932267, -0.00037362284, -0...."
1,2,"[-0.10992242, 0.008053303, 0.09707958, -0.0057..."
2,0,"[-0.09812444, 0.028659057, 0.0556533, -0.02935..."


In [12]:
# Install required libraries for model fine-tuning
import torch
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset

In [29]:
# Convert the embeddings to tensor format
data_frame_embeddings['email_embeddings'] = data_frame_embeddings['email_embeddings'].apply(lambda x: torch.tensor(x))
data_frame_embeddings.head(1)

Unnamed: 0,label,email_embeddings
0,2,"[tensor(-0.0689), tensor(-0.0932), tensor(-0.0..."


In [30]:
# Split the dataset into training and validation sets
train_df, val_df = train_test_split(data_frame_embeddings, test_size=0.2, random_state=42)

In [31]:
# Custom Dataset class to load the data into the format required by transformers
class EmailDataset(Dataset):
    def __init__(self, df, device=None):
        self.input_ids = df['email_embeddings'].tolist()
        self.labels = df['label'].tolist()
        self.device = device

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        input_ids_tensor = torch.tensor(self.input_ids[idx], dtype=torch.long)
        labels_tensor = torch.tensor(self.labels[idx], dtype=torch.long)
        if self.device:
            input_ids_tensor = input_ids_tensor.to(self.device)
            labels_tensor = labels_tensor.to(self.device)
        return {'input_ids': input_ids_tensor, 'labels': labels_tensor}

# Load the dataset
train_dataset = EmailDataset(train_df, device='cpu')  # Pass device as needed
val_dataset = EmailDataset(val_df, device='cpu')

In [16]:
# Load the DistilBertForSequenceClassification model 
from transformers import DistilBertTokenizer

# Load the tokenizer and model
tokenizer_DBSC = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model_DBSC = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=4)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Defining the training arguments for fine-tuning of the DBSC model.
training_args = TrainingArguments(
    output_dir=os.path.join(config.DATADIR, 'ml_models'),          # Output directory for saving model
    num_train_epochs=5,              # Number of training epochs
    per_device_train_batch_size=8,   # Batch size for training
    per_device_eval_batch_size=16,   # Batch size for evaluation
    warmup_steps=500,                # Number of warmup steps
    weight_decay=0.01,               # Strength of weight decay
    logging_dir=os.path.join(config.DATADIR, 'ml_models', 'logs'),            # Directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch",     # Evaluation strategy
    save_strategy="epoch",           # Save strategy
    load_best_model_at_end=True,     # Load the best model when finished training
)



In [33]:
from transformers import Trainer, TrainingArguments

trainer = Trainer(
    model=model_DBSC,                         # The model to train
    args=training_args,                  # Training arguments
    train_dataset=train_dataset,         # Training dataset
    eval_dataset=val_dataset,            # Validation dataset
    tokenizer=tokenizer_DBSC                  # Tokenizer
)

  trainer = Trainer(


In [34]:
# Fine-tune the model
trainer.train()

  0%|          | 0/500 [23:29<?, ?it/s]
  input_ids_tensor = torch.tensor(self.input_ids[idx], dtype=torch.long)
                                                  
  2%|▏         | 10/500 [03:32<2:30:48, 18.47s/it]

{'loss': 1.3914, 'grad_norm': 3.272550582885742, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.1}


                                                  
  4%|▍         | 20/500 [06:11<2:07:54, 15.99s/it]

{'loss': 1.3895, 'grad_norm': 2.399317502975464, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.2}


                                                  
  6%|▌         | 30/500 [08:49<2:03:25, 15.76s/it]

{'loss': 1.3839, 'grad_norm': 2.3556902408599854, 'learning_rate': 3e-06, 'epoch': 0.3}


                                                  
  8%|▊         | 40/500 [11:24<1:59:19, 15.56s/it]

{'loss': 1.3845, 'grad_norm': 2.388061046600342, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.4}


                                                  
 10%|█         | 50/500 [14:12<2:03:45, 16.50s/it]

{'loss': 1.3954, 'grad_norm': 2.360504627227783, 'learning_rate': 5e-06, 'epoch': 0.5}


                                                  
 12%|█▏        | 60/500 [17:01<2:08:49, 17.57s/it]

{'loss': 1.3839, 'grad_norm': 2.3211779594421387, 'learning_rate': 6e-06, 'epoch': 0.6}


                                                  
 14%|█▍        | 70/500 [19:51<1:53:55, 15.90s/it]

{'loss': 1.3779, 'grad_norm': 1.8253506422042847, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.7}


                                                  
 16%|█▌        | 80/500 [22:20<1:44:00, 14.86s/it]

{'loss': 1.3891, 'grad_norm': 3.7629635334014893, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.8}


                                                  
 18%|█▊        | 90/500 [25:05<1:59:11, 17.44s/it]

{'loss': 1.395, 'grad_norm': 4.3131256103515625, 'learning_rate': 9e-06, 'epoch': 0.9}


                                                   
 20%|██        | 100/500 [27:41<1:39:41, 14.95s/it]

{'loss': 1.3811, 'grad_norm': 2.844799757003784, 'learning_rate': 1e-05, 'epoch': 1.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                   
[A                                    

 20%|██        | 100/500 [29:33<1:39:41, 14.95s/it]
[A
[A

{'eval_loss': 1.3989975452423096, 'eval_runtime': 112.3452, 'eval_samples_per_second': 1.78, 'eval_steps_per_second': 0.116, 'epoch': 1.0}


  input_ids_tensor = torch.tensor(self.input_ids[idx], dtype=torch.long)
                                                   
 22%|██▏       | 110/500 [32:09<1:45:22, 16.21s/it]

{'loss': 1.3914, 'grad_norm': 1.7851414680480957, 'learning_rate': 1.1000000000000001e-05, 'epoch': 1.1}


                                                   
 24%|██▍       | 120/500 [34:41<1:32:46, 14.65s/it]

{'loss': 1.3835, 'grad_norm': 1.5576601028442383, 'learning_rate': 1.2e-05, 'epoch': 1.2}


                                                   
 26%|██▌       | 130/500 [37:09<1:30:14, 14.63s/it]

{'loss': 1.399, 'grad_norm': 2.065403461456299, 'learning_rate': 1.3000000000000001e-05, 'epoch': 1.3}


                                                   
 28%|██▊       | 140/500 [39:41<1:29:39, 14.94s/it]

{'loss': 1.3846, 'grad_norm': 1.287466049194336, 'learning_rate': 1.4000000000000001e-05, 'epoch': 1.4}


                                                   
 30%|███       | 150/500 [42:11<1:26:40, 14.86s/it]

{'loss': 1.392, 'grad_norm': 2.494753360748291, 'learning_rate': 1.5e-05, 'epoch': 1.5}


                                                   
 32%|███▏      | 160/500 [44:46<1:25:14, 15.04s/it]

{'loss': 1.4045, 'grad_norm': 1.2497637271881104, 'learning_rate': 1.6000000000000003e-05, 'epoch': 1.6}


                                                   
 34%|███▍      | 170/500 [47:16<1:21:20, 14.79s/it]

{'loss': 1.4036, 'grad_norm': 2.271383762359619, 'learning_rate': 1.7000000000000003e-05, 'epoch': 1.7}


                                                   
 36%|███▌      | 180/500 [49:44<1:18:45, 14.77s/it]

{'loss': 1.3987, 'grad_norm': 1.9331021308898926, 'learning_rate': 1.8e-05, 'epoch': 1.8}


                                                   
 38%|███▊      | 190/500 [52:11<1:16:19, 14.77s/it]

{'loss': 1.376, 'grad_norm': 2.8755645751953125, 'learning_rate': 1.9e-05, 'epoch': 1.9}


                                                   
 40%|████      | 200/500 [54:43<1:14:44, 14.95s/it]

{'loss': 1.4164, 'grad_norm': 2.2024426460266113, 'learning_rate': 2e-05, 'epoch': 2.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                   
[A                                    

 40%|████      | 200/500 [56:30<1:14:44, 14.95s/it]
[A
[A

{'eval_loss': 1.4012902975082397, 'eval_runtime': 106.637, 'eval_samples_per_second': 1.876, 'eval_steps_per_second': 0.122, 'epoch': 2.0}


  input_ids_tensor = torch.tensor(self.input_ids[idx], dtype=torch.long)
                                                   
 42%|████▏     | 210/500 [59:15<1:22:41, 17.11s/it]

{'loss': 1.3831, 'grad_norm': 1.4199646711349487, 'learning_rate': 2.1e-05, 'epoch': 2.1}


                                                     
 44%|████▍     | 220/500 [1:01:53<1:13:26, 15.74s/it]

{'loss': 1.3901, 'grad_norm': 3.175156354904175, 'learning_rate': 2.2000000000000003e-05, 'epoch': 2.2}


                                                     
 46%|████▌     | 230/500 [1:04:36<1:12:07, 16.03s/it]

{'loss': 1.3952, 'grad_norm': 2.1122868061065674, 'learning_rate': 2.3000000000000003e-05, 'epoch': 2.3}


                                                     
 48%|████▊     | 240/500 [1:08:36<1:31:11, 21.04s/it]

{'loss': 1.351, 'grad_norm': 2.7856268882751465, 'learning_rate': 2.4e-05, 'epoch': 2.4}


                                                     
 50%|█████     | 250/500 [1:11:18<1:06:49, 16.04s/it]

{'loss': 1.4295, 'grad_norm': 2.3211731910705566, 'learning_rate': 2.5e-05, 'epoch': 2.5}


                                                     
 52%|█████▏    | 260/500 [1:14:05<1:08:25, 17.11s/it]

{'loss': 1.4028, 'grad_norm': 1.4549769163131714, 'learning_rate': 2.6000000000000002e-05, 'epoch': 2.6}


                                                     
 54%|█████▍    | 270/500 [1:16:51<1:04:15, 16.76s/it]

{'loss': 1.3991, 'grad_norm': 2.578497886657715, 'learning_rate': 2.7000000000000002e-05, 'epoch': 2.7}


                                                     
 56%|█████▌    | 280/500 [1:19:35<59:41, 16.28s/it]

{'loss': 1.4017, 'grad_norm': 1.0915743112564087, 'learning_rate': 2.8000000000000003e-05, 'epoch': 2.8}


                                                   
 58%|█████▊    | 290/500 [1:22:14<55:47, 15.94s/it]

{'loss': 1.3948, 'grad_norm': 1.2047383785247803, 'learning_rate': 2.9e-05, 'epoch': 2.9}


                                                   
 60%|██████    | 300/500 [1:24:53<52:49, 15.85s/it]

{'loss': 1.3892, 'grad_norm': 1.337282419204712, 'learning_rate': 3e-05, 'epoch': 3.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                   
[A                                      

 60%|██████    | 300/500 [1:26:50<52:49, 15.85s/it]
[A
[A

{'eval_loss': 1.391973614692688, 'eval_runtime': 117.5672, 'eval_samples_per_second': 1.701, 'eval_steps_per_second': 0.111, 'epoch': 3.0}


  input_ids_tensor = torch.tensor(self.input_ids[idx], dtype=torch.long)
                                                     
 62%|██████▏   | 310/500 [1:29:39<54:57, 17.36s/it]

{'loss': 1.3929, 'grad_norm': 1.3620362281799316, 'learning_rate': 3.1e-05, 'epoch': 3.1}


                                                   
 64%|██████▍   | 320/500 [1:32:17<47:21, 15.79s/it]

{'loss': 1.3957, 'grad_norm': 1.4959760904312134, 'learning_rate': 3.2000000000000005e-05, 'epoch': 3.2}


                                                   
 66%|██████▌   | 330/500 [1:34:55<44:38, 15.76s/it]

{'loss': 1.3836, 'grad_norm': 1.99321448802948, 'learning_rate': 3.3e-05, 'epoch': 3.3}


                                                   
 68%|██████▊   | 340/500 [1:37:39<45:59, 17.25s/it]

{'loss': 1.3822, 'grad_norm': 1.6870115995407104, 'learning_rate': 3.4000000000000007e-05, 'epoch': 3.4}


                                                   
 70%|███████   | 350/500 [1:40:29<40:46, 16.31s/it]

{'loss': 1.4021, 'grad_norm': 2.3486757278442383, 'learning_rate': 3.5e-05, 'epoch': 3.5}


                                                   
 72%|███████▏  | 360/500 [1:43:08<37:20, 16.01s/it]

{'loss': 1.395, 'grad_norm': 1.4318883419036865, 'learning_rate': 3.6e-05, 'epoch': 3.6}


                                                   
 74%|███████▍  | 370/500 [1:45:47<34:30, 15.92s/it]

{'loss': 1.3827, 'grad_norm': 1.5419650077819824, 'learning_rate': 3.7e-05, 'epoch': 3.7}


                                                   
 76%|███████▌  | 380/500 [1:48:37<33:58, 16.99s/it]

{'loss': 1.388, 'grad_norm': 1.1659373044967651, 'learning_rate': 3.8e-05, 'epoch': 3.8}


                                                   
 78%|███████▊  | 390/500 [1:51:22<29:44, 16.22s/it]

{'loss': 1.4008, 'grad_norm': 1.483149766921997, 'learning_rate': 3.9000000000000006e-05, 'epoch': 3.9}


                                                   
 80%|████████  | 400/500 [1:54:00<26:21, 15.82s/it]

{'loss': 1.3759, 'grad_norm': 2.577026128768921, 'learning_rate': 4e-05, 'epoch': 4.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                   
[A                                      

 80%|████████  | 400/500 [1:55:56<26:21, 15.82s/it]
[A
[A

{'eval_loss': 1.4146380424499512, 'eval_runtime': 115.3516, 'eval_samples_per_second': 1.734, 'eval_steps_per_second': 0.113, 'epoch': 4.0}


  input_ids_tensor = torch.tensor(self.input_ids[idx], dtype=torch.long)
                                                     
 82%|████████▏ | 410/500 [1:58:42<25:41, 17.13s/it]

{'loss': 1.3349, 'grad_norm': 2.6703238487243652, 'learning_rate': 4.1e-05, 'epoch': 4.1}


                                                   
 84%|████████▍ | 420/500 [2:01:16<20:32, 15.40s/it]

{'loss': 1.4341, 'grad_norm': 1.5187342166900635, 'learning_rate': 4.2e-05, 'epoch': 4.2}


                                                   
 86%|████████▌ | 430/500 [2:03:47<17:34, 15.07s/it]

{'loss': 1.4072, 'grad_norm': 1.7435709238052368, 'learning_rate': 4.3e-05, 'epoch': 4.3}


                                                   
 88%|████████▊ | 440/500 [2:06:19<15:09, 15.17s/it]

{'loss': 1.3916, 'grad_norm': 1.1443623304367065, 'learning_rate': 4.4000000000000006e-05, 'epoch': 4.4}


                                                   
 90%|█████████ | 450/500 [2:08:59<13:02, 15.64s/it]

{'loss': 1.3928, 'grad_norm': 1.5602588653564453, 'learning_rate': 4.5e-05, 'epoch': 4.5}


                                                   
 92%|█████████▏| 460/500 [2:11:31<10:09, 15.25s/it]

{'loss': 1.3953, 'grad_norm': 2.9333488941192627, 'learning_rate': 4.600000000000001e-05, 'epoch': 4.6}


                                                   
 94%|█████████▍| 470/500 [2:14:03<07:35, 15.17s/it]

{'loss': 1.3789, 'grad_norm': 1.4103642702102661, 'learning_rate': 4.7e-05, 'epoch': 4.7}


                                                   
 96%|█████████▌| 480/500 [2:16:35<05:02, 15.12s/it]

{'loss': 1.4087, 'grad_norm': 1.889042615890503, 'learning_rate': 4.8e-05, 'epoch': 4.8}


                                                   
 98%|█████████▊| 490/500 [2:19:13<02:34, 15.46s/it]

{'loss': 1.4148, 'grad_norm': 2.696460723876953, 'learning_rate': 4.9e-05, 'epoch': 4.9}


                                                   
100%|██████████| 500/500 [2:21:44<00:00, 15.19s/it]

{'loss': 1.3974, 'grad_norm': 2.035902261734009, 'learning_rate': 0.0, 'epoch': 5.0}


  input_ids_tensor = torch.tensor(self.input_ids[idx], dtype=torch.long)

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                   
[A                                      

100%|██████████| 500/500 [2:23:45<00:00, 15.19s/it]
[A
[A

{'eval_loss': 1.3923410177230835, 'eval_runtime': 114.085, 'eval_samples_per_second': 1.753, 'eval_steps_per_second': 0.114, 'epoch': 5.0}


                                                   
100%|██████████| 500/500 [2:23:52<00:00, 15.19s/it]

{'train_runtime': 8633.0258, 'train_samples_per_second': 0.463, 'train_steps_per_second': 0.058, 'train_loss': 1.3922538299560547, 'epoch': 5.0}


100%|██████████| 500/500 [2:23:55<00:00, 17.27s/it]


TrainOutput(global_step=500, training_loss=1.3922538299560547, metrics={'train_runtime': 8633.0258, 'train_samples_per_second': 0.463, 'train_steps_per_second': 0.058, 'total_flos': 397416370176000.0, 'train_loss': 1.3922538299560547, 'epoch': 5.0})

In [35]:
# Evaluate the model
trainer.evaluate()

  input_ids_tensor = torch.tensor(self.input_ids[idx], dtype=torch.long)
100%|██████████| 13/13 [01:51<00:00,  8.57s/it]


{'eval_loss': 1.391973614692688,
 'eval_runtime': 119.5655,
 'eval_samples_per_second': 1.673,
 'eval_steps_per_second': 0.109,
 'epoch': 5.0}

In [None]:
# The email message to classify
email_message = """

"""

# Tokenize and encode the message
inputs = tokenizer_DBSC(email_message, return_tensors="pt", truncation=True, padding=True, max_length=512)

# Make sure the model is in evaluation mode
model_DBSC.eval()

# Perform prediction
with torch.no_grad():
    outputs = model_DBSC(**inputs)
    logits = outputs.logits
    predicted_label = torch.argmax(logits, dim=1).item()

print(f"Predicted label: {predicted_label}")

Predicted label: 3
