In [11]:
%pip install transformers



In [12]:
import transformers

print(transformers.__version__)


4.31.0


In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
import torch
from torch.utils.data import DataLoader, TensorDataset
import joblib

In [14]:
# Load and preprocess the data
# data = pd.read_csv('training.csv', encoding='utf-8', on_bad_lines='skip')  # Replace with your CSV file containing the data
# texts = data['text']
# labels = data['label'].astype('category').cat.codes.tolist()

try:
    data = pd.read_csv('training.csv')  # Replace with your CSV file containing the data
except pd.errors.ParserError:
    print("ParserError: Error while parsing CSV. Ignoring and continuing with empty DataFrame.")
    data = pd.DataFrame()

if not data.empty:
    texts = data['text'].tolist()
    labels = data['label'].astype('category').cat.codes.tolist()

    # Rest of your code
    # ...
else:
    print("No valid data in the CSV.")

In [15]:
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

In [16]:

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(data['label'].unique()))



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)

In [18]:
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

In [19]:
train_dataset = TensorDataset(torch.tensor(train_encodings['input_ids']),
                              torch.tensor(train_encodings['attention_mask']),
                              torch.tensor(train_labels))
val_dataset = TensorDataset(torch.tensor(val_encodings['input_ids']),
                            torch.tensor(val_encodings['attention_mask']),
                            torch.tensor(val_labels))


In [20]:

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)


In [21]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
optimizer = AdamW(model.parameters(), lr=1e-5)



In [22]:
for epoch in range(3):
    model.train()
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()


In [23]:
# # Save the model using joblib
# model_filename = "distilbert_sequence_classifier_model.pkl"
# joblib.dump(model, model_filename)


['distilbert_sequence_classifier_model.pkl']

In [24]:
# # Save the model state_dict using joblib
# model_state_dict_filename = "distilbert_sequence_classifier_state_dict.pkl"
# joblib.dump(model.state_dict(), model_state_dict_filename)


['distilbert_sequence_classifier_state_dict.pkl']

In [25]:
# # Load the model using the saved state_dict
# loaded_model_state_dict = joblib.load(model_state_dict_filename)
# loaded_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(data['label'].unique()))
# loaded_model.load_state_dict(loaded_model_state_dict)
# loaded_model.to(device)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [28]:
# # Save the model state_dict using Python's pickle
# import pickle

# model_state_dict_filename = "distilbert_sequence_classifier_state_dict_pick.pkl"
# with open(model_state_dict_filename, 'wb') as f:
#     pickle.dump(model.state_dict(), f)


In [27]:
# Save the model state_dict using PyTorch's native saving method
model_state_dict_filename = "distilbert_sequence_classifier_state_dict_pt.pth"
torch.save(model.state_dict(), model_state_dict_filename)
