In [1]:
import pandas as pd

import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizerFast
import torch
from transformers import DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments



In [2]:
# load sata
df = pd.read_csv('text_data_unclean.csv')

In [3]:
def clean_text(text):
    text = str(text).lower()          # lowercase
    text = re.sub(r'\s+', ' ', text)  # remove extra spaces
    return text

df["text"] = df["text"].apply(clean_text)

In [4]:
df.head()

Unnamed: 0,id,category,text,keyword_count,sentiment,risk_score
0,1,storage_issue,pharmacies in giza reported issues with antide...,2,-0.5,4.5
1,2,storage_issue,a weekly report from delta region connected an...,2,-0.5,4.5
2,3,import_delay,"in giza, anticoagulants availability has dropp...",4,-1.0,9.6
3,4,manufacturing_issue,"in coastal governorates, anticoagulants availa...",3,-0.5,7.65
4,5,manufacturing_issue,pharmacies in giza reported issues with asthma...,2,-0.5,5.1


In [5]:

#df = df.drop(columns=['id'])
df = df.drop(columns=['keyword_count', 'sentiment', 'risk_score','id' ])

In [6]:
df.columns

Index(['category', 'text'], dtype='object')

In [7]:


label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["category"])


In [19]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["text"],
    df["label"],
    test_size=0.2,
    random_state=42,
    stratify=df["label"]
)


In [20]:
#tokenization

tokenizer = DistilBertTokenizerFast.from_pretrained(
    "distilbert-base-uncased"
)

train_encodings = tokenizer(
    train_texts.tolist(),
    truncation=True,
    padding=True
)

val_encodings = tokenizer(
    val_texts.tolist(),
    truncation=True,
    padding=True
)


In [10]:


class IssueDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx])
                for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels.iloc[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = IssueDataset(train_encodings, train_labels)
val_dataset = IssueDataset(val_encodings, val_labels)


In [21]:
from transformers import DistilBertForSequenceClassification

model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(label_encoder.classes_)
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
9,80
8,29
1,28
4,25
12,24
5,24
13,23
2,21
14,21
11,20


In [32]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_steps=10
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,

)

trainer.train()


  trainer = Trainer(


Step,Training Loss
10,2.3798
20,2.3443
30,2.2208
40,2.2004
50,2.0314
60,2.0984
70,1.9816
80,1.9379
90,1.8333
100,1.9184


TrainOutput(global_step=100, training_loss=2.0946289253234864, metrics={'train_runtime': 1252.0795, 'train_samples_per_second': 1.278, 'train_steps_per_second': 0.08, 'total_flos': 28984478208000.0, 'train_loss': 2.0946289253234864, 'epoch': 5.0})

In [34]:
trainer.evaluate()


{'eval_loss': 1.9733314514160156,
 'eval_runtime': 13.4113,
 'eval_samples_per_second': 5.965,
 'eval_steps_per_second': 0.373,
 'epoch': 5.0}