In [39]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from hdbscan import HDBSCAN
from umap import UMAP
from copy import deepcopy
from bertopic.representation import TextGeneration

In [40]:
df = pd.read_csv("df_file.csv")

In [41]:
df.head()

Unnamed: 0,Text,Label
0,Budget to set scene for election\n \n Gordon B...,0
1,Army chiefs in regiments decision\n \n Militar...,0
2,Howard denies split over ID cards\n \n Michael...,0
3,Observers to monitor UK election\n \n Minister...,0
4,Kilroy names election seat target\n \n Ex-chat...,0


In [42]:
df.shape

(2225, 2)

In [43]:
df.nunique()

Text     2127
Label       5
dtype: int64

In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    2225 non-null   object
 1   Label   2225 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 34.9+ KB


In [45]:
df["Label"].unique()

array([0, 1, 2, 3, 4], dtype=int64)

In [46]:
df["Text"].isnull().sum()

0

In [47]:
duplicates_df= df["Text"].duplicated()

In [48]:
def has_duplicates(duplicates_df):
    if duplicates_df.duplicated().any():
        print("Yes")
    else:
        print("No")


has_duplicates(duplicates_df)

Yes


In [49]:
duplicates = df.duplicated().any()

In [50]:
df = df.drop_duplicates()

In [51]:
df.shape

(2127, 2)

In [52]:
## Train-test split

In [53]:
X=df["Text"]
y=df["Label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print("Train", X_train.shape)
print("Test", X_test.shape)

Train (1701,)
Test (426,)


In [54]:
from transformers import pipeline
from tqdm import tqdm
from transformers.pipelines.pt_utils import KeyDataset
from sklearn.metrics import classification_report
from sentence_transformers import SentenceTransformer

In [55]:
pipe = pipeline(
    task="text-classification",     
    model="bert-base-uncased",
    tokenizer="bert-base-uncased",
    return_all_scores=True,          
    device=-1                        
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [56]:
#Fine-tuning model
from transformers import( BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments
)

In [57]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [58]:
def preprocess_function(batch):
    return tokenizer(batch["Text"], truncation=True, padding="max_length", max_length=128)

In [69]:
labels = list(set(df["Label"]))
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for label, i in label2id.items()}

In [72]:
def encode_labels(batch):
    batch["Labels"] = [label2id[label] for label in batch["Label"]]
    return batch


In [73]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [74]:
from datasets import Dataset

dataset = Dataset.from_pandas(df)

encoded_dataset = dataset.map(preprocess_function, batched=True)
encoded_dataset = encoded_dataset.map(encode_labels, batched=True)


Map:   0%|          | 0/2127 [00:00<?, ? examples/s]

Map:   0%|          | 0/2127 [00:00<?, ? examples/s]

In [75]:
encoded_dataset = encoded_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = encoded_dataset["train"]
eval_dataset = encoded_dataset["test"]

In [76]:
training_args = TrainingArguments(
    output_dir="./email_bert_results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    
\
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)

In [78]:
import evaluate

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)


Downloading builder script: 0.00B [00:00, ?B/s]

In [79]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


In [80]:
trainer.train()

ValueError: The model did not return a loss from the inputs, only the following keys: logits. For reference, the inputs it received are input_ids,token_type_ids,attention_mask.