In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from hdbscan import HDBSCAN
from umap import UMAP
from copy import deepcopy
from bertopic.representation import TextGeneration

In [2]:
df = pd.read_csv("df_file.csv")

In [3]:
df.head()

Unnamed: 0,Text,Label
0,Budget to set scene for election\n \n Gordon B...,0
1,Army chiefs in regiments decision\n \n Militar...,0
2,Howard denies split over ID cards\n \n Michael...,0
3,Observers to monitor UK election\n \n Minister...,0
4,Kilroy names election seat target\n \n Ex-chat...,0


In [4]:
df.shape

(2225, 2)

In [5]:
df.nunique()

Text     2127
Label       5
dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    2225 non-null   object
 1   Label   2225 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 34.9+ KB


In [7]:
df["Label"].unique()

array([0, 1, 2, 3, 4], dtype=int64)

In [8]:
df["Text"].isnull().sum()

0

In [9]:
duplicates_df= df["Text"].duplicated()

In [10]:
def has_duplicates(duplicates_df):
    if duplicates_df.duplicated().any():
        print("Yes")
    else:
        print("No")


has_duplicates(duplicates_df)

Yes


In [11]:
duplicates = df.duplicated().any()

In [12]:
df = df.drop_duplicates()

In [13]:
df.shape

(2127, 2)

In [14]:
## Train-test split

In [15]:
X=df["Text"]
y=df["Label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print("Train", X_train.shape)
print("Test", X_test.shape)

Train (1701,)
Test (426,)


In [16]:
from transformers import pipeline
from tqdm import tqdm
from transformers.pipelines.pt_utils import KeyDataset
from sklearn.metrics import classification_report
from sentence_transformers import SentenceTransformer

In [17]:
pipe = pipeline(
    task="text-classification",     
    model="bert-base-uncased",
    tokenizer="bert-base-uncased",
    return_all_scores=True,          
    device=-1                        
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
#Fine-tuning model
from transformers import( AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)

In [32]:
model_id = "bert-base-cased"

model= AutoModelForSequenceClassification.from_pretrained(
    model_id, num_labels=2       
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [34]:
#Tokenize our data

In [37]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [38]:
def preprocess_function(examples):
    return tokenizer(examples["Text"], truncation=True)

In [41]:
from datasets import Dataset

dataset = Dataset.from_pandas(df)

encoded_dataset = dataset.map(preprocess_function, batched=True)
encoded_dataset = encoded_dataset.map(encode_labels, batched=True)


Map:   0%|          | 0/2127 [00:00<?, ? examples/s]

Map:   0%|          | 0/2127 [00:00<?, ? examples/s]

In [42]:
encoded_dataset = encoded_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = encoded_dataset["train"]
eval_dataset = encoded_dataset["test"]

In [44]:
import evaluate

def compute_metrics(eval_pred):
    """Calculate F1 score"""
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    f1 = evaluate.load("f1")
    return f1.compute(predictions=predictions, references=labels)

In [46]:
training_args = TrainingArguments(
    "model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    save_strategy="epoch",
    report_to="none"
)

In [48]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [49]:
trainer.evaluate()

{'eval_model_preparation_time': 0.0044,
 'eval_runtime': 312.8228,
 'eval_samples_per_second': 1.362,
 'eval_steps_per_second': 0.086}

In [51]:
trainer.save_model("my_finetuned_model")
tokenizer.save_pretrained("my_finetuned_model")


('my_finetuned_model\\tokenizer_config.json',
 'my_finetuned_model\\special_tokens_map.json',
 'my_finetuned_model\\vocab.txt',
 'my_finetuned_model\\added_tokens.json',
 'my_finetuned_model\\tokenizer.json')

In [52]:
#predictions

pipe = pipeline("text-classification", model="my_finetuned_model", tokenizer="my_finetuned_model")

text = ["The scars of the Boksâ€™ 29-28 victory over Les Bleus in the 2023 Rugby World Cup are still showing clearly in the French capital"]
predictions = pipe(text)

print(predictions)


[{'label': 'LABEL_1', 'score': 0.6312353610992432}]
