In [1]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
import pandas as pd
import warnings 
warnings.filterwarnings("ignore")
import evaluate
import numpy as np
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

  from .autonotebook import tqdm as notebook_tqdm





In [2]:
df = pd.read_csv("GPTsentimentanalysis.csv")
df.shape

(219294, 3)

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,tweets,labels
0,0,ChatGPT: Optimizing Language Models for Dialog...,neutral
1,1,"Try talking with ChatGPT, our new AI system wh...",good
2,2,ChatGPT: Optimizing Language Models for Dialog...,neutral
3,3,"THRILLED to share that ChatGPT, our new model ...",good
4,4,"As of 2 minutes ago, @OpenAI released their ne...",bad


In [4]:
df.drop("Unnamed: 0",axis=1, inplace=True)

In [5]:
df.isna().sum()

tweets    0
labels    0
dtype: int64

In [6]:
df.duplicated().sum()

np.int64(1671)

In [7]:
df.drop_duplicates(inplace=True)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 217623 entries, 0 to 219293
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   tweets  217623 non-null  object
 1   labels  217623 non-null  object
dtypes: object(2)
memory usage: 5.0+ MB


In [9]:
df["labels"] = df["labels"].map({"bad":0,"neutral":1, "good":2})

In [10]:
# 219294
data = df.sample(100000)
train_df , test_df = train_test_split(data,test_size=0.2, random_state=42)

val_df, test_df = train_test_split(test_df, test_size=0.5, random_state=42)
print(len(train_df)  ,  len(val_df)  ,  len(test_df))

80000 10000 10000


In [11]:
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
val_dataset = Dataset.from_pandas(val_df.reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))

dataset = DatasetDict({
    "train" : train_dataset,
    "validation" : val_dataset,
    "test" : test_dataset
})
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['tweets', 'labels'],
        num_rows: 80000
    })
    validation: Dataset({
        features: ['tweets', 'labels'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tweets', 'labels'],
        num_rows: 10000
    })
})


In [12]:
model_path = 'bert-base-uncased'

tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path , num_labels=3)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### freeze most of parameters

In [13]:
# Freeze all base model parameters

for name, param in model.base_model.named_parameters():
    param.requires_grad = False

# unfreeze base model pooling layers
for name, param in model.base_model.named_parameters():
    if "pooler" in name:
        param.requires_grad = True    

#### Data Pre-processing

In [14]:
def preprocess_function(examples):
    return tokenizer(examples["tweets"] , truncation=True)

tokenized_data = dataset.map(preprocess_function , batched = True)

Map: 100%|██████████| 80000/80000 [00:28<00:00, 2808.48 examples/s]
Map: 100%|██████████| 10000/10000 [00:03<00:00, 2859.61 examples/s]
Map: 100%|██████████| 10000/10000 [00:03<00:00, 2859.47 examples/s]


In [15]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

#### Define Evaluation metrices

In [16]:
# load Metrics
accuracy = evaluate.load("accuracy")
auc_score = evaluate.load("roc_auc")


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    probabilities = np.exp(predictions) / np.exp(predictions).sum(-1, keepdims=True)

    # multi-class AUC
    auc = np.round(
        roc_auc_score(labels, probabilities, multi_class="ovr"), 3
    )

    predicted_classes = np.argmax(predictions, axis=1)
    acc = np.round(
        accuracy.compute(predictions=predicted_classes, references=labels)['accuracy'], 3
    )
    return {"accuracy": acc, "roc_auc": auc}


#### Training Parameters

In [17]:
lr = 2e-5
batch_size = 16
num_epochs = 5

training_args = TrainingArguments(
    output_dir="testTraining",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    logging_strategy="epoch",
    eval_strategy="epoch",  
    save_strategy="epoch",
    num_train_epochs=num_epochs,
    load_best_model_at_end=True
)

In [18]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

 20%|██        | 5000/25000 [09:49<41:36,  8.01it/s]  

{'loss': 0.9531, 'grad_norm': 3.027066469192505, 'learning_rate': 1.6000000000000003e-05, 'epoch': 1.0}


                                                    
 20%|██        | 5000/25000 [11:01<41:36,  8.01it/s]

{'eval_loss': 0.9109494090080261, 'eval_accuracy': 0.578, 'eval_roc_auc': 0.716, 'eval_runtime': 72.0645, 'eval_samples_per_second': 138.765, 'eval_steps_per_second': 8.673, 'epoch': 1.0}


 40%|████      | 10000/25000 [21:40<32:51,  7.61it/s]   

{'loss': 0.9083, 'grad_norm': 2.33327579498291, 'learning_rate': 1.2e-05, 'epoch': 2.0}


                                                     
 40%|████      | 10000/25000 [22:50<32:51,  7.61it/s]

{'eval_loss': 0.89178866147995, 'eval_accuracy': 0.588, 'eval_roc_auc': 0.728, 'eval_runtime': 70.0506, 'eval_samples_per_second': 142.754, 'eval_steps_per_second': 8.922, 'epoch': 2.0}


 60%|██████    | 15000/25000 [32:55<20:46,  8.02it/s]   

{'loss': 0.8983, 'grad_norm': 2.821110248565674, 'learning_rate': 8.000000000000001e-06, 'epoch': 3.0}


                                                     
 60%|██████    | 15000/25000 [34:08<20:46,  8.02it/s]

{'eval_loss': 0.8906595706939697, 'eval_accuracy': 0.588, 'eval_roc_auc': 0.733, 'eval_runtime': 73.0988, 'eval_samples_per_second': 136.801, 'eval_steps_per_second': 8.55, 'epoch': 3.0}


 80%|████████  | 20000/25000 [59:03<15:16,  5.46it/s]     

{'loss': 0.8933, 'grad_norm': 1.6373945474624634, 'learning_rate': 4.000000000000001e-06, 'epoch': 4.0}


                                                     
 80%|████████  | 20000/25000 [1:00:51<15:16,  5.46it/s]

{'eval_loss': 0.8880722522735596, 'eval_accuracy': 0.589, 'eval_roc_auc': 0.736, 'eval_runtime': 108.0077, 'eval_samples_per_second': 92.586, 'eval_steps_per_second': 5.787, 'epoch': 4.0}


100%|██████████| 25000/25000 [1:14:27<00:00,  7.37it/s]   

{'loss': 0.892, 'grad_norm': 3.4508755207061768, 'learning_rate': 0.0, 'epoch': 5.0}


                                                       
100%|██████████| 25000/25000 [1:15:38<00:00,  7.37it/s]

{'eval_loss': 0.88504958152771, 'eval_accuracy': 0.592, 'eval_roc_auc': 0.736, 'eval_runtime': 70.6983, 'eval_samples_per_second': 141.446, 'eval_steps_per_second': 8.84, 'epoch': 5.0}


100%|██████████| 25000/25000 [1:15:39<00:00,  5.51it/s]

{'train_runtime': 4539.705, 'train_samples_per_second': 88.111, 'train_steps_per_second': 5.507, 'train_loss': 0.90898078125, 'epoch': 5.0}





TrainOutput(global_step=25000, training_loss=0.90898078125, metrics={'train_runtime': 4539.705, 'train_samples_per_second': 88.111, 'train_steps_per_second': 5.507, 'total_flos': 1.931526288523843e+16, 'train_loss': 0.90898078125, 'epoch': 5.0})

In [19]:
predictions = trainer.predict(tokenized_data["test"])

logits = predictions.predictions
labels = predictions.label_ids

metrics = compute_metrics((logits,labels))
print(metrics)


100%|██████████| 625/625 [01:11<00:00,  8.78it/s]


{'accuracy': np.float64(0.602), 'roc_auc': np.float64(0.753)}
