In [1]:
!pip install transformers scikit-learn pandas torch imbalanced-learn



In [2]:

from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset
from transformers import AdamW, get_linear_schedule_with_warmup
import torch

from tqdm import tqdm

In [3]:
# Load dataset
main_df = pd.read_excel("topical_chat.xlsx")
main_df = main_df.dropna()
main_df = main_df[main_df['message'].notna() & (main_df['message'] != '')]
main_df = main_df.reset_index(drop=True)

# Shuffle the data
shuffled_df = main_df.sample(frac=1, random_state=42)  # frac=1 means 100% of the data, random_state for reproducibility

# Use only 10% of the data
percentage_to_use = 0.10
subset_df = shuffled_df.sample(frac=percentage_to_use, random_state=42)

# Display the first few rows of the subset
print(subset_df.head())

        conversation_id                                            message  \
31320              1434   Hello,  I do not follow it but I watch it som...   
80091              3668  Ender's game and its sequel ender's shadow are...   
172750             7915   Wow, I didn't know that about The Saints. I w...   
6030                276   You're right. That's probably why baseball ca...   
140050             6413  I really love her idea about what she's making...   

                     sentiment  
31320                  Neutral  
80091                  Neutral  
172750  Curious to dive deeper  
6030                 Surprised  
140050  Curious to dive deeper  


In [4]:
unique_labels = subset_df['sentiment'].unique()
print("Unique Labels:", unique_labels)
print("Num of labels :",len(unique_labels))


Unique Labels: ['Neutral' 'Curious to dive deeper' 'Surprised' 'Happy' 'Sad' 'Angry'
 'Disgusted' 'Fearful']
Num of labels : 8


In [5]:
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

# Assuming your DataFrame is named 'subset_df'

# Check class distribution before balancing
class_distribution_before = subset_df['sentiment'].value_counts()
print("Class Distribution Before Balancing:")
print(class_distribution_before)

# Separate features and labels
X = subset_df['message']
y = subset_df['sentiment']

# Convert labels to numerical values for imbalanced-learn
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Balancing the classes
over_sampler = RandomOverSampler(sampling_strategy='auto', random_state=42)
under_sampler = RandomUnderSampler(sampling_strategy='auto', random_state=42)

X_resampled, y_resampled = over_sampler.fit_resample(X.values.reshape(-1, 1), y_encoded)
X_resampled, y_resampled = under_sampler.fit_resample(X_resampled, y_resampled)

# Convert back to DataFrame
resampled_df = pd.DataFrame({'message': X_resampled.flatten(), 'sentiment': label_encoder.inverse_transform(y_resampled)})

# Check class distribution after balancing
class_distribution_after = resampled_df['sentiment'].value_counts()
print("\nClass Distribution After Balancing:")
print(class_distribution_after)
df = resampled_df
# Now 'resampled_df' contains a balanced subset of the data


Class Distribution Before Balancing:
Curious to dive deeper    8070
Neutral                   4087
Surprised                 3103
Happy                     2954
Sad                        263
Disgusted                  143
Fearful                    111
Angry                      106
Name: sentiment, dtype: int64

Class Distribution After Balancing:
Angry                     8070
Curious to dive deeper    8070
Disgusted                 8070
Fearful                   8070
Happy                     8070
Neutral                   8070
Sad                       8070
Surprised                 8070
Name: sentiment, dtype: int64


In [6]:
# Split the data into training, validation, and testing sets
train_data, temp_data = train_test_split(df, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

# Encode sentiment labels
label_encoder = LabelEncoder()
train_data['label'] = label_encoder.fit_transform(train_data['sentiment'])
val_data['label'] = label_encoder.transform(val_data['sentiment'])
test_data['label'] = label_encoder.transform(test_data['sentiment'])

# Load the pre-trained model and tokenizer
# model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name, num_labels=len(unique_labels))
model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)
# model = AutoModelForSequenceClassification.from_pretrained(model_name)



# Tokenize the text data
train_texts = list(train_data['message'].astype(str).values)
val_texts = list(val_data['message'].astype(str).values)
test_texts = list(test_data['message'].astype(str).values)

train_encodings = tokenizer(train_texts, truncation=True, padding=True, return_tensors='pt')
val_encodings = tokenizer(val_texts, truncation=True, padding=True, return_tensors='pt')
test_encodings = tokenizer(test_texts, truncation=True, padding=True, return_tensors='pt')


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# Train the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

train_labels = torch.tensor(list(train_data['label'])).unsqueeze(1)
val_labels = torch.tensor(list(val_data['label'])).unsqueeze(1)
test_labels = torch.tensor(list(test_data['label'])).unsqueeze(1)

train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
val_dataset = TensorDataset(val_encodings['input_ids'], val_encodings['attention_mask'], val_labels)
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * 3)

# Training loop with early stopping
num_epochs = 10
best_val_loss = float('inf')
patience = 3
current_patience = 0

for epoch in range(num_epochs):
    model.train()
    for batch in tqdm(train_loader):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

    # Validation
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()

    avg_val_loss = val_loss / len(val_loader)
    print(f'Epoch {epoch + 1}/{num_epochs}, Validation Loss: {avg_val_loss}')

    # Early stopping
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        current_patience = 0
    else:
        current_patience += 1
        if current_patience == patience:
            print("Early stopping as validation loss is not improving.")
            break


100%|██████████| 807/807 [15:34<00:00,  1.16s/it]


Epoch 1/10, Validation Loss: 0.48922985276268494


100%|██████████| 807/807 [15:36<00:00,  1.16s/it]


Epoch 2/10, Validation Loss: 0.32952173135218543


100%|██████████| 807/807 [15:35<00:00,  1.16s/it]


Epoch 3/10, Validation Loss: 0.31586814931359564


100%|██████████| 807/807 [15:35<00:00,  1.16s/it]


Epoch 4/10, Validation Loss: 0.31586814931359564


100%|██████████| 807/807 [15:35<00:00,  1.16s/it]


Epoch 5/10, Validation Loss: 0.31586814931359564


100%|██████████| 807/807 [15:36<00:00,  1.16s/it]


Epoch 6/10, Validation Loss: 0.31586814931359564
Early stopping as validation loss is not improving.


In [8]:
save_path = "assign_model"
# Save model configuration
config = AutoConfig.from_pretrained(model_name, num_labels=len(unique_labels))
config.save_pretrained(save_path)

# Save model weights
model.save_pretrained(save_path)


from transformers import AutoModelForSequenceClassification

# Load saved model
# loaded_model = AutoModelForSequenceClassification.from_pretrained(save_path)


In [9]:
# Testing
model.eval()
correct_predictions = 0
total_samples = 0
predictions_list = []
labels_list = []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1)

        correct_predictions += (predictions == labels.squeeze()).sum().item()
        total_samples += labels.size(0)

        predictions_list.extend(predictions.cpu().numpy())
        labels_list.extend(labels.squeeze().cpu().numpy())

accuracy = correct_predictions / total_samples
print(f'Testing Accuracy: {accuracy}')

# Calculate additional metrics
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

print("Classification Report:")
print(classification_report(labels_list, predictions_list, target_names=label_encoder.classes_))

print("Confusion Matrix:")
print(confusion_matrix(labels_list, predictions_list))

print(f'Testing Accuracy: {accuracy}')


Testing Accuracy: 0.8997831474597274
Classification Report:
                        precision    recall  f1-score   support

                 Angry       1.00      1.00      1.00       813
Curious to dive deeper       0.74      0.62      0.68       828
             Disgusted       1.00      1.00      1.00       792
               Fearful       1.00      1.00      1.00       854
                 Happy       0.87      0.89      0.88       809
               Neutral       0.79      0.78      0.78       799
                   Sad       0.99      1.00      0.99       816
             Surprised       0.80      0.91      0.85       745

              accuracy                           0.90      6456
             macro avg       0.90      0.90      0.90      6456
          weighted avg       0.90      0.90      0.90      6456

Confusion Matrix:
[[813   0   0   0   0   0   0   0]
 [  0 516   0   2  61 123   6 120]
 [  0   0 792   0   0   0   0   0]
 [  0   0   0 854   0   0   0   0]
 [  2  43  