In [1]:
!pip install torch
!pip install transformers
!pip install pandas
!pip install datasets



In [2]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

In [3]:
train_data = pd.read_excel("traindata.xlsx")
test_data = pd.read_excel("testdata.xlsx")

In [4]:
emotion_labels = ['affinity', 'compassion', 'dismay', 'fear', 'confidence', 'careless', 'anger']
belief_labels = ['pcb', 'dcb']

In [5]:
emotion_label_map = {label: i for i, label in enumerate(emotion_labels)}
belief_label_map = {label: i for i, label in enumerate(belief_labels)}

In [6]:
train_data['emot2'] = train_data['emot2'].map(emotion_label_map)
train_data['blf_value'] = train_data['blf_value'].map(belief_label_map)

In [7]:
train_data['combined_label'] = train_data['emot2'] * len(belief_labels) + train_data['blf_value']

In [8]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_tokens = tokenizer(list(train_data['text']), padding=True, truncation=True, return_tensors='pt')

In [9]:
train_dataset = TensorDataset(train_tokens['input_ids'], train_tokens['attention_mask'], torch.tensor(train_data['combined_label']))
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

In [10]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(emotion_labels) * len(belief_labels))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
optimizer = AdamW(model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss()



In [12]:
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}'):
        input_ids, attention_mask, combined_labels = batch
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.logits, combined_labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

Epoch 1: 100%|██████████████████████████████████| 87/87 [02:23<00:00,  1.65s/it]
Epoch 2: 100%|██████████████████████████████████| 87/87 [02:23<00:00,  1.65s/it]
Epoch 3: 100%|██████████████████████████████████| 87/87 [02:21<00:00,  1.62s/it]
Epoch 4: 100%|██████████████████████████████████| 87/87 [02:22<00:00,  1.64s/it]
Epoch 5: 100%|██████████████████████████████████| 87/87 [02:21<00:00,  1.63s/it]
Epoch 6: 100%|██████████████████████████████████| 87/87 [02:23<00:00,  1.65s/it]
Epoch 7: 100%|██████████████████████████████████| 87/87 [02:22<00:00,  1.64s/it]
Epoch 8: 100%|██████████████████████████████████| 87/87 [02:24<00:00,  1.66s/it]
Epoch 9: 100%|██████████████████████████████████| 87/87 [02:24<00:00,  1.67s/it]
Epoch 10: 100%|█████████████████████████████████| 87/87 [02:23<00:00,  1.65s/it]
Epoch 11: 100%|█████████████████████████████████| 87/87 [02:22<00:00,  1.64s/it]
Epoch 12: 100%|█████████████████████████████████| 87/87 [02:23<00:00,  1.65s/it]
Epoch 13: 100%|█████████████

In [13]:
test_data['text'] = test_data['text'].astype(str)
test_tokens = tokenizer(list(test_data['text']), padding=True, truncation=True, return_tensors='pt')

In [14]:
test_dataset = TensorDataset(test_tokens['input_ids'], test_tokens['attention_mask'])
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [15]:
model.eval()
predicted_combined_labels = []

In [16]:
for batch in tqdm(test_loader, desc='Testing'):
    input_ids, attention_mask = batch
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    predicted_combined_labels.extend(torch.argmax(outputs.logits, axis=1).tolist())

Testing: 100%|██████████████████████████████████| 33/33 [00:14<00:00,  2.35it/s]


In [17]:
predicted_emotions = [label // len(belief_labels) for label in predicted_combined_labels]
predicted_beliefs = [label % len(belief_labels) for label in predicted_combined_labels]

In [18]:
predicted_emotions = [emotion_labels[i] for i in predicted_emotions]
predicted_beliefs = [belief_labels[i] for i in predicted_beliefs]

In [19]:
results_df = pd.DataFrame({'text': test_data['text'], 'actual_emotion': test_data['emot2'], 'actual_belief': test_data['blf_value'],
                           'predicted_emotion': predicted_emotions, 'predicted_belief': predicted_beliefs})

In [20]:
correct_emotion_predictions = sum(1 for a, b in zip(test_data['emot2'], predicted_emotions) if a == b)
correct_belief_predictions = sum(1 for a, b in zip(test_data['blf_value'], predicted_beliefs) if a == b)
total_samples = len(test_data)

emotion_accuracy = correct_emotion_predictions / total_samples * 100
belief_accuracy = correct_belief_predictions / total_samples * 100

print(f"Emotion Accuracy: {emotion_accuracy:.2f}%")
print(f"Belief Accuracy: {belief_accuracy:.2f}%")

Emotion Accuracy: 46.77%
Belief Accuracy: 77.57%


In [21]:
emotion_accuracy_dict = {}
for emotion in emotion_labels:
    correct_predictions = sum(1 for a, b in zip(test_data['emot2'], predicted_emotions) if a == emotion)
    total_samples = len(test_data)
    accuracy = correct_predictions / total_samples * 100
    emotion_accuracy_dict[emotion] = accuracy
    print(f"{emotion} Accuracy: {accuracy:.2f}%")

affinity Accuracy: 31.56%
compassion Accuracy: 17.49%
dismay Accuracy: 27.38%
fear Accuracy: 4.56%
confidence Accuracy: 2.28%
careless Accuracy: 6.08%
anger Accuracy: 0.76%


In [22]:
belief_accuracy_dict = {}
for belief in belief_labels:
    correct_predictions = sum(1 for a, b in zip(test_data['blf_value'], predicted_beliefs) if a == belief)
    total_samples = len(test_data)
    accuracy = correct_predictions / total_samples * 100
    belief_accuracy_dict[belief] = accuracy
    print(f"{belief} Accuracy: {accuracy:.2f}%")

pcb Accuracy: 79.09%
dcb Accuracy: 20.91%


In [23]:
for emotion in emotion_labels:
    emotion_results_df = results_df[results_df['actual_emotion'] == emotion]
    emotion_results_df['total_accuracy'] = emotion_accuracy_dict[emotion]
    emotion_results_df.to_excel(f'{emotion}_results.xlsx', index=False)
    print(f"Saved results for {emotion} in {emotion}_results.xlsx")

Saved results for affinity in affinity_results.xlsx
Saved results for compassion in compassion_results.xlsx
Saved results for dismay in dismay_results.xlsx
Saved results for fear in fear_results.xlsx
Saved results for confidence in confidence_results.xlsx
Saved results for careless in careless_results.xlsx
Saved results for anger in anger_results.xlsx


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  emotion_results_df['total_accuracy'] = emotion_accuracy_dict[emotion]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  emotion_results_df['total_accuracy'] = emotion_accuracy_dict[emotion]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  emotion_results_df['total_accuracy'] = emotion_accuracy_dict[emot

In [24]:
for belief in belief_labels:
    belief_results_df = results_df[results_df['actual_belief'] == belief]
    belief_results_df['total_accuracy'] = belief_accuracy_dict[belief]
    belief_results_df.to_excel(f'{belief}_results.xlsx', index=False)
    print(f"Saved results for {belief} in {belief}_results.xlsx")

Saved results for pcb in pcb_results.xlsx
Saved results for dcb in dcb_results.xlsx


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  belief_results_df['total_accuracy'] = belief_accuracy_dict[belief]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  belief_results_df['total_accuracy'] = belief_accuracy_dict[belief]


In [27]:
model.save_pretrained("Desktop/CU Denver/Machine Learning RA /5 . belief model")