In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import torch

# Load the dataset
selected_classification = 'Pattern Category'
df = pd.read_csv('dark_patterns.csv')
df = df[pd.notnull(df['Pattern String'])]
col = ['Pattern String', selected_classification]
df = df[col]

# Convert string labels to numerical labels
label_encoder = LabelEncoder()
df['encoded_labels'] = label_encoder.fit_transform(df[selected_classification])

# Split the data into training and testing sets
# train_df, temp_df = train_test_split(df, test_size=0.5, random_state=42)
# test_df, val_df = train_test_split(temp_df, test_size=0.5, random_state=42)
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)
test_df, val_df = train_test_split(temp_df, test_size=0.33, random_state=42)



# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
num_labels = len(df['encoded_labels'].unique())  # Number of unique classes
new_model1 = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)
new_model = new_model1
# Tokenize and encode the training data
train_encodings = tokenizer(list(train_df['Pattern String']), truncation=True, padding=True, return_tensors='pt')
train_labels = torch.tensor(train_df['encoded_labels'].tolist())

# Tokenize and encode the testing data
test_encodings = tokenizer(list(test_df['Pattern String']), truncation=True, padding=True, return_tensors='pt')
test_labels = torch.tensor(test_df['encoded_labels'].tolist())

# Tokenize and encode the validation data
val_encodings = tokenizer(list(val_df['Pattern String']), truncation=True, padding=True, return_tensors='pt')

# Create a DataLoader for validation
val_dataset = TensorDataset(val_encodings['input_ids'], val_encodings['attention_mask'])
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)


# Create a DataLoader for training and testing
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Set up optimizer and loss function
optimizer = AdamW(new_model.parameters(), lr=5e-5)
criterion = torch.nn.CrossEntropyLoss()

# Train the model
new_model.train()
for epoch in range(3):  # Adjust the number of epochs as needed
    for batch in tqdm(train_dataloader, desc=f'Epoch {epoch + 1}', unit='batches'):
        optimizer.zero_grad()
        outputs = new_model(input_ids=batch[0], attention_mask=batch[1], labels=batch[2])
        loss = outputs.loss
        loss.backward()
        optimizer.step()


new_model.eval()
y_pred_proba = []
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc='Evaluating', unit='batches'):
        outputs = new_model(input_ids=batch[0], attention_mask=batch[1])
        y_pred_proba.extend(torch.softmax(outputs.logits, dim=1).tolist())

y_pred_labels = [torch.argmax(torch.tensor(proba)).item() for proba in y_pred_proba]


y_pred_proba_val = []
with torch.no_grad():
    for batch in tqdm(val_dataloader, desc='Evaluating on Validation Set', unit='batches'):
        outputs = new_model(input_ids=batch[0], attention_mask=batch[1])
        y_pred_proba_val.extend(torch.softmax(outputs.logits, dim=1).tolist())
        
y_pred_labels_val = [torch.argmax(torch.tensor(proba)).item() for proba in y_pred_proba_val]


# Evaluate the model
accuracy = accuracy_score(test_labels.tolist(), y_pred_labels)
precision = precision_score(test_labels.tolist(), y_pred_labels, average='weighted')
recall = recall_score(test_labels.tolist(), y_pred_labels, average='weighted')
f1 = f1_score(test_labels.tolist(), y_pred_labels, average='weighted')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|███████████████████████████| 133/133 [05:43<00:00,  2.58s/batches]
Epoch 2: 100%|███████████████████████████| 133/133 [16:45<00:00,  7.56s/batches]
Epoch 3: 100%|███████████████████████████| 133/133 [05:31<00:00,  2.50s/batches]
Evaluating: 100%|██████████████████████████| 38/38 [00:13<00:00,  2.78batches/s]
Evaluating on Validation Set: 100%|████████| 19/19 [00:06<00:00,  2.94batches/s]

Accuracy: 0.9736842105263158
Precision: 0.9735368437541221
Recall: 0.9736842105263158
F1 Score: 0.9732862688184764



  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
y_pred_labels_val = [torch.argmax(torch.tensor(proba)).item() for proba in y_pred_proba_val]

In [20]:
print(y_pred_labels_val)

[3, 3, 6, 5, 3, 3, 6, 6, 3, 5, 1, 3, 3, 3, 3, 3, 5, 3, 5, 3, 3, 3, 1, 3, 3, 6, 5, 5, 3, 5, 3, 5, 5, 6, 5, 3, 6, 6, 3, 3, 3, 6, 3, 3, 3, 5, 6, 3, 3, 3, 3, 5, 3, 3, 6, 6, 6, 3, 2, 3, 3, 3, 3, 3, 1, 3, 5, 3, 3, 3, 6, 3, 3, 1, 1, 5, 6, 3, 3, 3, 6, 3, 1, 1, 3, 5, 1, 3, 3, 3, 1, 1, 3, 3, 3, 3, 1, 5, 3, 3, 5, 5, 3, 3, 5, 3, 5, 3, 3, 1, 1, 5, 5, 6, 1, 3, 3, 6, 3, 3, 3, 3, 3, 5, 3, 5, 6, 1, 3, 3, 6, 1, 1, 3, 1, 1, 3, 5, 1, 3, 3, 3, 2, 3, 3, 3, 3, 1, 5, 3]


In [21]:
val_df

Unnamed: 0,Pattern String,Pattern Category,encoded_labels
1535,Availability: Only 4 Left!,Scarcity,3
1264,Only 498 left,Scarcity,3
593,02 DAYS :23 HOURS :00 MINS :54 SECS,Urgency,6
308,232 people are viewing this offer!,Social Proof,5
1637,3 Left!,Scarcity,3
...,...,...,...
1131,Only 2 units left in stock,Scarcity,3
1616,In Stock only 3 left,Scarcity,3
481,"No thanks, I don't want to save up to 75%",Misdirection,1
54,166 people have added this item to cart,Social Proof,5


In [22]:
val_labels = torch.tensor(val_df['encoded_labels'].tolist())

accuracy_val = accuracy_score(val_labels.tolist(), y_pred_labels_val)
precision_val = precision_score(val_labels.tolist(), y_pred_labels_val, average='weighted')
recall_val = recall_score(val_labels.tolist(), y_pred_labels_val, average='weighted')
f1_val = f1_score(val_labels.tolist(), y_pred_labels_val, average='weighted')
print(f"Accuracy_val: {accuracy_val}")
print(f"Precision_val: {precision_val}")
print(f"Recall_val: {recall_val}")
print(f"F1 Score_val: {f1_val}")


Accuracy_val: 0.9933333333333333
Precision_val: 0.9936842105263158
Recall_val: 0.9933333333333333
F1 Score_val: 0.9934032214032215


In [23]:
import joblib
model_state_dict = new_model.state_dict()
joblib.dump(model_state_dict, 'determine_category_bert_model.joblib')
#save the label encoder as well
joblib.dump(label_encoder, 'determine_category_label_encoder.joblib')


['determine_category_label_encoder.joblib']