In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
selected_classification = 'classification'

df1 = pd.read_csv('normie.csv')
df2 = pd.read_csv('dark_patterns.csv')

df1 = df1[pd.notnull(df1['Pattern String'])]
df1 = df1[df1[selected_classification] == 0]
df1['classification'] = 'Not Dark'
df1.drop_duplicates(subset='Pattern String', inplace=True)

df2 = df2[pd.notnull(df2['Pattern String'])]
df2['classification'] = 'Dark'
col = ["Pattern String", "classification"]
df2 = df2[col]

df = pd.concat([df1, df2])

In [4]:

label_encoder = LabelEncoder()
df['encoded_labels'] = label_encoder.fit_transform(df[selected_classification])

# Split the data into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [5]:
# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
num_labels = len(df['encoded_labels'].unique())  # Number of unique classes
model2 = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

# Tokenize and encode the training data
train_encodings = tokenizer(list(train_df['Pattern String']), truncation=True, padding=True, return_tensors='pt')
train_labels = torch.tensor(train_df['encoded_labels'].tolist())

# Tokenize and encode the testing data
test_encodings = tokenizer(list(test_df['Pattern String']), truncation=True, padding=True, return_tensors='pt')
test_labels = torch.tensor(test_df['encoded_labels'].tolist())


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# Create DataLoaders for training and testing
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)

optimizer = AdamW(model2.parameters(), lr=5e-5)
criterion = torch.nn.CrossEntropyLoss()




In [7]:
# Train the model
model2.train()
for epoch in range(4):  # Adjust the number of epochs as needed
    for batch in tqdm(train_dataloader, desc=f'Epoch {epoch + 1}', unit='batches'):
        optimizer.zero_grad()
        outputs = model2(input_ids=batch[0], attention_mask=batch[1], labels=batch[2])
        loss = outputs.loss
        loss.backward()
        optimizer.step()

Epoch 1: 100%|███████████████████████████| 298/298 [14:26<00:00,  2.91s/batches]
Epoch 2: 100%|███████████████████████████| 298/298 [14:21<00:00,  2.89s/batches]
Epoch 3: 100%|███████████████████████████| 298/298 [18:42<00:00,  3.77s/batches]
Epoch 4: 100%|███████████████████████████| 298/298 [14:15<00:00,  2.87s/batches]


In [9]:
# Evaluate the model on the test set
model2.eval()
test_true_all = []
test_pred_all = []

with torch.no_grad():
    for test_batch in tqdm(test_dataloader, desc='Evaluating', unit='batches'):
        outputs = model2(input_ids=test_batch[0], attention_mask=test_batch[1])
        test_true_all.extend(test_batch[2].tolist())
        test_pred_all.extend(torch.argmax(outputs.logits, dim=1).tolist())

# Decode labels back to original values
decoded_test_true = label_encoder.inverse_transform(test_true_all)
decoded_test_pred = label_encoder.inverse_transform(test_pred_all)

# Evaluate the model
test_accuracy = accuracy_score(decoded_test_true, decoded_test_pred)
test_precision = precision_score(decoded_test_true, decoded_test_pred, average='weighted')
test_recall = recall_score(decoded_test_true, decoded_test_pred, average='weighted')
test_f1 = f1_score(decoded_test_true, decoded_test_pred, average='weighted')

print(f"Test Accuracy: {test_accuracy}")
print(f"Test Precision: {test_precision}")
print(f"Test Recall: {test_recall}")
print(f"Test F1 Score: {test_f1}")

Evaluating: 100%|██████████████████████████| 75/75 [00:33<00:00,  2.21batches/s]

Test Accuracy: 0.9446308724832215
Test Precision: 0.9492419937396476
Test Recall: 0.9446308724832215
Test F1 Score: 0.9446987208807014





In [11]:
import joblib
model_state_dict = model2.state_dict()
joblib.dump(model_state_dict, 'determine_presence_model.joblib')
#save the label encoder as well
joblib.dump(label_encoder, 'determine_presence_label_encoder.joblib')


['determine_presence_label_encoder.joblib']