In [27]:
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
import numpy as np
from tqdm import tqdm

# Initialize an empty DataFrame to store the results
results_df = pd.DataFrame(columns=['Layer', 'F1_Score', 'Accuracy'])

# Load the dataset
df = pd.read_csv('/home/pgajo/working/incels/data/datasets/English/Incels.is/IFD-EN-5203.csv',
                #  nrows=100
                 )
# Filter the dataset
df = df[df['incel_terms'] == 0]

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)

# Function to get [CLS] token embeddings for all layers
def get_cls_embeddings(sentence, model, tokenizer):
    tokens = tokenizer(sentence, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**tokens)
    hidden_states = outputs.hidden_states
    cls_embeddings = [layer[:, 0, :].numpy() for layer in hidden_states[1:]]  # Skip the embedding layer
    return cls_embeddings

# Initialize a list of lists to store embeddings for all samples for each layer
all_layer_embeddings = [[] for _ in range(12)]

# Adding tqdm for progress tracking during extraction
for text in tqdm(df['text'], desc="Extracting embeddings"):
    cls_embeddings = get_cls_embeddings(text, model, tokenizer)
    for i, emb in enumerate(cls_embeddings):
        all_layer_embeddings[i].append(emb)

# Convert list of lists of arrays to a list of 2D numpy arrays
all_layer_embeddings = [np.vstack(layer) for layer in all_layer_embeddings]

# Prepare labels
labels = df['hs'].values

# Adding tqdm for progress tracking during training and evaluation
for i, embeddings in enumerate(tqdm(all_layer_embeddings, desc="Training and Evaluating")):
    print(f"Shape of embeddings for layer {i+1}: {embeddings.shape}")  # Debugging line

    X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, test_size=0.2, random_state=42)
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    f1 = f1_score(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)
    
    print(f"Layer {i+1}: F1 Score = {f1}, Accuracy = {acc}")

    # Append the results to the DataFrame
    results_df = results_df.append({'Layer': i+1, 'F1_Score': f1, 'Accuracy': acc}, ignore_index=True)

# Save the DataFrame to a CSV file
results_df.to_csv('/home/pgajo/working/incels/results/bert_layer_probing/layerwise_evaluation_metrics_NOincel_terms.csv', index=False)

Extracting embeddings: 100%|██████████| 3156/3156 [01:32<00:00, 34.07it/s]
Training and Evaluating:   0%|          | 0/12 [00:00<?, ?it/s]

Shape of embeddings for layer 1: (3156, 768)


Training and Evaluating:   8%|▊         | 1/12 [00:00<00:07,  1.51it/s]

Layer 1: F1 Score = 0.2706766917293233, Accuracy = 0.8465189873417721
Shape of embeddings for layer 2: (3156, 768)


Training and Evaluating:  17%|█▋        | 2/12 [00:01<00:07,  1.41it/s]

Layer 2: F1 Score = 0.4102564102564103, Accuracy = 0.8544303797468354
Shape of embeddings for layer 3: (3156, 768)


Training and Evaluating:  25%|██▌       | 3/12 [00:02<00:06,  1.29it/s]

Layer 3: F1 Score = 0.47852760736196326, Accuracy = 0.865506329113924
Shape of embeddings for layer 4: (3156, 768)


Training and Evaluating:  33%|███▎      | 4/12 [00:03<00:06,  1.15it/s]

Layer 4: F1 Score = 0.5841584158415841, Accuracy = 0.8670886075949367
Shape of embeddings for layer 5: (3156, 768)


Training and Evaluating:  42%|████▏     | 5/12 [00:04<00:06,  1.15it/s]

Layer 5: F1 Score = 0.5853658536585367, Accuracy = 0.865506329113924
Shape of embeddings for layer 6: (3156, 768)


Training and Evaluating:  50%|█████     | 6/12 [00:05<00:05,  1.04it/s]

Layer 6: F1 Score = 0.5170731707317073, Accuracy = 0.8433544303797469
Shape of embeddings for layer 7: (3156, 768)


Training and Evaluating:  58%|█████▊    | 7/12 [00:06<00:04,  1.07it/s]

Layer 7: F1 Score = 0.5226130653266331, Accuracy = 0.8496835443037974
Shape of embeddings for layer 8: (3156, 768)


Training and Evaluating:  67%|██████▋   | 8/12 [00:07<00:03,  1.02it/s]

Layer 8: F1 Score = 0.5743589743589744, Accuracy = 0.8686708860759493
Shape of embeddings for layer 9: (3156, 768)


Training and Evaluating:  75%|███████▌  | 9/12 [00:08<00:02,  1.05it/s]

Layer 9: F1 Score = 0.5263157894736843, Accuracy = 0.8575949367088608
Shape of embeddings for layer 10: (3156, 768)


Training and Evaluating:  83%|████████▎ | 10/12 [00:08<00:01,  1.11it/s]

Layer 10: F1 Score = 0.5, Accuracy = 0.8322784810126582
Shape of embeddings for layer 11: (3156, 768)


Training and Evaluating:  92%|█████████▏| 11/12 [00:09<00:00,  1.19it/s]

Layer 11: F1 Score = 0.47000000000000003, Accuracy = 0.8322784810126582
Shape of embeddings for layer 12: (3156, 768)


Training and Evaluating: 100%|██████████| 12/12 [00:10<00:00,  1.17it/s]

Layer 12: F1 Score = 0.5148514851485148, Accuracy = 0.8449367088607594



