In [5]:
import os
os.environ["TRANSFORMERS_NO_TF"] = "1"  # Disable TensorFlow/Keras integration

import pandas as pd
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import classification_report
import torch


  from .autonotebook import tqdm as notebook_tqdm


In [23]:
label_mapping = {
    'with israel': 0,
    'with palestine': 1,
    'neutral': 2,
    'inquisitive': 2,  # Assuming inquisitive is neutral
    'indifferent': 2   # Assuming indifferent is neutral
}

dataset = load_dataset('csv', data_files='data/reddit_comments_clean.csv')
dataset = dataset.map(lambda e: {'label': label_mapping[e['label']]})
dataset = dataset['train'].train_test_split(test_size=0.2)

In [None]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

In [24]:

def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True, max_length=512)

dataset = dataset.map(tokenize, batched=True)
dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

Map: 100%|██████████| 21054/21054 [00:05<00:00, 4168.33 examples/s]
Map: 100%|██████████| 5264/5264 [00:01<00:00, 4379.28 examples/s]


In [26]:
%pip install --upgrade transformers

Note: you may need to restart the kernel to use updated packages.


In [28]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=2e-5,
    weight_decay=0.01,
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    report = classification_report(labels, predictions, output_dict=True, zero_division=0)
    return {
        "precision": report["weighted avg"]["precision"],
        "recall": report["weighted avg"]["recall"],
        "f1": report["weighted avg"]["f1-score"]
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    compute_metrics=compute_metrics
)

trainer.train()

Step,Training Loss
500,1.043
1000,0.9624
1500,0.9316
2000,0.9527
2500,0.9078
3000,0.8947
3500,0.9083
4000,0.8821
4500,0.9007
5000,0.911


TrainOutput(global_step=15792, training_loss=0.7230867890239124, metrics={'train_runtime': 1937.9324, 'train_samples_per_second': 32.592, 'train_steps_per_second': 8.149, 'total_flos': 8367055045797888.0, 'train_loss': 0.7230867890239124, 'epoch': 3.0})

In [34]:
import numpy as np

trainer.evaluate()

{'eval_loss': 1.3108025789260864,
 'eval_precision': 0.6301010555646825,
 'eval_recall': 0.6295592705167173,
 'eval_f1': 0.6297878664019076,
 'eval_runtime': 42.8576,
 'eval_samples_per_second': 122.825,
 'eval_steps_per_second': 30.706,
 'epoch': 3.0}

In [29]:
trainer.save_model("./models/my-distilbert-base-uncased")


In [30]:
def predict(text):
    device = next(model.parameters()).device  # get model device (cuda or cpu)
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}  # move inputs to model device
    outputs = model(**inputs)
    prediction = torch.argmax(outputs.logits, dim=1).item()
    return list(label_map.keys())[list(label_map.values()).index(prediction)]

# Example predictions
print(predict("Hamas is a terrorist organization."))
print(predict("End Israeli apartheid."))
print(predict("This conflict is tragic for both sides."))

pro_israel
pro_palestine
neutral


In [32]:
from sklearn.metrics import accuracy_score

pro_israel_texts = [
    "I stand with Israel in their right to defend themselves",
    "The IDF is protecting their citizens",
    "Israel has the right to exist as a Jewish state",
    "Israel has the right to defend its borders.",
    "Hamas is a terrorist organization targeting civilians.",
    "Supporting Israel is supporting democracy in the Middle East.",
    "The Iron Dome saves countless Israeli lives.",
    "Criticism of Israel often masks antisemitism.",
    "Israeli citizens live under constant rocket threat.",
    "The Jewish people have a historical right to this land.",
    "IDF operations aim to eliminate terrorist threats.",
    "Israel withdrew from Gaza, yet rockets still fly.",
    "The UN is biased against Israel in its resolutions."
]

# --- Pro-Palestine Texts ---
pro_palestine_texts = [
    "Free Palestine from occupation",
    "Palestinians deserve equal rights and freedom",
    "End the occupation of Palestinian territories",
    "Israel's blockade has devastated Gaza's economy.",
    "The occupation must end for peace to begin.",
    "Palestinian families are being evicted from their homes.",
    "The West Bank is under illegal military control.",
    "Free Gaza from siege and suffering.",
    "Palestinian children deserve safety and education.",
    "The wall separates families and stifles lives.",
    "The Nakba is an ongoing tragedy for Palestinians.",
    "Settlements violate international law.",
    "We must stand against apartheid policies."
]

# --- Neutral Texts ---
neutral_texts = [
    "This is a neutral comment about the situation",
    "The conflict is complex with valid arguments on both sides",
    "Both sides have suffered greatly in this conflict.",
    "Dialogue and understanding are essential for peace.",
    "The conflict has a long and complex history.",
    "Civilians on both sides deserve protection.",
    "International law should guide the resolution process.",
    "It's important to listen to all voices in this debate.",
    "War affects everyone, not just combatants.",
    "Social media often simplifies complex issues.",
    "Peace will require compromise from both parties.",
    "We must seek truth before taking sides."
]

sample_texts = pro_israel_texts + pro_palestine_texts + neutral_texts
true_labels = (
    ["pro_israel"] * len(pro_israel_texts)
    + ["pro_palestine"] * len(pro_palestine_texts)
    + ["neutral"] * len(neutral_texts)
)

# --- Predict and evaluate ---
predicted_labels = []

print("\nSample Predictions:")
for text in sample_texts:
    prediction = predict(text)
    predicted_labels.append(prediction)
    print(f"Text: '{text[:50]}...' -> {prediction}")

# --- Accuracy ---
accuracy = accuracy_score(true_labels, predicted_labels)
print(f"\nAccuracy: {accuracy * 100:.2f}%")


Sample Predictions:
Text: 'I stand with Israel in their right to defend thems...' -> pro_israel
Text: 'The IDF is protecting their citizens...' -> pro_israel
Text: 'Israel has the right to exist as a Jewish state...' -> pro_israel
Text: 'Israel has the right to defend its borders....' -> pro_israel
Text: 'Hamas is a terrorist organization targeting civili...' -> pro_israel
Text: 'Supporting Israel is supporting democracy in the M...' -> pro_palestine
Text: 'The Iron Dome saves countless Israeli lives....' -> pro_palestine
Text: 'Criticism of Israel often masks antisemitism....' -> pro_palestine
Text: 'Israeli citizens live under constant rocket threat...' -> pro_israel
Text: 'The Jewish people have a historical right to this ...' -> pro_israel
Text: 'IDF operations aim to eliminate terrorist threats....' -> pro_israel
Text: 'Israel withdrew from Gaza, yet rockets still fly....' -> pro_israel
Text: 'The UN is biased against Israel in its resolutions...' -> pro_palestine
Text: 'Free Pal