# POLITICAL CLASSIFIER

## IMPORTS

Set the following to True and specify a folder if you want to run on Colab

In [2]:
DRIVE = False
your_folder = "" # change this to your folder
if DRIVE:
    from google.colab import drive
    drive.mount(your_folder)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install transformers -U
!pip install accelerate -U

In [3]:
import re
import os
import torch
import pandas as pd
import numpy as np
import json
from sklearn.metrics import accuracy_score
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, XLMRobertaForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report



## DATA PROCESSING

In [5]:
# READING IN THE DATA
train = pd.read_csv("RedditSubmissionsClean.csv")
list_labels = ["r/Israel","r/Palestine"]
train = train[train.subreddit.isin(list_labels)]

In [6]:
# ENCODING LABELS
train = train.rename(columns = {"subreddit" : "label"})
map_dict = {train.label.unique()[i] : i for i in range(len(train.label.unique()))}
train["label"] = train["label"].map(map_dict)
inverse_map_dict = {v: k for k, v in map_dict.items()}

In [7]:
# EXPLORING CLASS IMBALANCE
pd.DataFrame(train.groupby("label")["post_id"].count()).rename(columns = {"post_id": "Number of Posts"})

Unnamed: 0_level_0,Number of Posts
label,Unnamed: 1_level_1
0,5960
1,1412


In [8]:
# BASIC PREPROCESSING, REMOVE \n, \t, """ and emails
def preprocess(x):
    x = re.sub(r"\\n","",x)
    x = re.sub(r"\\t","",x)
    x = re.sub(r'"', "",x)
    #x = re.sub(r"https?:\/\/.*?[\s+]","",x) # remove URLs
    x = re.sub(r"([a-zA-Z0-9_\-\.]+)@([a-zA-Z0-9_\-\.]+)\.([a-zA-Z]{2,5})","",x) # Remove emails
    return x

In [9]:
# DATA FORMAT REQUIRED BY ROBERTA
class RedditDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels


    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
# IMPORT THE TOKENIZER
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

In [11]:
# PREPROCESS AND CHANGE FORMAT
train["text"] = [preprocess(x) for x in train.text]
X_bert = train.text.tolist()
y_bert = train.label.astype(int).tolist()

In [12]:
EVAL = False
if EVAL:
  X_train, X_dev, y_train, y_dev = train_test_split(X_bert,y_bert, test_size = 0.2, stratify = y_bert, random_state = 42) # stratified random split
  train_encodings = tokenizer(X_train, truncation=True, padding=True)
  dev_encodings = tokenizer(X_dev,truncation = True, padding = True)

  train_dataset = RedditDataset(train_encodings,y_train) # Encode the split dataset for roberta
  dev_dataset = RedditDataset(dev_encodings, y_dev)
else:
  train_encodings = tokenizer(X_bert, truncation=True, padding=True)
  train_dataset = RedditDataset(train_encodings,y_bert) # Encode the entire dataset for roberta

## ROBERTA FINE-TUNING

In [13]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [14]:
# TRAINING, SET RETRAIN = True TO REPEAT THE TRAINING PROCESS
RETRAIN = False
if RETRAIN:
  training_args = TrainingArguments(
      output_dir='results_pred',
      num_train_epochs=5,              # training epochs
      per_device_train_batch_size=16,  # batch size training
      per_device_eval_batch_size=32,   # batch size evaluation
      warmup_steps=500,
      weight_decay=0.01,
      logging_dir='logs_pred',
      logging_steps=10,
  )

  device = torch.device(type='cuda') if torch.cuda.is_available() else torch.device(type='cpu')
  model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-base',
                                                            num_labels= 2,ignore_mismatched_sizes=True)


  def compute_metrics(predictions_in):
      labels = predictions_in.label_ids
      predictions = predictions_in.predictions.argmax(-1)
      macrof1 = f1_score(y_true=predictions, y_pred=labels, average='macro')
      return {'macro_f1': macrof1}


  trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=train_dataset,
      compute_metrics=compute_metrics
  )
  trainer.train()
  if EVAL:
    trainer.save_model("roberta_eval")
  else:
    trainer.save_model("roberta_pred")

else:
  if EVAL:
    roberta = AutoModelForSequenceClassification.from_pretrained("roberta_eval")
  else:
    roberta = AutoModelForSequenceClassification.from_pretrained("roberta_pred")

## EVALUATION

In [None]:
if EVAL:
  trainer = Trainer(model = roberta)
  preds2 = trainer.predict(dev_dataset)
  print(f"Accuracy_score: {accuracy_score(list(preds2.predictions.argmax(-1)),list(y_dev))}")
  print(f"F1-score: {f1_score(list(preds2.predictions.argmax(-1)),list(y_dev))}")
else:
  print("Set EVAL = True to evaluate the model.")

Accuracy_score: 0.8867796610169492
F1-score: 0.6969147005444647


## PREDICTION

In [15]:
test = pd.read_csv("RedditSubmissionsClean.csv")

In [16]:
test = test[test.subreddit == "r/IsraelPalestine"]

In [None]:
# ENCODING IsraelPalestine
X_test = test.text.tolist()
X_test = [preprocess(x) for x in X_test]
test_encodings = tokenizer(X_test,truncation = True, padding = True, return_tensors = "pt")
input_ids = test_encodings["input_ids"]
attention_mask = test_encodings["attention_mask"]
input_ids = input_ids.to("cuda")
attention_mask = attention_mask.to("cuda")

In [None]:
# Function to divide into batches
def chunks(xs, n):
    n = max(1, n)
    return (xs[i:i+n] for i in range(0, len(xs), n))

In [None]:
# Dividing into batches to save RAM
batches_ids = list(chunks(input_ids,16))
batches_attention = list(chunks(attention_mask,16))

In [None]:
# MAKING INFERENCE, ONLY WORKS IF EVAL = False, meaning that we are not evaluating the model
if EVAL == False:
  final = []
  with torch.no_grad():
    for batch in range(len(batches_ids)):
      outputs = roberta(input_ids = batches_ids[batch], attention_mask = batches_attention[batch])
      final.append(outputs)
else:
  print("Set EVAL = False to make inference.")


In [None]:
# EXTRACTING A LABEL (0 israel or 1 palestine)
if EVAL == False:
  out = []
  for batch in final:
    out.append(batch.logits.detach().cpu().numpy())
  preds = []
  for i in out:
    for j in i:
      preds.append(np.argmax(j))
else:
   print("Set EVAL = False to make inference.")

In [None]:
# SAVING ENTIRE DATASET
if EVAL == False:
  test_predicted = test.copy()
  test_predicted["predicted_label"] = preds
  test_predicted.to_csv("IsraelPalestinePredicted.csv")
else:
   print("Set EVAL = False to make inference.")

In [None]:
# DATASET TO FACILITATE ANNOTATION
if EVAL == False:
  test_inspect = test_predicted[["text","predicted_label"]].reset_index(drop = True)
  test_inspect["predicted_label"] = test_inspect["predicted_label"].map(inverse_map_dict).apply(lambda x: re.sub(r"^r\/","Pro ",x))
  test_inspect.to_csv("IsraelPalestine_toread.csv")

Unnamed: 0,text,predicted_label
0,Either for a two-state solution or a one-state...,Pro Israel
1,Antizionists always claim that they're not aga...,Pro Israel
2,Reject the false dichotomy both states are the...,Pro Israel
3,Looking for a pro Palestinian to discuss the c...,Pro Palestine
4,I have never been a proponent of a one state s...,Pro Israel
...,...,...
6662,"So, long story short, I'm an Israeli, around 3...",Pro Israel
6663,The University protestors have crossed the lin...,Pro Israel
6664,"The short answer is ""Yes"" - for the very long ...",Pro Israel
6665,Today I spent over 100 USD on food and deliver...,Pro Palestine
