In [1]:
# Data processing
import pandas as pd
import numpy as np

# %pip show tensorflow
# Modeling

import tensorflow as tf

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback, TextClassificationPipeline

# # Hugging Face Dataset
from datasets import Dataset

# # Model performance evaluation
import evaluate

In [2]:
import os,random,math
TRAINING_DIR="propaganda_dataset_v2"
files=os.listdir(TRAINING_DIR)
#since we have just two files we dont need loops
traindf= pd.read_csv(os.path.join(TRAINING_DIR,files[0]),sep = '\t')
testdf= pd.read_csv(os.path.join(TRAINING_DIR,files[1]),sep = '\t')

In [3]:
def convert_text(sentences):
    start_sent=sentences.split("<BOS>")
    end_sent=start_sent[1].split("<EOS>")
    return end_sent[0]

testdf["tagged_in_context"]=testdf["tagged_in_context"].map(convert_text)
traindf["tagged_in_context"]=traindf["tagged_in_context"].map(convert_text)

testdf.head(10)

Unnamed: 0,label,tagged_in_context
0,not_propaganda,according to a UN estimate.
1,causal_oversimplification,the country would not last long without an ou...
2,appeal_to_fear_prejudice,gets Earl Warren and Sen. Richard Russel to j...
3,not_propaganda,You
4,repetition,infidels
5,"name_calling,labeling","the ""gay lifestyle"
6,loaded_language,devastating communities
7,not_propaganda,Jacob
8,flag_waving,Iran’s long rap sheet of aggression against A...
9,doubt,"Now, the pope’s reply to my testimony was: “I..."


In [4]:
traindf["label"].value_counts(dropna=False)

label
not_propaganda               1191
exaggeration,minimisation     164
causal_oversimplification     158
name_calling,labeling         157
loaded_language               154
appeal_to_fear_prejudice      151
flag_waving                   148
repetition                    147
doubt                         144
Name: count, dtype: int64

In [5]:
def convert_labels_multiclass(label):
    if label=="flag_waving":
        return 0
    elif label=="appeal_to_fear_prejudice":
        return 1
    elif label=="causal_oversimplification":
        return 2
    elif label =="doubt":
        return 3
    elif label == "exaggeration,minimisation":
        return 4
    elif label == "loaded_language":
        return 5
    elif label == "name_calling,labeling":
        return 6
    elif label == "repetition":
        return 7
     
#drop rows with no propanganda

testdf=testdf[testdf.label != "not_propaganda"]
traindf = traindf[traindf.label != "not_propaganda"]

traindf["label"] = traindf["label"].map(convert_labels_multiclass)
testdf["label"] = testdf["label"].map(convert_labels_multiclass)

traindf.reset_index(inplace=True)
testdf.reset_index(inplace = True)


In [6]:
traindf

Unnamed: 0,index,label,tagged_in_context
0,2,0,American people
1,5,5,annihilated
2,8,3,so-called evidence
3,10,6,hateful conduct
4,12,1,point to Iran’s positioning itself for more a...
...,...,...,...
1218,2403,7,Nazi
1219,2405,4,absolutely no place for anti-Semitism
1220,2406,0,Prosecutors Doing Mueller’s ‘Dirty Work Are A...
1221,2407,2,Neither the Democrat leadership nor the Democ...


In [7]:
traindf["label"].value_counts(dropna=False)

label
4    164
2    158
6    157
5    154
1    151
0    148
7    147
3    144
Name: count, dtype: int64

In [8]:
hg_train_data = Dataset.from_pandas(traindf)
hg_test_data = Dataset.from_pandas(testdf)
hg_test_data[0:3]

{'index': [1, 2, 4],
 'label': [2, 1, 7],
 'tagged_in_context': [' the country would not last long without an outside high IQ elite to run the country ',
  ' gets Earl Warren and Sen. Richard Russel to join the Warren Commission by telling them that the assassination could lead to World War III ',
  ' infidels ']}

In [18]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_data(data):
    return tokenizer(data["tagged_in_context"],
                     max_length=140,
                     truncation=True,
                     padding="max_length")



In [19]:
dataset_train = hg_train_data.map(tokenize_data)
dataset_test = hg_test_data.map(tokenize_data)

Map:   0%|          | 0/1223 [00:00<?, ? examples/s]

Map:   0%|          | 0/279 [00:00<?, ? examples/s]

In [20]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=8)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [21]:
training_args = TrainingArguments(
    output_dir="./sentiment_transfer_learning_transformer/",          
    logging_dir='./sentiment_transfer_learning_transformer/logs',            
    logging_strategy='epoch',
    logging_steps=100,    
    num_train_epochs=12,              
    per_device_train_batch_size=1,  
    per_device_eval_batch_size=1,  
    learning_rate=5e-6,
    seed=42,
    save_strategy='epoch',
    save_steps=100,
    evaluation_strategy='epoch',
    eval_steps=100,
    load_best_model_at_end=True
)



In [22]:
def compute_metrics(eval_pred):
    metric = evaluate.load("accuracy")
    logits, labels = eval_pred
    # probabilities = tf.nn.softmax(logits)
    predictions = np.argmax(logits, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [23]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_test,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
)

trainer.train()



  0%|          | 0/14676 [00:00<?, ?it/s]

{'loss': 1.8394, 'learning_rate': 4.583333333333333e-06, 'epoch': 1.0}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 1.5490914583206177, 'eval_accuracy': 0.4336917562724014, 'eval_runtime': 7.9304, 'eval_samples_per_second': 35.181, 'eval_steps_per_second': 35.181, 'epoch': 1.0}
{'loss': 1.3242, 'learning_rate': 4.166666666666667e-06, 'epoch': 2.0}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 1.2762616872787476, 'eval_accuracy': 0.5483870967741935, 'eval_runtime': 8.4068, 'eval_samples_per_second': 33.187, 'eval_steps_per_second': 33.187, 'epoch': 2.0}
