In [1]:
import pandas as pd
import numpy as np

# %pip show tensorflow
# Modeling

import tensorflow as tf

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback, TextClassificationPipeline

# # Hugging Face Dataset
from datasets import Dataset

# # Model performance evaluation
import evaluate

In [2]:
import os,random,math
TRAINING_DIR="propaganda_dataset_v2"
files=os.listdir(TRAINING_DIR)
#since we have just two files we dont need loops
traindf= pd.read_csv(os.path.join(TRAINING_DIR,files[0]),sep = '\t')
testdf= pd.read_csv(os.path.join(TRAINING_DIR,files[1]),sep = '\t')




In [3]:
def convert_text(sentences):
    start_sent = sentences.replace("<BOS>","")
    end_sent =start_sent.replace("<EOS>","")
    return end_sent


In [4]:
testdf["tagged_in_context"]=testdf["tagged_in_context"].map(convert_text)
traindf["tagged_in_context"]=traindf["tagged_in_context"].map(convert_text)

In [5]:
two_class_traindf=traindf
two_class_testdf=testdf

two_class_traindf


Unnamed: 0,label,tagged_in_context
0,not_propaganda,"No, he will not be confirmed."
1,not_propaganda,This declassification effort won’t make thing...
2,flag_waving,The Obama administration misled the American ...
3,not_propaganda,“It looks like we’re capturing the demise of t...
4,not_propaganda,"Location: Westerville, Ohio"
...,...,...
2409,not_propaganda,We support and appreciate your business.”
2410,not_propaganda,International Atomic Energy Agency (IAEA) Dire...
2411,not_propaganda,What has been done: there has been work on for...
2412,not_propaganda,This is the law of gradualness not the gradua...


In [6]:
def convert_labels(label):
    return 1 if label=="not_propaganda" else 0
two_class_testdf["label"]= two_class_testdf["label"].map(convert_labels)
two_class_traindf["label"]=two_class_traindf["label"].map(convert_labels)
two_class_traindf

Unnamed: 0,label,tagged_in_context
0,1,"No, he will not be confirmed."
1,1,This declassification effort won’t make thing...
2,0,The Obama administration misled the American ...
3,1,“It looks like we’re capturing the demise of t...
4,1,"Location: Westerville, Ohio"
...,...,...
2409,1,We support and appreciate your business.”
2410,1,International Atomic Energy Agency (IAEA) Dire...
2411,1,What has been done: there has been work on for...
2412,1,This is the law of gradualness not the gradua...


In [7]:
hg_train_data = Dataset.from_pandas(two_class_traindf)
hg_test_data = Dataset.from_pandas(two_class_testdf)

In [8]:
hg_train_data[0:3]

{'label': [1, 1, 0],
 'tagged_in_context': ['No,  he  will not be confirmed. ',
  'This declassification effort  won’t make things any worse than they are for President Trump.  ',
  'The Obama administration misled the  American people  and Congress because they were desperate to get a deal with Iran, said Sen. ']}

In [9]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_data(data):
    return tokenizer(data["tagged_in_context"],
                     max_length=140,
                     truncation=True,
                     padding="max_length")
# Tokenize the dataset
dataset_train = hg_train_data.map(tokenize_data)
dataset_test = hg_test_data.map(tokenize_data)


Map:   0%|          | 0/2414 [00:00<?, ? examples/s]

Map:   0%|          | 0/580 [00:00<?, ? examples/s]

In [10]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [11]:
training_args = TrainingArguments(
    output_dir="./sentiment_transfer_learning_transformer/",          
    logging_dir='./sentiment_transfer_learning_transformer/logs',            
    logging_strategy='epoch',
    logging_steps=100,    
    num_train_epochs=5,              
    per_device_train_batch_size=10,  
    per_device_eval_batch_size=10,  
    learning_rate=5e-6,
    seed=42,
    save_strategy='epoch',
    save_steps=100,
    evaluation_strategy='epoch',
    eval_steps=100,
    load_best_model_at_end=True
)

In [12]:
# Number of evaluation modules
print(f'There are {len(evaluate.list_evaluation_modules())} evaluation models in Hugging Face.\n')

# List all evaluation metrics
evaluate.list_evaluation_modules()

There are 158 evaluation models in Hugging Face.



['lvwerra/test',
 'precision',
 'code_eval',
 'roc_auc',
 'cuad',
 'xnli',
 'rouge',
 'pearsonr',
 'mse',
 'super_glue',
 'comet',
 'cer',
 'sacrebleu',
 'mahalanobis',
 'wer',
 'competition_math',
 'f1',
 'recall',
 'coval',
 'mauve',
 'xtreme_s',
 'bleurt',
 'ter',
 'accuracy',
 'exact_match',
 'indic_glue',
 'spearmanr',
 'mae',
 'squad',
 'chrf',
 'glue',
 'perplexity',
 'mean_iou',
 'squad_v2',
 'meteor',
 'bleu',
 'wiki_split',
 'sari',
 'frugalscore',
 'google_bleu',
 'bertscore',
 'matthews_correlation',
 'seqeval',
 'trec_eval',
 'rl_reliability',
 'jordyvl/ece',
 'angelina-wang/directional_bias_amplification',
 'cpllab/syntaxgym',
 'lvwerra/bary_score',
 'kaggle/amex',
 'kaggle/ai4code',
 'hack/test_metric',
 'yzha/ctc_eval',
 'codeparrot/apps_metric',
 'mfumanelli/geometric_mean',
 'daiyizheng/valid',
 'poseval',
 'erntkn/dice_coefficient',
 'mgfrantz/roc_auc_macro',
 'Vlasta/pr_auc',
 'gorkaartola/metric_for_tp_fp_samples',
 'idsedykh/metric',
 'idsedykh/codebleu2',
 'idsed

In [13]:
# Function to compute the metric
def compute_metrics(eval_pred):
    metric = evaluate.load("accuracy")
    logits, labels = eval_pred
    # probabilities = tf.nn.softmax(logits)
    predictions = np.argmax(logits, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_test,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
)

trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: tagged_in_context. If tagged_in_context are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 2414
  Num Epochs = 5
  Instantaneous batch size per device = 10
  Total train batch size (w. parallel, distributed & accumulation) = 10
  Gradient Accumulation steps = 1
  Total optimization steps = 1210
  Number of trainable parameters = 108311810


  0%|          | 0/1210 [00:00<?, ?it/s]

In [None]:
# Predictions
y_test_predict = trainer.predict(dataset_test)

y_test_logits = y_test_predict.predictions

# First 5 predicted probabilities


  0%|          | 0/580 [00:00<?, ?it/s]

In [None]:
y_test_probabilities = tf.nn.softmax(y_test_logits)
y_test_probabilities[:5]

<tf.Tensor: shape=(5, 2), dtype=float32, numpy=
array([[0.00254825, 0.9974517 ],
       [0.96868247, 0.0313175 ],
       [0.9934496 , 0.00655044],
       [0.00200245, 0.99799746],
       [0.9967861 , 0.00321387]], dtype=float32)>

In [None]:
y_test_pred_labels = np.argmax(y_test_probabilities, axis=1)

In [None]:
y_test_actual_labels = y_test_predict.label_ids

In [None]:
y_test_actual_labels[:5]

array([1, 0, 0, 1, 0], dtype=int64)

In [None]:
trainer.evaluate(dataset_test)

  0%|          | 0/580 [00:00<?, ?it/s]

{'eval_loss': 1.2690449953079224,
 'eval_accuracy': 0.7517241379310344,
 'eval_runtime': 22.4253,
 'eval_samples_per_second': 25.864,
 'eval_steps_per_second': 25.864,
 'epoch': 2.0}

In [None]:
# Load f1 metric
metric_f1 = evaluate.load("f1")

# Compute f1 metric
metric_f1.compute(predictions=y_test_pred_labels, references=y_test_actual_labels)

{'f1': 0.7575757575757576}

In [None]:
# Save tokenizer_to_our_system
tokenizer.save_pretrained('./sentiment_transfer_learning_transformer/')

# Save model to out system
trainer.save_model('./sentiment_transfer_learning_transformer/')