## **Notebook for NLP Coursework: Patronising and Condescending Language Detection**

The final best model we ended up using can be seen in 'Roberta with augmented data' under the Data Augmentation section

# Main imports and code

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os

In [None]:
# The below needs to be altered to wherever the datafiles are being kept
%cd ./drive/MyDrive/NLP
!pwd

/content/drive/MyDrive/NLP
/content/drive/MyDrive/NLP


In [None]:
# check which gpu we're using
!nvidia-smi

Fri Mar  4 08:38:01 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!pip install simpletransformers
!pip install tensorboardx
!pip install sklearn

Collecting simpletransformers
  Downloading simpletransformers-0.63.4-py3-none-any.whl (248 kB)
[?25l[K     |█▎                              | 10 kB 27.4 MB/s eta 0:00:01[K     |██▋                             | 20 kB 9.9 MB/s eta 0:00:01[K     |████                            | 30 kB 8.5 MB/s eta 0:00:01[K     |█████▎                          | 40 kB 7.9 MB/s eta 0:00:01[K     |██████▋                         | 51 kB 4.2 MB/s eta 0:00:01[K     |████████                        | 61 kB 5.0 MB/s eta 0:00:01[K     |█████████▎                      | 71 kB 5.4 MB/s eta 0:00:01[K     |██████████▌                     | 81 kB 5.6 MB/s eta 0:00:01[K     |███████████▉                    | 92 kB 6.3 MB/s eta 0:00:01[K     |█████████████▏                  | 102 kB 5.0 MB/s eta 0:00:01[K     |██████████████▌                 | 112 kB 5.0 MB/s eta 0:00:01[K     |███████████████▉                | 122 kB 5.0 MB/s eta 0:00:01[K     |█████████████████▏              | 133 kB 5.0

Collecting tensorboardx
  Downloading tensorboardX-2.5-py2.py3-none-any.whl (125 kB)
[?25l[K     |██▋                             | 10 kB 28.0 MB/s eta 0:00:01[K     |█████▎                          | 20 kB 17.8 MB/s eta 0:00:01[K     |███████▉                        | 30 kB 9.8 MB/s eta 0:00:01[K     |██████████▌                     | 40 kB 8.2 MB/s eta 0:00:01[K     |█████████████                   | 51 kB 4.4 MB/s eta 0:00:01[K     |███████████████▊                | 61 kB 5.2 MB/s eta 0:00:01[K     |██████████████████▎             | 71 kB 5.3 MB/s eta 0:00:01[K     |█████████████████████           | 81 kB 5.3 MB/s eta 0:00:01[K     |███████████████████████▌        | 92 kB 5.9 MB/s eta 0:00:01[K     |██████████████████████████▏     | 102 kB 5.1 MB/s eta 0:00:01[K     |████████████████████████████▊   | 112 kB 5.1 MB/s eta 0:00:01[K     |███████████████████████████████▍| 122 kB 5.1 MB/s eta 0:00:01[K     |████████████████████████████████| 125 kB 5.1 MB/s 
Ins

In [None]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs, MultiLabelClassificationModel, MultiLabelClassificationArgs
from urllib import request
import pandas as pd
import logging
import torch
import numpy as np
from collections import Counter
from ast import literal_eval
from sklearn.metrics import f1_score, confusion_matrix, precision_score, recall_score, accuracy_score
from sklearn.utils import shuffle

In [None]:
# prepare logger
logging.basicConfig(level=logging.INFO)

transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# check gpu
cuda_available = torch.cuda.is_available()

print('Cuda available? ',cuda_available)

Cuda available?  True


In [None]:
if cuda_available:
  import tensorflow as tf
  # Get the GPU device name.
  device_name = tf.test.gpu_device_name()
  # The device name should look like the following:
  if device_name == '/device:GPU:0':
      print('Found GPU at: {}'.format(device_name))
  else:
      raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In [None]:
!pwd

/content/drive/MyDrive/NLP


# Fetch Don't Patronize Me! data manager module

In [None]:
module_url = f"https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py"
module_name = module_url.split('/')[-1]
print(f'Fetching {module_url}')
#with open("file_1.txt") as f1, open("file_2.txt") as f2
with request.urlopen(module_url) as f, open(module_name,'w') as outf:
  a = f.read()
  outf.write(a.decode('utf-8'))

Fetching https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py


In [None]:
# helper function to save predictions to an output file
def labels2file(p, outf_path):
	with open(outf_path,'w') as outf:
		for pi in p:
			outf.write(','.join([str(k) for k in pi])+'\n')

In [None]:
from dont_patronize_me import DontPatronizeMe

In [None]:
dpm = DontPatronizeMe('.', 'task4_test.tsv')

In [None]:
dpm.load_task1()
# dpm.load_task2(return_one_hot=True)

# Load paragraph IDs

In [None]:
trids = pd.read_csv('train_semeval_parids-labels.csv')
teids = pd.read_csv('dev_semeval_parids-labels.csv')

In [None]:
trids.head()

Unnamed: 0,par_id,label
0,4341,"[1, 0, 0, 1, 0, 0, 0]"
1,4136,"[0, 1, 0, 0, 0, 0, 0]"
2,10352,"[1, 0, 0, 0, 0, 1, 0]"
3,8279,"[0, 0, 0, 1, 0, 0, 0]"
4,1164,"[1, 0, 0, 1, 1, 1, 0]"


In [None]:
trids.par_id = trids.par_id.astype(str)
teids.par_id = teids.par_id.astype(str)



# Rebuild training set (Task 1)

In [None]:
rows_train = [] # will contain par_id, label and text
for idx in range(len(trids)):  
  parid = trids.par_id[idx]
  #print(parid)
  # select row from original dataset to retrieve `text` and binary label
  text = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].text.values[0]
  label = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].label.values[0]
  rows_train.append({
      'par_id':parid,
      'text':text,
      'label':label
  })
  

In [None]:
trdf1 = pd.DataFrame(rows_train)

In [None]:
print(len(trdf1[trdf1.label==1]))
print(len(trdf1[trdf1.label==0]))

794
7581


# Rebuild test set (Task 1)

In [None]:
rows_test = [] # will contain par_id, label and text
for idx in range(len(teids)):  
  parid = teids.par_id[idx]
  #print(parid)
  # select row from original dataset
  text = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].text.values[0]
  label = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].label.values[0]
  rows_test.append({
      'par_id':parid,
      'text':text,
      'label':label
  })
  

# Decipher relation between sentence length and pat/non-pat
This is just splitting up the dev set into sentences larger than 55 and ones smaller than 40. It can be seen from the print statements that there's a ratio of 77:894 (pat:non-pat) for shorter sentences and a ratio of 80:512 (pat:non-pat) for longer sentences. So longer sentences have a higher chance of being patronising than shorter ones in this training set.

In [None]:
rows_test_short = [] 
rows_test_long = [] # will contain par_id, label and text
av_len = []
for idx in range(len(teids)):  
  parid = teids.par_id[idx]
  #print(parid)
  # select row from original dataset
  text = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].text.values[0]
  label = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].label.values[0]
  
#   print(text.split(' '))
  text_len = len(text.split(' '))
  av_len.append(text_len)
#   print(text_len)
 
  if text_len < 35:
    rows_test_short.append({
        'par_id':parid,
        'text':text,
        'label':label
    })

  if text_len > 60:
    rows_test_long.append({
        'par_id':parid,
        'text':text,
        'label':label
    })
# print(sum(av_len)/len(av_len))

rows_test_short = pd.DataFrame(rows_test_short)
rows_test_long = pd.DataFrame(rows_test_long)

In [None]:
print(len(rows_test_short[rows_test_short.label==1]))
print(len(rows_test_short[rows_test_short.label==0]))

print(len(rows_test_long[rows_test_long.label==1]))
print(len(rows_test_long[rows_test_long.label==0]))

54
686
67
414


In [None]:
len(rows_test)

2094

In [None]:
tedf1 = pd.DataFrame(rows_test)

In [None]:
print(len(trdf1[trdf1.label==1]))
print(len(trdf1[trdf1.label==0]))

print(len(tedf1[tedf1.label==1]))
print(len(tedf1[tedf1.label==0]))

794
7581
199
1895


# RoBERTa Baseline using RAW data

In [None]:
# Basline with raw data

training_set1 = shuffle(trdf1, random_state=0)

In [None]:
# downsample negative instances
# pat_samples = trdf1[trdf1.label==1]
# npos = len(pat_samples)

# training_set1 = pd.concat([pat_samples,trdf1[trdf1.label==0][:npos*2]])
# training_set1 = shuffle(training_set1, random_state=0)

In [None]:
training_set1

Unnamed: 0,par_id,text,label
2265,1656,The anti-immigrant Slovenian Democratic Party ...,0
3647,3169,German police on Sunday took two Afghan nation...,0
2817,2251,Kuria has called on the International Criminal...,0
196,2780,"""At such times , the principle of Christ would...",1
5362,5036,Three I cry a lot aki I cry even on air that '...,0
...,...,...,...
4373,3961,""""""" All administrative formalities , related t...",0
7891,7852,"POIPET , June 17- The number of Cambodians fle...",0
4859,4497,The most recent high-profile case saw an intel...,0
3264,2744,Reform actions considered include proposals on...,0


In [None]:
!pwd

/content/drive/MyDrive/NLP


In [None]:

task1_model_args_roberta = ClassificationArgs(num_train_epochs=1, 
                                      no_save=True, 
                                      no_cache=True, 
                                      overwrite_output_dir=True)
task1_model_roberta = ClassificationModel("roberta", 
                                  'roberta-base', 
                                  args = task1_model_args_roberta, 
                                  num_labels=2, 
                                  use_cuda=cuda_available)
# train model
task1_model_roberta.train_model(training_set1[['text', 'label']])

# run predictions
preds_task1_roberta, _ = task1_model_roberta.predict(tedf1.text.tolist())


Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/8375 [00:00<?, ?it/s]



Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/1047 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Training of roberta model complete. Saved to outputs/.
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/2094 [00:00<?, ?it/s]

  0%|          | 0/262 [00:00<?, ?it/s]

In [None]:
real_labels = np.array(tedf1['label'])

In [None]:
print(len(real_labels[real_labels==1]))

199


In [None]:
print(len(preds_task1_roberta[preds_task1_roberta==1]))

0


In [None]:
def evaluation_metrics(true_labels, pred_labels):
    F1_score = f1_score(true_labels, pred_labels)
    precision_sc = precision_score(true_labels, pred_labels)
    conf_mat = confusion_matrix(true_labels, pred_labels)
    recall_sc = recall_score(true_labels, pred_labels)
    accuracy_sc = accuracy_score(true_labels, pred_labels)
    print(f"This is the positive f1 score: {F1_score}")
    print(f"This is the precison score: {precision_sc}")
    print(f"This is the recall score: {recall_sc}")
    print(f"This is the accuracy score: {accuracy_sc}")
    print(f"This is the confusion matrix:\n {conf_mat}")

In [None]:
evaluation_metrics(real_labels, preds_task1_roberta)

This is the positive f1 score: 0.0
This is the precison score: 0.0
This is the recall score: 0.0
This is the accuracy score: 0.9049665711556829
This is the confusion matrix:
 [[1895    0]
 [ 199    0]]


  _warn_prf(average, modifier, msg_start, len(result))


# RoBERTa using downsampled data

In [None]:
# downsample negative instances
pat_samples = trdf1[trdf1.label==1]
npos = len(pat_samples)
print(npos)
print(len(trdf1[trdf1.label==0]))

training_set1_down = pd.concat([pat_samples, trdf1[trdf1.label==0][:int(npos*3)]]) # got best results for *3 (0.53)
training_set1_down = shuffle(training_set1_down, random_state=0)

print(len(training_set1_down))

794
7581
3176


In [None]:
training_set1_down

Unnamed: 0,par_id,text,label
1093,328,"Earlier this year , a leaked confidential poli...",0
641,5835,"At least 400,000 hungry and traumatised refuge...",1
1554,848,"Yavatmal : In a forward push for women power ,...",0
575,1326,""""""" We 've seen in the past that Kenyans who a...",1
117,5432,"' A large-scale , publicly-funded energy effic...",1
...,...,...,...
763,8907,( Bloomberg ) -- First Lady Melania Trump soug...,1
835,45,"Currently , what 's left to the Palestinian pe...",0
1653,961,Ann 's Cottage has cared for disabled children...,0
2607,2028,""""""" This is the first advisory in some time , ...",0


In [None]:

task1_model_args_roberta_down = ClassificationArgs(num_train_epochs=1, 
                                      no_save=True, 
                                      no_cache=True, 
                                      overwrite_output_dir=True)
task1_model_roberta_down = ClassificationModel("roberta", 
                                  'roberta-base', 
                                  args = task1_model_args_roberta_down, 
                                  num_labels=2, 
                                  use_cuda=cuda_available)
# train model
task1_model_roberta_down.train_model(training_set1_down[['text', 'label']])
# run predictions
preds_task1_roberta_down, _ = task1_model_roberta_down.predict(tedf1.text.tolist())


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

  0%|          | 0/3176 [00:00<?, ?it/s]



Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/397 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Training of roberta model complete. Saved to outputs/.
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/2094 [00:00<?, ?it/s]

  0%|          | 0/262 [00:00<?, ?it/s]

In [None]:
real_labels = np.array(tedf1['label'])

In [None]:
print(len(real_labels[real_labels==1]))
print(len(real_labels[real_labels==0]))

199
1895


In [None]:
print(len(preds_task1_roberta_down[preds_task1_roberta_down==1]))
print(len(preds_task1_roberta_down[preds_task1_roberta_down==0]))

# Although these results may be similar, it doesn't mean they are mapping correctly. We want these numbers to be kinda similar ideallu, since then we know that there are x many 1s in the test
# data, and our prediction is also saying there are approx x amounts of 1s too.
# We seem to peak at 0.53 by downsampling alone.

307
1787


In [None]:
evaluation_metrics(real_labels, preds_task1_roberta_down)

This is the positive f1 score: 0.5098814229249012
This is the precison score: 0.4201954397394137
This is the recall score: 0.6482412060301508
This is the accuracy score: 0.8815663801337154
This is the confusion matrix:
 [[1717  178]
 [  70  129]]


# DeBERTa for Task 1

In [None]:
# downsample negative instances
pat_samples = trdf1[trdf1.label==1]
npos = len(pat_samples)
print(npos)
print(len(trdf1[trdf1.label==0][:npos]))
print(len(trdf1[trdf1.label==0]))

training_set1_down_deb = pd.concat([pat_samples, trdf1[trdf1.label==0][:int(npos*4)]]) # got 0.53 with *4. Started getting lower at *7
training_set1_down_deb = shuffle(training_set1_down_deb, random_state=0)

print(len(training_set1_down_deb))

794
794
7581
3970


In [None]:
training_set1_down_deb

Unnamed: 0,par_id,text,label
1694,1010,""""""" WHO welcomes and is very appreciative of t...",0
3285,2765,My duties at CAF have never conflicted or disa...,0
1867,1203,Government soldiers stand on guard in front of...,0
2161,1538,The government will continue paying millions o...,0
3430,2934,The combination of being molested by adult mal...,0
...,...,...,...
835,45,"Currently , what 's left to the Palestinian pe...",0
3264,2744,Reform actions considered include proposals on...,0
1653,961,Ann 's Cottage has cared for disabled children...,0
2607,2028,""""""" This is the first advisory in some time , ...",0


In [None]:

task1_model_args_deberta = ClassificationArgs(num_train_epochs=1, 
                                      no_save=True, 
                                      no_cache=True, 
                                      overwrite_output_dir=True)
task1_model_deberta = ClassificationModel("deberta", 
                                  'microsoft/deberta-base', 
                                  args = task1_model_args_deberta, 
                                  num_labels=2, 
                                  use_cuda=cuda_available)
# train model
task1_model_deberta.train_model(training_set1_down_deb[['text', 'label']])
# run predictions
preds_task1_deberta, _ = task1_model_deberta.predict(tedf1.text.tolist())


Downloading:   0%|          | 0.00/474 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/533M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.weight', 'pooler.d

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/3970 [00:00<?, ?it/s]



Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/497 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Training of deberta model complete. Saved to outputs/.
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/2094 [00:00<?, ?it/s]

  0%|          | 0/262 [00:00<?, ?it/s]

In [None]:
real_labels = np.array(tedf1['label'])

In [None]:
print(len(real_labels[real_labels==1]))

199


In [None]:
print(len(preds_task1_deberta[preds_task1_deberta==1]))

245


In [None]:
evaluation_metrics(real_labels, preds_task1_deberta)

This is the positive f1 score: 0.536036036036036
This is the precison score: 0.4857142857142857
This is the recall score: 0.5979899497487438
This is the accuracy score: 0.9016236867239733
This is the confusion matrix:
 [[1769  126]
 [  80  119]]


# **Upsample Data**
Duplicating the original data (minority class) to up the amount of training data

In [None]:
# upsample training data
pat_samples_copy = pat_samples.copy(deep=True)


pat_samples_big = pd.concat([pat_samples, pat_samples_copy])
npos = len(pat_samples_big)
print(npos)
print(len(trdf1[trdf1.label==0]))

training_set1_big = pd.concat([pat_samples_big, trdf1[trdf1.label==0][:int(3*npos)]]) # Again, best was 0.52 at 3*npos

print(len(training_set1_big))

training_set1_big_shuff = shuffle(training_set1_big, random_state=0)
# print(len(training_set1_big_shuff))


1588
7581
6352


## **roBERTa basline with upsampled data**

In [None]:
training_set1_big_shuff

Unnamed: 0,par_id,text,label
5534,5231,Brit bomber 's wife Shukee Begum on ' gangster...,0
4741,4371,She will highlight her government 's initiativ...,0
4063,3626,""""""" I 'm very happy with the assistance becaus...",0
1930,1272,I had travelled to Kent on a hopeless mission ...,0
292,2321,Rio 2016 produced many memorable moments . The...,1
...,...,...,...
4137,3710,The program depicted new refugee housing conta...,0
2470,1876,"I 've been in London since last May , and you ...",0
859,69,Developing countries shoulder the most signifi...,0
1813,1142,Italy 's new coalition government has set a go...,0


In [None]:

task1_model_args_roberta_up = ClassificationArgs(num_train_epochs=1, 
                                      no_save=True, 
                                      no_cache=True, 
                                      overwrite_output_dir=True)
task1_model_roberta_up = ClassificationModel("roberta", 
                                  'roberta-base', 
                                  args = task1_model_args_roberta_up, 
                                  num_labels=2, 
                                  use_cuda=cuda_available)
# train model
task1_model_roberta_up.train_model(training_set1_big_shuff[['text', 'label']])
# run predictions
preds_task1_roberta_up, _ = task1_model_roberta_up.predict(tedf1.text.tolist())


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

  0%|          | 0/6352 [00:00<?, ?it/s]



Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/794 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Training of roberta model complete. Saved to outputs/.
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/2094 [00:00<?, ?it/s]

  0%|          | 0/262 [00:00<?, ?it/s]

In [None]:
real_labels = np.array(tedf1['label'])

In [None]:
print(len(real_labels[real_labels==1]))

199


In [None]:
print(len(preds_task1_roberta_up[preds_task1_roberta_up==1]))

238


In [None]:
evaluation_metrics(real_labels, preds_task1_roberta_up)

This is the positive f1 score: 0.5491990846681922
This is the precison score: 0.5042016806722689
This is the recall score: 0.6030150753768844
This is the accuracy score: 0.9059216809933143
This is the confusion matrix:
 [[1777  118]
 [  79  120]]


# **Data Augmentation**
Use symonymns to increase the minority class with some differenitation 



## Setting up functions and stopwords

In [None]:
#for the first time you use wordnet
import nltk
import random
random.seed(69)
nltk.download('wordnet')
from nltk.corpus import wordnet 

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 
			'ours', 'ourselves', 'you', 'your', 'yours', 
			'yourself', 'yourselves', 'he', 'him', 'his', 
			'himself', 'she', 'her', 'hers', 'herself', 
			'it', 'its', 'itself', 'they', 'them', 'their', 
			'theirs', 'themselves', 'what', 'which', 'who', 
			'whom', 'this', 'that', 'these', 'those', 'am', 
			'is', 'are', 'was', 'were', 'be', 'been', 'being', 
			'have', 'has', 'had', 'having', 'do', 'does', 'did',
			'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or',
			'because', 'as', 'until', 'while', 'of', 'at', 
			'by', 'for', 'with', 'about', 'against', 'between',
			'into', 'through', 'during', 'before', 'after', 
			'above', 'below', 'to', 'from', 'up', 'down', 'in',
			'out', 'on', 'off', 'over', 'under', 'again', 
			'further', 'then', 'once', 'here', 'there', 'when', 
			'where', 'why', 'how', 'all', 'any', 'both', 'each', 
			'few', 'more', 'most', 'other', 'some', 'such', 'no', 
			'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 
			'very', 's', 't', 'can', 'will', 'just', 'don', 
			'should', 'now', '']

In [None]:
def get_synonyms(word):
	synonyms = set()
	for syn in wordnet.synsets(word): 
		for l in syn.lemmas(): 
			synonym = l.name().replace("_", " ").replace("-", " ").lower()
			synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
			synonyms.add(synonym) 
	if word in synonyms:
		synonyms.remove(word)
	return list(synonyms)

In [None]:
def synonym_replacement(words, n):
	new_words = words.copy()
	random_word_list = list(set([word for word in words if word not in stop_words]))
	random.shuffle(random_word_list)
	num_replaced = 0
	for random_word in random_word_list:
		synonyms = get_synonyms(random_word)
		if len(synonyms) >= 1:
			synonym = random.choice(list(synonyms))
			new_words = [synonym if word == random_word else word for word in new_words]
			#print("replaced", random_word, "with", synonym)
			num_replaced += 1
		if num_replaced >= n: #only replace up to n words
			break

	#this is stupid but we need it, trust me
	sentence = ' '.join(new_words)
	new_words = sentence.split(' ')

	return new_words

In [None]:
#cleaning up text
import re
def get_only_chars(line):

    clean_line = ""

    line = line.replace("’", "")
    line = line.replace("'", "")
    line = line.replace("-", " ") #replace hyphens with spaces
    line = line.replace("\t", " ")
    line = line.replace("\n", " ")
    line = line.lower()

    for char in line:
        if char in 'qwertyuiopasdfghjklzxcvbnm ':
            clean_line += char
        else:
            clean_line += ' '

    clean_line = re.sub(' +',' ',clean_line) #delete extra spaces
    if clean_line[0] == ' ':
        clean_line = clean_line[1:]
    return clean_line

In [None]:
# main data augmentation function
# final sentence in output is same as input

def eda(sentence, num_aug, alpha_sr=0.1, alpha_ri=0.1, alpha_rs=0.1, p_rd=0.1):
	
	sentence = get_only_chars(sentence)
	words = sentence.split(' ')
	words = [word for word in words if word is not '']
	num_words = len(words)
	
	augmented_sentences = []
	num_new_per_technique = int(num_aug) 

	#sr
	if (alpha_sr > 0):
		n_sr = max(1, int(alpha_sr*num_words))
		for _ in range(num_new_per_technique):
			a_words = synonym_replacement(words, n_sr)
			augmented_sentences.append(' '.join(a_words))

	augmented_sentences = [get_only_chars(sentence) for sentence in augmented_sentences]
	shuffle(augmented_sentences)

	#trim so that we have the desired number of augmented sentences
	if num_aug >= 1:
		augmented_sentences = augmented_sentences[:num_aug]
	else:
		keep_prob = num_aug / len(augmented_sentences)
		augmented_sentences = [s for s in augmented_sentences if random.uniform(0, 1) < keep_prob]

	#append the original sentence
	augmented_sentences.append(sentence)

	return augmented_sentences

In [None]:
def augment_data(data_df, num_aug):

    aug_df = pd.DataFrame(columns=['par_id', 'text', 'label'])
    # print(aug_df)

    for index, sample in data_df.iterrows():
        # print(f"sample: {sample}")
        text = sample['text']
        # print(text)
        aug_sentences = eda(text, num_aug)
        for sentence in aug_sentences:
            id = sample['par_id']
            label = sample['label']
            aug_df = aug_df.append({'par_id': id, 'text': sentence, 'label': label}, ignore_index=True)
            # print(aug_df)

    return aug_df

In [None]:
print(len(trdf1[trdf1.label==0]))
print(len(trdf1[trdf1.label==1]))

print(len(tedf1[tedf1.label==0]))
print(len(tedf1[tedf1.label==1]))

7581
794
1895
199


In [None]:
num_aug = 4

# Make an augements training dataset of the minority class
pat_samples = trdf1[trdf1.label==1]
print(len(pat_samples))
pat_samples_aug = augment_data(pat_samples, num_aug=num_aug)
npos_aug = len(pat_samples_aug)

# Make an augements training dataset of the majority class
not_pat_samples = trdf1[trdf1.label==0]
print(len(not_pat_samples))
not_pat_samples_aug = augment_data(not_pat_samples, num_aug=1)

# Join together
training_set_aug = pd.concat([pat_samples_aug, not_pat_samples_aug[:int(npos_aug*3)]])
# training_set_aug = shuffle(training_set_aug, random_state=0)


794
7581


In [None]:
print(npos_aug)
print(len(pat_samples_aug))
print(len(not_pat_samples_aug))
print(len(not_pat_samples_aug[:int(npos_aug*4)]))
print(len(training_set_aug))


3970
3970
15162
15162
15880


In [None]:
assert len(pat_samples_aug) == (num_aug + 1) * len(pat_samples)

print(training_set_aug[:10])

  par_id                                               text label
0   4341  the scheme figure an estimated children from p...     1
1   4341  the scheme saw an estimated children from mise...     1
2   4341  the outline understand an estimated children f...     1
3   4341  the scheme saw an estimated children from poor...     1
4   4341  the scheme saw an estimated children from poor...     1
5   4136  durban s homeless communities reconciliation d...     1
6   4136  durban s roofless communities reconciliation l...     1
7   4136  durban s homeless community of interests recon...     1
8   4136  durban s homeless communities rapprochement lunch     1
9   4136  durban s homeless communities reconciliation l...     1


## Roberta with augemented data

In [None]:

task1_model_args_roberta_aug = ClassificationArgs(num_train_epochs=1, 
                                      no_save=True, 
                                      no_cache=True, 
                                      overwrite_output_dir=True)
task1_model_roberta_aug = ClassificationModel("roberta", 
                                  'roberta-base', 
                                  args = task1_model_args_roberta_aug, 
                                  num_labels=2, 
                                  use_cuda=cuda_available)
# train model
task1_model_roberta_aug.train_model(training_set_aug[['text', 'label']])
# run predictions
preds_task1_roberta_aug, _ = task1_model_roberta_aug.predict(tedf1.text.tolist())


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

  0%|          | 0/15880 [00:00<?, ?it/s]



Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/1985 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Training of roberta model complete. Saved to outputs/.
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/2094 [00:00<?, ?it/s]

  0%|          | 0/262 [00:00<?, ?it/s]

In [None]:
real_labels = np.array(tedf1['label'])

In [None]:
print(len(real_labels[real_labels==1]))

199


In [None]:
print(len(preds_task1_roberta_aug[preds_task1_roberta_aug==1]))

155


In [None]:
evaluation_metrics(real_labels, preds_task1_roberta_aug)

This is the positive f1 score: 0.5423728813559321
This is the precison score: 0.6193548387096774
This is the recall score: 0.4824120603015075
This is the accuracy score: 0.9226361031518625
This is the confusion matrix:
 [[1836   59]
 [ 103   96]]


### Testing to see F1 scores for short and long sentences
Seemingly no. This is likely because the likelihood of being patronising or not was not that different between our long and short sentences (see section 'Decipher relation between sentence length and pat/non-pat')

In [None]:
# run predictions on Long
preds_task1_roberta_aug_long, _ = task1_model_roberta_aug.predict(rows_test_long.text.tolist())

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/481 [00:00<?, ?it/s]

  0%|          | 0/61 [00:00<?, ?it/s]

In [None]:
real_labels_long = np.array(rows_test_long['label'])

In [None]:
evaluation_metrics(real_labels_long, preds_task1_roberta_aug_long)

This is the positive f1 score: 0.43283582089552236
This is the precison score: 0.43283582089552236
This is the recall score: 0.43283582089552236
This is the accuracy score: 0.841995841995842
This is the confusion matrix:
 [[376  38]
 [ 38  29]]


In [None]:
# run predictions on short
preds_task1_roberta_aug_short, _ = task1_model_roberta_aug.predict(rows_test_short.text.tolist())

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/740 [00:00<?, ?it/s]

  0%|          | 0/93 [00:00<?, ?it/s]

In [None]:
real_labels_short = np.array(rows_test_short['label'])

In [None]:
evaluation_metrics(real_labels_short, preds_task1_roberta_aug_short)

This is the positive f1 score: 0.5
This is the precison score: 0.45454545454545453
This is the recall score: 0.5555555555555556
This is the accuracy score: 0.918918918918919
This is the confusion matrix:
 [[650  36]
 [ 24  30]]


### Testing on highly patronising test data

In [None]:
print(len(tedf1[tedf1==1]))

2094


In [None]:

# find amount of patronising samples in dev set
pat_samples_even = tedf1[tedf1.label==1]
pat_len = len(pat_samples)
print(pat_len)


# collect same amount of test dataset of the majority class
not_pat_samples_even = tedf1[tedf1.label==0][:int(pat_len/6)]
print(len(not_pat_samples_even))


# Join together
pat_test_even = pd.concat([pat_samples_even, not_pat_samples_even])
print(len(pat_test_even))

794
132
331


In [None]:
preds_task1_roberta_aug_even, _ = task1_model_roberta_aug.predict(pat_test_even.text.tolist())

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/331 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]

In [None]:
real_labels_even = np.array(pat_test_even['label'])

In [None]:
print(len(preds_task1_roberta_aug_even[preds_task1_roberta_aug_even==1]))
print(len(preds_task1_roberta_aug_even[preds_task1_roberta_aug_even==0]))

99
232


In [None]:
evaluation_metrics(real_labels_even, preds_task1_roberta_aug_even)

This is the positive f1 score: 0.5973154362416107
This is the precison score: 0.898989898989899
This is the recall score: 0.4472361809045226
This is the accuracy score: 0.6374622356495468
This is the confusion matrix:
 [[122  10]
 [110  89]]


## Deberta with augemented data

In [None]:

task1_model_args_deberta_aug = ClassificationArgs(num_train_epochs=1, 
                                      no_save=True, 
                                      no_cache=True, 
                                      overwrite_output_dir=True)
task1_model_deberta_aug = ClassificationModel("deberta", 
                                  'microsoft/deberta-base', 
                                  args = task1_model_args_deberta_aug, 
                                  num_labels=2, 
                                  use_cuda=cuda_available)
# train model
task1_model_deberta_aug.train_model(training_set_aug[['text', 'label']])
# run predictions
preds_task1_deberta_aug, _ = task1_model_deberta_aug.predict(tedf1.text.tolist())


Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.weight', 'pooler.d

  0%|          | 0/15880 [00:00<?, ?it/s]



Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/1985 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Training of deberta model complete. Saved to outputs/.
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/2094 [00:00<?, ?it/s]

  0%|          | 0/262 [00:00<?, ?it/s]

In [None]:
real_labels = np.array(tedf1['label'])

In [None]:
print(len(real_labels[real_labels==1]))

199


In [None]:
print(len(preds_task1_deberta_aug[preds_task1_deberta_aug==1]))

201


In [None]:
evaluation_metrics(real_labels, preds_task1_deberta_aug)

This is the positive f1 score: 0.555
This is the precison score: 0.5522388059701493
This is the recall score: 0.5577889447236181
This is the accuracy score: 0.9149952244508118
This is the confusion matrix:
 [[1805   90]
 [  88  111]]


# **Predicting for Actual test set**
This is to predict the labels for the samples in the official test set, and save these predicitions as 'task1.txt' in your working directory.

In [None]:
dpm.load_test()
test_df = dpm.test_set_df

In [None]:
# print(test_df.text.tolist())

In [None]:
# run predictions
preds_task1_roberta_aug_test, _ = task1_model_roberta_aug.predict(test_df.text.tolist())

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/3832 [00:00<?, ?it/s]

  0%|          | 0/479 [00:00<?, ?it/s]

In [None]:
print(preds_task1_roberta_aug_test)

[0 0 0 ... 0 0 0]


In [None]:
labels2file([[k] for k in preds_task1_roberta_aug_test], 'task1.txt')