In [90]:
#!pip install checklist

In [91]:
#!pip install simpletransformers

In [3]:
from google.colab import drive
drive.mount('/content/drive')

import numpy as np
import checklist
import pandas as pd
import spacy
from checklist.editor import Editor
from checklist.perturb import Perturb
from simpletransformers.classification import ClassificationModel,ClassificationArgs
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix,average_precision_score
import warnings
warnings.filterwarnings("ignore")
from checklist.test_types import MFT, INV, DIR

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
data = pd.read_csv('/content/data/olid-subset-diagnostic-tests.csv')
train_data = pd.read_csv('/content/data/olid-train.csv')

data = data[['text','labels']]
train_data = train_data[['text','labels']]
data.head(3)

Unnamed: 0,text,labels
0,@USER @USER Who the hell does he think he is?,1
1,#BREAKING. #Greece: Molotov cocktails fly afte...,1
2,"#OrrinHatch I can’t believe this sexist , clue...",1


In [5]:
nlp = spacy.load('en_core_web_sm')

In [6]:
perturbed_data = list(nlp.pipe(data['text']))

In [7]:
perturbed_data[:3]

[@USER @USER Who the hell does he think he is?,
 #BREAKING. #Greece: Molotov cocktails fly after protest honouring killed antifa arti... URL via @USER URL,
 #OrrinHatch I can’t believe this sexist , clueless, old fart gets to weigh in on another woman’s charges against a Supreme Court nominee. And he is spouting the same old nasty shit he spewed 20+ years ago against Anita Hill. His time’s up! Good riddance Neanderthal!]

In [8]:
np.random.seed(42)
ret_data_typo2 = Perturb.perturb(data['text'], Perturb.add_typos, typos = 2)
ret_data_typo4 = Perturb.perturb(data['text'], Perturb.add_typos, typos = 4)
ret_data_typo6 = Perturb.perturb(data['text'], Perturb.add_typos, typos = 6)
ret_data_typo8 = Perturb.perturb(data['text'], Perturb.add_typos, typos = 8)
ret_data_typo10 = Perturb.perturb(data['text'], Perturb.add_typos, typos = 10)

ret_data_typo2.data[:3][1]

['#BREAKING. #Greece: Molotov cocktails fly after protest honouring killed antifa arti... URL via @USER URL',
 '#BREAKING. #Greece: Molotov cocktails fly after protest honouring kille dantifa arti... URL via @USER URL']

In [9]:
def make_data_list(data):
  typo_data = []
  for i in range(len(data)):
    typo_data.append(data[i][1]) # tweet with typos added
  
  return typo_data

In [10]:
typo_data2 = make_data_list(ret_data_typo2.data)
typo_data4 = make_data_list(ret_data_typo4.data)
typo_data6 = make_data_list(ret_data_typo6.data)
typo_data8 = make_data_list(ret_data_typo8.data)
typo_data10 = make_data_list(ret_data_typo10.data)

In [11]:
print(typo_data2[0])
print(typo_data10[0])

@USER @USER Who the hell doe she thinkh e is?
U@SER U@SE RhWo th eehl dloes he think h eis?


In [12]:
data_typo2 = pd.DataFrame({'text':typo_data2,'label':data['labels']})
data_typo4 = pd.DataFrame({'text':typo_data4,'label':data['labels']})
data_typo6 = pd.DataFrame({'text':typo_data6,'label':data['labels']})
data_typo8 = pd.DataFrame({'text':typo_data8,'label':data['labels']})
data_typo10 = pd.DataFrame({'text':typo_data10,'label':data['labels']})

data_typo2.head()

Unnamed: 0,text,label
0,@USER @USER Who the hell doe she thinkh e is?,1
1,#BREAKING. #Greece: Molotov cocktails fly afte...,1
2,"#OrrinHatch I can’t eblieve this sexist , clue...",1
3,@USER @USER I'll use that one the next time im...,1
4,0-1 lost my acca o nth efirst fucking fight cba,1


In [13]:
model_args = ClassificationArgs()
model_args.num_train_epochs = 1
model_args.overwrite_output_dir= True
model_args.output_dir = '/content/drive/MyDrive/NLP/error_analyse_model'


model = ClassificationModel("bert", "/content/drive/MyDrive/NLP/error_analyse_model")

In [14]:
#model.train_model(train_data)

In [15]:
predictions, raw_output = model.predict(list(data['text']))
predictions2, raw_outputs2 = model.predict(list(data_typo2['text']))
predictions4, raw_outputs4 = model.predict(list(data_typo4['text']))
predictions6, raw_outputs6 = model.predict(list(data_typo6['text']))
predictions8, raw_outputs8 = model.predict(list(data_typo8['text']))
predictions10, raw_outputs10 = model.predict(list(data_typo10['text']))

pred, out = model.predict(list(data['text']))

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

In [16]:
predicted_typo_labels2 = pd.DataFrame({'text':data_typo2.text, 'labels': predictions2})
predicted_typo_labels2.head()

Unnamed: 0,text,labels
0,@USER @USER Who the hell doe she thinkh e is?,1
1,#BREAKING. #Greece: Molotov cocktails fly afte...,1
2,"#OrrinHatch I can’t eblieve this sexist , clue...",1
3,@USER @USER I'll use that one the next time im...,1
4,0-1 lost my acca o nth efirst fucking fight cba,1


In [17]:
predicted_typo_labels10 = pd.DataFrame({'text':data_typo10.text, 'labels': predictions10})
predicted_typo_labels10.head()

Unnamed: 0,text,labels
0,U@SER U@SE RhWo th eehl dloes he think h eis?,0
1,#BREAKING. #Greece :oMlotov cocktails fly afte...,0
2,"#OrrinHatch I can’t believe this sexist , clue...",1
3,@USER @USERI 'll uset hat one the next tim eim...,0
4,0-1l ost m ycaac o nhet first fucikgn fight cba,0


In [18]:
result2, model_outputs2, wrong_predictions2 = model.eval_model(data_typo2, report_typo2 = classification_report)
result4, model_outputs4, wrong_predictions4 = model.eval_model(data_typo4, report_typo4 = classification_report)
result6, model_outputs6, wrong_predictions6 = model.eval_model(data_typo6, report_typo6 = classification_report)
result8, model_outputs8, wrong_predictions8 = model.eval_model(data_typo8, report_typo8 = classification_report)
result10, model_outputs10, wrong_predictions10 = model.eval_model(data_typo10, report_typo10 = classification_report)

result_d, model_outputs_d, wrong_predictions_d = model.eval_model(data, report_normaldata = classification_report)

  0%|          | 0/100 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/13 [00:00<?, ?it/s]

In [19]:
print("Original data prediction")
print(result_d['report_normaldata'])
print("\n")
print("With 2 typos")
print(result2['report_typo2'])
print("\n")
print("With 4 typos")
print(result4['report_typo4'])
print("\n")
print("With 6 typos")
print(result6['report_typo6'])
print("\n")
print("With 8 typos")
print(result8['report_typo8'])
print("\n")
print("With 10 typos")
print(result10['report_typo10'])


Original data prediction
              precision    recall  f1-score   support

         0.0       0.79      0.88      0.83        50
         1.0       0.86      0.76      0.81        50

    accuracy                           0.82       100
   macro avg       0.82      0.82      0.82       100
weighted avg       0.82      0.82      0.82       100



With 2 typos
              precision    recall  f1-score   support

         0.0       0.80      0.88      0.84        50
         1.0       0.87      0.78      0.82        50

    accuracy                           0.83       100
   macro avg       0.83      0.83      0.83       100
weighted avg       0.83      0.83      0.83       100



With 4 typos
              precision    recall  f1-score   support

         0.0       0.74      0.92      0.82        50
         1.0       0.89      0.68      0.77        50

    accuracy                           0.80       100
   macro avg       0.82      0.80      0.80       100
weighted avg       

In [20]:
print(model_outputs10[:5])

[[ 0.28710938 -1.25585938]
 [ 0.14892578 -0.42089844]
 [-0.56787109  1.50585938]
 [ 0.06188965 -0.23583984]
 [ 0.1583252  -0.95361328]]


In [21]:
print("Confusion Matrix (2 typos): ")
print(confusion_matrix(data['labels'], predictions2))
print("\n")
print("Confusion Matrix (4 typos): ")
print(confusion_matrix(data['labels'], predictions4))
print("\n")
print("Confusion Matrix (6 typos): ")
print(confusion_matrix(data['labels'], predictions6))
print("\n")
print("Confusion Matrix (8 typos): ")
print(confusion_matrix(data['labels'], predictions8))
print("\n")
print("Confusion Matrix (10 typos): ")
print(confusion_matrix(data['labels'], predictions10))
print("\n")

Confusion Matrix (2 typos): 
[[44  6]
 [11 39]]


Confusion Matrix (4 typos): 
[[46  4]
 [16 34]]


Confusion Matrix (6 typos): 
[[46  4]
 [19 31]]


Confusion Matrix (8 typos): 
[[45  5]
 [18 32]]


Confusion Matrix (10 typos): 
[[41  9]
 [23 27]]




### Provide 3 examples when the model failed to assign the correct label after perturbation.

In [22]:
print(data['labels'][:10])

0    1
1    1
2    1
3    1
4    1
5    1
6    1
7    1
8    1
9    1
Name: labels, dtype: int64


In [23]:
print(predicted_typo_labels10['labels'][:10])

0    0
1    0
2    1
3    0
4    0
5    1
6    0
7    1
8    0
9    1
Name: labels, dtype: int64


In [24]:
failed_texts=[]
failed_labels=[]
for i in range(len(data)):
  if predicted_typo_labels10['labels'][i] != data['labels'][i]:
    failed_texts.append(predicted_typo_labels10['text'][i])
    failed_labels.append(predicted_typo_labels10['labels'][i])

In [25]:
failed_examples = pd.DataFrame({'text':failed_texts, 'labels': failed_labels})
failed_examples

Unnamed: 0,text,labels
0,U@SER U@SE RhWo th eehl dloes he think h eis?,0
1,#BREAKING. #Greece :oMlotov cocktails fly afte...,0
2,@USER @USERI 'll uset hat one the next tim eim...,0
3,0-1l ost m ycaac o nhet first fucikgn fight cba,0
4,#hCristian #America – If we go by #Trump’s exa...,0
5,#Democrats #Liberals you are being t#hreaetned...,0
6,(crt1ikal voice) emsf m ayss chekes,0
7,t*ge sall teh ibthce*s,0
8,And apparently I'm committed to gonig to a enw...,0
9,#Nigeri a#Naija #9ja 'You areth e msot incopme...,0


In [26]:
failed_examples['text'].iloc[30]

'U@SRE He is obivously getting suspendde. He isn ot an asste for anoyne'

In [27]:
success_df = pd.DataFrame({'typo_text':predicted_typo_labels10['text'], 'predicted_typolabels': predicted_typo_labels10['labels'],
                           'original_text':data['text'],'original_labels':data['labels'],
                           'predicted_label':predictions})
success_df.head()

Unnamed: 0,typo_text,predicted_typolabels,original_text,original_labels,predicted_label
0,U@SER U@SE RhWo th eehl dloes he think h eis?,0,@USER @USER Who the hell does he think he is?,1,1
1,#BREAKING. #Greece :oMlotov cocktails fly afte...,0,#BREAKING. #Greece: Molotov cocktails fly afte...,1,1
2,"#OrrinHatch I can’t believe this sexist , clue...",1,"#OrrinHatch I can’t believe this sexist , clue...",1,1
3,@USER @USERI 'll uset hat one the next tim eim...,0,@USER @USER I'll use that one the next time im...,1,0
4,0-1l ost m ycaac o nhet first fucikgn fight cba,0,0-1 lost my acca on the first fucking fight cba,1,1


In [28]:
filtered_df = success_df[((success_df['original_labels'] == 0) & (success_df['predicted_label'] == 0)) | ((success_df['original_labels'] == 1) & (success_df['predicted_label'] == 1))]
filtered_df.head()

Unnamed: 0,typo_text,predicted_typolabels,original_text,original_labels,predicted_label
0,U@SER U@SE RhWo th eehl dloes he think h eis?,0,@USER @USER Who the hell does he think he is?,1,1
1,#BREAKING. #Greece :oMlotov cocktails fly afte...,0,#BREAKING. #Greece: Molotov cocktails fly afte...,1,1
2,"#OrrinHatch I can’t believe this sexist , clue...",1,"#OrrinHatch I can’t believe this sexist , clue...",1,1
4,0-1l ost m ycaac o nhet first fucikgn fight cba,0,0-1 lost my acca on the first fucking fight cba,1,1
5,#Bakersifeld is why we nee udgn cotnrol! Sorry...,1,#Bakersfield is why we need gun control! Sorry...,1,1


In [29]:
filtered_df[((filtered_df['predicted_typolabels'] == 0) & (filtered_df['predicted_label'] == 1)) | ((filtered_df['predicted_typolabels'] == 1) & (filtered_df['predicted_label'] == 0))]

Unnamed: 0,typo_text,predicted_typolabels,original_text,original_labels,predicted_label
0,U@SER U@SE RhWo th eehl dloes he think h eis?,0,@USER @USER Who the hell does he think he is?,1,1
1,#BREAKING. #Greece :oMlotov cocktails fly afte...,0,#BREAKING. #Greece: Molotov cocktails fly afte...,1,1
4,0-1l ost m ycaac o nhet first fucikgn fight cba,0,0-1 lost my acca on the first fucking fight cba,1,1
11,(crt1ikal voice) emsf m ayss chekes,0,(cr1tikal voice) smef my ass cheeks,1,1
12,t*ge sall teh ibthce*s,0,*gets all the bitches*,1,1
19,And apparently I'm committed to gonig to a enw...,0,And apparently I'm committed to going to a new...,1,1
20,#Nigeri a#Naija #9ja 'You areth e msot incopme...,0,#Nigeria #Naija #9ja 'You are the most incompe...,1,1
28,#eFinstei nyou cants ay anythingt utrhful. Yo...,0,#Feinstein you cant say anything truthful. Yo...,1,1
29,#Kavanaugh Th emoment #BalseyFord is exposed a...,0,#Kavanaugh The moment #BlaseyFord is exposed a...,1,1
33,@USER Ho wis seh hdini gher ugyl presonality. ...,0,@USER How is she hiding her ugly personality. ...,1,1


In [31]:
len(filtered_df[((filtered_df['predicted_typolabels'] == 0) & (filtered_df['predicted_label'] == 1)) | ((filtered_df['predicted_typolabels'] == 1) & (filtered_df['predicted_label'] == 0))])

17

# 6. Negation *(4.5 points)* 

In [32]:
perturbed_data = list(nlp.pipe(data['text']))

In [33]:
org_predictions, org_raw_output = model.predict(list(data['text']))

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

In [35]:
ret = Perturb.perturb(perturbed_data, Perturb.add_negation) # negated data
ret.data[:5]

[['@USER @USER Who the hell does he think he is?',
  "@USER @USER Who the hell doesn't he think he is?"],
 ['#BREAKING. #Greece: Molotov cocktails fly after protest honouring killed antifa arti... URL via @USER URL',
  "#BREAKING. #Greece: Molotov cocktails don't fly after protest honouring killed antifa arti... URL via @USER URL"],
 ['#OrrinHatch I can’t believe this sexist , clueless, old fart gets to weigh in on another woman’s charges against a Supreme Court nominee. And he is spouting the same old nasty shit he spewed 20+ years ago against Anita Hill. His time’s up! Good riddance Neanderthal!',
  '#OrrinHatch I can’t believe this sexist , clueless, old fart gets to weigh in on another woman’s charges against a Supreme Court nominee. And he is not spouting the same old nasty shit he spewed 20+ years ago against Anita Hill. His time’s up! Good riddance Neanderthal!'],
 ["@USER @USER I'll use that one the next time im in a gun control debate or in a debate about free speech or taxes.

In [36]:
#ret.data[40:60]

In [37]:
negated_text = []
for i in range(len(ret.data)):
  negated_text.append(ret.data[i][1])

negated_text[:5]

["@USER @USER Who the hell doesn't he think he is?",
 "#BREAKING. #Greece: Molotov cocktails don't fly after protest honouring killed antifa arti... URL via @USER URL",
 '#OrrinHatch I can’t believe this sexist , clueless, old fart gets to weigh in on another woman’s charges against a Supreme Court nominee. And he is not spouting the same old nasty shit he spewed 20+ years ago against Anita Hill. His time’s up! Good riddance Neanderthal!',
 "@USER @USER I'll use that one the next time im in a gun control debate or in a debate about free speech or taxes. Yes you can't choose to be irresponsible or choose not to be. I argue responsible. Whats wrong with that? Don't justify murder by saying it was never alive or its my right.",
 "0-1 didn't lose my acca on the first fucking fight cba"]

In [38]:
neg_predictions, neg_raw_outputs = model.predict(negated_text)

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

In [39]:
predicted_neg_labels = pd.DataFrame({'text':negated_text, 'labels': data['labels']})
predicted_neg_labels.head()

Unnamed: 0,text,labels
0,@USER @USER Who the hell doesn't he think he is?,1
1,#BREAKING. #Greece: Molotov cocktails don't fl...,1
2,"#OrrinHatch I can’t believe this sexist , clue...",1
3,@USER @USER I'll use that one the next time im...,1
4,0-1 didn't lose my acca on the first fucking f...,1


In [40]:
result, model_outputs, wrong_predictions = model.eval_model(predicted_neg_labels, neg_report = classification_report)

  0%|          | 0/100 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/13 [00:00<?, ?it/s]

In [41]:
print(result['neg_report'])

              precision    recall  f1-score   support

         0.0       0.80      0.94      0.86        50
         1.0       0.93      0.76      0.84        50

    accuracy                           0.85       100
   macro avg       0.86      0.85      0.85       100
weighted avg       0.86      0.85      0.85       100



In [42]:
print("Confusion Matrix : ")
print(confusion_matrix(data['labels'], neg_predictions))

Confusion Matrix : 
[[47  3]
 [12 38]]


### Check the first 10 negated messages. For which of these negated messages should the label be flipped, in your opinion?

In [43]:
list(predicted_neg_labels['text'])[:10]

["@USER @USER Who the hell doesn't he think he is?",
 "#BREAKING. #Greece: Molotov cocktails don't fly after protest honouring killed antifa arti... URL via @USER URL",
 '#OrrinHatch I can’t believe this sexist , clueless, old fart gets to weigh in on another woman’s charges against a Supreme Court nominee. And he is not spouting the same old nasty shit he spewed 20+ years ago against Anita Hill. His time’s up! Good riddance Neanderthal!',
 "@USER @USER I'll use that one the next time im in a gun control debate or in a debate about free speech or taxes. Yes you can't choose to be irresponsible or choose not to be. I argue responsible. Whats wrong with that? Don't justify murder by saying it was never alive or its my right.",
 "0-1 didn't lose my acca on the first fucking fight cba",
 '#Bakersfield is not why we need gun control! Sorry for the victims other than the gunman himself I hope he rots in hell!',
 "#Christian #America – If we go by #Trump’s example, where liberals support open

In [44]:
list(data['text'])[:10]

['@USER @USER Who the hell does he think he is?',
 '#BREAKING. #Greece: Molotov cocktails fly after protest honouring killed antifa arti... URL via @USER URL',
 '#OrrinHatch I can’t believe this sexist , clueless, old fart gets to weigh in on another woman’s charges against a Supreme Court nominee. And he is spouting the same old nasty shit he spewed 20+ years ago against Anita Hill. His time’s up! Good riddance Neanderthal!',
 "@USER @USER I'll use that one the next time im in a gun control debate or in a debate about free speech or taxes. Yes you can choose to be irresponsible or choose not to be. I argue responsible. Whats wrong with that? Don't justify murder by saying it was never alive or its my right.",
 '0-1 lost my acca on the first fucking fight cba',
 '#Bakersfield is why we need gun control! Sorry for the victims other than the gunman himself I hope he rots in hell!',
 '#Christian #America – If we go by #Trump’s example, where liberals support open borders, I guess conserva

In [45]:
predicted_neg_labels_df = pd.DataFrame({'negated_text':negated_text, 'predicted_neglabels': neg_predictions,'original_text':data['text'],'original_labels':data['labels']})
predicted_neg_labels_df[(predicted_neg_labels_df['original_labels'] == 0) & (predicted_neg_labels_df['predicted_neglabels'] == 1)]

Unnamed: 0,negated_text,predicted_neglabels,original_text,original_labels
54,#StayOnYourPath #HealthQuote If you are wishin...,1,#StayOnYourPath #HealthQuote If you are wishin...,0
66,#WednesdayWisdom Antifa doesn't call the right...,1,#WednesdayWisdom Antifa calls the right fascis...,0
83,@USER I don't guess that’s where swamp ass ori...,1,@USER I guess that’s where swamp ass originated,0


In [46]:
predicted_neg_labels_df['negated_text'].iloc[83]

"@USER I don't guess that’s where swamp ass originated"

In [47]:
predicted_neg_labels_df[(predicted_neg_labels_df['original_labels'] == 1) & (predicted_neg_labels_df['predicted_neglabels'] == 0)]

Unnamed: 0,negated_text,predicted_neglabels,original_text,original_labels
3,@USER @USER I'll use that one the next time im...,0,@USER @USER I'll use that one the next time im...,1
6,#Christian #America – If we go by #Trump’s exa...,0,#Christian #America – If we go by #Trump’s exa...,1
8,#Democrats #Liberals you are being #threatened...,0,#Democrats #Liberals you are being #threatened...,1
17,"#Emmys Well, most of those ex-crackheads are ...",0,"#Emmys Well, most of those ex-crackheads are ...",1
23,#BoycottNike campaign must not continue and gr...,0,#BoycottNike campaign must continue and grow ...,1
24,#ANTIFA are not bedfellows with The Democratic...,0,#ANTIFA are bedfellows with The Democratic Soc...,1
26,@USER Antifa doesn't have TS level influence. ...,0,@USER Antifa has TS level influence. It's scary.,1
35,#Cuckservative Traitors Are not Worse Than For...,0,#Cuckservative Traitors Are Worse Than Fortnit...,1
42,#MSNBC #Hardball @USER Bingo you are not absol...,0,#MSNBC #Hardball @USER Bingo you are absolutel...,1
46,#FemiNazis are not Liberals so #MeToo only cou...,0,#FemiNazis are Liberals so #MeToo only counts ...,1


In [48]:
predicted_neg_labels_df['negated_text'].iloc[35]



In [49]:
failed_neg_texts=[]
failed_neg_labels=[]
for i in range(len(data)):
  if predicted_neg_labels_df['predicted_neglabels'][i] != data['labels'][i]:
    failed_neg_texts.append(predicted_neg_labels['text'][i])
    failed_neg_labels.append(predicted_neg_labels['labels'][i])

In [50]:
failed_neg_examples = pd.DataFrame({'text':failed_neg_texts, 'labels': failed_neg_labels})
failed_neg_examples

Unnamed: 0,text,labels
0,@USER @USER I'll use that one the next time im...,1
1,#Christian #America – If we go by #Trump’s exa...,1
2,#Democrats #Liberals you are being #threatened...,1
3,"#Emmys Well, most of those ex-crackheads are ...",1
4,#BoycottNike campaign must not continue and gr...,1
5,#ANTIFA are not bedfellows with The Democratic...,1
6,@USER Antifa doesn't have TS level influence. ...,1
7,#Cuckservative Traitors Are not Worse Than For...,1
8,#MSNBC #Hardball @USER Bingo you are not absol...,1
9,#FemiNazis are not Liberals so #MeToo only cou...,1


In [51]:
success_neg_texts=[]
success_neg_labels=[]
for i in range(len(data)):
  if predicted_neg_labels_df['predicted_neglabels'][i] == data['labels'][i]:
    success_neg_texts.append(predicted_neg_labels['text'][i])
    success_neg_labels.append(predicted_neg_labels['labels'][i])

In [52]:
successful_neg_examples = pd.DataFrame({'text':success_neg_texts, 'labels': success_neg_labels})
successful_neg_examples

Unnamed: 0,text,labels
0,@USER @USER Who the hell doesn't he think he is?,1
1,#BREAKING. #Greece: Molotov cocktails don't fl...,1
2,"#OrrinHatch I can’t believe this sexist , clue...",1
3,0-1 didn't lose my acca on the first fucking f...,1
4,#Bakersfield is not why we need gun control! S...,1
...,...,...
80,@USER He is obviously getting not suspended. H...,0
81,#Canada - EXCLUSIVE: #Trudeau #Liberals don't ...,0
82,@USER @USER ...than why didn't you show us how...,0
83,@USER @USER @USER You don't have yet to answer...,0


## 7. Creating negated examples

In [53]:
editor = Editor()

In [54]:
hate_mask = editor.template("I hate {mask}")
dont_hate_mask = editor.template("I don't hate {mask}")

hate_nationality = editor.template("I hate {nationality}")
dont_hate_nationality = editor.template("I don't hate {nationality}")

hate_religion = editor.template("I hate {religion}")
dont_hate_religion = editor.template("I don't hate {religion}")

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

In [55]:
hate_mask = hate_mask.data
dont_hate_mask = dont_hate_mask.data

hate_nationality = hate_nationality.data
dont_hate_nationality = dont_hate_nationality.data

hate_religion = hate_religion.data
dont_hate_religion = dont_hate_religion.data

In [56]:
dont_hate_mask[10:15]

["I don't hate anybody",
 "I don't hate Muslims",
 "I don't hate Jews",
 "I don't hate gays",
 "I don't hate religion"]

In [57]:
hate_religion[:5]

['I hate Christianity',
 'I hate Judaism',
 'I hate Islam',
 'I hate Buddhism',
 'I hate Zoroastrianism']

In [58]:
hatemask_predictions, hatemask_outputs = model.predict(hate_mask)
donthatemask_predictions, donthatemask_outputs = model.predict(dont_hate_mask)

hatenati_predictions, hatenati_outputs = model.predict(hate_nationality)
donthatenati_predictions, donthatenati_outputs = model.predict(dont_hate_nationality)

hatereligion_predictions, hatereligion_outputs = model.predict(hate_religion)
donthatereligion_predictions, donthatereligion_outputs = model.predict(dont_hate_religion)

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/215 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

  0%|          | 0/215 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

In [59]:
hatemask_df = pd.DataFrame({'text':hate_mask,'predicted_label':hatemask_predictions})
hatemask_df.head()

Unnamed: 0,text,predicted_label
0,I hate it,1
1,I hate capitalism,1
2,I hate Trump,1
3,I hate racism,1
4,I hate them,1


In [60]:
donthatemask_df = pd.DataFrame({'text':dont_hate_mask,'predicted_label':donthatemask_predictions})
donthatemask_df.head()

Unnamed: 0,text,predicted_label
0,I don't hate them,1
1,I don't hate you,1
2,I don't hate him,1
3,I don't hate people,1
4,I don't hate her,1


In [61]:
hatenati_df = pd.DataFrame({'text':hate_nationality,'predicted_label':hatenati_predictions})
hatenati_df.head()

Unnamed: 0,text,predicted_label
0,I hate Chinese,1
1,I hate Indian,1
2,I hate American,1
3,I hate Indonesian,0
4,I hate Pakistani,1


In [62]:
donthatenati_df = pd.DataFrame({'text':dont_hate_nationality,'predicted_label':donthatenati_predictions})
donthatenati_df.head()

Unnamed: 0,text,predicted_label
0,I don't hate Chinese,1
1,I don't hate Indian,1
2,I don't hate American,1
3,I don't hate Indonesian,0
4,I don't hate Pakistani,1


In [63]:
hatereligion_df = pd.DataFrame({'text':hate_religion,'predicted_label':hatereligion_predictions})
hatereligion_df.head()

Unnamed: 0,text,predicted_label
0,I hate Christianity,1
1,I hate Judaism,0
2,I hate Islam,1
3,I hate Buddhism,1
4,I hate Zoroastrianism,1


In [64]:
donthatereligion_df = pd.DataFrame({'text':dont_hate_religion,'predicted_label':donthatereligion_predictions})
donthatereligion_df.head()

Unnamed: 0,text,predicted_label
0,I don't hate Christianity,0
1,I don't hate Judaism,0
2,I don't hate Islam,1
3,I don't hate Buddhism,0
4,I don't hate Zoroastrianism,1


In [65]:
hatereligion_df[hatereligion_df['predicted_label'] == 0] # wrong

Unnamed: 0,text,predicted_label
1,I hate Judaism,0
5,I hate Hinduism,0
8,I hate Baha'i,0


In [66]:
hatereligion_df[hatereligion_df['predicted_label'] == 1] # correct

Unnamed: 0,text,predicted_label
0,I hate Christianity,1
2,I hate Islam,1
3,I hate Buddhism,1
4,I hate Zoroastrianism,1
6,I hate Sikhism,1
7,I hate Shintoism,1
9,I hate Taoism,1
10,I hate Confucianism,1
11,I hate Jain,1
12,I hate Atheism,1


In [67]:
hatenati_df[hatenati_df['predicted_label'] == 0].head(10) # wrong

Unnamed: 0,text,predicted_label
3,I hate Indonesian,0
11,I hate Ethiopian,0
12,I hate Philippine,0
18,I hate Thai,0
19,I hate French,0
21,I hate Italian,0
23,I hate Tanzanian,0
24,I hate Burmese,0
25,I hate Kenyan,0
29,I hate Argentine,0


In [68]:
hatenati_df[hatenati_df['predicted_label'] == 1].head(10) # correct

Unnamed: 0,text,predicted_label
0,I hate Chinese,1
1,I hate Indian,1
2,I hate American,1
4,I hate Pakistani,1
5,I hate Brazilian,1
6,I hate Nigerian,1
7,I hate Bangladeshi,1
8,I hate Russian,1
9,I hate Japanese,1
10,I hate Mexican,1


In [69]:
hatemask_df[hatemask_df['predicted_label'] == 0].head(10)

Unnamed: 0,text,predicted_label
14,I hate […],0
28,I hate Facebook,0
30,I hate Twitter,0
32,I hate voting,0
38,I hate writing,0
45,I hate math,0
50,I hate Brexit,0
52,I hate diversity,0
83,I hate Netflix,0
85,I hate to,0


In [70]:
for texts in train_data['text']:
  if "Islam" in texts or "islam" in texts:
    print(texts)

@USER @USER SOROS is at the root of everything and anything anti American. He has his money in Hollywood all the way to Islam and ANTIFA to illegals . URL
#SJW crap and/or #Paedophilia ..  #Hollywood #TV #Movies #Gaming #Politics #Comics #Police #PublicAuthorities like #Councils .. #DWP #DWPCrimes  #CatholicChurch or #Church  #Vatican  #Islam and #Muslims used as a weapon ..  Missed anything?! #NHS maybe?  #MBGA #Conservatives
#Labour #Councillor shouting Nazi Scum ..  At victims of #GroomingGangs and this is going to get worse under #JeremyCorbyn's socialist #Britain   #Muslims #Islam #Leftists #MAGA #MEGA #MCGA #MEGA #Patriots #Conservatives URL
@USER @USER @USER @USER allows actual Islamic terrorists to recruit on Twitter despite repeated attempts to have them removed.
@USER @USER Liberals defend Islam yet know little to nothing about the ideology. As a middle eastern Christian I pay close attention to what the Muslim people are up to.
@USER @USER Look at Sweden, France, UK, Europe 

In [71]:
for texts in train_data['text']:
  if "Italian" in texts or "italian" in texts or "italy" in texts or "Italy" in texts:
    print(texts)

@USER you are so wonderful Elizabeth...kisses from Italy....
@USER @USER guys im literally italian and i know what that shit is wtf
97% of Americans want a form of gun control. 62% of Americans want the government to protect the environment. 75% of Americans say immigration is a good thing &amp; 100% of Americans believe that HotDogs belong in the crusts of pizzas &amp; if the Italians don't like it they can suck it.
@USER She is not wrong tho. Some Tunisians look like Italians
@USER @USER 😍 She is coming to Italy! 🇮🇹 URL
@USER @USER @USER You are such a lier and all for money. Nobody wants “ francoism” here in Spain! Except ppl like you and your fascists separatists friends to smoke screen all the money embezzlement here in Catalonia! Why all your friends are with Vlaams Belang? &amp;the Italian nazis? NAZI URL


In [72]:
for texts in train_data['text']:
  if "french" in texts or "French" in texts or "France" in texts or "france" in texts:
    print(texts)

Still canny get er somecunt in maga telling us no to go into bananas cos French boys were spiking folks drinks &amp; kerr turns round n shouts fuck it free dunt 😂😂😂😂
@USER @USER @USER How dare you for questioning my service in a little thing called the Vietnam War? France surrendered Vietnam to Snowflake Supremo Ho Chi Minh. Liberals JFK and LBJ went in but refused to fight to win. We should go back to Vietnam to finish the fight. Restore the draft?
@USER @USER @USER Actually my ancestors were here long before anyone got here even your French ass! Native Americans were here first. So get your facts straight! So if you wanna throw punches make sure you know who and where you are throwing them first.
.@USER It’s sad and scary that the French so-called justice system is ordering @USER Le Pen to undergo psych eval because she dared to reveal the horrors of DAESH/ISIS. The true crazies are radical Muslims and liberals who protect them.
@USER @USER Look at Sweden, France, UK, Europe is chang

### 3 examples when the model fails to assign the correct label *(choose both from masking and lexicon suggestions)*

In [73]:
print(list(editor.lexicons.keys()))

['male', 'female', 'first_name', 'first_pronoun', 'last_name', 'country', 'nationality', 'city', 'religion', 'religion_adj', 'sexual_adj', 'sentiment', 'country_city', 'male_from', 'female_from', 'last_from']


In [74]:
list(editor.lexicons.keys())[6]

'nationality'

In [75]:
editor.suggest('I hate {mask}.')[:5]

['it', 'them', 'capitalism', 'cats', 'politics']

In [76]:
editor.related_words('I hate Canadian.', 'Canadian')[:5]

['people', 'war', 'change', 'feminism', 'life']

In [77]:
editor.related_words('I hate Islam.', 'Islam')[:5]

['politics', 'people', 'death', 'change', 'feminism']

In [78]:
', '.join(editor.suggest('This is not {a:mask} {thing}.', thing=list(editor.lexicons.keys())[6]))

'French, European, American, Arab, Italian, African, English, alien, Iranian, Irish, official, Indian, Asian, EU, Islamic, Israeli, Australian, honorary, Austrian, Arabic, individual, ordinary, Icelandic, acceptable, invented, independent, indigenous, anonymous, Indonesian, ethnic, advanced, automatic, enemy, Egyptian, illegal, exotic, imperial, old, ideal, international, exceptional, exclusive, endangered, Afghan, foreign, Armenian, authentic, new, Eastern, alternative, assumed, easy, oppressed, Japanese, British, open, immigrant, German, apartheid, Ethiopian, identifiable, single, island, established, adopted, original, important, aggressive, Western, existing, dual, attractive, accepted, inferior, Argentine, Hungarian, artificial, alternate, offensive, common, equal, isolated, actual, unusual, imaginary, special, Russian, Chinese, active, particular, allied, Iraqi, Ottoman, undocumented, ancestral, ancient, Spanish, Oriental, honorable, Greek, Turkish, legitimate, arbitrary, unknown

In [79]:
pos = ['like','love', 'don\'t hate','do not hate', 'respect', 'admire', 'adore', 'relish']
neg = ['hate','dislike', 'do not like','deset','distaste','hatred','despise']

In [80]:
ret = editor.template('I {pos} {nationality}.', pos=pos, labels=0, save=True, nsamples=100)
ret += editor.template('I {neg} {nationality}.', neg=neg, labels=1, save=True, nsamples=100)


In [81]:
test = MFT(ret.data, labels=ret.labels, name='Simple negation',
           capability='Negation', description='Very simple negations.')

In [82]:
from pattern.en import sentiment

In [83]:
import numpy as np
def predict_proba(inputs):
    p1 = np.array([(sentiment(x)[0] + 1)/2. for x in inputs]).reshape(-1, 1)
    p0 = 1- p1
    return np.hstack((p0, p1))

In [84]:
predict_proba(['hate', "don't hate"])


array([[0.9, 0.1],
       [0.3, 0.7]])

In [85]:
from checklist.pred_wrapper import PredictorWrapper
wrapped_pp = PredictorWrapper.wrap_softmax(predict_proba)

In [86]:
wrapped_pp(['hate'])

(array([0]), array([[0.9, 0.1]]))

In [87]:
test.run(wrapped_pp,overwrite=True)

Predicting 200 examples


In [88]:
test.summary()

Test cases:      200
Fails (rate):    133 (66.5%)

Example fails:
0.1 I hate Marshallese.
----
0.5 I despise Samoan.
----
0.8 I love Indonesian.
----


In [89]:
test.summary()

Test cases:      200
Fails (rate):    133 (66.5%)

Example fails:
0.5 I hatred Basotho.
----
0.5 I distaste Burmese.
----
0.5 I despise Djiboutian.
----
