# Test applying a model

In [1]:
# Load model and tokenizer

from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_dirpath = '/home/mamille3/white_supremacist_lang/output/bert/6topic_annotated_5x/checkpoint-21210'

tokenizer = AutoTokenizer.from_pretrained(model_dirpath)
model = AutoModelForSequenceClassification.from_pretrained(model_dirpath)

In [2]:
# Run example data through (preprocess, get output)
import pandas as pd
import torch

example_data = [ # taken from Alatawi+2021 annotated dataset
    'of course this is happening to one of your elite white male appointees for decades the radical left has been marching through the educational system with hatred for everything white historically western while flooding the west with new voters they are trying to overthrow us',
    'africans and asians are not projected to be minorities in their countries why do you support this happening to whites',
    'happy birthday king',
    'it ok went back to normal',
]

inputs = tokenizer(example_data, padding=True, truncation=True, return_tensors="pt")
output = model(**inputs) # these are logits

# Convert logits to binary predictions
preds = torch.argmax(output.logits, dim=1)
preds
# Convert it to a numpy array
preds.numpy()

array([1, 1, 0, 0])

# Other

In [65]:
ws = data.query('predicted == "white_supremacist" and label==1')
len(ws)

165

In [67]:
not_ws = data.query('predicted == "neutral" and label==0')
not_ws.sample(10)[['text']]

Unnamed: 0_level_0,text
id,Unnamed: 1_level_1
alatawi2021_588,it ok went back to normal
alatawi2021_1949,believe scientists refer to it as the jay double virus
alatawi2021_1069,angus taylor must be removed from his energy portfolio he has been total failure when he was given it he said he would get elect prices down with his big stick that was years ago and all we have got from him is stalling tactics sackthebastered
alatawi2021_1509,in 1940 my 20 year old future mother rode the trolley to downtown st paul mn to see the picture show nights per week after 1965 however women stopped going out at night it was safer to stay home and watch tv this loss of women freedom ought to be regretted
alatawi2021_1232,where do they make these creatures
alatawi2021_486,thank you for calling this out look up terry bean and dennis hastert there are more
alatawi2021_553,just one
alatawi2021_288,happy birthday king
alatawi2021_1037,often wonder when the second coming of christ will be but it seems merely waiting on damn cat to move
alatawi2021_1025,aware the left looks at compromise as an opening but they will try to ban semi autos large capacity mags they ll want the 20 30 rd ones but we can point to heller say nope these are in common use we can not say that about 100 rd ones so why not win some good pr


In [66]:
ws.sample(10)[['text']]

Unnamed: 0_level_0,text
id,Unnamed: 1_level_1
alatawi2021_630,firstly he did not make children do anything he merely rewarded them if they did fight secondly anti whites use children as political soldiers every day in white western nations so could not possibly care less what they say
alatawi2021_182,of course this is happening to one of your elite white male appointees for decades the radical left has been marching through the educational system with hatred for everything white historically western while flooding the west with new voters they are trying to overthrow us
alatawi2021_1405,you can document illegals because the system is broken your fellow congressional members have had decades to correct it their plan was all along to let it get to this point and open the flood gates not on my watch too many illegal criminals already getting in
alatawi2021_309,part of frederick the great jewish policy part in frederick correspondence jews appear as currency manipulating financiers as wartime merchants as spies used by both sides and as the unwanted residents of conquered territories
alatawi2021_18,25 000 fake child refugees in france cost 800 million per year via
alatawi2021_438,africans and asians are not projected to be minorities in their countries why do you support this happening to whites
alatawi2021_469,sweden tyrannic invader resettlement law being used to eliminate homogeneous white areas via
alatawi2021_1970,the fraud of feminism 1913 by bax ernest belfort 1854 1926
alatawi2021_103,there reason they say one death is tragedy one million is statistic it part and parcel of living in society
alatawi2021_652,yes this is not hidden fact that hitler helped in part to send jews to palestine indirectly helping to create the stage of israel but why should that matter he obviously was doing it because it got the jews out of germany with all parties being fine not because he was shill


In [62]:
pd.set_option('display.max_colwidth', None)
pred_ws.sample(10)[['text']]

Unnamed: 0_level_0,text
id,Unnamed: 1_level_1
alatawi2021_788,congrats destroying isis just he said he would real leader get it done
alatawi2021_1970,the fraud of feminism 1913 by bax ernest belfort 1854 1926
alatawi2021_309,part of frederick the great jewish policy part in frederick correspondence jews appear as currency manipulating financiers as wartime merchants as spies used by both sides and as the unwanted residents of conquered territories
alatawi2021_1964,there term for those that call your existence privilege your continued existence supremacy that cheer on your demographic replacement and push for it every chance they get that try to erase your history and take future away from your children enemies
alatawi2021_1548,trashy show for trashy people they make the trash look bad
alatawi2021_911,those are the self interested conservative types who will never be on board until we already have successful organizations up and running talking about getting to that point
alatawi2021_1650,all mass shootings are eventually turned into fake amp
alatawi2021_778,trump prosecutes his own white supporters for defending themselves from antifa ambushes but earnestly pleads the case for black rappers promoting crude violence and filth
alatawi2021_1007,it can be starting point to explain that differences in outcomes between races has better explanation than white exploitation whites care for fairness and this will allow them to see the differences as fair other than that it is not important for our cause
alatawi2021_1988,hebrew immigrant aid society again demands us take in refugees but not in israel


In [58]:
# Preprocess text and apply tokenizer
import pandas as pd
from datasets import Dataset

example_data = [ # taken from annotated datasets
    'this flower smells really nice',
    'white genocide is real',
]
example_df = pd.DataFrame({'text': example_data})
example_df

Unnamed: 0,text
0,this flower smells really nice
1,white genocide is real


In [52]:
def preprocess(df):
    return tokenizer(df["text"], truncation=True)

dataset = Dataset.from_pandas(pred_ws)
tokenized = dataset.map(preprocess, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

## Classify (as just a PyTorch Module)

In [54]:
pred_ws.iloc[1]['text']

'the muslim thug needed some 00 buck where it counts'

In [56]:
# inputs = tokenizer(example_data[0], return_tensors='pt')
# inputs = tokenizer(example_data[1], return_tensors='pt')
# inputs = tokenizer(example_data[2], return_tensors='pt')
# inputs = tokenizer(example_data[3], return_tensors='pt')
inputs = tokenizer(pred_ws.iloc[3]['text'], return_tensors='pt')
# Should do for multiple examples (map it to the list)
# inputs = list(map(tokenizer(example_data, return_tensors='pt')
output = model(**inputs)
output

SequenceClassifierOutput(loss=None, logits=tensor([[-1.7837,  1.9144]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [57]:
# Binary predictions
# import numpy as np
import torch

# preds = np.argmax(output.logits, axis=-1)
preds = torch.argmax(output.logits, dim=1)
# preds = torch.argmax(output.logits)
preds

tensor([1])

## Classify (with Trainer)

In [6]:
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding

args = TrainingArguments(
           logging_dir='logs',
           output_dir='output',
           learning_rate=2e-5,
           per_device_train_batch_size = 16,
           per_device_eval_batch_size = 16,
           num_train_epochs=1,
           weight_decay=0.01,
)

trainer = Trainer(
    model = model,
    args = args,
    tokenizer = tokenizer,
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    # compute_metrics = self.compute_metrics,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [8]:
import numpy as np

output = trainer.predict(tokenized)

# Binary predictions
preds = np.argmax(output.predictions, axis=-1)
preds

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 4
  Batch size = 32
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


array([0, 0, 0, 0])

In [10]:
# Numeric class probability predictions
import scipy

prob = scipy.special.softmax(output.predictions, axis=-1)
class_prob = pd.DataFrame(prob)
class_prob
# class_prob.columns = class_prob.columns.map(self.id2label)
# class_prob.to_json(prob_outpath, orient='records', lines=True

Unnamed: 0,0,1
0,0.999845,0.000155
1,0.981697,0.018303
2,0.999157,0.000843
3,0.998917,0.001083


# Load some data that has been predicted as white supremacist

In [35]:
# Load predictions
import os
import pandas as pd

exp_name = '6topic_annotated_5x_21210_test30'
# preds_path = os.path.join('../output/bert/', exp_name, 'hatecheck_identity_nonhate_predictions.json')
preds_path = os.path.join('../output/bert/', exp_name, 'alatawi2021_predictions.json')
preds = pd.read_json(preds_path, typ='series')
preds

test_label_combine = {'antiracist': 'neutral'}
if test_label_combine is not None:
    preds = preds.map(lambda pred: test_label_combine.get(pred, pred))
preds

0      white_supremacist
1                neutral
2      white_supremacist
3                neutral
4      white_supremacist
             ...        
576              neutral
577    white_supremacist
578    white_supremacist
579              neutral
580    white_supremacist
Length: 581, dtype: object

In [36]:
preds.value_counts()

neutral              359
white_supremacist    222
dtype: int64

In [45]:
# Load data (Alatawi+2021 from annotated test30)
corpus_path = '../tmp/annotated_corpus_test30.pkl'
corpus = pd.read_pickle(corpus_path)
data = corpus[corpus.dataset=='alatawi2021'].copy()
# data.info()

# data_path = '../data/alatawi2021_white_supremacist_annotated_tweets.csv'
# data = pd.read_csv(data_path)
# data['text'] = data['input.text'].map(tokenize_lowercase)
# data['label'] = data['Voting and Final Labels']

data['predicted'] = preds.values
data[data['predicted']=='white_supremacist'][['text', 'predicted', 'label']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['predicted'] = preds.values


Unnamed: 0_level_0,text,predicted,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
alatawi2021_1717,we will do this using the magic of government ...,white_supremacist,1
alatawi2021_1946,the muslim thug needed some 00 buck where it c...,white_supremacist,1
alatawi2021_1132,over the course of 15 years or so ve talked to...,white_supremacist,1
alatawi2021_487,sorry bro no racism on the tl had to respond w...,white_supremacist,1
alatawi2021_1732,watch what happened when this muslim lad inter...,white_supremacist,1
...,...,...,...
alatawi2021_1485,ban the burqa wants burqas banned from governm...,white_supremacist,1
alatawi2021_1504,2020 is definitely going to be national populi...,white_supremacist,1
alatawi2021_1706,notice how omits the role of iran regime and h...,white_supremacist,0
alatawi2021_732,you degenerate cultureless philistine your ins...,white_supremacist,1


In [46]:
pred_ws = data[data['predicted']=='white_supremacist']
len(pred_ws)

222

In [47]:
# Make sure these are predicted white supremacist

def preprocess(df):
    return tokenizer(df["text"], truncation=True)

dataset = Dataset.from_pandas(pred_ws)
tokenized = dataset.map(preprocess, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [48]:
import numpy as np

output = trainer.predict(tokenized)

# Binary predictions
preds = np.argmax(output.predictions, axis=-1)
preds

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: domain, source, id, word_count, predicted, text, dataset. If domain, source, id, word_count, predicted, text, dataset are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 222
  Batch size = 32


array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1])

In [16]:
# Load data (HateCheck)
data_path = '../../data/hate_speech/hatecheck-data/test_suite_cases.csv'
data = pd.read_csv(data_path, index_col=0)
selected_identities = ['trans people', 'gay people', 'women',
                'black people', 'Muslims', 'immigrants']
selected_cols = ['ident_neutral_nh', 'ident_pos_nh',
                'slur_homonym_nh', 'slur_reclaimed_nh',
                'negate_neg_nh',
                'counter_quote_nh', 'counter_ref_nh',
                ]
data = data.query('target_ident==@selected_identities and functionality==@selected_cols')
len(data)

762

In [39]:
import nltk

def tokenize_lowercase(inp, nlp=None):
    """ Tokenize and lowercase text with NLTK """
    if nlp is not None:
        tokens = [tok.text for tok in nlp.tokenizer(inp)]
    else:
        tokens = nltk.word_tokenize(str(inp))
    return ' '.join(tokens).lower()

In [38]:
data['text'] = data['test_case'].map(tokenize_lowercase)
data.head()[['text', 'test_case']]

KeyError: 'test_case'

In [29]:
data['predicted'] = preds.values
data[data['predicted']=='white_supremacist'][['text', 'predicted']]

Unnamed: 0,text,predicted
985,she bought some faggots at the butcher .,white_supremacist
1041,calling myself a faggot to offend the homophob...,white_supremacist
1043,"i 'm a proud faggot , and i should not be bann...",white_supremacist
1044,"well , i am happily a faggot and proud !",white_supremacist
1047,happy pride month ! i love being a faggot !,white_supremacist
...,...,...
2710,there have to be consequences for saying you w...,white_supremacist
2742,why would you think it 's okay to call me a fa...,white_supremacist
2745,why would you think it 's okay to call me a ni...,white_supremacist
2754,why would you think it 's okay to call me a ra...,white_supremacist


In [30]:
pred_ws = data[data['predicted']=='white_supremacist']
len(pred_ws)

199

In [33]:
# Make sure these are predicted white supremacist

def preprocess(df):
    return tokenizer(df["text"], truncation=True)

dataset = Dataset.from_pandas(pred_ws)
tokenized = dataset.map(preprocess, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [34]:
import numpy as np

output = trainer.predict(tokenized)

# Binary predictions
preds = np.argmax(output.predictions, axis=-1)
preds

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: functionality, focus_words, direction, label_gold, templ_id, focus_lemma, target_ident, ref_templ_id, test_case, case_id, case_templ, __index_level_0__, predicted, text, ref_case_id. If functionality, focus_words, direction, label_gold, templ_id, focus_lemma, target_ident, ref_templ_id, test_case, case_id, case_templ, __index_level_0__, predicted, text, ref_case_id are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 199
  Batch size = 32


array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1])