### "Replication" Code for "How Well Does Supervised Machine Learning Code Ethnographic Interview Data?"¶


Author: Zhuofan Li

Date: 07-30-2021

Note: Data and data cleaning procedures are not provided to protect the confidentiality of human subjects data.

### BERT Fine-tuning

In [1]:
from AtlasToDataframe import read_atlas, code_selector, splitter

In [2]:
from sklearn.metrics import accuracy_score, f1_score, accuracy_score, precision_score, recall_score, cohen_kappa_score
import numpy as np, scipy as sp
import pandas as pd
import krippendorff

In [3]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

In [4]:
import torch

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [5]:
def compute_metrics(p):    
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)
    alpha = krippendorff.alpha(np.stack((labels, pred)))
    kappa = cohen_kappa_score(labels, pred)
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1, "alpha": alpha, "kappa": kappa} 

In [6]:
data = read_atlas("Interviews.csv")

In [24]:
data2 = code_selector(data, 'PI')

In [None]:
gc.collect()
torch.cuda.empty_cache()

In [26]:
train, test, valid = splitter(data2, 0.5, 0.2, None)

In [27]:
train_texts, train_labels = zip(*train.values.tolist())
test_texts, test_labels = zip(*test.values.tolist())
valid_texts, valid_labels = zip(*valid.values.tolist())

In [31]:
MAX_LENGTH = 128
BATCH_SIZE = 16

In [32]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [33]:
train_encodings = tokenizer(train_texts, padding=True, truncation=True, max_length = MAX_LENGTH, add_special_tokens=True)
valid_encodings = tokenizer(valid_texts, padding=True, truncation=True, max_length = MAX_LENGTH, add_special_tokens=True)
test_encodings = tokenizer(test_texts, padding=True, truncation=True, max_length = MAX_LENGTH, add_special_tokens=True)

In [34]:
train_dataset = Dataset(train_encodings, train_labels)
valid_dataset = Dataset(valid_encodings, valid_labels)
test_dataset = Dataset(test_encodings, test_labels)

In [35]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=50,
    load_best_model_at_end=True
)

model = BertForSequenceClassification.from_pretrained("bert-base-uncased")

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=valid_dataset,             # evaluation dataset
    compute_metrics=compute_metrics,
)

trainer.train()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Step,Training Loss
50,0.5999
100,0.4425
150,0.4298
200,0.3674
250,0.3595
300,0.3546
350,0.2952
400,0.3355
450,0.3772
500,0.3003


TrainOutput(global_step=1205, training_loss=0.239240137781345, metrics={'train_runtime': 431.5411, 'train_samples_per_second': 2.792, 'total_flos': 1616085667706880.0, 'epoch': 5.0})

In [36]:
raw_pred, _ , metrics = trainer.predict(test_dataset)

In [380]:
metrics

{'test_loss': 0.1403384506702423,
 'test_accuracy': 0.9705882352941176,
 'test_precision': 0.583011583011583,
 'test_recall': 0.592156862745098,
 'test_f1': 0.5875486381322956,
 'test_alpha': 0.5723287022498232,
 'test_kappa': 0.5722999914353415,
 'test_runtime': 38.7505,
 'test_samples_per_second': 186.01}

In [402]:
metrics

{'test_loss': 0.11096487939357758,
 'test_accuracy': 0.9698945615982242,
 'test_precision': 0.5115511551155115,
 'test_recall': 0.6919642857142857,
 'test_f1': 0.588235294117647,
 'test_alpha': 0.5726410659302961,
 'test_kappa': 0.5729755230321999,
 'test_runtime': 48.5846,
 'test_samples_per_second': 148.36}

In [419]:
metrics

{'test_loss': 0.1333872377872467,
 'test_accuracy': 0.9721143174250833,
 'test_precision': 0.6175298804780877,
 'test_recall': 0.5961538461538461,
 'test_f1': 0.6066536203522506,
 'test_alpha': 0.5922266765463999,
 'test_kappa': 0.59220303718115,
 'test_runtime': 45.0191,
 'test_samples_per_second': 160.11}

In [431]:
metrics

{'test_loss': 0.1403384506702423,
 'test_accuracy': 0.9705882352941176,
 'test_precision': 0.583011583011583,
 'test_recall': 0.592156862745098,
 'test_f1': 0.5875486381322956,
 'test_alpha': 0.5723287022498232,
 'test_kappa': 0.5722999914353415,
 'test_runtime': 45.126,
 'test_samples_per_second': 159.731}

In [37]:
metrics

{'test_loss': 0.7333071827888489,
 'test_accuracy': 0.8520291363163371,
 'test_precision': 0.6711409395973155,
 'test_recall': 0.5899705014749262,
 'test_f1': 0.6279434850863422,
 'test_alpha': 0.5356421545908121,
 'test_kappa': 0.536070882523925,
 'test_runtime': 29.6996,
 'test_samples_per_second': 161.786}

In [38]:
# transform logits into probabilities
prob = sp.special.expit(raw_pred)

# print machine predictions and human codings with the texts presented in Figure 1B
pd.DataFrame([[prob[i,1], test_labels[i], test_texts[i]] for i in range(len(test_texts))], columns = ['Predicted Probability', 'True Label', 'Text']).sort_values(by = 'Predicted Probability', ascending=False).to_csv("Pr_BERT_PI.txt", sep = "\t")