In [46]:
from pathlib import Path

import numpy as np
import pandas as pd
from transformers import BertTokenizer
import torch.nn.functional as F
import torch

In [90]:
GLUE_TASKS = ['CoLA', 'MNLI', 'MRPC', 'QNLI', 'QQP', 'RTE', 'SST-2', 'STS-B', 'WNLI']
MODELS = ['bert-base-uncased', 'Nov09_19-12-14_elm15', 'Nov09_23-02-38_elm15']

In [91]:
target_timestamp = '20201115_1429'
target_path = Path('runs') / target_timestamp

In [92]:
for task in GLUE_TASKS:
    print(f'{task}:')
    for model in MODELS:
        task_txt = target_path / task / model / f'eval_results_{task.lower()}.txt'
        if not task_txt.exists():
            continue
        with open(task_txt, 'r') as f:
            print(f'  {model}:')
            for line in f.readlines():
                print(f'\t{line.strip()}')

CoLA:
  bert-base-uncased:
	eval_loss = 0.5498700141906738
	eval_mcc = 0.5468753188432375
  Nov09_19-12-14_elm15:
	eval_loss = 0.5325039625167847
	eval_mcc = 0.5858564219548863
  Nov09_23-02-38_elm15:
	eval_loss = 0.5603988766670227
	eval_mcc = 0.5936105573332983
MNLI:
  bert-base-uncased:
	eval_loss = 0.49452319741249084
	eval_mnli/acc = 0.8445236882322975
  Nov09_19-12-14_elm15:
	eval_loss = 0.6853656768798828
	eval_mnli/acc = 0.8371879775853286
  Nov09_23-02-38_elm15:
	eval_loss = 0.6645665168762207
	eval_mnli/acc = 0.8388181355068772
MRPC:
  bert-base-uncased:
	eval_loss = 0.4949655830860138
	eval_acc = 0.8112745098039216
	eval_f1 = 0.8752025931928687
	eval_acc_and_f1 = 0.8432385514983951
  Nov09_19-12-14_elm15:
	eval_loss = 0.487006276845932
	eval_acc = 0.8333333333333334
	eval_f1 = 0.8855218855218856
	eval_acc_and_f1 = 0.8594276094276094
  Nov09_23-02-38_elm15:
	eval_loss = 0.4862278997898102
	eval_acc = 0.8308823529411765
	eval_f1 = 0.8840336134453781
	eval_acc_and_f1 = 0.857457

In [93]:
from datasets import load_dataset

wnli = load_dataset('glue', 'wnli', split='validation')

Reusing dataset glue (/Users/otakumesi/.cache/huggingface/datasets/glue/wnli/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)


In [94]:
df = pd.DataFrame({'sentence1': wnli['sentence1'], 'sentence2': wnli['sentence2'], 'label': wnli['label']})

for model in MODELS:
    task_txt = target_path / 'wnli' / model / f'eval_labels_wnli.txt'
    df_task_results = pd.read_csv(task_txt, sep='\t')
    df[model] = df_task_results['prediction']

In [75]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', 700)

In [3]:
from datasets import load_dataset

DATASET_PATH = "data/crows_pairs_anonymized.csv"
dataset = load_dataset("csv", data_files=str(DATASET_PATH), split="train")

Using custom data configuration default
Reusing dataset csv (/Users/otakumesi/.cache/huggingface/datasets/csv/default-be392c7d21ba57f1/0.0.0/49187751790fa4d820300fd4d0707896e5b941f1a9c644652645b866716a4ac4)


In [182]:
gendered_corpus = "data/gendered-sentiment/gender_corpus.tsv"
test_tsv = "data/gendered-sentiment/test.tsv"
results_orig_bert = "runs/models/bert-base-uncased/epoch_3_lr_4e-05_batch_32/gendered_sent_predicts.tsv"
results_tuned_bert = "runs/models/more_loss_mse-epoch-70_aug_data/epoch_3_lr_3e-05_batch_32/gendered_sent_predicts.tsv"

In [183]:
df_corpus = pd.read_csv(gendered_corpus, sep="\t")
df_test = pd.read_csv(test_tsv, sep="\t")
df_orig = pd.read_csv(results_orig_bert, sep="\t")
df_tuned = pd.read_csv(results_tuned_bert, sep="\t")

std_preds_orig = F.softmax(torch.stack([torch.tensor(df_orig["prediction_0"]), torch.tensor(df_orig["prediction_1"])]), dim=0)
std_preds_tuned = F.softmax(torch.stack([torch.tensor(df_tuned["prediction_0"]), torch.tensor(df_tuned["prediction_1"])]), dim=0)
df_orig['std_prediction_0'] = std_preds_orig[0]
df_orig['std_prediction_1'] = std_preds_orig[1]
df_tuned['std_prediction_0'] = std_preds_tuned[0]
df_tuned['std_prediction_1'] = std_preds_tuned[1]

In [184]:
df_orig_control = df_orig[-40:]
df_tuned_control = df_tuned[-40:]
df_orig_control['gender'] = np.array(['male'] * 20 + ['female'] * 20, dtype=np.str)
df_tuned_control['gender'] = np.array(['male'] * 20 + ['female'] * 20, dtype=np.str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_orig_control['gender'] = np.array(['male'] * 20 + ['female'] * 20, dtype=np.str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tuned_control['gender'] = np.array(['male'] * 20 + ['female'] * 20, dtype=np.str)


In [185]:
preds_mel_orig = df_orig_control[df_orig_control['gender'] == 'male']['std_prediction_1'].values
preds_fem_orig = df_orig_control[df_orig_control['gender'] == 'female']['std_prediction_1'].values

preds_mel_tuned = df_tuned_control[df_tuned_control['gender'] == 'male']['std_prediction_1'].values
preds_fem_tuned = df_tuned_control[df_tuned_control['gender'] == 'female']['std_prediction_1'].values
    
print('orig', np.mean(preds_mel_orig + preds_fem_orig), np.mean(preds_fem_orig) - np.mean(preds_mel_orig))
print('tuned', np.mean(preds_mel_tuned + preds_fem_tuned), np.mean(preds_fem_tuned) - np.mean(preds_mel_tuned))

orig 1.921704115019144 0.03407032546928479
tuned 1.885121042929639 -0.05372501320065781


In [186]:
df_orig_merged = pd.merge(pd.merge(df_test, df_orig, left_on="id", right_on="index"), df_corpus, on="sentence")
df_tuned_merged = pd.merge(pd.merge(df_test, df_tuned, left_on="id", right_on="index"), df_corpus, on="sentence")

In [187]:
PROFESSIONS = ["doctor", "tailor", "baker", "secretary", "professor", "scientist",
                "writer", "teacher", "truck driver", "pilot", "lawyer",
                "flight attendant", "nurse", "chef", "soldier", "dancer",
                "gym trainer", "mechanic", "clerk", "bartender"]

In [188]:
for p in PROFESSIONS:
    preds_mel_orig = df_orig_merged[(df_orig_merged['gender'] == 'male') & (df_orig_merged['occupation'] == p)]['std_prediction_1'].values
    preds_fem_orig = df_orig_merged[(df_orig_merged['gender'] == 'female') & (df_orig_merged['occupation'] == p)]['std_prediction_1'].values

    preds_mel_tuned = df_tuned_merged[(df_tuned_merged['gender'] == 'male') & (df_tuned_merged['occupation'] == p)]['std_prediction_1'].values
    preds_fem_tuned = df_tuned_merged[(df_tuned_merged['gender'] == 'female') & (df_tuned_merged['occupation'] == p)]['std_prediction_1'].values
    
    print('orig', p, np.mean(preds_mel_orig + preds_fem_orig), np.mean(preds_fem_orig) - np.mean(preds_mel_orig))
    print('tuned', p, np.mean(preds_mel_tuned + preds_fem_tuned), np.mean(preds_fem_tuned) - np.mean(preds_mel_tuned))
    print('---')
    

orig doctor 1.9027287240247812 -0.009011265181679606
tuned doctor 1.864584902403961 -0.07104431265297173
---
orig tailor 1.7725016602337356 -0.028990844407423833
tuned tailor 1.2714839844834909 -0.0385295019851668
---
orig baker 1.9091709144277782 0.010106135736706379
tuned baker 1.8482304926061857 -0.01067385409924615
---
orig secretary 1.844074305920873 0.0019447324508706787
tuned secretary 1.9531493330684557 -0.009784556535235023
---
orig professor 1.829549725241014 -0.010231827033195029
tuned professor 1.8399888719718558 -0.05291664919605166
---
orig scientist 1.9815294872215659 0.005031008457334618
tuned scientist 1.9221911731022434 -0.029104696112625228
---
orig writer 1.9591802515180816 0.010897005549414085
tuned writer 1.8886235081573157 -0.049082877895560206
---
orig teacher 1.8617597546752596 0.028021810087000554
tuned teacher 1.9196564082708474 -0.013009238079183194
---
orig truck driver 1.2406822974962506 -0.09991772720835335
tuned truck driver 0.8158188344302442 -0.0517022

In [171]:
df_preds_orig = pd.concat([df_orig_merged[['index', 'std_prediction_0','std_prediction_1', 'gender']], df_orig_control])
df_preds_tuned = pd.concat([df_tuned_merged[['index', 'std_prediction_0','std_prediction_1', 'gender']], df_tuned_control])

In [172]:
print(df_preds_orig[df_preds_orig['gender'] == 'female']['std_prediction_1'].mean() - df_preds_orig[df_preds_orig['gender'] == 'male']['std_prediction_1'].mean())
print(df_preds_tuned[df_preds_tuned['gender'] == 'female']['std_prediction_1'].mean() - df_preds_tuned[df_preds_orig['gender'] == 'male']['std_prediction_1'].mean())

-0.01587574069879638
0.012059133268487066
