# Import Statements

In [94]:
import argparse
import torch
import gc
import numpy as np
import pandas as pd
from process_data import get_jigsaw_datasets, init_embed_lookup, get_ctf_datasets, get_CivilComments_Datasets, get_jigsaw_dev_data, get_CivilComments_idents_Datasets
from models import CNNClassifier
from train_eval import train, evaluate, CTF
from loss import CLP_loss, ERM_loss
from torch.utils.data import DataLoader

## Only run the below code if you are using Google Colab

In [37]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [38]:
!git clone https://github.com/mtzig/NLP_CTF.git
%cd /content/NLP_CTF/

fatal: could not create work tree dir 'NLP_CTF': Read-only file system
[Errno 2] No such file or directory: '/content/NLP_CTF/'
/


In [41]:
%cd /content/NLP_CTF/data
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1JXm1N6SHmzIawgH7Aa4Ag-ZVuqLX7ba7' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1JXm1N6SHmzIawgH7Aa4Ag-ZVuqLX7ba7" -O GoogleNews-vectors-negative300.bin && rm -rf /tmp/cookies.txt
%cd ./civil_comments
!wget wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1pVM0PGHDXrhE4dqQf-offz_Xv8SoPx0X' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1pVM0PGHDXrhE4dqQf-offz_Xv8SoPx0X" -O civil_comments.csv && rm -rf /tmp/cookies.txt
%cd ..
%cd ..

[Errno 2] No such file or directory: '/content/NLP_CTF/data'
/
GoogleNews-vectors-negative300.bin: Read-only file system
[Errno 2] No such file or directory: './civil_comments'
/
civil_comments.csv: Read-only file system
/
/


In [22]:
# args, similar to run.py

# DEVICE = args.device is the device Pytorch should use (cuda, mps, cpu)
# train_method is the method to train the model (baseline, blind, CLP, augment)
# lambda represents the lambda in the CLP method
# nontoxic determines whether only nontoxic comments will be used (only for CLP)
# verbose prints the results
# trials is the number of trials to run
# epochs is the number of epochs to train model

In [76]:
%cd Users/teaganjohnson/Desktop/NLP_CTF/

(Alfred Adult All-in-One Course) Willard A. Palmer, Morton Manus, Amanda Vick Lethco - Adult All-In-One Course_ Lesson-Theory-Technic_ Level 1. 1-Alfred Publishing Co., Inc. (1994).pdf
2022+Total+Rewards+Overview+-+Classified (1).pdf
2022-08-29 Concussion Summary Report.pdf
2022-10-21 11-21.pdf
Acceptable+Docs+for+I-9+and+Citizenship+Verification.pdf
Bayes_HW_4_Teagan.pdf
[34mCAC[m[m/
CAC Client Questions Week 2.pdf
[34mCarleton College[m[m/
Code+of+Ethical+Conduct.pdf
Comps_Paper.pdf
[34mConsulting[m[m/
Corporate+Onboarding+Guide+(AZ+NC+PA).pdf
[34mCover Letter + Resumes[m[m/
Cracking the Coding Interview - 189 Programming Questions and Solutions (6th Edition) [EnglishOnlineClub.com].pdf
Entry+Level+Offer+-+FAQ.doc
GoogleNews-vectors-negative300.bin
IMG_0528.HEIC
[34mImportant Documents[m[m/
[34mInstallers[m[m/
JOHNSON Teagan Summer 2022 Poster.pdf
Johnson_Teagan_Asana_Cover_Letter.pdf
Johnson_Teagan_Fast_Cover_Letter.pdf
Johnson_Teagan_Fortinet_Cover_Letter.pdf
John

## Set parameters below

In [95]:
train_method = "baseline"
DEVICE = 'cpu'
lmbda = 1
nontoxic = True
verbose = 1
trials = 3
epochs = 15

## Set up word embeddings

In [96]:
# basically pulls embeddings from GoogleNews file in repo
embed_lookup = init_embed_lookup()
pretrained_embed = torch.from_numpy(embed_lookup.vectors)

In [97]:
# just gets the training data from jigsaw (used to train the model)
# can specify if we want the CLP trained data or not (CLP returns adversarial examples)
def get_training_data():
    if train_method == 'CLP':
        train_data, A = get_jigsaw_datasets(device=DEVICE, data_type='CLP', embed_lookup=embed_lookup)
    else:
        train_data = get_jigsaw_datasets(device=DEVICE, data_type=train_method, embed_lookup=embed_lookup)

    jig_dev_data = get_jigsaw_dev_data(device=DEVICE, embed_lookup=embed_lookup) # what is the dev split?
    
    return train_data, jig_dev_data

In [98]:
def get_civil_info():
    # get the civil comments data set, acts as test data set
    cc_data = get_CivilComments_Datasets(device=DEVICE, embed_lookup=embed_lookup)
    # get the identities for civil comments
    cc_idents_data = get_CivilComments_idents_Datasets(device=DEVICE, embed_lookup=embed_lookup)
    
    return cc_data, cc_idents_data

In [99]:
# initialize ctf datasets
ctf_datas = []
for dataset in ('civil_eval', 'civil_train', 'synth_toxic', 'synth_nontoxic'):
    ctf_datas.append(get_ctf_datasets(device=DEVICE, dataset=dataset, embed_lookup=embed_lookup))

100%|██████████| 310/310 [00:09<00:00, 31.88it/s]
100%|██████████| 531/531 [00:00<00:00, 697.58it/s] 
100%|██████████| 11683/11683 [00:05<00:00, 2114.19it/s]
100%|██████████| 11478/11478 [00:03<00:00, 3356.42it/s]


In [100]:
# load into dataloader
# data loader prepares data?

train_data, jig_dev_data = get_training_data()

cc_data, cc_idents_data = get_civil_info()

train_loader = DataLoader(train_data, batch_size=64) # train data set
jig_loader = DataLoader(jig_dev_data, batch_size=64) # dev split jigsaw training data

cc_loader = DataLoader(cc_data, batch_size=64) # civil comments data
cc_idents_loader = DataLoader(cc_idents_data, batch_size=64) # civil comments identities

100%|██████████| 159571/159571 [00:21<00:00, 7261.28it/s] 
100%|██████████| 63978/63978 [00:06<00:00, 9687.49it/s] 
100%|██████████| 133782/133782 [00:11<00:00, 11229.44it/s]
100%|██████████| 768/768 [00:00<00:00, 60957.26it/s]


In [101]:
# just create list of the 4 data loaders above
ctf_loaders = []
for data in ctf_datas:
    ctf_loaders.append(DataLoader(data, batch_size=64))
print('done')

done


In [103]:
results = []

for trial in range(int(trials)):
    print('{:=^50}'.format(f'Trial {trial+1}/{int(trials)}'))

    print('initializing model')
    # first we do garbage collection,
    # as torch sometimes does not free model when we reinitialize it
    model = None
    gc.collect()
    torch.cuda.empty_cache()
    
    # initialize models    
    model = CNNClassifier(pretrained_embed,device=DEVICE)
    if train_method == 'CLP':
        loss_fn = CLP_loss(torch.nn.CrossEntropyLoss(), A, lmbda=float(lambda_clp), only_nontox=nontoxic)
    else:
        loss_fn = ERM_loss(torch.nn.CrossEntropyLoss())
    optimizer = torch.optim.AdamW(model.parameters())

    print('done')
    # train model
    for epoch in range(int(epochs)):
        print(f'Epoch {epoch+1}/{int(epochs)}')
        train(train_loader, model, loss_fn, optimizer, verbose=verbose)

    print('evaluating model')
    # evaluate loss/accuracy/sensitivity/specificity/AUC on Jigsaw dev set
    jig_results = evaluate(jig_loader, model, get_loss=True, verbose=verbose)

    # evaluate loss/accuracy/sensitivity/specificity/AUC on civil comments test set
    cc_results = evaluate(cc_loader, model, get_loss=True, verbose=verbose)

    # evaluate loss/accuracy/sensitivity/specificity/AUC on civil comments idents only test set
    cc_idents_results = evaluate(cc_idents_loader, model, get_loss=True, verbose=verbose)

    # evaluate CTF gap over every eval dataset
    ctf_gaps = []
    for ctf_loader in ctf_loaders:
        ctf_gaps.append(CTF(ctf_loader, model))

    # TODO: evaluate tp, tn on training identity in Civil Comments

    results.append(jig_results+cc_results+cc_idents_results+tuple(ctf_gaps))


# output results as csv
columns = ('jig_loss', 'jig_accuracy', 'jig_tp', 'jig_tn', 'jig_auc',
            'cc_loss', 'cc_accuracy', 'cc_tp', 'cc_tn', 'cc_auc',
            'cci_loss', 'cci_accuracy', 'cci_tp', 'cci_tn', 'cci_auc',
            'ctf_cc_eval', 'ctf_cc_train',
            'ctf_synth_toxic', 'ctf_synth_nontoxic',
            )

print('outputting results to csv')
df_results = pd.DataFrame(np.array(results), columns=columns)
df_results.to_csv(f'{test_name}.csv', index=False)

initializing model


  0%|          | 0/2494 [00:00<?, ?it/s]

done
Epoch 1/15


 14%|█▎        | 338/2494 [02:52<18:21,  1.96it/s]


KeyboardInterrupt: 