## Get sentences

In [1]:
import pandas as pd
import os
import json
from nltk.tokenize import sent_tokenize


In [18]:
root_dir = '/john1/scr1/baom/text'
wiki_dirs = [f'{root_dir}/AA', f'{root_dir}/AB', f'{root_dir}/AC']

search_tokens = ['Muslim']
num_samples = 100 # number of samples to return
device = -1 # set to -1 if not using GPU


In [19]:
def get_samples(wiki_dirs):
    hits = []
    num_sents = 0

    for wiki_dir in wiki_dirs:
        for subdir, dirs, files in os.walk(wiki_dir):
            for f in files:
                wiki_text = os.path.join(subdir, f)
                with open(wiki_text, "r") as wiki_file:
                    for article in wiki_file.readlines():
                        wiki_file = json.loads(article)
                        title = wiki_file['title']
                        text = wiki_file['text']
                        
                        contained_tokens = []
                        for i in search_tokens:
                            if i not in text:
                                continue
                            else:
                                contained_tokens.append(i)
                        if not contained_tokens:
                            continue
                        
                        sentences = sent_tokenize(text)
                        num_sents += len(sentences)

                        for i, sent in enumerate(sentences):
                            toks = []
                            for tok in contained_tokens:
                                if tok in sent:
                                    toks.append(tok)
                            if toks:
    #                             data = {"title": title, "tokens": ','.join(toks), "sentence": sent, "sentence_idx": i, "path": wiki_text}
                                data = {"title": title, "sentence": sent, "sentence_idx": i, "path": wiki_text, "toks":",".join(toks)}
                                hits.append(data)

                            if len(hits) == num_samples:
                                return hits, num_sents


In [4]:
hits, num_sents = get_samples(wiki_dirs)
print(f'searched thru {num_sents} sentences')
df = pd.DataFrame(hits)
df.style.set_properties(subset=['sentence'], **{'width-min': '300px'})

searched thru 599 sentences


## Get activations

In [None]:
"""Given any data directory containing a doc.txt file, uses BERT or GPT to generate
a corresponding tokens.pickle file and a corresponding activations.npz file."""
from transformers import GPT2Tokenizer, GPT2Model, BertTokenizer, BertModel
import os
import argparse
import pickle
import torch
import numpy as np
import sys
import ast
import pandas as pd
from tqdm.notebook import tqdm
sys.path.insert(0, os.path.abspath('.'))  # add CWD to path


In [15]:
max_docs = None  # Max number of documents to read or None. If None, this is ignored.
max_contexts = None  # Max number of contexts to read or None. If None, this is ignored.
max_toks = 30  # Max number of tokens in an acceptable document. If None, this is ignored.

model_type = 'bert' # 'gpt'

random_state = 1
frac = 1.0 # 0.02 // fraction of rows to sample from in provided .tsv file


In [None]:
tokenizer = None
model = None

if model_type == 'bert':
    tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
    model = BertModel.from_pretrained('bert-base-cased', output_hidden_states=True)
elif model_type == 'gpt':
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model = GPT2Model.from_pretrained('gpt2', output_hidden_states=True)
else:
    print("Incorrect model_type set.")
    exit()

if device != -1:
    # move the model to the GPU
    torch.cuda.set_device(device)
    device = torch.device("cuda", device)
    model.to(device)

df = df[df['sentence'].map(len) < 512]
df_sub = df.sample(frac = frac, random_state = random_state)

if 'tokens' in df_sub.columns:
    df_sub.drop(columns=['tokens'], inplace=True)

# Create a list of contexts. Each context will be a tuple: (doc's tokens, position in doc).
contexts = []
# Create a dictionary to map layer to list of docs' activations.
# Each doc's activations will be size (# contexts x size of embedding)
layers = {}
n_docs_consolidated = 0
n_long_docs = 0

for _, row in tqdm(df_sub.iterrows()):

    sent = row['sentence']
    inputs = tokenizer(sent, return_tensors="pt")
    tokens = [tokenizer.decode(i).replace(' ', '') for i in inputs['input_ids'].tolist()[0]]

    try:
        outputs = model(**inputs)
        hidden_state = outputs.hidden_states
        hidden_state = torch.stack(hidden_state, dim=0)
        hidden_state = torch.squeeze(hidden_state, dim=1)
    except Exception as e:
        print(str(e))
        print(row['sentence'])
        hidden_state = ()

    for tok_i in range(len(tokens)):
        context = (tokens, tok_i)
        contexts.append(context)

    num_layers = hidden_state.shape[0]
    for l in range(num_layers):
        layer = f'arr_{l}'

        if layer not in layers:
            layers[layer] = hidden_state[l, :, :,].detach().numpy()
        else:
            layers[layer] = np.concatenate([layers[layer], hidden_state[l, :, :,].detach().numpy()])

    print(f'Doc {n_docs_consolidated}: ({len(tokens)} tokens) --> {len(contexts)} total contexts')
    n_docs_consolidated += 1
    print(n_docs_consolidated)
    if n_docs_consolidated == max_docs:
        break  # Done

print(f'Found {n_docs_consolidated} docs & {len(contexts)} contexts and obtained activations of shape {layers[layer].shape}')
if max_toks:
    print(f'Ignored {n_long_docs} docs longer than {max_toks} tokens.')
