In [1]:
import os
import json
from dotenv import load_dotenv
from pathlib import Path

dotenv_path = Path("../../.env")
if dotenv_path.exists():
    print("Loaded .env file!")
    load_dotenv(str(dotenv_path))


data = json.load(open(Path(os.environ["PROJECT_HOME_DIR"]) / "data/train.json"))

In [2]:
matches = list()

total_num_tokens = 0

for d in data:
    matches.extend(list(set([x for x in d["tokens"] if x.istitle() and len(x) > 2])))
    total_num_tokens += len(d["tokens"])

len(matches), len(set(matches)), total_num_tokens

(163888, 13668, 4992533)

In [3]:
matches[:20]

['What',
 'Éditions',
 'Annex1',
 'Application',
 'Approach',
 'Annex',
 'This',
 'Mind',
 'The',
 'Thinking',
 'Avril',
 'Around',
 'After',
 'Creativity',
 'They',
 'Buzan',
 'According',
 'Paris',
 'Sylla',
 'Dessine']

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1", torch_dtype=torch.float16, device_map=0)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
s = slice(0, 20)

prompt = '[INST] You are an expert at identifying names.\n# Context\n{context}\n\nIn the above context, could "{name}" be a first or last name? Respond Yes/No [/INST]'
prompts = [prompt.format(name=name, context=" ".join(ctx)) for name, ctx in zip(matches[s], [""]*len(matches[s]))]

tokenizer.padding_side = "left"
tokenizer.pad_token_id = tokenizer.eos_token_id

inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True)

with torch.no_grad():
    logits = model(**inputs.to(model.device)).logits[:, -1, :].softmax(-1)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [6]:
print(matches[s])

['What', 'Éditions', 'Annex1', 'Application', 'Approach', 'Annex', 'This', 'Mind', 'The', 'Thinking', 'Avril', 'Around', 'After', 'Creativity', 'They', 'Buzan', 'According', 'Paris', 'Sylla', 'Dessine']


In [7]:
logits.shape

torch.Size([20, 32000])

In [8]:
print(tokenizer.encode([".", " ."]), tokenizer.encode(" yes Yes .yes.Yes"), tokenizer.encode(" No no .no.No"))

yes_tokens = [5081, 5592, 9780, 5613]
no_tokens = [1770, 708, 1510, 2501]

print(tokenizer.convert_ids_to_tokens(yes_tokens))
print(tokenizer.convert_ids_to_tokens(no_tokens))

[1, 842, 1, 28705, 842] [1, 28705, 5081, 5592, 842, 9780, 28723, 5613] [1, 28705, 1770, 708, 842, 1510, 28723, 2501]
['▁yes', '▁Yes', 'yes', 'Yes']
['▁No', '▁no', 'no', 'No']


In [9]:
logits[:, yes_tokens+no_tokens].round(decimals=2)

tensor([[0.0000, 0.3000, 0.0000, 0.0000, 0.5900, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0900, 0.0000, 0.0000, 0.8400, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.2600, 0.0000, 0.0000, 0.5800, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.1500, 0.0000, 0.0000, 0.7800, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.2300, 0.0000, 0.0000, 0.6800, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.4500, 0.0000, 0.0000, 0.4700, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.2000, 0.0000, 0.0000, 0.5400, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.5400, 0.0000, 0.0000, 0.3600, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.1900, 0.0000, 0.0000, 0.7100, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.1800, 0.0000, 0.0000, 0.7400, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.7000, 0.0000, 0.0000, 0.1700, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.1200, 0.0000, 0.0000, 0.8200, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.2200, 0.0000, 0.0000, 0.6900, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.1100, 0.0000, 0.000

In [17]:
from functools import partial

for p, _, l in zip(matches[s], [""]*len(matches[s]), logits[:, [yes_tokens[1], no_tokens[0]]]):
    print(p, list(map(partial(round, ndigits=3), l.tolist())))

What [0.3, 0.592]
Éditions [0.095, 0.838]
Annex1 [0.255, 0.575]
Application [0.152, 0.782]
Approach [0.226, 0.679]
Annex [0.446, 0.471]
This [0.203, 0.536]
Mind [0.536, 0.36]
The [0.191, 0.71]
Thinking [0.182, 0.738]
Avril [0.7, 0.174]
Around [0.119, 0.816]
After [0.22, 0.693]
Creativity [0.108, 0.857]
They [0.242, 0.568]
Buzan [0.803, 0.113]
According [0.091, 0.774]
Paris [0.627, 0.246]
Sylla [0.648, 0.183]
Dessine [0.441, 0.389]


In [37]:
from tqdm.auto import tqdm


@torch.inference_mode()
def get_preds(names, contexts=None):

    if contexts is None:
        contexts = [""] * len(names)

    prompt = '[INST] # Context\n{context}\n\nIn the above context, could "{name}" be a first or last name? Respond Yes/No [/INST]'
    prompts = [
        prompt.format(name=name, context=ctx) for name, ctx in zip(names, contexts)
    ]

    inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True)

    bs = 32

    all_probs = []
    # [5592, 1770] _Yes, _No 
    # [5613, 2501] (yes, no)
    yes_no_tokens = [5592, 1770] 

    for i in tqdm(range(0, len(inputs["input_ids"]), bs)):
        input_ids = inputs["input_ids"][i : i + bs].to(model.device)
        attention_mask = inputs["attention_mask"][i : i + bs].to(model.device)

        logits = model(input_ids, attention_mask=attention_mask).logits

        probs = logits[:, -1, :].softmax(-1)[:, yes_no_tokens].cpu().tolist()

        all_probs.extend(probs)

    return all_probs

In [21]:
unq_matches = list(set(matches))

probs = get_preds(list(set(matches)))

  0%|          | 0/428 [00:00<?, ?it/s]

In [22]:
probs[:10]

[[0.2097785472869873, 0.6986692547798157],
 [0.5639786124229431, 0.36986619234085083],
 [0.483333021402359, 0.4400797188282013],
 [0.2821289896965027, 0.552381694316864],
 [0.6366611123085022, 0.22348907589912415],
 [0.4513509273529053, 0.4340599477291107],
 [0.7455400228500366, 0.15028692781925201],
 [0.25410473346710205, 0.653968334197998],
 [0.3341909945011139, 0.5298798084259033],
 [0.5592300891876221, 0.3261960446834564]]

In [23]:
unq_matches = list(set(matches))
unq_matches[:10]

['Defer',
 'Champions',
 'Query',
 'My\u200b',
 'Rya',
 'Fossil',
 'Kings',
 'Chai',
 'Gulf',
 'Konnect']

In [24]:
len(probs), len([x for x in probs if x[0] > x[1]])

(13668, 4733)

In [25]:
gt_names = []

for d in data:
    for t, l in zip(d["tokens"], d["labels"]):
        if "NAME_STUDENT" in l and len(t) > 2:
            gt_names.append(t)

len(set(gt_names))

1249

"You are an expert name identifier.\nIs the word \"{word}\" a name? Respond yes/no\n"
1207/1249

"You are an expert name identifier.\nCould \"{word}\" a first or last name? Respond yes/no\n"
1247/1249

In [30]:
pos_matches = [y for x,y  in zip(probs, unq_matches) if x[0] > x[1]]
missed = list(set(gt_names) - set(pos_matches))

print(len(probs))
print(len(pos_matches))
print("False positives", len(set(pos_matches) - set(gt_names)))

len(set(gt_names) & (set(unq_matches))), len(set(gt_names) & (set(pos_matches)))

13668
4733
False positives 3526


(1249, 1207)

In [27]:
missed_probs = get_preds(missed)
missed_probs[:10]

  0%|          | 0/2 [00:00<?, ?it/s]

[[0.3675011098384857, 0.5062541365623474],
 [0.2925245761871338, 0.5465078353881836],
 [0.4013194441795349, 0.5033673048019409],
 [0.4014034867286682, 0.43402066826820374],
 [0.36264950037002563, 0.4340346157550812],
 [0.31771907210350037, 0.47695276141166687],
 [0.22140875458717346, 0.5437273979187012],
 [0.31792664527893066, 0.5756908059120178],
 [0.4214402735233307, 0.4965779483318329],
 [0.29537197947502136, 0.6253024339675903]]

In [28]:
missed

['Sin',
 'Maga',
 'Llamas',
 'Jhonatan',
 'Elnazer',
 'Fofa',
 'Dla',
 'Corona',
 'Yessica',
 'Rehab',
 'Buonincontro',
 'Born',
 'Yap',
 'Juber',
 'Street',
 'Francy',
 'Murcia',
 'Asia',
 'Sales',
 'Frutos',
 'Pink',
 'Islam',
 'Saly',
 'Villa',
 'Okey',
 'Cuda',
 'Zizza',
 'Ramadan',
 'Pong',
 'Momentos',
 'Maprok',
 'Montenegro',
 'Der',
 'Peroni',
 'Bhai',
 'Papa',
 'Bravo',
 'Telfah',
 'Sunday',
 'Puerta',
 'Barbie',
 'Mossad']

# With more context

In [31]:
num_tokens_before = 20
num_tokens_after = 20

contexts = []
gt_names = []

for d in data:
    for i, (t, l) in enumerate(zip(d["tokens"], d["labels"])):
        if "NAME_STUDENT" in l:
            start = max(0, i - num_tokens_before)
            end = min(len(d["tokens"]), i + num_tokens_after)
            contexts.append(d["tokens"][start:end])
            gt_names.append(t)

In [32]:
len(contexts), len(set(["".join(t) for t in contexts]))

(2461, 2461)

In [33]:
s = slice(-20, None)

prompt = '[INST] # Context\n{context}\n\nIn the above context, could "{name}" be a first or last name? Respond Yes/No [/INST]'
prompts = [prompt.format(name=name, context=" ".join(ctx)) for name, ctx in zip(gt_names[s], contexts[s])]

tokenizer.padding_side = "left"
tokenizer.pad_token_id = tokenizer.eos_token_id

inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True)

with torch.no_grad():
    logits = model(**inputs.to(model.device)).logits[:, -1, :].softmax(-1)

print(gt_names[s])
print(contexts[s])

logits[:, yes_tokens+no_tokens].round(decimals=2)

['Walsh', 'Said', 'Catania', 'Brittany', 'Jackson', 'Gopal', 'Kumar', 'Jose', 'Luis', 'Llamas', 'Jose', 'Luis', 'Llamas', 'Oscar', 'Silvia', 'Cervantes', 'Carlos', 'Hernandez', 'Carlos', 'Hernandez']
[['Mind', 'Mapping', '–', 'Reflection', '–', 'Robert', 'Walsh', '\n\n', '1', '.', 'Challenge', ':', 'Describe', 'your', 'challenge', ',', 'including', 'all', 'relevant', 'information', '.', '\n\n', 'I', 'wanted', 'to', 'apply'], ['Said', 'Catania', '\n\n', 'Peer', '-', 'graded', 'Assignment', ':', 'Reﬂection', '\n\n', 'October', '13', ',', '2020', '\n\n', 'Using', 'Visualization', '\n\n', 'As', 'a'], ['Said', 'Catania', '\n\n', 'Peer', '-', 'graded', 'Assignment', ':', 'Reﬂection', '\n\n', 'October', '13', ',', '2020', '\n\n', 'Using', 'Visualization', '\n\n', 'As', 'a', 'tool'], ['Using', 'Visualization', 'to', 'create', 'lessons', 'about', 'educational', 'platforms', '\n\n', 'by', 'Brittany', 'Jackson', '\n\n', 'Challenge', '\n\n', 'I', 'am', 'a', 'high', 'school', 'Spanish', 'teacher', 

tensor([[0.0000, 0.7300, 0.0000, 0.0000, 0.1400, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.5200, 0.0000, 0.0000, 0.3500, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.6300, 0.0000, 0.0000, 0.2800, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.7000, 0.0000, 0.0000, 0.1900, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.7000, 0.0000, 0.0000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.5500, 0.0000, 0.0000, 0.2700, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.6100, 0.0000, 0.0000, 0.1900, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.4800, 0.0000, 0.0000, 0.4400, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.4800, 0.0000, 0.0000, 0.4500, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.5400, 0.0000, 0.0000, 0.3900, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.4600, 0.0000, 0.0000, 0.3700, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.4200, 0.0000, 0.0000, 0.4600, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.5800, 0.0000, 0.0000, 0.3400, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.7900, 0.0000, 0.000

In [38]:
ctx = [" ".join(contexts[5])]*5
print(ctx[0])
n = ["Design", "Thinking", "Avril", "Sylla", " ", " "]

get_preds(n, ctx)

  thus facilitating the implementation of related actions . 

 Design Thinking for innovation reflexion - Avril 2021 - Nathalie Sylla 

 Annex 1 : Mind Map Shared facilities project 




  0%|          | 0/1 [00:00<?, ?it/s]

[[0.32757481932640076, 0.6119903326034546],
 [0.07549765706062317, 0.8984438180923462],
 [0.6531009078025818, 0.270134299993515],
 [0.7930570244789124, 0.15616217255592346],
 [0.3426230847835541, 0.3453103303909302]]

In [40]:
with_ctx = get_preds(gt_names, [" ".join(c) for c in contexts])

  0%|          | 0/77 [00:00<?, ?it/s]

In [42]:
sum([x[0] > x[1] for x in with_ctx]), len(with_ctx)

(2426, 2461)

In [56]:
import random
negatives = []

for gt, ctx in zip(gt_names, contexts):
    titled = [x for x in ctx if x.istitle() and len(x) > 2 and x not in gt_names]
    if len(titled) > 0:
        negatives.append(random.choice(titled))
    else:
        rand = random.choice(ctx)
        while rand in gt_names:
            rand = random.choice(ctx)
        negatives.append(rand)

negatives[:10]

['Design',
 'Challenge',
 'Design',
 'Avril',
 'Design',
 'Design',
 'The',
 'Design',
 'This',
 'This']

In [57]:
neg_with_ctx = get_preds(negatives, [" ".join(c) for c in contexts])

  0%|          | 0/77 [00:00<?, ?it/s]

In [58]:
sum([x[0] > x[1] for x in neg_with_ctx]), len(neg_with_ctx)

(234, 2461)

In [59]:
for p, gt in zip(neg_with_ctx, negatives):
    if p[0] > p[1]:
        print(gt, p)

Avril [0.6200621128082275, 0.29061728715896606]
The [0.4301931858062744, 0.4169575273990631]
Gitam [0.5238758325576782, 0.4079948961734772]
Vice [0.4675936698913574, 0.37572208046913147]
The [0.4841519296169281, 0.2780250608921051]
Name [0.8964372873306274, 0.040955688804388046]
Interviewer [0.7718675136566162, 0.14846888184547424]
Interviewee [0.5190407633781433, 0.36805498600006104]
Kasyap [0.48509907722473145, 0.3719383776187897]
General [0.45022788643836975, 0.43297991156578064]
The [0.45732221007347107, 0.4099409580230713]
my [0.6711328625679016, 0.17783252894878387]
Wise [0.5046449899673462, 0.39920708537101746]
Wise [0.4775359332561493, 0.4148904085159302]
member [0.48131704330444336, 0.41817548871040344]


 [0.40514621138572693, 0.2918158769607544]
P [0.8111827373504639, 0.11505018174648285]
By [0.6241849064826965, 0.2748248875141144]
Lengua [0.4959598779678345, 0.3516882061958313]
Profesor [0.45655497908592224, 0.3366425633430481]
Example [0.46362417936325073, 0.33655506372451