## Model Analysis for Indirect Object Identification (IOI) in GPT2
Blackbox analysis of GPT-neo-20b parameters to try to better understand IOI behavior.

In [39]:
import openai
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from tabulate import tabulate
from math import exp
import random
import plotly.express as px

openai.api_key = "api"

openai.api_base = "https://api.goose.ai/v1"

In [4]:
# completion generation
def get_completion(prompt, engine, logprobs=20):
    completion = openai.Completion.create(
        engine=engine,
        prompt=prompt,
        max_tokens=30,
        n=1,
        stream=False,
        temperature=0,
        logprobs=logprobs)

    return engine + '\n' + "[" + completion['choices'][0]['text'] + "]" + '\n'

# get the logprobabilities for next token possibilities
def get_logprobs(prompt, engine, logprobs=20):
    completion = openai.Completion.create(
        engine=engine,
        prompt=prompt,
        max_tokens=1,
        n=1,
        stream=False,
        logprobs=logprobs)

    return {k: exp(v) for k, v in completion['choices'][0]['logprobs']['top_logprobs'][0].items()}

    
    # plot a graph of two log probabilities of the same model with two different prompts
def plot_compare_prompt_logprobs(prompt1, prompt2,xaxis, yaxis, model="gpt-neo-125m"):
    lp1 = get_logprobs(prompt1, model)
    lp2 = get_logprobs(prompt2, model)
    data = {k: (lp1.get(k, 0), lp2.get(k,0)) for k in lp1.keys() | lp2.keys()}
    fig = go.Figure(
        data=go.Scatter(
            x=[v[0] for v in data.values()],
            y=[v[1] for v in data.values()],
            mode='markers+text',
            text=list(data.keys()),
            textposition="bottom center"
                       )
                    )
    fig.update_layout(
    title={
        'text': model,
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    xaxis_title={
            'text': xaxis
        },
    yaxis_title={
            'text': yaxis
        }
    
    )

    fig.show()
    
    # plot two graphs side by side
def plot_two(prompt1, prompt2, prompt3, prompt4, xaxis1, yaxis1, xaxis2, yaxis2, model="gpt-neo-125m"):
    fig = make_subplots(rows=1, cols=2)
    lp1 = get_logprobs(prompt1, model)
    lp2 = get_logprobs(prompt2, model)
    data = {k: (lp1.get(k, 0), lp2.get(k,0)) for k in lp1.keys() | lp2.keys()}
    fig.add_trace( 
        go.Scatter(
            x=[v[0] for v in data.values()],
            y=[v[1] for v in data.values()],
            mode='markers+text',
            text=list(data.keys()),
            textposition="bottom center"
           ),
        row=1, col=1
                 )
    lp1 = get_logprobs(prompt3, model)
    lp2 = get_logprobs(prompt4, model)
    data = {k: (lp1.get(k, 0), lp2.get(k,0)) for k in lp1.keys() | lp2.keys()}
    fig.add_trace( 
        go.Scatter(
            x=[v[0] for v in data.values()],
            y=[v[1] for v in data.values()],
            mode='markers+text',
            text=list(data.keys()),
            textposition="bottom center"
           ),
        row=1, col=2
                 )
    fig.update_layout(
    title={
        'text': model,
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    )
    
    # update axis
    fig.update_xaxes(title_text=xaxis1, row=1, col=1)
    fig.update_xaxes(title_text=xaxis2, row=1, col=2)
    fig.update_yaxes(title_text=yaxis1, row=1, col=1)
    fig.update_yaxes(title_text=yaxis2, row=1, col=2)

    fig.show()
    

## First steps
I would like to confirm the observations that emerged in neo 20b. First one would be gender bias.
***
Initial observation: interestingly, the smaller model performs better than the bigger model by far.

In [22]:
A = "Jim" #person A
B = "Alicia" #person B
P = "palace" #place
O = "vase" #object

In [23]:
prompt1 = f"When {A} and {B} got a {O} at the {P}, {B} decided to give it to"
prompt2 = f"When {B} and {A} got a {O} at the {P}, {B} decided to give it to"
prompt3 = f"When {A} and {B} got a {O} at the {P}, {A} decided to give it to"
prompt4 = f"When {B} and {A} got a {O} at the {P}, {A} decided to give it to"

In [24]:
plot_two(prompt1, prompt2, prompt3, prompt4, f"{A} {B} {O} {P} {B} []", f"{B} {A} {O} {P} {B} []", f"{A} {B} {O} {P} {A} []", f"{B} {A} {O} {P} {A} []")
plot_two(prompt1, prompt2, prompt3, prompt4, f"{A} {B} {O} {P} {B} []", f"{B} {A} {O} {P} {B} []", f"{A} {B} {O} {P} {A} []", f"{B} {A} {O} {P} {A} []", "gpt-neo-20b")

plot_two(prompt1, prompt3, prompt2, prompt4, f"{A} {B} {O} {P} {B} []", f"{A} {B} {O} {P} {A} []", f"{B} {A} {O} {P} {B} []", f"{B} {A} {O} {P} {A} []")
plot_two(prompt1, prompt2, prompt3, prompt4, f"{A} {B} {O} {P} {B} []", f"{B} {A} {O} {P} {B} []", f"{A} {B} {O} {P} {A} []", f"{B} {A} {O} {P} {A} []", "gpt-neo-20b")


### test the different model's performance

In [185]:
engines = ['gpt-neo-125m', 'gpt-neo-1-3b', 'gpt-neo-2-7b', 'gpt-neo-20b']
Alist = ['Bob', 'Josh', 'Nathan', 'John', 'George', 'Michael', 'Steve']
Blist = ['Alice', 'Jess', 'Kim', 'Lucy', 'Sophia', 'Mia', 'Gina']
Plist = ['palace', 'house', 'grocery store', 'market', 'warehouse', 'restaurant']
Olist = ['ring', 'coin', 'vase', 'flower', 'card', 'key']

In [186]:
ntests = 10
results = [0]*ntests
for x in range(ntests):
    A = random.choice(Alist)
    B = random.choice(Blist)
    P = random.choice(Plist)
    O = random.choice(Olist)
    prompt = f"When {A} and {B} got a {O} at the {P}, {B} decided to give it to"
    lps = {engine: get_logprobs(prompt, 
                                        engine=engine) for engine in engines}
    results[x] = [[e, v[f' {A}'], v[' him']] for e, v in lps.items()]

avg_vals = {'gpt-neo-125m':[0,0], 'gpt-neo-1-3b':[0,0], 'gpt-neo-2-7b':[0,0], 'gpt-neo-20b':[0,0]}
avg_a = [0]
avg_b = [0]
for result in results:
    for e, a, b in result:
        avg_vals[e][0]+=a
        avg_vals[e][1]+=b
ntests= 100/ntests
for k in avg_vals:
    avg_vals[k][0] = avg_vals[k][0] * ntests
    avg_vals[k][1] = avg_vals[k][1] * ntests
print (tabulate([['engine', 'correct name', 'correct pronoun']] + [[e, avg_vals[e][0], avg_vals[e][1]] for e in avg_vals]))

------------  ------------------  ------------------
engine        correct name        correct pronoun
gpt-neo-125m  49.111893764484115  12.337547895376142
gpt-neo-1-3b  49.98841900436324   7.14554920576402
gpt-neo-2-7b  20.141514087995077  5.936571005762362
gpt-neo-20b   12.918405029236848  7.1659668506484895
------------  ------------------  ------------------


In [134]:
MA = "Eric" #male A
MB = "George" #male B
FA = "Jess" #female A
FB = "Kim" #female B
P = "shop" #place
O = "plate" #object

In [135]:
prompt1 = f"When {MA} and {MB} got a {O} at the {P}, {MB} decided to give it to"
prompt2 = f"When {FB} and {FA} got a {O} at the {P}, {FB} decided to give it to"

In [130]:
plot_compare_prompt_logprobs(prompt1, prompt2, "All Male", "All Female")

### gender bias
the gender bias by context does not seem to appear in this model. Smaller models have better performance for some reason.
### surnames
Next thing to try: Surnames and anaphors

In [187]:
ASlist = ["Smith", "Washington", "Johnson", "Williams", "Brown"]
BSlist = ["Scott", "Anderson", "Miller", "Thompson", "Robinson"]

In [191]:
A = random.choice(Alist) #person A
AS = random.choice(ASlist) #person A's surname
B = random.choice(Blist) #person B 
BS = random.choice(BSlist) #person B's surname
P = random.choice(Plist) #place
O = random.choice(Olist) #object

In [192]:
prompt1 = f"When {A} {AS} and {A} {BS} got a {O} at the {P}, {BS} decided to give it to"
prompt2 = f"When {A} {AS} and {B} {BS} got a {O} at the {P}, {B} decided to give it to"

In [193]:
plot_compare_prompt_logprobs(prompt1, prompt2, f"{A} {AS} {A} {BS} {O} {P} {BS}", f"{A} {AS} {B} {BS} {O} {P} {B}")
plot_compare_prompt_logprobs(prompt1, prompt2, f"{A} {AS} {A} {BS} {O} {P} {BS}", f"{A} {AS} {B} {BS} {O} {P} {B}", "gpt-neo-20b")

In [194]:
A = "George Scott" #person A
MB = "Michael Smith" #person B male version
FB = "Jess Smith" #person B female version
P = "grocery store" #place
O = "ring" #object

In [195]:
prompt1 = f"{A} and {MB} got a {O} at the {P}. {MB} decided to give it to"
prompt2 = f"{A} and {FB} got a {O} at the {P}. {FB} decided to give it to"

In [198]:
plot_compare_prompt_logprobs(prompt1, prompt2, f"{A} {MB} {O} {P} {MB}", f"{A} {FB} {O} {P} {FB}")
plot_compare_prompt_logprobs(prompt1, prompt2, f"{A} {MB} {O} {P} {MB}", f"{A} {FB} {O} {P} {FB}", "gpt-neo-20b")

Still performs better than 20b <br>
Now we check behavior by carefully removing surnames to sort out where is the issue.

### Findings
so far, the model performs better at IOI, which makes me think that instead of doing actual IOI, this test is simpler and more reliable when simple rules are applied. An expert linguist could probably come up with a set of rules looking at the mentioned entities that cover most test cases without proper understanding of the context.
#### next step
Check if a pronoun anaphor allows the model to understand the agents at play.

In [211]:
A = "Michael" #person A
B = "George" #person B
MC = random.choice(Alist) #person C male version
FC = random.choice(Blist) #person C female version
BP = "he" #person B's pronoun
P = random.choice(Plist) #place
O = random.choice(Olist) #object

In [212]:
prompt1 = f"{A}, {B}, and {MC} got a {O} at the {P}. When {B} left, {BP} gave it to"
prompt2 = f"{A}, {B}, and {FC} got a {O} at the {P}. When {B} left, {BP} gave it to"

In [213]:
plot_compare_prompt_logprobs(prompt1, prompt2, f"{A} {B} {MC} {O} {P} {B} {BP} []", f"{A} {B} {FC} {O} {P} {B} {BP} []")

In [217]:
A = random.choice(Blist) #person A
AP = "she" #person A pronoun
B = random.choice(Alist) #person B
BP = "he" #person B pronoun
P = random.choice(Plist) #place
O = random.choice(Olist) #object

In [218]:
prompt1 = f"{A} and {B} got a {O} at the {P}, {AP} decided to give it to"
prompt2 = f"{A} and {B} got a {O} at the {P}, {BP} decided to give it to"

In [219]:
plot_compare_prompt_logprobs(prompt1, prompt2, f"{AP} gave it to", f"{BP} gave it to")

### Takeaway
In terms of pronouns as anaphors, similar to the bigger model, they both understand its function correctly and correlates to the correct names. We can tell that the model still uses a proximity rule, making an unambiguous prediction that the most likely person that is being referred to (in the prediction) is the closest name to the prediction site. E.g. in a prompt1. <br>
### sanity check
When 3 entities show up, and 2 of them grouped together by a 'they' anaphor, it should be understood that the last person remaining without being mentioned is the correct answer. 

In [214]:
A = "Michael" #person A
B = "George" #person B
MC = random.choice(Alist) #person C male version
FC = random.choice(Blist) #person C female version
P = random.choice(Plist) #place
O = random.choice(Olist) #object

In [215]:
prompt1 = f"{A}, {B}, and {MC} got a {O} at the {P}. When {B} and {A} left, they gave it to"
prompt2 = f"{A}, {B}, and {FC} got a {O} at the {P}. When {B} and {A} left, they gave it to"

In [216]:
plot_compare_prompt_logprobs(prompt1, prompt2, f"{A} {B} {MC} {O} {P} {B} {A} []", f"{A} {B} {FC} {O} {P} {B} {A} []")

The model still understands anaphors and they/them grouping.
## Confirm Hypothesis
Now we want to confirm the main hypothesis

In [224]:
LlistA = ["the singer", "the farmer", "the pianist", "the guitarist", "the actor"]
LlistB = ["the butcher", "the worker", "the painter", "the musician", "the scientist"]

In [225]:
A = random.choice(Alist) #person A
Al = random.choice(LlistA) #person A's alias
B = random.choice(Blist) #person B
Bl = random.choice(LlistB)#person B's alias
P = random.choice(Plist)#place
O = random.choice(Olist)#object

In [226]:
prompt1 = f"When {A} {Al} and {B} {Bl} got a {O} at the {P}, {B} decided to give it to the"
prompt2 = f"When {A} {Al} and {B} {Bl} got a {O} at the {P}, {Bl} decided to give it to the"
prompt3 = f"When {A} {Al} and {B} {Bl} got a {O} at the {P}, {B} decided to give it to"
prompt4 = f"When {A} {Al} and {B} {Bl} got a {O} at the {P}, {Bl} decided to give it to"

In [228]:
plot_two(prompt1,prompt2,prompt3,prompt4, f"name to alias", f"alias to alias", f"name to name", f"alias to alias")

The model seems to not understand anaphors in this case. Regarding aliases, the model does a way worse job than 20b, only getting the correct answers on the case of ‘name to name. Failing any test cases involving aliases. (alias to alias, alias to name, name to alias)
***
### next steps
Counting hypothesis will be tested now.


In [233]:
A = random.choice(Alist) #example A person
B = random.choice(Blist) #example B person
P = random.choice(Plist) #place
O = random.choice(Olist) #object

In [236]:
prompt1 = f"When {A} and I got a {O} at the {P}, {A} decided to give it to"
prompt2 = f"When {B} and I got a {O} at the {P}, {B} decided to give it to"
prompt3 = f"When {A} and I got a {O} at the {P}, I decided to give it to"
prompt4 = f"When you and {A} got a {O} at the {P}, you decided to give it to"

In [237]:
plot_two(prompt1,prompt2,prompt3,prompt4, f"{A} I {A}", f"{B} I {B}", f"{A} I I", f"you {A} you")


## furthermore
Now we will test the counting entities hypothesis in greater depth.

In [258]:
A = random.choice(Blist) #person A
AP = "she" #person A pronoun
B = random.choice(Alist) #person B
BP = "he" #person B pronoun
C = random.choice(Alist) #person C
P = random.choice(Plist) #place
O = random.choice(Olist) #object

In [259]:
prompt1 = f"When {A}, {B}, and I got the {O} at the {P}, I apologized to {B} since I decided to give it to"
prompt2 = f"When {C}, {A}, and I got the {O} at the {P}, I apologized to {A} since I decided to give it to"
prompt3 = f"When {A}, {B}, and I got the {O} at the {P}, {B} apologized to {A} since {BP} decided to give it to"
prompt4 = f"When {C}, {A}, and I got the {O} at the {P}, {A} apologized to {C} since {AP} decided to give it to"

In [260]:
plot_two(prompt1,prompt2,prompt3,prompt4, f"I apologized to {B}, I", f"I apologized to {A}, I", f"{B} apologized to {A}, {BP}", f"{B} apologized to {C}, {AP}")

plot_two(prompt1,prompt2,prompt3,prompt4, f"I apologized to {B}, I", f"I apologized to {A}, I", f"{B} apologized to {A}, {BP}", f"{B} apologized to {C}, {AP}", "gpt-neo-20b")


The model almost succesfully predicts that the next token of prompt 3 and 4 should be 'me', however that's the second most probable answer, with the first one being the closest entity, this seems very similar to what happens in 20b, the two rules overriding each other depending on the context. <br>
If we compare the graphs to those of 20b, it seems like this model relies more on the counting rule than the proximity, since it predicted ‘Josh’ correctly on prompt 2 (left side y axis.)

In [263]:
A = random.choice(Blist) #person A (female)
AP = "she" #person A pronoun
B = random.choice(Alist)  #person B (male)
C = random.choice(Alist)  #person C (gender m)
CP = "he" #person C pronoun
D = random.choice(Blist)  #person D (gender f)
DP = "she" #person D pronoun
P = random.choice(Plist)  #place
O = random.choice(Olist)  #object

In [266]:
prompt1 = f"When {A}, {C}, and {D} got the {O} at the {P}, {D} apologized to {C} since {DP} decided to give it to"
prompt2 = f"When {A}, {B}, and {D} got the {O} at the {P}, {D} apologized to {A} since {DP} decided to give it to"
prompt3 = f"When {A}, {C}, and {D} got the {O} at the {P}, {C} apologized to {A} since {CP} decided to give it to"
prompt4 = f"When {B}, {A}, and {D} got the {O} at the {P}, {A} apologized to {B} since {AP} decided to give it to"

In [268]:
plot_two(prompt1,prompt2,prompt3,prompt4, f"{D} apologized to {C} since {DP}", f"{D} apologized to {A} since {DP}", f"{C} apologized to {A} since {CP}", f"{A} apologized to {B} since {AP}")

Effects are still similar to 20b
## hypothesis
If we force the entity counts, under the right context, we can force a prediction.

In [275]:
A = random.choice(Blist) #person A
B = random.choice(Alist) #person B
C = random.choice(Alist) #person C
P = random.choice(Plist) #place
O = random.choice(Olist) #object

In [276]:
prompt1 = f"When {A}, {A}, and {B} got {A} at the {A}, {B} decided to give it to"
prompt2 = f"When {A}, {C}, and {B} got the {O} at the {P}, {B} decided to give it to"


In [278]:
plot_compare_prompt_logprobs(prompt1, prompt2, f"Force {B}", "Normal")

Similar to 20b, the self object penalty is still strong and does not allow a subject to target itself.

In [292]:
samples = random.sample(Alist, 4)
A = samples[0] #person A
B =  samples[1] #person B
C =  samples[2] #person C
D =  samples[3] #person D
P = random.choice(Plist) #place

In [293]:
prompt1 = f"When {A}, {B}, {C}, and {D} were at the {P}, {D} was with {A}, and {B} was with"
prompt2 = f"When {A}, {B}, {C}, and {D} were at the {P}, {D} was with {A}, and {B} followed"


In [294]:
plot_compare_prompt_logprobs(prompt1, prompt2, f"and {B} was with... [{C}]", f"and {B} followed [{C}]")

The model's understanding of context and syntactic patterns diminished greatly. We can see this by comparing its performance to other sizes.

In [304]:
ntests = 10
results = [0]*ntests
for x in range(ntests):
    samples = random.sample(Alist, 4)
    A = samples[0] #person A
    B =  samples[1] #person B
    C =  samples[2] #person C
    D =  samples[3] #person D
    P = random.choice(Plist) #place
    prompt = fprompt2 = f"When {A}, {B}, {C}, and {D} were at the {P}, {D} was with {A}, and {B} was with"
    lps = {engine: get_logprobs(prompt, 
                                        engine=engine) for engine in engines}
    results[x] = [[e, v[f' {C}'], v[f' {A}'] if v[f' {A}'] else v[f' {B}'] if v[f' {B}'] else [f' them']] for e, v in lps.items()]

avg_vals = {'gpt-neo-125m':[0,0], 'gpt-neo-1-3b':[0,0], 'gpt-neo-2-7b':[0,0], 'gpt-neo-20b':[0,0]}
avg_a = [0]
avg_b = [0]
for result in results:
    for e, a, b in result:
        avg_vals[e][0]+=a
        avg_vals[e][1]+=b
ntests= 100/ntests
for k in avg_vals:
    avg_vals[k][0] = avg_vals[k][0] * ntests
    avg_vals[k][1] = avg_vals[k][1] * ntests
print (tabulate([['engine', 'correct', 'trap']] + [[e, avg_vals[e][0], avg_vals[e][1]] for e in avg_vals]))

------------  ------------------  ------------------
engine        correct             trap
gpt-neo-125m  28.353792598529242  19.36630228797114
gpt-neo-1-3b  74.51157708880282   3.916497067908838
gpt-neo-2-7b  76.42862547764796   2.7538671000565316
gpt-neo-20b   63.43078530067686   8.21147281459468
------------  ------------------  ------------------


In [318]:
samples = random.sample(Alist, 4)
A = samples[0] #person A
B =  samples[1] #person B
C =  samples[2] #person C
D =  samples[3] #person D
P = random.choice(Plist) #place

In [319]:
prompt1 = f"When {A}, {B}, {C}, and {D} were at the {P}, {D} was with {A}, and {B} was with"
prompt2 = f"When {A}, {B}, {C}, and {D} were at the {P}, {D} was with {C} and {A}, and {B} was with"
prompt3 = f"When {A}, {B}, {C}, and {D} were at the {P}, {D} was with {C} and {A}, {A}, and {B} was with"


In [320]:
plot_two(prompt1,prompt2,prompt1,prompt3, f"Normal [{C}]", f"Force {D}, 2 counts each", f"Normal [{C}]", f"Force {D}, 3 {A}s")
plot_compare_prompt_logprobs(prompt2, prompt3, f"Force {D}, 2 counts each.", f"Force {D}, 3 {A}s")

As predicted, the model can be forced onto generating the desired token. However, due to the inconsistent context understanding, and no clear AND behavior, the model sometimes resorts to proximity ABAB even though the context is the same and only the names have changed, yielding inconsistent results.
***
We shall test different sizes' 'forceability' meaning the amount of times we can force an incorrect answer out of them by tweaking the context.

In [323]:
ntests = 20
results = [0]*ntests
for x in range(ntests):
    samples = random.sample(Alist, 4)
    A = samples[0] #person A
    B =  samples[1] #person B
    C =  samples[2] #person C
    D =  samples[3] #person D
    P = random.choice(Plist) #place
    prompt = fprompt2 = f"When {A}, {B}, {C}, and {D} were at the {P}, {D} was with {C} and {A}, {A}, and {B} was with"
    lps = {engine: get_logprobs(prompt, 
                                        engine=engine) for engine in engines}
    results[x] = [[e, v[f' {D}'], v[f' {A}'] if v[f' {A}'] else v[f' {C}'] if v[f' {C}'] else [f' {B}']] for e, v in lps.items()]

avg_vals = {'gpt-neo-125m':[0,0], 'gpt-neo-1-3b':[0,0], 'gpt-neo-2-7b':[0,0], 'gpt-neo-20b':[0,0]}
avg_a = [0]
avg_b = [0]
for result in results:
    for e, a, b in result:
        avg_vals[e][0]+=a
        avg_vals[e][1]+=b
ntests= 100/ntests
for k in avg_vals:
    avg_vals[k][0] = avg_vals[k][0] * ntests
    avg_vals[k][1] = avg_vals[k][1] * ntests
print (tabulate([['engine', 'force%', 'nonforce']] + [[e, avg_vals[e][0], avg_vals[e][1]] for e in avg_vals]))

------------  ------------------  ------------------
engine        force%              nonforce
gpt-neo-125m  11.64691699710971   5.716365316971288
gpt-neo-1-3b  31.355363482368404  3.3114103310323224
gpt-neo-2-7b  20.514813240586953  5.29060320549657
gpt-neo-20b   36.9139503249268    4.755359160935678
------------  ------------------  ------------------


## Summary
The hypothesis has been disproven for this smaller model, which is more similar to GPT2 small. This would mean that the analysis on GPT2 small will have to be conducted on narrow cases, since generalization is inconsistent in smaller models.
