In [None]:
from collections import Counter
import pandas as pd

This notebook takes a list of verbs from female and male characters generated by [movie-dialogue.ipynb](movie-dialogue.ipynb). It computes PPMI scores for those words as described in [Gálvez, et. al](https://link.springer.com/content/pdf/10.1007/s11199-019-01019-x.pdf). 

In [None]:
words_connoting_intelligence = {
    # This lexicon was developed by Gálvez and colleagues in "Half a Century of 
    # Stereotyping Associations Between Gender and Intellectual Ability in Films"
    "ingenious", 
    "genius", 
    "ingeniousness", 
    "ingeniously", 
    "bright", 
    "brightness", 
    "brightly", 
    "brilliant", 
    "brilliance", 
    "brilliantly", 
    "clever", 
    "cleverness", 
    "cleverly", 
    "intelligent", 
    "intelligence",
    "intelligently",
}

## Construct Contingency Table

In [None]:
# Concatenate all tokens in all examples containing the prononun "she"
tokens_in_she_examples = ["oh", ",", "for", "chrissake", ",", "she", "was", "an", "actor"]

# Turn into a counter
types_in_she_examples = Counter(tokens_in_she_examples)

# Remove she pronoun because it would make the count of words in the vocab too large
types_in_she_examples.pop("she")

In [None]:
# Sum counts of all examples in vocab and not
vocab = ["the", "oh"] # TODO: find vocabulary

lexicon_she_CT00, not_lexicon_she_CT01 = 0, 0
for type, count in types_in_she_examples.items():
    if type in words_connoting_intelligence:
        lexicon_she_CT00 += 1
    elif type in vocab:
        not_lexicon_she_CT01 += 1


In [None]:
# Add another row for he
lexicon_he_CT10 = 4
not_lexicon_he_CT11 = 0
cont_table = pd.DataFrame([[lexicon_she_CT00, not_lexicon_she_CT01], 
                           [lexicon_he_CT10, not_lexicon_he_CT11]], 
                          ["she", "he"], ["Words in lexicon", "Other words"])


## Compute Odds Ratio

In [None]:
odds_ratio = (cont_table["Words in lexicon"]["she"]/cont_table["Other words"]["she"]) / \
             (cont_table["Words in lexicon"]["he"]/cont_table["Other words"]["he"])

In [None]:
odds_ratio