In [None]:
from collections import Counter
import pandas as pd

This notebook takes a list of verbs from female and male characters generated by [movie-dialogue.ipynb](movie-dialogue.ipynb). It computes PPMI scores for those words as described in [Gálvez, et. al](https://link.springer.com/content/pdf/10.1007/s11199-019-01019-x.pdf). 

In [None]:
words_connoting_intelligence = {
    # This lexicon was developed by Gálvez and colleagues in "Half a Century of 
    # Stereotyping Associations Between Gender and Intellectual Ability in Films"
    "ingenious", 
    "genius", 
    "ingeniousness", 
    "ingeniously", 
    "bright", 
    "brightness", 
    "brightly", 
    "brilliant", 
    "brilliance", 
    "brilliantly", 
    "clever", 
    "cleverness", 
    "cleverly", 
    "intelligent", 
    "intelligence",
    "intelligently",
}

## Read in Dataset
**Run movie dialogue base to the `%store` magic before running the below cell

In [None]:
%store -r movie_lines

In [None]:
shes = movie_lines.loc[movie_lines["Pronoun"] == "she"]
hes = movie_lines.loc[movie_lines["Pronoun"] == "he"]

In [None]:
def get_context_words(one_gender_df):
    tokens = []
    for tokenized_line in one_gender_df["Tokenized_Line"]:
        tokens.extend(tokenized_line)
    
    return Counter(tokens)

she_context_types, he_context_types = get_context_words(shes), get_context_words(hes)

In [None]:
she_context_types['my'], he_context_types['my']

## Construct Contingency Table

In [None]:
# Sum counts of all examples in vocab and not
# The vocabulary is all of the words that occur in the context of either pronoun
vocab = set(types_in_she_examples.keys()) - words_connoting_intelligence

def count_tokens_in_lexicon(types):
    lexicon, not_lexicon = 0, 0
    for word, count in types.items():
        if word in words_connoting_intelligence:
            lexicon += 1
        elif type in vocab:
            not_lexicon += 1
    return lexicon, not_lexicon
            
lexicon_she_CT00, not_lexicon_she_CT01 = count_tokens_in_lexicon(she_context_types)
lexicon_he_CT10, not_lexicon_he_CT11 = count_tokens_in_lexicon(he_context_types)

In [None]:
cont_table = pd.DataFrame([[lexicon_she_CT00, not_lexicon_she_CT01], 
                           [lexicon_he_CT10, not_lexicon_he_CT11]], 
                          ["she", "he"], ["Words in lexicon", "Other words"])
cont_table

## Compute Odds Ratio

In [None]:
odds_ratio = (cont_table["Words in lexicon"]["she"]/cont_table["Other words"]["she"]) / \
             (cont_table["Words in lexicon"]["he"]/cont_table["Other words"]["he"])

In [None]:
odds_ratio

In [None]:
# TODO: look at whether PMI would be useful
# TODO: (optional): temporal and genre analysis