In [1]:
# This is necessary to include the project directory into system paths
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
# Now we can import from project directory
from project.src.data_classes import (
    Opinion,
    ProjectUtils,
    gender_words,
    charlesworth_2021_words
)
# 
from collections import Counter
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from wefe.query import Query
plt.rcParams["figure.figsize"] = (10,10)

In [3]:
aclu_opinions = pd.read_csv('aclu_opinions.csv').sort_values('year_filed')
ProjectUtils.summarize_opinions_metadata(aclu_opinions)
# ProjectUtils.plot_corpora_scatter(aclu_opinions, type_token='type_count')
# ProjectUtils.plot_corpora_scatter(aclu_opinions, type_token='token_count')

Number of opinions: 92
Number of unique cases: 37
case_name
Harris v. McRae                                                      4
Thornburgh v. American College of Obstetricians and Gynecologists    4
Craig v. Boren                                                       4
Orr v. Orr                                                           4
Nevada Department of Human Resources v. Hibbs                        4
Griswold v. Connecticut                                              4
Green v. Brennan                                                     3
United States v. Virginia                                            3
Taylor v. Louisiana                                                  3
Stanton v. Stanton                                                   3
Rust v. Sullivan                                                     3
Rostker v. Goldberg                                                  3
Roe v. Wade                                                          3
Kahn v. Shevin   

In [4]:
# Only interested in 'Roe v. Wade' decision
# roe_wade_opinions = aclu_opinions[aclu_opinions['case_name'] == 'Roe v. Wade']

# ... or over time
# Only interested in 'Second wave feminism' i.e. 1960-1980
# opinions_second_wave = aclu_opinions[aclu_opinions['year_filed'].between(1962, 1982)]

# ProjectUtils.summarize_opinions_metadata(opinions_second_wave)
# ProjectUtils.plot_corpora_scatter(opinions_second_wave, type_token='type_count')
# ProjectUtils.plot_corpora_scatter(opinions_second_wave, type_token='token_count')

In [15]:
target_words_male = charlesworth_2021_words['malemax']
target_words_female = charlesworth_2021_words['femalemax']
attribute_words_home = charlesworth_2021_words['homemax']
attribute_words_work = charlesworth_2021_words['workmax']
attribute_words_good = charlesworth_2021_words['good']
attribute_words_bad = charlesworth_2021_words['bad']

['he', 'his', 'him', 'man', 'men', 'himself', 'son', 'father', 'husband', 'guy', 'boy', 'brother', 'male', 'dad', 'boyfriend', 'king', 'grandfather', 'uncle', 'grandson', 'nephew', 'lad', 'gentleman', 'fraternity', 'bachelor', 'prince', 'dude', 'gentlemen', 'stepfather', 'daddy', 'sir', 'bloke', 'groom', 'stepson', 'suitor', 'godfather', 'grandpa', 'fella', 'hero', 'fatherhood', 'fraternities', 'papa', 'pa']
['her', 'hers', 'she', 'women', 'woman', 'herself', 'daughter', 'mother', 'wife', 'gal', 'girl', 'sister', 'female', 'mom', 'girlfriend', 'queen', 'grandmother', 'aunt', 'granddaughter', 'niece', 'lady', 'gentlewoman', 'sorority', 'bachelorette', 'princess', 'dame', 'gentlewomen', 'stepmother', 'mommy', 'ms', 'madam', 'bride', 'stepdaughter', 'maiden', 'godmother', 'grandma', 'missus', 'heroine', 'motherhood', 'sororities', 'mama', 'ma']


In [6]:
# opinions = roe_wade_opinions['text'].values

# opinions = opinions_second_wave['text'].values

# pregnancy_cases = ["Griswold v. Connecticut", "Eisenstadt v. Baird", "Roe v. Wade", "Doe v. Bolton", "Geduldig v. Aiello", "Bellotti v. Baird"]
# opinions = aclu_opinions.loc[aclu_opinions['case_name'].isin(pregnancy_cases)]['text'].values

opinions = aclu_opinions['text'].values

# len(text)
# tokenized opinion text
# print(opinions)
print(len(opinions))
ugram = Counter()
bigram = Counter()
for i, text in enumerate(opinions):
    print(i)
    clean_text = ProjectUtils.clean_text(text)
    ugram, bigram = ProjectUtils.count_grams(clean_text, ugram, bigram)
    # print(ugram.most_common(5))

92
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91


In [23]:
male_ugram = Counter(dict([(k,v) for k,v in ugram.items() if k in target_words_male]))
female_ugram = Counter(dict([(k,v) for k,v in ugram.items() if k in target_words_female]))
print('Male target word counts')
for (w,c) in male_ugram.most_common(20):
    print('\t{} {}'.format(w, c))
print('Female target word counts')
for (w,c) in female_ugram.most_common(20):
    print('\t{} {}'.format(w, c))
    
work_ugram = Counter(dict([(k,v) for k,v in ugram.items() if k in attribute_words_work]))
home_ugram = Counter(dict([(k,v) for k,v in ugram.items() if k in attribute_words_home]))
print('Work attribute word counts')
for (w,c) in work_ugram.most_common(20):
    print('\t{} {}'.format(w, c))
print('Home attribute word counts')
for (w,c) in home_ugram.most_common(20):
    print('\t{} {}'.format(w, c))
    
good_ugram = Counter(dict([(k,v) for k,v in ugram.items() if k in attribute_words_good]))
bad_ugram = Counter(dict([(k,v) for k,v in ugram.items() if k in attribute_words_bad]))
print('Good attribute word counts')
for (w,c) in good_ugram.most_common(20):
    print('\t{} {}'.format(w, c))
print('Bad attribute word counts')
for (w,c) in bad_ugram.most_common(20):
    print('\t{} {}'.format(w, c))


Male target word counts
	his 449
	he 303
	men 246
	male 147
	him 90
	father 82
	husband 28
	man 26
	brother 24
	pa 23
	himself 17
	son 11
	gentleman 9
	king 7
	prince 6
	bachelor 2
	gentlemen 1
	boy 1
Female target word counts
	women 967
	her 463
	woman 269
	she 220
	female 163
	mother 126
	wife 32
	herself 21
	daughter 11
	lady 5
	girl 5
	motherhood 3
	hers 3
	ma 2
	princess 1
	sister 1
Work attribute word counts
	work 201
	job 98
	business 49
	professional 42
	office 27
	executive 23
	money 23
	corporation 17
	hiring 16
	hire 7
	career 7
	salary 7
	trade 4
	corporate 2
	manage 1
Home attribute word counts
	family 229
	children 125
	house 80
	marriage 43
	parent 42
	home 35
	relative 26
	marry 12
	cousins 2
	kitchen 2
	caregiving 1
	baby 1


In [8]:
# ProjectUtils.plot_ngram_barchart(ugram)
# ProjectUtils.plot_ngram_barchart(bigram)
# ProjectUtils.plot_unigram_wordcloud(ugram)
# ProjectUtils.most_common_unigram_heatmap(ugram, bigram)
# ProjectUtils.most_common_bigram_heatmap(bigram)


In [9]:
# Should only need to do this once per weat test condition
ppmi, cooccurence, x2i, i2x = ProjectUtils.model_matrices(ugram, bigram)


In [17]:
svd = ProjectUtils.model_svd(ppmi, k=500)

<class 'scipy.sparse._csc.csc_matrix'>
U shape (13233, 500)
S shape (500,)
VT shape (500, 13233)


In [18]:
U = svd['U']

most_common_gender_words = dict(male_ugram.most_common(10) + female_ugram.most_common(10)).keys()
most_common_words = dict(ugram.most_common(20)).keys()
most_common_attribute_words = dict(work_ugram.most_common(10) + home_ugram.most_common(10)).keys()

k = 5
# for x in most_common_gender_words:
# for x in most_common_words:
for x in most_common_attribute_words:
    dd = np.dot(U, U[x2i[x]]) # Cosine similarity for this unigram against all others.
    s = ''
    # Compile the list of nearest neighbor descriptions.
    # Argpartition is faster than argsort and meets our needs.
    for i in np.argpartition(-1 * dd, k + 1)[:k + 1]:
        if i2x[i] == x: continue
        s += '%s, %.3lf\n ' % (i2x[i], dd[i])
    print('%s, %d\n %s' % (x, ugram[x], s))
    print('-' * 10)

his, 449
 he, 0.809
 him, 0.738
 illegitimate, 0.497
 holding, 0.504
 holds, 0.495
 
----------
he, 303
 holding, 0.461
 his, 0.809
 him, 0.669
 gave, 0.401
 holds, 0.456
 
----------
men, 246
 methodology, 0.514
 military, 0.546
 meet, 0.500
 michael, 0.579
 male, 0.451
 
----------
male, 147
 maintains, 0.468
 men, 0.451
 man, 0.557
 january, 0.456
 lower, 0.442
 
----------
him, 90
 he, 0.669
 his, 0.738
 himself, 0.411
 greens, 0.379
 helpful, 0.368
 
----------
father, 82
 extension, 0.686
 fathers, 0.800
 evidences, 0.726
 fathercontrols, 0.661
 familyaid, 0.655
 
----------
husband, 28
 husbands, 0.634
 illegitimate, 0.464
 holds, 0.416
 improvidently, 0.338
 iii, 0.336
 
----------
man, 26
 male, 0.557
 long, 0.484
 market, 0.383
 many, 0.453
 matter, 0.368
 
----------
brother, 24
 brothers, 0.431
 capricious, 0.403
 brethren, 0.393
 avoiding, 0.373
 broadens, 0.338
 
----------
pa, 23
 pennsylvanias, 0.571
 paramilitary, 0.600
 parade, 0.517
 patent, 0.539
 nonviability, 0.51

In [19]:
model_kv = ProjectUtils.svd_keyed_vectors(svd, x2i, k=500)
model_kv

<gensim.models.keyedvectors.KeyedVectors at 0x1d500d5a0>

In [20]:
gender_query = Query(
    target_sets=[
        charlesworth_2021_words['femalemax'],
        charlesworth_2021_words['malemax'],
    ],
    attribute_sets=[
        charlesworth_2021_words['homemax'],
        charlesworth_2021_words['workmax'],
    ],
    target_sets_names=["Female terms", "Male Terms"],
    attribute_sets_names=["Home terms", "Work terms"],
)

weat_results = ProjectUtils.kv_weat_test(model_kv, gender_query)
print(weat_results)

{'query_name': 'Female terms and Male Terms wrt Home terms and Work terms', 'result': -0.15872989992106645, 'weat': -0.15872989992106645, 'effect_size': -0.3685438874299413, 'p_value': 0.8556144385561444}
{'query_name': 'Female terms and Male Terms wrt Home terms and Work terms', 'result': -0.15872989992106645, 'weat': -0.15872989992106645, 'effect_size': -0.3685438874299413, 'p_value': 0.8556144385561444}


In [21]:
moral_query = Query(
    target_sets=[
        charlesworth_2021_words['femalemax'],
        charlesworth_2021_words['malemax'],
    ],
    attribute_sets=[
        charlesworth_2021_words['good'],
        charlesworth_2021_words['bad'],
    ],
    target_sets_names=["Female terms", "Male Terms"],
    attribute_sets_names=["Good terms", "Bad terms"],
)

weat_results = ProjectUtils.kv_weat_test(model_kv, moral_query)
print(weat_results)

{'query_name': 'Female terms and Male Terms wrt Good terms and Bad terms', 'result': -0.29545366181991994, 'weat': -0.29545366181991994, 'effect_size': -0.2157163486273141, 'p_value': 0.7375262473752625}
{'query_name': 'Female terms and Male Terms wrt Good terms and Bad terms', 'result': -0.29545366181991994, 'weat': -0.29545366181991994, 'effect_size': -0.2157163486273141, 'p_value': 0.7375262473752625}
