In [1]:
# This is necessary to include the project directory into system paths
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
# Now we can import from project directory
from project.src.data_classes import (
    Opinion,
    ProjectUtils,
    gender_words,
    charlesworth_2021_words
)
# 
from collections import Counter
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from wefe.query import Query
plt.rcParams["figure.figsize"] = (10,10)

In [3]:
aclu_opinions = pd.read_csv('aclu_opinions.csv').sort_values('year_filed')
ProjectUtils.summarize_opinions_metadata(aclu_opinions)
# ProjectUtils.plot_corpora_scatter(aclu_opinions, type_token='type_count')
# ProjectUtils.plot_corpora_scatter(aclu_opinions, type_token='token_count')

Number of opinions: 92
Number of unique cases: 37
case_name
Harris v. McRae                                                      4
Thornburgh v. American College of Obstetricians and Gynecologists    4
Craig v. Boren                                                       4
Orr v. Orr                                                           4
Nevada Department of Human Resources v. Hibbs                        4
Griswold v. Connecticut                                              4
Green v. Brennan                                                     3
United States v. Virginia                                            3
Taylor v. Louisiana                                                  3
Stanton v. Stanton                                                   3
Rust v. Sullivan                                                     3
Rostker v. Goldberg                                                  3
Roe v. Wade                                                          3
Kahn v. Shevin   

In [4]:
# Only interested in 'Roe v. Wade' decision
# roe_wade_opinions = aclu_opinions[aclu_opinions['case_name'] == 'Roe v. Wade']

# ... or over time
# Only interested in 'Second wave feminism' i.e. 1960-1980
# opinions_second_wave = aclu_opinions[aclu_opinions['year_filed'].between(1962, 1982)]

# ProjectUtils.summarize_opinions_metadata(opinions_second_wave)
# ProjectUtils.plot_corpora_scatter(opinions_second_wave, type_token='type_count')
# ProjectUtils.plot_corpora_scatter(opinions_second_wave, type_token='token_count')

In [25]:
target_words_male = charlesworth_2021_words['malemax']
target_words_female = charlesworth_2021_words['femalemax']
attribute_words_home = charlesworth_2021_words['homemax']
attribute_words_work = charlesworth_2021_words['workmax']
attribute_words_good = charlesworth_2021_words['good']
attribute_words_bad = charlesworth_2021_words['bad']

In [40]:
# opinions = roe_wade_opinions['text'].values

# opinions = opinions_second_wave['text'].values

# pregnancy_cases = ["Griswold v. Connecticut", "Eisenstadt v. Baird", "Roe v. Wade", "Doe v. Bolton", "Geduldig v. Aiello", "Bellotti v. Baird"]
# opinions = aclu_opinions.loc[aclu_opinions['case_name'].isin(pregnancy_cases)]['text'].values

# opinions = aclu_opinions['text'].values

# Liberal
opinions = aclu_opinions.loc[
    (aclu_opinions['scdb_decision_direction'].isin([1.])) &
    (aclu_opinions['category']=='majority')
]['text'].values

# len(text)
# tokenized opinion text
# print(opinions)
print(len(opinions))
ugram = Counter()
bigram = Counter()
for i, text in enumerate(opinions):
    print(i)
    clean_text = ProjectUtils.clean_text(text)
    ugram, bigram = ProjectUtils.count_grams(clean_text, ugram, bigram)
    # print(ugram.most_common(5))

14
0
1
2
3
4
5
6
7
8
9
10
11
12
13


In [41]:
male_ugram = Counter(dict([(k,v) for k,v in ugram.items() if k in target_words_male]))
female_ugram = Counter(dict([(k,v) for k,v in ugram.items() if k in target_words_female]))
print('Male target word counts')
for (w,c) in male_ugram.most_common(20):
    print('\t{} {}'.format(w, c))
print('Female target word counts')
for (w,c) in female_ugram.most_common(20):
    print('\t{} {}'.format(w, c))
    
work_ugram = Counter(dict([(k,v) for k,v in ugram.items() if k in attribute_words_work]))
home_ugram = Counter(dict([(k,v) for k,v in ugram.items() if k in attribute_words_home]))
print('Work attribute word counts')
for (w,c) in work_ugram.most_common(20):
    print('\t{} {}'.format(w, c))
print('Home attribute word counts')
for (w,c) in home_ugram.most_common(20):
    print('\t{} {}'.format(w, c))
    
good_ugram = Counter(dict([(k,v) for k,v in ugram.items() if k in attribute_words_good]))
bad_ugram = Counter(dict([(k,v) for k,v in ugram.items() if k in attribute_words_bad]))
print('Good attribute word counts')
for (w,c) in good_ugram.most_common(20):
    print('\t{} {}'.format(w, c))
print('Bad attribute word counts')
for (w,c) in bad_ugram.most_common(20):
    print('\t{} {}'.format(w, c))


Male target word counts
	his 36
	men 31
	he 22
	male 21
	him 4
	man 3
	brother 2
	king 2
	father 1
	son 1
Female target word counts
	women 187
	her 69
	woman 58
	she 49
	mother 18
	female 11
	herself 7
	daughter 1
Work attribute word counts
	job 23
	business 16
	executive 12
	office 11
	corporation 11
	work 9
	professional 6
	hiring 6
	hire 2
	money 2
	salary 1
	corporate 1
Home attribute word counts
	family 66
	house 18
	children 15
	relative 8
	parent 3
	marriage 2
	cousins 2
	home 2
Good attribute word counts
	good 18
Bad attribute word counts
	abuse 4
	stress 3
	death 1
	bad 1


In [42]:
# ProjectUtils.plot_ngram_barchart(ugram)
# ProjectUtils.plot_ngram_barchart(bigram)
# ProjectUtils.plot_unigram_wordcloud(ugram)
# ProjectUtils.most_common_unigram_heatmap(ugram, bigram)
# ProjectUtils.most_common_bigram_heatmap(bigram)


In [43]:
# Should only need to do this once per weat test condition
ppmi, cooccurence, x2i, i2x = ProjectUtils.model_matrices(ugram, bigram)


In [44]:
svd = ProjectUtils.model_svd(ppmi, k=500)

<class 'scipy.sparse._csc.csc_matrix'>
U shape (5601, 500)
S shape (500,)
VT shape (500, 5601)


In [45]:
U = svd['U']

most_common_gender_words = dict(male_ugram.most_common(10) + female_ugram.most_common(10)).keys()
most_common_words = dict(ugram.most_common(20)).keys()
most_common_attribute_words = dict(work_ugram.most_common(10) + home_ugram.most_common(10)).keys()

k = 5
# for x in most_common_gender_words:
# for x in most_common_words:
for x in most_common_attribute_words:
    dd = np.dot(U, U[x2i[x]]) # Cosine similarity for this unigram against all others.
    s = ''
    # Compile the list of nearest neighbor descriptions.
    # Argpartition is faster than argsort and meets our needs.
    for i in np.argpartition(-1 * dd, k + 1)[:k + 1]:
        if i2x[i] == x: continue
        s += '%s, %.3lf\n ' % (i2x[i], dd[i])
    print('%s, %d\n %s' % (x, ugram[x], s))
    print('-' * 10)

job, 23
 issuewhether, 0.951
 jobrelated, 0.951
 ironic, 0.920
 jobrelatedness, 0.890
 invisible, 0.879
 
----------
business, 16
 button, 0.587
 buffers, 0.583
 bylaws, 0.568
 campaign, 0.511
 broadened, 0.483
 
----------
executive, 12
 expand, 0.677
 exceeded, 0.562
 exempt, 0.505
 exempting, 0.544
 eventual, 0.488
 
----------
office, 11
 offering, 0.581
 obligations, 0.631
 officer, 0.563
 ones, 0.608
 okla, 0.519
 
----------
corporation, 11
 continuity, 0.600
 contra, 0.634
 controls, 0.650
 credit, 0.556
 contemporary, 0.480
 
----------
work, 9
 workforce, 0.894
 workmanship, 0.886
 womenthat, 0.891
 worse, 0.871
 words, 0.851
 
----------
professional, 6
 protections, 0.621
 proposals, 0.641
 proclamation, 0.640
 print, 0.601
 profession, 0.594
 
----------
hiring, 6
 hired, 0.829
 holds, 0.779
 holdingthat, 0.795
 heterosexual, 0.761
 hire, 0.799
 
----------
hire, 2
 holdingthat, 0.879
 holds, 0.852
 heterosexual, 0.985
 height, 0.861
 hired, 0.978
 
----------
money, 2
 mi

In [46]:
model_kv = ProjectUtils.svd_keyed_vectors(svd, x2i, k=500)
model_kv

<gensim.models.keyedvectors.KeyedVectors at 0x1d40601f0>

In [47]:
gender_query = Query(
    target_sets=[
        charlesworth_2021_words['femalemax'],
        charlesworth_2021_words['malemax'],
    ],
    attribute_sets=[
        charlesworth_2021_words['homemax'],
        charlesworth_2021_words['workmax'],
    ],
    target_sets_names=["Female terms", "Male Terms"],
    attribute_sets_names=["Home terms", "Work terms"],
)

weat_results = ProjectUtils.kv_weat_test(model_kv, gender_query)
print(weat_results)

{'query_name': 'Female terms and Male Terms wrt Home terms and Work terms', 'result': -0.30621195326481637, 'weat': -0.30621195326481637, 'effect_size': -1.0497267291079782, 'p_value': 0.9859014098590141}
{'query_name': 'Female terms and Male Terms wrt Home terms and Work terms', 'result': -0.30621195326481637, 'weat': -0.30621195326481637, 'effect_size': -1.0497267291079782, 'p_value': 0.9859014098590141}


In [49]:
moral_query = Query(
    target_sets=[
        charlesworth_2021_words['femalemax'],
        charlesworth_2021_words['malemax'],
    ],
    attribute_sets=[
        charlesworth_2021_words['good'],
        charlesworth_2021_words['bad'],
    ],
    target_sets_names=["Female terms", "Male Terms"],
    attribute_sets_names=["Good terms", "Bad terms"],
)

weat_results = ProjectUtils.kv_weat_test(model_kv, moral_query, lost_vocab_threshold=1.)
print(weat_results)

{'query_name': 'Female terms and Male Terms wrt Good terms and Bad terms', 'result': 0.06449852808145806, 'weat': 0.06449852808145806, 'effect_size': 0.2783498618501594, 'p_value': 0.2806719328067193}
{'query_name': 'Female terms and Male Terms wrt Good terms and Bad terms', 'result': 0.06449852808145806, 'weat': 0.06449852808145806, 'effect_size': 0.2783498618501594, 'p_value': 0.2806719328067193}
