In [1]:
import pandas as pd
import os

data_dirpath = '/usr2/mamille2/tumblr/data/'

# Measure proportion of hegemonic labels

## Define set of hegemonic labels

In [2]:
hegemonic_labels = {
    'gender': [ # Should include 'male' and 'female', girl, etc, anything that isn't trans (make regular expressions)
        'cis',
    ],
    'sexual orientation': [
        'straight',
    ],
    'pronouns': [ # Should be better about any type of delimiter or no delimiter with these
        'she/her',
        'she her',
        'he/him',
        'he him',
    ],
    'ethnicity/nationality': [
        'white',
    ],
}

hegemonic_excl_terms = {
    'gender': [
    ],
    'sexual orientation': [
    ],
    'pronouns': [
        'they/them',
    ],
    'ethnicity/nationality': [
    ],
}

In [3]:
# Load annotated data
# data = pd.read_pickle(os.path.join(data_dirpath, 'blog_descriptions_recent100_100posts.pkl'))
data = pd.read_pickle(os.path.join(data_dirpath, 'blog_descriptions_recent100.pkl'))
print(len(data))
data.columns

5226750


Index(['tumblog_id', 'activity_time_epoch', 'tumblr_blog_name',
       'tumblr_blog_title', 'tumblr_blog_description', 'tumblr_blog_url',
       'tumblr_blog_theme', 'is_group_blog', 'is_primary', 'is_private',
       'created_time_epoch', 'updated_time_epoch', 'timezone', 'language',
       'blog_classifier', 'generated_date', 'parsed_blog_description',
       'age_terms', 'age', 'ethnicity/nationality_terms',
       'ethnicity/nationality', 'fandoms_terms', 'fandoms', 'gender_terms',
       'gender', 'gender/sexuality_terms', 'gender/sexuality',
       'interests_terms', 'interests', 'location_terms', 'location',
       'personality type_terms', 'personality type', 'pronouns_terms',
       'pronouns', 'relationship status_terms', 'relationship status',
       'roleplay_terms', 'roleplay', 'roleplay/fandoms_terms',
       'roleplay/fandoms', 'sexual orientation_terms', 'sexual orientation',
       'weight_terms', 'weight'],
      dtype='object')

In [6]:
# Basic stats
cats = ['age', 'location', 'gender', 'sexual orientation', 'pronouns', 
        'personality type', 'ethnicity/nationality', 'relationship status', 'roleplay',
        'fandoms', 'interests', 'weight',
        'gender/sexuality', 'roleplay/fandoms']

outlines = []

for col in cats:
    annotated = sum(data[col])
    outlines.append([col, annotated, annotated/len(data) * 100])
#     print(f"{col}: {annotated}\t{annotated/len(data): .1%}")

# No annotations
any_annote = sum([any(line) for line in list(zip(*[data[col] for col in cats]))])
no_annote = len(data) - any_annote
# print()
# print(f'# with no annotations: {no_annote}\t{no_annote/len(data): .1%}')
outlines.append(['none', no_annote, no_annote/len(data) * 100])

table = pd.DataFrame(outlines, columns=['category', 'n_instances', '% descriptions'])\
        .sort_values('percentage of descriptions', ascending=False).reset_index(drop=True)
table

Unnamed: 0,category,n_instances,percentage of descriptions
0,none,2935957,56.171751
1,gender/sexuality,860774,16.468628
2,interests,796236,15.233864
3,age,724675,13.864734
4,gender,480900,9.200746
5,roleplay/fandoms,414261,7.925786
6,location,388432,7.431616
7,fandoms,340216,6.509131
8,pronouns,307186,5.877189
9,sexual orientation,227868,4.35965


In [11]:
given_count = {}
for cat in hegemonic_labels:
    given_count[cat] = len(data[data[cat]==True])
    print(f'{cat} given: {given_count[cat]}')

gender given: 1990
sexual orientation given: 1309
pronouns given: 2494
ethnicity/nationality given: 728


In [9]:
# Annotate for hegemonic labels
for cat, terms in hegemonic_labels.items():
    data[f'{cat}_hegemonic_present'] = data['parsed_blog_description'].map(lambda x: any(t in x for t in terms) and \
                                                                          not any(e in x for e in hegemonic_excl_terms[cat]))

In [10]:
# Count hegemonic labels

hegemonic_count = {}

for cat in hegemonic_labels:
    hegemonic_count[cat] = len(data[(data[cat]==True) & (data[f'{cat}_hegemonic_present'] == True)])
    print(f"{cat}: {hegemonic_count[cat]}")

gender: 57
sexual orientation: 33
pronouns: 551
ethnicity/nationality: 52


In [14]:
# Build table of results

outlines = []
for cat in hegemonic_labels:
    outlines.append([cat, hegemonic_count[cat], given_count[cat], hegemonic_count[cat]/given_count[cat]])
    
results = pd.DataFrame(outlines, columns=['category', 'hegemonic number', 'given number', 'hegemonic %'])
results

Unnamed: 0,category,hegemonic number,given number,hegemonic %
0,gender,57,1990,0.028643
1,sexual orientation,33,1309,0.02521
2,pronouns,551,2494,0.22093
3,ethnicity/nationality,52,728,0.071429
