In [9]:
import pandas as pd
import os
import re
from tqdm import tqdm_notebook as tqdm
import numpy as np
from urllib.parse import urlsplit

data_dirpath = '/usr2/mamille2/tumblr/data/'

# Examine quick 50 blog author identity annotations

In [12]:
annotations = pd.read_csv(os.path.join(data_dirpath, 'tumblr_identity_annotation.csv'))
print(annotations.columns)
print(len(annotations))

Index(['Tumblr blog', 'non-personal curation', 'rp', 'gender',
       'sexual orientation', 'pronouns', 'ethnicity/nationality',
       'gender evidence', 'sexual orientation evidence', 'ethnicity evidence',
       'notes'],
      dtype='object')
50


In [17]:
for cat in ['gender', 'sexual orientation', 'pronouns', 'ethnicity/nationality']:
    print(cat)
    print(annotations[cat].value_counts())
    print(f"N/A: {len(annotations) - annotations[cat].count()}")
    print()

gender
female         17
male           8 
LGBTQ "guy"    1 
cis female     1 
Name: gender, dtype: int64
N/A: 23

sexual orientation
lesbian     6
bisexual    4
straight    3
gay         1
LGBTQ       1
bi/aro      1
Name: sexual orientation, dtype: int64
N/A: 34

pronouns
she/her        4
they/them      3
he/she/they    1
Name: pronouns, dtype: int64
N/A: 42

ethnicity/nationality
white                5
Spanish-speaking     2
Turkish              2
Brazilian            1
Hungarian            1
black                1
Italian              1
French               1
Australian/white     1
German               1
Indian, Brazilian    1
Name: ethnicity/nationality, dtype: int64
N/A: 33



# Get list of blogs for sample human annotation (out of data from those who fill out blog descriptions)

In [2]:
# Load textposts_recent100

fpath = os.path.join(data_dirpath, 'textposts_recent100.pkl')
posts = pd.read_pickle(fpath)
print(posts.columns)
print(len(posts))

Index(['post_id', 'activity_time_epoch', 'tumblog_id', 'post_title',
       'post_short_url', 'post_type', 'post_caption', 'post_format',
       'post_note_count', 'created_time_epoch', 'updated_time_epoch',
       'is_submission', 'source_title', 'source_url', 'post_classifier',
       'blog_classifier', 'accepts_answers', 'reblogged_from_post_id',
       'reblogged_from_metadata', 'root_post_id', 'body', 'mentions',
       'post_tags', 'parsed_tags_minfreq1'],
      dtype='object')
27342192


In [6]:
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 1000)

In [8]:
# Sample from blogs
non_nan_sources = posts['source_url'].dropna(inplace=False)
blog_sample = non_nan_sources.sample(100)
blog_sample

548499      http://just-shower-thoughts.tumblr.com/post/136782798724/everything-in-the-universe-is-either-a-potato-or     
21547974    http://taubchen.tumblr.com/post/159869605379/can-someone-tell-me-if-this-is-strawberry-or                     
26622512    http://jee-q.tumblr.com/post/132179290518/im-not-really-single-im-dating-myself-i-take                        
23908855    http://azereong.tumblr.com/post/157633624763                                                                  
8386683     https://blackbutler-erabelle.tumblr.com/post/155264886271/cuteness-overload                                   
572788      http://saekou.tumblr.com/post/96128209404/big-owain                                                           
15760018    http://zany-the-nerd.tumblr.com/post/151427111107/child-whats-a-vhs-me                                        
2491788     http://eeveestevie.tumblr.com/post/156719347344/if-ive-ever-made-you-cum-you-owe-me-a-vday-gift               
10499450    http

In [11]:
blog_urls = [urlsplit(url).netloc for url in blog_sample]
blog_urls

['just-shower-thoughts.tumblr.com',
 'taubchen.tumblr.com',
 'jee-q.tumblr.com',
 'azereong.tumblr.com',
 'blackbutler-erabelle.tumblr.com',
 'saekou.tumblr.com',
 'zany-the-nerd.tumblr.com',
 'eeveestevie.tumblr.com',
 'bonjourfrenchwords.tumblr.com',
 '66koi.tumblr.com',
 'drowzylove.tumblr.com',
 'chessys.tumblr.com',
 'acoolsuggestion.tumblr.com',
 'earth-suggestion.tumblr.com',
 'eskidencoksevmisbiri.tumblr.com',
 'brown-nena.tumblr.com',
 'markusaleksander.tumblr.com',
 'gatorbiscuits.tumblr.com',
 'thexfiles.tumblr.com',
 'io-sono-fedele.tumblr.com',
 'depressionfordays.tumblr.com',
 'florpincel.tumblr.com',
 'artofthecatt.tumblr.com',
 'daddy-crowe.tumblr.com',
 'vanillatwilightwallflower.tumblr.com',
 'lez-bruh.tumblr.com',
 'a--z--u--l.tumblr.com',
 'fadedfemales.tumblr.com',
 'hlbstrk.tumblr.com',
 'almejavas.tumblr.com',
 'heartlessharless.tumblr.com',
 'wallpaperprintery.tumblr.com',
 'shittyidea.tumblr.com',
 'darckcarnival.tumblr.com',
 'rad-x.tumblr.com',
 'nacho2x1.tum

# Investigate labels for gender/sexuality or ethnicity outside of blog descriptions

## Sample blogs of people who give gender/sexuality or ethnicity labels

In [3]:
# Load descriptions with category annotations
descs = pd.read_pickle(os.path.join(data_dirpath, 'blog_descriptions_recent100.pkl')) # should annotate and use full set
print(len(descs))
print(descs.columns)

5226750
Index(['tumblog_id', 'activity_time_epoch', 'tumblr_blog_name',
       'tumblr_blog_title', 'tumblr_blog_description', 'tumblr_blog_url',
       'tumblr_blog_theme', 'is_group_blog', 'is_primary', 'is_private',
       'created_time_epoch', 'updated_time_epoch', 'timezone', 'language',
       'blog_classifier', 'generated_date', 'parsed_blog_description',
       'age_terms', 'age', 'ethnicity/nationality_terms',
       'ethnicity/nationality', 'fandoms_terms', 'fandoms', 'gender_terms',
       'gender', 'gender/sexuality_terms', 'gender/sexuality',
       'interests_terms', 'interests', 'location_terms', 'location',
       'personality type_terms', 'personality type', 'pronouns_terms',
       'pronouns', 'relationship status_terms', 'relationship status',
       'roleplay_terms', 'roleplay', 'roleplay/fandoms_terms',
       'roleplay/fandoms', 'sexual orientation_terms', 'sexual orientation',
       'weight_terms', 'weight', 'ethnicity/nationality_hegemonic_present',
       'gen

In [5]:
descs.loc[descs['gender/sexuality']==True, ['tumblr_blog_url', 'gender/sexuality_terms']].sample(200)

Unnamed: 0,tumblr_blog_url,gender/sexuality_terms
2787979,https://nawtig.tumblr.com/,[them]
3349673,http://soaringstarbolts.tumblr.com/,"[girl, princess]"
5429055,https://subyeollie.tumblr.com/,"[he/, him]"
5557095,https://wrenweasly.tumblr.com/,"[/they/, them, pronouns]"
5559083,https://zeref-darkmoon.tumblr.com/,[man]
3590676,https://reginadellerose0.tumblr.com/,[Queen]
5137910,https://little-slut-baby.tumblr.com/,[her]
4156573,https://publicmaker123.tumblr.com/,[Gay]
4363527,https://fan-girl-lgbt.tumblr.com/,"[girl, Gay, Ace, LGBT]"
2108654,https://feet-and-tickles.tumblr.com/,[female]


# Get tumblog IDs of users who give hegemonic gender/sexuality labels

In [2]:
# Load hegemonic annotations
data = pd.read_pickle(os.path.join(data_dirpath, 'blog_descriptions_recent100.pkl')) # should annotate and use full set
print(len(data))
print(data.columns)

5226750
Index(['tumblog_id', 'activity_time_epoch', 'tumblr_blog_name',
       'tumblr_blog_title', 'tumblr_blog_description', 'tumblr_blog_url',
       'tumblr_blog_theme', 'is_group_blog', 'is_primary', 'is_private',
       'created_time_epoch', 'updated_time_epoch', 'timezone', 'language',
       'blog_classifier', 'generated_date', 'parsed_blog_description',
       'age_terms', 'age', 'ethnicity/nationality_terms',
       'ethnicity/nationality', 'fandoms_terms', 'fandoms', 'gender_terms',
       'gender', 'gender/sexuality_terms', 'gender/sexuality',
       'interests_terms', 'interests', 'location_terms', 'location',
       'personality type_terms', 'personality type', 'pronouns_terms',
       'pronouns', 'relationship status_terms', 'relationship status',
       'roleplay_terms', 'roleplay', 'roleplay/fandoms_terms',
       'roleplay/fandoms', 'sexual orientation_terms', 'sexual orientation',
       'weight_terms', 'weight', 'ethnicity/nationality_hegemonic_present',
       'gen

In [3]:
heg_gs_label = data[data['gender/sexuality_hegemonic_present'] == True]
len(heg_gs_label)

69381

In [4]:
ids = heg_gs_label['tumblog_id'].tolist()
len(ids)

69381

In [6]:
with open(os.path.join(data_dirpath, 'hegemonic_gender_sexuality_tumblog_ids.txt'), 'w') as f:
    for i in sorted(ids):
        f.write(f'{i}\n')

# Measure proportion of hegemonic labels

## Define set of hegemonic labels

In [5]:
hegemonic_labels = {
    'gender': [
        r'\bcis',
    ],
    'sexual orientation': [
        'straight', r'\bcishet',
    ],
    'pronouns': [ # Should be better about any type of delimiter or no delimiter with these
        'she/her',
        'she her',
        'he/him',
        'he him',
    ],
    'ethnicity/nationality': [
        'white', 'caucasian',
    ],
}

opp_hegemonic_labels = {
    'gender': [
        'trans', 'ftm', 'mtf',
        'nonbinary', r'non-binary', r'non binary', '\bnb\b', '\bbigender', r'\bagender', r'neutrois', 
        r'genderfluid', r'gender-fluid',
        r'lgbt', r'\bqueer\b',
    ],
    'sexual orientation': [
        r'gay', 'r\bhomo\b',
        r'lesbian',
        r'bisexual', r'\bbi\b', r'bisexual', r'pansexual', r'\bpan\b', r'\bwlw\b', r'\bmlm\b',
        r'lgbt', r'\bqueer\b',
        r'\bace\b', r'\basexual', r'aro-ace', r'aro/ace',
        r'demisexual',
    ],
    'pronouns': [ # Should be better about any type of delimiter or no delimiter with these
        r'(?:\W|\b)they(?:\W|\b)', r'(?:\W|\b)them(?:\W|\b)',
        r'(?:\W|\b)xe(?:\W|\b)', r'(?:\W|\b)xem(?:\W|\b)',
        r'theythem',
    ],
    'ethnicity/nationality': [
        r'black', r'african(-| )american', r'afro(-| )american',
        r'latin', r'afrolatin',
        r'cuban', r'mexican',
        r'asian', r'chinese', r'korean', r'japanese', r'indian',
    ],
}

hegemonic_excl_terms = {
    'gender': [
        r"isn't a cishet",
        r'if ur a cishet',
        r"don't call me: cishet",
        r"cishet aces",
        r"anti-cishet",
        r"cishets begone",
        r"cishet scum",
        r"hate cishets",
        r"cishet sims",
        r"isntcishet",
        r"cishets",
        r"you are white or cishet",
        r"cishet men",
        r"cishet males",
        r"cishet guys",
        r"terf/twerf/cishet/man",
        r"if cishet",
        r"not cishet",
        r"isn't a cishet",
        r'if ur a cishet',
    ],
    'pronouns': [
        'they/them',
    ],
    'ethnicity/nationality': [
        r'white lies',
        r'snow white',
        r'great white',
    ],
}
hegemonic_excl_terms['sexual orientation'] = hegemonic_excl_terms['gender']

In [None]:
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 1000)

In [None]:
# Load annotated data
# data = pd.read_pickle(os.path.join(data_dirpath, 'blog_descriptions_recent100_100posts.pkl'))
data = pd.read_pickle(os.path.join(data_dirpath, 'blog_descriptions_recent100.pkl'))
print(len(data))
data.columns

In [None]:
# Basic stats
cats = ['age', 'location', 'gender', 'sexual orientation', 'pronouns', 
        'personality type', 'ethnicity/nationality', 'relationship status', 'roleplay',
        'fandoms', 'interests', 'weight',
        'gender/sexuality', 'roleplay/fandoms']

outlines = []

for col in cats:
    annotated = sum(data[col])
    outlines.append([col, annotated, annotated/len(data) * 100])
#     print(f"{col}: {annotated}\t{annotated/len(data): .1%}")

# No annotations
any_annote = sum([any(line) for line in list(zip(*[data[col] for col in cats]))])
no_annote = len(data) - any_annote
# print()
# print(f'# with no annotations: {no_annote}\t{no_annote/len(data): .1%}')
outlines.append(['none', no_annote, no_annote/len(data) * 100])

table = pd.DataFrame(outlines, columns=['category', 'n_instances', '% descriptions'])\
        .sort_values('% descriptions', ascending=False).reset_index(drop=True)
table

## Annotate hegemonic, opposite labels

In [None]:
# Annotate for hegemonic, opposite labels
for cat, terms in sorted(hegemonic_labels.items()):
    print(cat)
#     data[f'{cat}_hegemonic_present'] = data['parsed_blog_description'].map(lambda x: any(re.search(t, x) for t in terms) and \
#                                                                           not any(re.search(e, x) for e in hegemonic_excl_terms[cat]))
    data[f'{cat}_hegemonic_present'] = list(map(lambda x: any(re.search(t, x) for t in terms) and not any(re.search(e, x) for e in hegemonic_excl_terms[cat]),
                                                                                                          data['parsed_blog_description']))
    
print()
for cat, terms in sorted(opp_hegemonic_labels.items()):
    print(cat)
#     data[f'{cat}_opposite_present'] = data['parsed_blog_description'].map(lambda x: any(re.search(t, x) for t in terms))
    data[f'{cat}_opposite_present'] = list(map(lambda x: any(re.search(t, x) for t in terms), tqdm(data['parsed_blog_description'].tolist())))

In [None]:
# Combine gender, sexual orientation and pronouns to annonate for gender/sexuality/hegemonic
data['gender/sexuality_opposite_present'] = [g or s or p for (g,s,p) in zip(*[data[f'{cat}_opposite_present'] for cat in ['gender', 'sexual orientation', 'pronouns']])]
print(sum(data['gender/sexuality_opposite_present']))
data['gender/sexuality_hegemonic_present'] = [g or s or p for (g,s,p) in zip(*[data[f'{cat}_hegemonic_present'] for cat in ['gender', 'sexual orientation', 'pronouns']])]
print(sum(data['gender/sexuality_hegemonic_present']))

In [None]:
# Save hegemonic annotations
data.to_pickle(os.path.join(data_dirpath, 'blog_descriptions_recent100.pkl'))

In [None]:
# Count hegemonic labels
hegemonic_count = {}
cats = list(hegemonic_labels.keys()) + ['gender/sexuality']
for cat in cats:
    hegemonic_count[cat] = len(data[(data[cat]==True) & (data[f'{cat}_hegemonic_present'] == True)])
    
# Count opposite labels
opp_count = {}
for cat in cats:
    opp_count[cat] = len(data[(data[cat]==True) & (data[f'{cat}_opposite_present'] == True)])

# Build table of results
outlines = []
for cat in cats:
    given_count = table.loc[table['category']==cat, 'n_instances'].item()
    outlines.append([cat, given_count, hegemonic_count[cat], hegemonic_count[cat]/given_count * 100,
                    opp_count[cat], opp_count[cat]/given_count * 100])
    
results = pd.DataFrame(outlines, columns=['category', 'given number', 'hegemonic number', 'hegemonic %', 
                                         'opposite number', 'opposite %'])
results

## 1-time

In [None]:
# Check for certain terms
# term = r'\bhet\b'
term = r'\basdfalskdjf ;alkdjs f;lakjsa\b'
term_rows = data.loc[data['parsed_blog_description'].map(lambda x: True if re.search(term, x) is not None else False), ['parsed_blog_description', 'gender', 'sexual orientation']]
print(len(term_rows))
term_rows

In [None]:
min(term_rows['gender'])

# See which factors are associated with giving a hegemonic label

## Count how many hegemonic labels are given w and wo non-hegemonic labels

In [2]:
# Load data
data = pd.read_pickle(os.path.join(data_dirpath, 'blog_descriptions_recent100.pkl'))
print(len(data))
data.columns

5226750


Index(['tumblog_id', 'activity_time_epoch', 'tumblr_blog_name',
       'tumblr_blog_title', 'tumblr_blog_description', 'tumblr_blog_url',
       'tumblr_blog_theme', 'is_group_blog', 'is_primary', 'is_private',
       'created_time_epoch', 'updated_time_epoch', 'timezone', 'language',
       'blog_classifier', 'generated_date', 'parsed_blog_description',
       'age_terms', 'age', 'ethnicity/nationality_terms',
       'ethnicity/nationality', 'fandoms_terms', 'fandoms', 'gender_terms',
       'gender', 'gender/sexuality_terms', 'gender/sexuality',
       'interests_terms', 'interests', 'location_terms', 'location',
       'personality type_terms', 'personality type', 'pronouns_terms',
       'pronouns', 'relationship status_terms', 'relationship status',
       'roleplay_terms', 'roleplay', 'roleplay/fandoms_terms',
       'roleplay/fandoms', 'sexual orientation_terms', 'sexual orientation',
       'weight_terms', 'weight', 'ethnicity/nationality_hegemonic_present',
       'gender_hege

In [3]:
hegemonic_cols = [col for col in data.columns.tolist() if 'hegemonic' in col]
hegemonic_cols = [col for col in hegemonic_cols if col != 'gender/sexuality_hegemonic_present']
print(hegemonic_cols)

opposite_cols = [col for col in data.columns.tolist() if 'opposite' in col]
opposite_cols = [col for col in opposite_cols if col != 'gender/sexuality_opposite_present']
print(opposite_cols)

['ethnicity/nationality_hegemonic_present', 'gender_hegemonic_present', 'pronouns_hegemonic_present', 'sexual orientation_hegemonic_present']
['ethnicity/nationality_opposite_present', 'gender_opposite_present', 'pronouns_opposite_present', 'sexual orientation_opposite_present']


In [5]:
mask = [any(t) for t in zip(*[data[col] for col in opposite_cols])]
opp = data[mask]
count_opp = len(opp)
count_opp

In [9]:
count_nonopp = len(data) - count_opp
count_nonopp

4949845

In [10]:
len(all_hegemonic)/count_nonopp

0.011584201121449257

In [7]:
mask = [any(t) for t in zip(*[data[col] for col in hegemonic_cols])]
any_hegemonic = data[mask]
len(any_hegemonic)

81256

In [8]:
mask = [not any(t) for t in zip(*[any_hegemonic[col] for col in opposite_cols])]
all_hegemonic = any_hegemonic[mask]
len(all_hegemonic)

57340

In [None]:
len(all_hegemonic)/len(any_hegemonic)