In [44]:
import pandas as pd
import os
from collections import Counter
import itertools
from tqdm import tqdm_notebook as tqdm
from operator import itemgetter

data_dirpath = '/usr2/mamille2/tumblr/data'

# Find mutual info between tags and identity categories

In [4]:
# Load data
posts = pd.read_pickle(os.path.join(data_dirpath, 'textposts_100posts.pkl'))
print(posts.shape)
posts.columns

(2026600, 28)


Index(['post_id', 'activity_time_epoch', 'tumblog_id', 'post_title',
       'post_short_url', 'post_type', 'post_caption', 'post_format',
       'post_note_count', 'created_time_epoch', 'updated_time_epoch',
       'is_submission', 'source_title', 'source_url', 'post_classifier',
       'blog_classifier', 'accepts_answers', 'reblogged_from_post_id',
       'reblogged_from_metadata', 'root_post_id', 'body', 'mentions',
       'post_tags', 'body_toks', 'body_str', 'body_toks_no_titles',
       'body_toks_str_no_titles', 'parsed_tags_minfreq3'],
      dtype='object')

In [6]:
descs = pd.read_pickle(os.path.join(data_dirpath, 'blog_descriptions_recent100_100posts.pkl'))
print(len(descs))
descs.columns

20266


Index(['tumblog_id', 'activity_time_epoch', 'tumblr_blog_name',
       'tumblr_blog_title', 'tumblr_blog_description', 'tumblr_blog_url',
       'tumblr_blog_theme', 'is_group_blog', 'is_primary', 'is_private',
       'created_time_epoch', 'updated_time_epoch', 'timezone', 'language',
       'blog_classifier', 'generated_date', 'parsed_blog_description', 'age',
       'gender', 'sexual orientation', 'pronouns', 'personality type',
       'ethnicity/nationality', 'relationship status', 'age_terms',
       'gender_terms', 'sexual orientation_terms', 'pronouns_terms',
       'personality type_terms', 'ethnicity/nationality_terms',
       'relationship status_terms', 'location_terms', 'location',
       'roleplay_terms', 'roleplay', 'fandoms_terms', 'fandoms',
       'interests_terms', 'interests', 'weight_terms', 'weight',
       'gender/sexuality_terms', 'gender/sexuality', 'roleplay/fandoms_terms',
       'roleplay/fandoms'],
      dtype='object')

In [12]:
# Build counters of tags with categories, as well as tags and categories by themselves
with open(os.path.join(data_dirpath, 'identity_categories.txt')) as f:
    categories = f.read().splitlines()
print(len(categories))
    
tags = set([t for l in posts['parsed_tags_minfreq3'].tolist() for t in l])
print(len(tags))

14
101714


In [14]:
category_ctr = {cat: sum(descs[cat]) for cat in categories}
category_ctr

{'age': 3552,
 'ethnicity/nationality': 728,
 'fandoms': 2091,
 'gender': 1990,
 'gender/sexuality': 4717,
 'interests': 3298,
 'location': 1377,
 'personality type': 912,
 'pronouns': 2494,
 'relationship status': 446,
 'roleplay': 2040,
 'roleplay/fandoms': 3472,
 'sexual orientation': 1309,
 'weight': 457}

In [15]:
tag_ctr = Counter([t for l in posts['parsed_tags_minfreq3'].tolist() for t in l])
tag_ctr.most_common(20)

[('text', 11247),
 ('me', 8939),
 ('ooc', 8343),
 ('personal', 8285),
 ('appless_rp', 8159),
 ('ifttt', 7068),
 ('oc_rp', 6325),
 ('nsfw', 6124),
 ('q', 6085),
 ('txt', 6018),
 ('rp', 4539),
 ('ic', 4495),
 ('new_rp', 4471),
 ('twitter', 4370),
 ('chat', 4231),
 ('c', 4094),
 ('queue', 3690),
 ('convo', 3427),
 ('reblog', 2840),
 ('writing', 2442)]

In [27]:
top = {}
top[1000] = [wd for wd,_ in tag_ctr.most_common(1000)]
top[500] = [wd for wd,_ in tag_ctr.most_common(500)]
top[100] = [wd for wd,_ in tag_ctr.most_common(100)]

In [19]:
# Join with identity annotations by tumblr id
merged = pd.merge(posts, descs, how='left', on=['tumblog_id']).loc[:, ['tumblog_id', 'parsed_tags_minfreq3'] + categories]
merged.shape

(2026600, 16)

In [36]:
tag = 'chat'
category = 'sexual orientation'
len(merged[merged[category]==True])
# mask = [tag in t[0] and t[1] for t in zip(merged['parsed_tags_minfreq3'], merged[category])]
# len(merged[mask])

130900

In [48]:
category_tag_ctr = {}
n_items = 1000
# for tag, category in tqdm(itertools.product(top[n_items], categories), total=n_items*len(categories)):
# for c in categories:
for c in ['gender/sexuality']:
    print(c)
    category_tag_ctr[c] = {}
    cat_true = merged[merged[c]==True]
    for tag in tqdm(top[n_items]):
        category_tag_ctr[c][tag] = sum([tag in t for t in cat_true['parsed_tags_minfreq3']])
#         mask = [tag in t[0] and t[1] for t in zip(merged['parsed_tags_minfreq3'], merged[category])]
#         category_tag_ctr[(tag,category)] = sum(mask)

len(category_tag_ctr)

gender/sexuality


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




1

In [49]:
sorted(category_tag_ctr.items(), key=itemgetter(1), reverse=True)

[('gender/sexuality',
  {'text': 3789,
   'me': 2641,
   'ooc': 1438,
   'personal': 2300,
   'appless_rp': 2636,
   'ifttt': 476,
   'oc_rp': 1882,
   'nsfw': 1427,
   'q': 1630,
   'txt': 1786,
   'rp': 1265,
   'ic': 714,
   'new_rp': 1659,
   'twitter': 479,
   'chat': 1341,
   'c': 1040,
   'queue': 769,
   'convo': 1086,
   'reblog': 889,
   'writing': 525,
   'tbd': 314,
   'meme': 444,
   'thinspo': 555,
   'town_rp': 412,
   'lol': 561,
   'about_me': 912,
   'twitter_rp': 688,
   'text_post': 462,
   'poetry': 312,
   'ana': 408,
   'roleplay': 507,
   'important': 724,
   'video': 141,
   'dash': 984,
   'anal': 346,
   'porn': 241,
   'mine': 522,
   'para': 567,
   'rpg': 628,
   'polyvore': 187,
   'same': 537,
   'lmao': 465,
   'tbd.': 231,
   '.txt': 566,
   'love': 351,
   'bts': 333,
   'wordpress': 0,
   'mobile': 258,
   'replies': 502,
   'sex': 180,
   'ooc.': 197,
   'about': 252,
   'yes': 461,
   'submission': 446,
   'promo': 175,
   'fuck': 332,
   'poem': 1

In [None]:
mi = {}
for c in ['gender/sexuality']:
    mi[c] = {}
    for t, count in category_tag_ctr[c].items():
        mi[c][t] = count/(category_ctr[c] * tag_ctr[t])
len(mi)

In [53]:
sorted(mi['gender/sexuality'].items(), key=itemgetter(1), reverse=True)

[('kikgirls', 0.00021199915200339198),
 ('hotbabe', 0.00021199915200339198),
 ('wetpussy', 0.00021199915200339198),
 ('sexygirl', 0.00021199915200339198),
 ('hornygirl', 0.00021199915200339198),
 ('hornyslut', 0.00021199915200339198),
 ('next_gen_rp', 0.00021199915200339198),
 ('hornyasfuck', 0.00021199915200339198),
 ('pussyplay', 0.00021199915200339198),
 ('freakygirl', 0.00021199915200339198),
 ('c:_all', 0.00021091197686491306),
 ('freak', 0.00021033641355630656),
 ('freaky', 0.00021032986734194795),
 ('mistress', 0.00020950504433276385),
 ('batesmb', 0.00020587200888190667),
 ('kikme', 0.00020478215959476588),
 ('kikmeguys', 0.00019255818173233858),
 ('royal_rp', 0.00018637288088210285),
 ('hmu', 0.00017666596000282667),
 ('master', 0.000170210857618108),
 ('dc', 0.00016391687010571546),
 ('nonsims', 0.0001639067517804003),
 ('not_mine', 0.00016008099232909192),
 ('erotic', 0.00015719665012496413),
 ('hottie', 0.00015657453710054441),
 ('dom', 0.00015650199179307995),
 ('dean_winc