In [1]:
import pandas as pd
import numpy as np
import pickle
import re
import ast

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn import svm
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.mixture import GaussianMixture
from sklearn.metrics import cohen_kappa_score, make_scorer, confusion_matrix
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from scipy.spatial.distance import cosine, euclidean
from tqdm import tqdm_notebook as tqdm

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model
from keras.layers import Dense, Conv1D, MaxPooling1D, Embedding, Flatten, Dropout
from keras.callbacks import EarlyStopping

import os,sys,inspect
sys.path.insert(1, os.path.join(sys.path[0], '..'))
# from .. import gaussian_mixture_cotrain
from gaussian_mixture_cotrain import GaussianMixtureCotrain

from collections import Counter, defaultdict

from IPython.core.debugger import set_trace
from IPython.display import display

import matplotlib.pyplot as plt

import fasttext as ft
from pprint import pprint

data_dirpath = '/usr2/mamille2/tumblr/data'

Using Theano backend.


In [2]:
# Restrict GPUs
# os.environ["CUDA_VISIBLE_DEVICES"] = "0,2"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# PCA over blog description unigrams, annotations

In [4]:
# Load data
data = pd.read_csv(os.path.join(data_dirpath, 'blog_descriptions_unigrams_annotations.csv'))
data.shape

  interactivity=interactivity, compiler=compiler, result=result)


(20267, 6068)

In [15]:
data.columns[-10:]

Index([''\\u2265'', ''\\uf8ff'', 'age__column',
       'ethnicity/nationality__column', 'gender__column',
       ''personality type__column'', 'pronouns__column',
       ''relationship status__column'', ''sexual orientation__column'',
       'CLASS'],
      dtype='object')

In [16]:
data["'sexual orientation__column'"].dtype

dtype('int64')

In [17]:
data.drop('CLASS', axis=1, inplace=True)

In [18]:
pca = PCA(n_components=50)
reduced = pca.fit_transform(data.values)
pca.explained_variance_ratio_

array([ 0.06041008,  0.01678474,  0.01618036,  0.01324199,  0.01311127,
        0.01136064,  0.01058365,  0.01005073,  0.00928222,  0.00919893,
        0.00866288,  0.00820081,  0.00798926,  0.00770643,  0.00685603,
        0.00658159,  0.00649684,  0.0063026 ,  0.00611545,  0.00607137,
        0.00600436,  0.00574545,  0.00555798,  0.00545096,  0.00539012,
        0.00503976,  0.00494929,  0.00490496,  0.00474186,  0.00450159,
        0.00440051,  0.00426845,  0.0041858 ,  0.00413887,  0.00407663,
        0.00394609,  0.00389955,  0.00384579,  0.00376493,  0.00370142,
        0.00360812,  0.00352819,  0.00349688,  0.00341546,  0.00335925,
        0.00330033,  0.00320191,  0.00313664,  0.00310182,  0.0030661 ])

In [19]:
reduced.shape

(20267, 50)

In [20]:
pca.components_.shape

(50, 6067)

In [21]:
feat_names = data.columns.tolist()
len(feat_names)

6067

In [47]:
def feats_for_factors(feature_names, pca, n_factors=20, n_feats=40):
    top = np.argsort(pca.components_)[-1*n_factors:]
    vec = np.vectorize(lambda x: feat_names[x])
    return top_vec[:, :n_feats]

In [49]:
top_feats = feats_for_factors(feat_names, pca)
top_feats[0]

array(['pronouns__column', 'and', 'my', 'to', 'is', "'\\\\u201a'",
       '<PERIOD>', "'she\\\\/her'", "'sexual orientation__column'", 'they',
       'she', 'her', 'cents', "'they\\\\/them'", "'\\\\u00e4'", 'them',
       "'he\\\\/him'", 'with', "'\\\\u00f4'", 'are', 'he', 'love',
       "'\\\\u220f'", 'name', 'welcome', "'\\\\u00e8'", 'pronouns',
       "'\\\\uf8ff'", 'life', 'gay', 'we', 'their', 'him', 'free', 'that',
       'all', "'\\\\u00e4\\\\u00f4s'", 'who', 'your', "'\\\\u00a7'"],
      dtype='<U51')

In [50]:
top_feats = feats_for_factors(feat_names, pca)
top_feats[1]

array(['pronouns__column', 'is', '<COMMA>', 'blog', 'i', '|',
       "'she\\\\/her'", 'the', "'n\\\\u2019t'", 'this', 'my', 'they',
       "'\\\\u2019s'", 'but', 'by', 'do', 'she', "'he\\\\/him'",
       "'they\\\\/them'", 'her', "'sexual orientation__column'", 'for',
       'it', 'that', 'them', 'indie', 'rp', 'not', 'he', 'what',
       'pronouns', 'so', 'written', '-lsb-', '-rsb-', '<AMPERSAND>',
       'selective', 'before', "'\\\\u2019m'", 'him'],
      dtype='<U51')

In [51]:
top_feats = feats_for_factors(feat_names, pca)
top_feats[2]

array(['<CLOSEPAREN>', '<OPENPAREN>', 'my', 'is', 'to', 'the', '<COMMA>',
       '<COLON>', 'this', 'of', 'in', 'name', 'not', '<PERIOD>', 'life',
       'welcome', 'gender__column', 'all', "'\\\\u221a'", 'sw', 'cw',
       'but', 'tumblr', 'world', 'old', 'ugw', 'no', 'new', 'body', '5',
       'trying', 'live', 'here', 'years', 'on', 'best', 'favorite', 'am',
       'at', 'art'],
      dtype='<U51')

In [52]:
top_feats = feats_for_factors(feat_names, pca)
top_feats[3]

array(['to', 'blog', 'pronouns__column', '<COMMA>', 'be', '|', 'a',
       '<EXCLAMATIONMARK>', 'for', 'rp', 'i', 'free', 'the', 'me', 'feel',
       'just', 'by', '<COLON>', 'indie', '<AMPERSAND>', 'welcome',
       '<DASH>', 'trying', 'moved', '-lsb-', '-rsb-', 'selective',
       "'\\\\u221a'", 'want', 'read', 'n', "'she\\\\/her'", "'\\\\u2019m'",
       's', 'rules', "'\\\\u2020'", 'e', 'she', 'from', 'message'],
      dtype='<U51')

# Examine non-list annotated descriptions

In [2]:
# Load annotated descriptions
descs = pd.read_pickle(os.path.join(data_dirpath, 'blog_descriptions_recent100_100posts.pkl'))
print(len(descs))
print(descs.columns)
print()

# Basic stats
cats = ['age', 'gender', 'sexual orientation', 'pronouns', 
        'personality type', 'ethnicity/nationality', 'relationship status', 
        'sexuality/gender']
for col in cats:
    annotated = sum(descs[col])
    print(f"{col}: {annotated}\t{annotated/len(descs): .1%}")

# No annotations
any_annote = sum([any(line) for line in list(zip(*[descs[col] for col in cats]))])
no_annote = len(descs) - any_annote
print()
print(f'# with no annotations: {no_annote}\t{no_annote/len(descs): .1%}')

20266
Index(['tumblog_id', 'activity_time_epoch', 'tumblr_blog_name',
       'tumblr_blog_title', 'tumblr_blog_description', 'tumblr_blog_url',
       'tumblr_blog_theme', 'is_group_blog', 'is_primary', 'is_private',
       'created_time_epoch', 'updated_time_epoch', 'timezone', 'language',
       'blog_classifier', 'generated_date', 'parsed_blog_description', 'age',
       'gender', 'sexual orientation', 'pronouns', 'personality type',
       'ethnicity/nationality', 'relationship status', 'sexuality/gender',
       'age_terms', 'gender_terms', 'sexual orientation_terms',
       'pronouns_terms', 'personality type_terms',
       'ethnicity/nationality_terms', 'relationship status_terms',
       'sexuality/gender_terms'],
      dtype='object')

age: 4090	 20.2%
gender: 1772	 8.7%
sexual orientation: 1296	 6.4%
pronouns: 2598	 12.8%
personality type: 450	 2.2%
ethnicity/nationality: 768	 3.8%
relationship status: 440	 2.2%
sexuality/gender: 4513	 22.3%

# with no annotations: 12877	 63.

## Evaluate quality of annotations

In [3]:
pd.set_option('display.max_colwidth', -1)

In [4]:
for col in cats:
    display(descs[descs[col]==True].sample(30).loc[:, ['parsed_blog_description', col, f'{col}_terms']])

Unnamed: 0,parsed_blog_description,age,age_terms
3412463,"19. UK. Female. INFP. TJLC. I occasionally write, I'm also teaching myself how to do video editing! ^^ NOT SPOILER-FREE!!",True,[19]
3010724,"warning: this blog is mostly nsfw. 18+ only!! 21. chubby, little virgin exploring my sexual interests. I have a Daddy💕",True,[21]
2108066,jess | she/her | 27 | CST aries | INFP | melancholic | rude bitch,True,[27]
3530273,11 0800+,True,[11]
1966592,HOLLIS REED ; TWENTY-ONE. under construction.,True,[TWENTY]
1950010,And I think the strangest thing is thinking the only man that could ever understand me is fifty-eight. Sixteen Morrissey and David Bowie enthusiast,True,"[fifty, Sixteen]"
1339812,"Just your average 18 year old girl obsessed with bands, poetry, and psychology. Send me messages pls!",True,[18]
3211635,63,True,[63]
3730889,"Hi, I'm Sian! I'm 19 and I'm from Scotland. Last year, I was on/off with my fitness journey but now I'm making a proper commitment to be healthier!",True,[19]
3622497,Bipolar. Married man in his 30s Melbourne Australia. If you're not 18+ do not view,True,[30]


Unnamed: 0,parsed_blog_description,gender,gender_terms
3768925,A blog written by a girl who is not defined by her mental illness.,True,[girl]
3173034,"CARSON E. HART. 20. FRESHMAN. SONGWRITER. RUNAWAY. Weep for yourself, my man, You'll never be what is in your heart Weep, little lion man, You're not as brave as you were at the start Rate yourself and rake yourself Take all the courage you have left And waste it on fixing all the problems that you made in your own head YOUR BOLDNESS STANDS AMONG THE WRECK.",True,"[man, man]"
2516839,"[5'7|Taurus|Hufflepuff|ISFJ] I am a fully""functional""fan girl who is into so many fandoms it's ridiculous should probably go interact with humans but not today satan, not today. [Fandoms] ·Supernatural ·Criminal Minds ·Teen Wolf ·The Vampire Diaries ·The Originals ·Fantastic Beast and Where to Find Them ·Doctor Strange ·Sherlock BBC ·Voltron Legendary Defender ·Gotham ·Arrow ·Miraculous: Tales of Ladybug & Cat Noir ·Etcetera",True,[girl]
3946903,"I walk into a room just as cool as you please, and to a man, the fellows stand or fall down on their knees. Then they swarm around me, a hive of bees",True,[man]
2685268,"Colby Lopez or Seth Rollins. Your choice but regardless, you can just call me THE MAN . The King Of Davenport, Iowa. Former United States Champion and WWE World Heavyweight Champion. First ever NXT Champion and the former poster boy for Triple H and Stephanie. One of the best wrestlers you will find, also one hell of a ladies man.",True,"[MAN, boy, man]"
2509661,just a girl in a world,True,[girl]
2629434,"13/Demigirl?/She/they?/ I reblog whatever's cool and nice (by my standards). Mostly just trying to do the right thing when I can :/ I joined the Bendy blog wagon, except THIS ones gotta twist. A pretty big one: mendyboyistalking-tumblr-com-about",True,[girl]
1665059,I'm just worried about my mom worrying less. 17|UK ig: abha.vela sc: vela.abha,True,[mom]
2724136,meadow. class of '17. eic of the academic . future scottie. bigender baby.,True,[bigender]
2714470,"Wanda Maximoff. scarlet witch. sister.an avenger.Oh, but why did God give us fragile hearts and such brutal bones to house them in? I want to be all t e n d e r n e s s but my hands are much too h a r s h.",True,[sister]


Unnamed: 0,parsed_blog_description,sexual orientation,sexual orientation_terms
3856502,AFART // alexander // he~him pronouns // 16 // canada // super queer // nerd // wcif friendly,True,[queer]
3607052,"""Where there's a will, there's a way."" William Hart. 22. He/Him/His. Student. Bisexual. Father to a dog named Junior. Still a small town boy at heart. You can call me Will.",True,[Bisexual]
3098895,apollo | 17 | capricorn | infp | queer | yeo wool is my spirit animal,True,[queer]
3985946,"Angel / Female / 17 / Virgo / Dom / Bisexual / Single, but looking / I don't like sharing with other doms thiscrush-com-~nerdygirl17",True,[Bisexual]
2729945,"Alex/13/pan/genderqueer Please message me I'm lonely and in need of friends. I don't even know what this blog is, but it's my mind.",True,"[pan, queer]"
3549222,will | bisexual transboy | he/him or they/them | INTP | ravenclaw | future jewish convert,True,[bisexual]
3886612,"Hey there. Name's Bruce. I'm a fae-shifter hybrid, a total bear, and super gay. I'm also the head groundskeeper at Krov's castle.",True,[gay]
1328172,Ace/Aro|She/They/He /Whatever you feel like calling me|16| I'll probably be really socially awkward but please try to put up with me ;u;,True,[Ace]
3565489,Disney lover and the lost princess • German • positivity only • Aladdin • beauty and the beast • makeup addict • winterkind • currently in recovery • aro ace,True,[ace]
2406712,dev; so gay.,True,[gay]


Unnamed: 0,parsed_blog_description,pronouns,pronouns_terms
1170857,"Emilia; Haunted House Enthusiast, Writer?? Human Mess. Gryffindor ♡ ENTP ♡ bi she/her",True,"[she/, her]"
3193301,14 / They-Him / gay /i use a ugee m708 graphics tablet and the used program is firealpaca / and i might do a rant that takes like 80 posts to finish every once in a while / also go follow @otakudraws cause she's a great person even if she never posts / that's all i gotta say about myself,True,"[They-, Him, she', she]"
715767,"The only thing that's a bigger joke this blog is my life// Anna // pronouns: idc, will respond to ""meme loving trash""",True,[pronouns]
3227196,Lisa • 23 • she/her • nsfw will be tagged,True,"[she/, her]"
3435769,Pika | She/They,True,"[She/, They]"
2621607,♡ hi i'm ci and i love jackson wang ♡ [ she/they ] (23) [aries] #stanfx #petmoredogs,True,"[she/, they]"
3309538,"Multifandom interests, just reblogging things tbh, some NSFW. Hufflepuff Nonbinary witch, he/him or they/them",True,"[he/, him, they/, them]"
3608600,"Trashier than you, They/Them. Idiot.",True,"[They/, Them.]"
3299685,instagram: frank.kko | snapchat: immakowala | pan | all pronouns | poly,True,[pronouns]
3122091,"very bored human バカだね I don't condone bullshit behaviors masking as ""constructive criticisms"" I will call you out when I see it. If it's not malicious, you get a pass. There is nothing more annoying than hypocrites. They claim that they hate something only to do the exact thing the same people they're hating on are doing. Stop. (Apparently I need people to validate my asian existence)",True,"[They, they, they']"


Unnamed: 0,parsed_blog_description,personality type,personality type_terms
3709509,ENFP • Anna • 18,True,[ENFP]
3499608,Bunk/Buzz | 15 | she/her | Bi | ♏ | INFP | Multi-fandom blog/Shitpost,True,[INFP]
2695919,"28 years old, she/her, bisexual, married, Hufflepuff, INFJ, Varric Tethras fangirl (I'm not sorry). Multifandom blog and teaching myself how to art. I try to maintain a discourse free and postive blog. I ship all the ships but Varric/Cassandra is otp. Fandoms: Dragon Age, Mass Effect, Horizon: Zero Dawn, Sense8, Orphan Black, and some sprinklings of other things. Everything is tagged. Let me know if you need something tagged specifically.",True,[INFJ]
2583319,•14•CJ/Caitlin•INFJ• •Marvel is mY LIFE BLOOD• 🗺🗺🗺,True,[INFJ]
4169581,93infp doriol#4269 #germij,True,[infp]
3255538,17/INTJ/♑/pan/nonbinary taken as of 4/18/17,True,[INTJ]
1015907,"Courtney/Marble || it/its, she/her INFP-T Please Softblock If You Unfollow Mobile Links Please Read The About/Kin Before Following",True,[INFP]
3271191,INFP ☀️,True,[INFP]
3908521,Intj | Ravenclaw,True,[Intj]
2754255,"So I'm Eliza. Hello, nice to meet you, all of that. I'm INFP, Gryffindor, Thunderbird, Leo. Icon: my lousy attempt at drawing.",True,[INFP]


Unnamed: 0,parsed_blog_description,ethnicity/nationality,ethnicity/nationality_terms
3112183,"love: english, reading, writing, yoga, but most of all, my family :) {10-14-08, 07-26-13, 06-13-14♡}",True,[english]
3620909,15 yo | She/they | Canadian | Dan and Phil blog (if you're interested): teamshibevsthedestroyer,True,[Canadian]
3886868,trixie - 22 - korean - exo-l,True,[korean]
3000721,"20 | Italian | Writer | Atheist | INTJ | I love girls, memes, solitude, books, intersectional feminism, and b&w photography",True,[Italian]
3144577,"Welcome to my blog. I'm an Indonesian foodie based on Medan city. Love to capture love, life, & beautiful moments on my everyday. A professional tarot reader since 2001. Keep your positive vibes, always! Use #DollyRegar for featured.",True,[Indonesian]
3629828,"I’m a fucking freak, I want a guy who can satisfy all my wants and needs, regardless of what they are, I DO NOT SEND ANYTHING, never have, never will, if you ask me to send something or prove myself, you will be blocked immediately, I don’t have any tolerance for the fake, bullshit, just don’t kik me at all, I like when guys talk to me as if I was a dominant male, or treating me like I was there real dad, I've always wanted a guy that I can refer to as my son, and calling me dad, daddy, dude, bro, etc, and allowing me to talk down to them as a dominant male would, telling you how id fuck you with my cock, etc, it turns me on a lot, and makes me feel more in control, if that’s an issue, or you think I’m a guy for wanting that, again, don’t kik me, because I don’t give a fuck, I require full control over you, I hate men who bitch, or complain, ignore me, or talk shit, just because they don’t want to do something I won’t deal with it, you will be blocked, I don’t have time to waste, I’m only here for myself, and I want a guy who will let me have my way with him, you must have zero limits, even when it comes to bathroom play, cause I have my freaky horny moods, where I require it, you must also be willing to change your physical appearance for me, and don’t assume I’m kidding or full of shit, because I’m not, men are garbage to me, and I only want to use them, to get what I want, and after I’m done with you I will ignore you, until I need you to get me off again, you may always send me pics and videos, that have been previously requested while I am away, so that I have something to come back to later on, I prefer guys who send me lots of pics and videos of them showing off, etc, acting arrogant or cocky, it turns me on a lot, especially if you look like a douche bag, it get me going, I will not tolerate it verbally, directed at me, you always refer to me as a slave should, never anything else. First of all, I expect you to send me daily pics, and videos of you, and your body, I want you to completely stop shaving for me, (face included) I hate men who shave, and I honestly want you to prove to me you can handle it, I want you to look like a dirty cavemen, don’t ever wear deodorant, or bathe, if possible ether, my men are dirty ass pigs, and I want them to look like it to, it turns me on a lot, especially when men obey me and give me exactly what I want, exactly how I tell them to do it, I will expect daily pics, of your biceps, armpits, legs and feet, in black or white ankle socks, regardless if you have sent them before, as well as pics of the places on your body where you have hair growing, it turns me on a lot, I’m interested in men developing and changing, especially if they do it specifically for me, I need full control over them, I prefer them to have little, to no will at all, and no boundaries, no restrictions, I also require that you work out for me on a daily basis, even if it’s a little bit, again, I would obviously, want you to send me videos of you working out for me, wearing, black ankle socks, basketball shorts, or boxers, and a tight shirt, to show the sweat off after you’re done, I would also like to see you flex your biceps after a workout, to show off your sweaty armpits, especially if you have big biceps, it gets me going a lot, to see your sweaty armpits in a tight shirt, you may record yourself in the gym as well, that’s fine, or using workout equipment, as long as I get to see it on video i'm also really interested in guys who prove themselves, as alpha males, by showing off, or fighting other men, it turns me on, so if you wanna prove your an alpha to me, than show me on video, I really love when men act cocky, and arrogant, in there pics and videos, it turns me on a lot. Secondly, I require this almost more than anything else out of men, because it turns me on a lot, I love men who get woman pregnant, and record them having sex, I love seeing men Cumming in woman, it turns me on, I really want a guy who will be able to go around fucking woman for me, and attempting to get them pregnant, as I am unable to get pregnant myself, and it turns me on more than anything else to be able to watch it on video, this can be done at a POF angle, or hidden cam angle, ether one is fine with me, as long as I get to see you (the guy), and as long as you somehow let me know it’s you on the video, I love men who talk shit, or talk dirty, to other woman, I wouldn’t mind hearing your voice (the guy) telling her you’re going to make her a mommy, or that you’re going to get her pregnant, you can send me any videos that you have already done as well, as long as she may have, or did, get pregnant on the video, I am one of those people who needs to be able to see everything that is happening, and I would also require a lot of proof, it can be any kind of proof, I would also require that you make her take the pregnancy test, and you record her doing that with you (the guy) in the video standing next to her, or by her, ether one, and of course afterwards you showing me the result, after that you would take pictures of her getting bigger, and bigger, as well as pics of you showing off, for me, or acting arrogant, because at that point I’d be a lot more interested in you, and would expect a lot more out of you, I’d want to see everything that happens from conception, to birth, if possible. Also if you’re able to set up voyeur cams, (camera’s around your house) let me know I would need one in the both the bathroom, and bedroom, any other rooms would be nice to, I need a guy who I can watch whenever I want 24/7, you would pretend that the cams do not exist, or are not there, it turns me on a lot, I love to see guys go about their business, I prefer them to be set up in the corners of rooms where I am able to see everything, if you already have voyeur cams, let me know on kik, and give me the information to them, otherwise you would need to purchase them, but it would be a lot easier than sending me pics and videos on a daily basis. Kik emogalx99 (I often ignore guys right off the bat on kik, make sure you send me a pic, or video, no dick pics, send a full body nude live pic, and flex those biceps) You can also send me amazon gift cards my email is (scannison5@gmail-com) anything other than an amazon gift card, will be deleted.",True,"[black, white, black]"
1394969,SAUDI,True,[SAUDI]
1907968,"Azizi Jordan-Black So if you ride then I'm riding too, by your side kinda stuck on you...",True,[Black]
2911881,"Emi. Part-time studyblr. English Lit, Teaching, & Black Culture. Stationery, Books, & Coffee. Unapologetically black.",True,"[English, Black, black]"
3704406,"Hello! I question why you're here but welcome you all the same! I'm the futuristicallycoldfox(because I couldn't resist the tumblr generated irl) but you can call me Frenchfryem. Or French fry. Or fry. Or Em. Or any variation of that or my irl. Honesty, anything is fine so long I know it's me. OH! AND I AM ACE! (ah, how lovely it is to say so)",True,[French]


Unnamed: 0,parsed_blog_description,relationship status,relationship status_terms
3674544,"Marshall / 18 / Frusciante lover / I make music! Single is out on Spotify, Itunes, and many more",True,[Single]
4252890,Rachel Barbra St. James Hello World! My name is Rachel St. James. Married to the handsome Jesse St. James. We have four beautiful children! Tony Award winner and best mom ever. I'm currently working as a theater teacher so I suggest you come try out for the school musical. It's going to be amazing.,True,[Married]
3638351,SFW BLOG ~ Mya/23/ Little Kitten Taken 💕☺🎀,True,[Taken]
2119163,🎀she/her🎀 🌸mdlg🌸 ∆little space∆ 🐇bunny🐇 💧switch💧 🔐taken&in love💞 nsfw (sometimes) mostly sfw,True,[taken]
3765708,*Read Rules before interacting please* Multi muse Single verse (Maybe multiverse if talked about ooc) RP only Not associated with any company muses are from,True,[Single]
3899304,Hiya I'm Carter Lea Varecielli or you can call me Bailee Rose Madison.I am 24 going to be 25 years old.I am single and waiting for love.If you want to know more send me a message♡.,True,[single]
2019068,"Lol otome games have taken over my life. My ideals for a boyfriend is beyond normal boyfriend material now, thank you otome. Oh and also, I'm @chocolaterulez in both wattpad and insta. I'm a fan of everything.",True,[taken]
3977725,"40 years old, father and married to Amanda Schull,and actor (not really Aaron Stanford rp account)",True,[married]
4213366,"This blog is for every single song on the planet and the perfect moments for it, its about when to listen to a song, enjoy it and feel every single tone, Its about when your heart skips a beat due to the music's beauty and all whats left beating is the sound of the drums in the song you chose. It doesnt matter if you are a metal head, a rocker, a hiphop addict or a rap god lover.. all what matters is the kind of music that speaks for you and makes you feel heard even for a second. that second that worth a life time. Here you just got to be proud of your taste.",True,"[single, single]"
4069561,Read the Bio before interacting. Multi-muse blog. Literate. AU. Single-verse; Except for one character. Hover Below.,True,[Single]


Unnamed: 0,parsed_blog_description,sexuality/gender,sexuality/gender_terms
3728731,"Y'all dudes is a hot damn mess I'm way too blessed to be stressed So I don't want no boyfriend Just give me them checks What if I'm a material girl? Can't blame me I live in a material world It's crazy, this much it should be making me hurl Big baby, I know you want a taste of the pearl",True,"[them, girl]"
3943221,Have you ever used a toy on a girl?,True,[girl]
1721043,"MOBILE | ( ) load up on guns & bring your friends - it's fun to ( l o s e ) and to pretend . . . he's overboard and self assured ! oh no - i know a DIRTY WORD . hello ? hello ? hello ? how low . with the LIGHTS OUT , it's less dangerous . . here we are now ; entertain us ! i feel stupid & contagious . . . here we are now - entertain us !",True,[he']
2150318,Everything OHSHC :3 Emily. 19. Canada. Come talk to me! Side blogs: -fandomsoverfriends (Book Trash) -morelikesasgay (Naruto Trash) -mykawaiifriends (Hobbit/LOTR Trash) -pickabiasanybias (K-Pop Trash) -magicismyweakness (General Movie/TV/Video Game Trash),True,[gay]
2271450,"21/ Female/ INFP/ Canadian Half-blind book addict. Never leave the house without a book. Currently reading: The Way of Shadows, Brent Weeks.",True,[Female]
2308469,"Hewo!. I'm Kayden Joseph| RESPECT MY PRONOUNS: HE HIM,HIS| | 18 | NJ and PA | FTM and demiromantic ||💯🚹|Pre-Med ||Future Medical Examiner | 18+Blog WARNING: SOME NSFW | CSA Survivor, 1 year clean from cutting and burning|Feel free to message me on here or Insta if u just wanna chat,vent,whatever. always here to help. just a warning I'm socially awkward but I am trying my best| Bands , Medicine,quotes, mommy and babyboy posts,science,tattoos,tate Langdon,joker,love,hate,evil,good,music,Lucifer, weed,food,forensic science murder| Full time bookworm | Taurus ♉|Soccer,basketball,Lacrosse,rugby,football,karate,hockey,baseball | Football#45 | Insta: yourneighborhooddork_",True,"[PRONOUNS, HE, HIM,, FTM, boy]"
3051658,Mel. 21. FTM.,True,[FTM]
3979784,"Greta,15,Lesbian,Dancer ""E vorresti avere un paio di ali,il mondo non è come ti aspettavi,solo gli incubi sono reali perché i sogni ci hanno resi schiavi""",True,[Lesbian]
1549573,"17 | Girl | 6,8km | M❤",True,[Girl]
2875374,ENTP | Meme Queen | 19 Hi I'm Nikko (they/them),True,"[(they/, them)]"


# Word attention weights

In [3]:
# Load word attn weights
with open('/usr0/home/mamille2/tumblr/output/model_2018-04-11T12-30_word_attn_weights.pkl', 'rb') as f:
    wts = pickle.load(f)
    
len(wts)

688

In [5]:
wts[0].shape

(100, 200)

# Check attention weights

In [4]:
# Load direct attn weights
with open('/usr0/home/mamille2/tumblr/output/model_2018-03-31T23-30_attn_weights_direct.pkl', 'rb') as f:
    direct_wts = pickle.load(f)
    
len(direct_wts)

688

In [3]:
# Load indirect attn weights
with open('/usr0/home/mamille2/tumblr/output/model_2018-03-31T23-30_attn_weights.pkl', 'rb') as f:
    wts = pickle.load(f)
    
len(wts)

688

In [5]:
print(direct_wts[0])
print(wts[0])

[ 0.30555958 -0.13703799  0.1035867   0.20224634 -0.19880936 -0.18145458
 -0.14962777  0.04433154  0.22017848 -0.18057808 -0.06096431  0.12984486
 -0.06407683  0.18515438  0.05510226  0.31275398  0.25671679 -0.05390156
 -0.10530041  0.34404707  0.02855545  0.00411435  0.07815137 -0.1248413
 -0.25851154  0.08612406 -0.20937416  0.14845489 -0.08660179 -0.19079304
  0.1475722   0.02698137  0.25198489 -0.00841342 -0.10379876  0.24128842
  0.23399304 -0.27723286 -0.15995578 -0.06592993 -0.08365398 -0.02024059
  0.16071615 -0.05126818 -0.35505942 -0.14211391  0.59991693  0.02603234
 -0.12317675  0.00439709 -0.07141152 -0.3717941  -0.22775826  0.11758669
  0.19131546 -0.19860813  0.08542084  0.20094982 -0.13264233  0.15666354
 -0.46880129 -0.01786879 -0.03257189  0.47176182 -0.33390722 -0.01613934
  0.18031847 -0.41216314 -0.1637273  -0.06626824  0.14992854  0.37174448
 -0.26347351  0.019693    0.09005575  0.21576723  0.07597204 -0.13793452
  0.08550265  0.32918209  0.06416348  0.22800489 -0.

In [6]:
print(np.argsort(direct_wts[0])[::-1])
print(np.argsort(wts[0])[::-1])

[ 46  63  85 146  71  19 181 118  79  15   0 113 134 104 161 107  16 159
 122  32 153 160  35  36  81 112  84 115 163   8  75 166 189 199   3  57
  91 110  54 133  13  86 170  66  42  59 120  70  27  30 165  90 176  93
  11 103  96 132 183 123  53 119 117 195   2 171  74  25  78  56  22  76
 186  97 139  80 198  14   7 141  99  83 106  20 126  31  47 196 156  73
 140  49  21 137  33 192 105  65 121  61  88  41  62  98 197 174  82  43
 125  17 102  10  12 108  39  69 172  50 194 187 168 128 180  40  28 131
 142  34  18 185  95 162  48  23 124  58 184 164 182   1 145  77 127 152
 191  45 147 178   6 129 109 177  38 116  68 157   9   5  92  94 148  29
 151 154  55   4 190  26 135 101  52 169 143 138 150 193 149  24  72 114
 100 158  37 188  87  89 144  64 136 175  44  51 179 111  67 155 173  60
 167 130]
[44 40 43 94 36 26 51 38  9 39 32 37 52 34 79 33 75 57 23 45 16 35 42 10 74
 60 58 63 41 21 31 55 69 50 90 22 29 95 30 27 89 65 59 19 25 28 56 20 73 17
  8 49 96 53 18 54 11  5 68 15 97 4

In [7]:
print(len(direct_wts[0]))
print(len(wts[0]))

200
100


# Verify predictions, scores

In [10]:
# Load predictions, gold
data = pd.read_pickle('/usr0/home/mamille2/tumblr/output/model_2018-03-31T23-30_dev_preds.pkl')
print(len(data))
data.columns

690


Index(['tumblog_id', 'pred_age', 'actual_age', 'pred_gender', 'actual_gender',
       'pred_sexual orientation', 'actual_sexual orientation', 'pred_pronouns',
       'actual_pronouns', 'pred_personality type', 'actual_personality type',
       'pred_ethnicity/nationality', 'actual_ethnicity/nationality',
       'pred_relationship status', 'actual_relationship status',
       'pred_sexuality/gender', 'actual_sexuality/gender'],
      dtype='object')

In [11]:
data

Unnamed: 0,tumblog_id,pred_age,actual_age,pred_gender,actual_gender,pred_sexual orientation,actual_sexual orientation,pred_pronouns,actual_pronouns,pred_personality type,actual_personality type,pred_ethnicity/nationality,actual_ethnicity/nationality,pred_relationship status,actual_relationship status,pred_sexuality/gender,actual_sexuality/gender
0,274521774,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
1,233971649,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
2,338546429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,335816456,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,301415060,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
5,266348896,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,339116812,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
7,280235066,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,318357067,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
9,327225020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
cohen_kappa_score(data['pred_age'], data['actual_age'])

0.75339998284521625

In [14]:
cohen_kappa_score(data['pred_sexuality/gender'], data['actual_sexuality/gender'])

0.7784911717495987

# Attention visualizations for category mention prediction

In [5]:
# Load input
with open('/usr0/home/mamille2/dev_inds.pkl', 'rb') as f:
    dev_inds = pickle.load(f)
    
dev_inds

array([1478,  510, 6035, 5521, 2357, 1292, 6117, 1648, 3365, 4103, 4086,
       2527, 3696, 5426, 2769, 1743, 6328, 4670, 6657, 6353, 1763, 6888,
        353, 5032, 3030, 2630, 6086, 2047,  344, 1506, 6171, 3850, 3739,
       1558,   36,  306,   86, 4202, 5892, 5632, 3343, 6804, 1707, 5927,
       3393, 4587,  837, 4834, 4690,  247, 1123, 5429, 3786,  348, 6146,
       4560,  940,  503,  293, 2626, 5037, 2452, 6538, 2855, 1661, 2685,
       3421, 3284, 1924, 2433, 3893,  802, 4401, 5444, 1922, 4731, 5283,
       4434, 2202, 3553, 3722, 2771,  893, 4696, 3773,  414, 5819, 4413,
       6313, 5588, 3414, 5957, 5172, 3148, 1953, 6759, 6844,  181, 1651,
       5011, 4121,  914, 3826, 4864,  214, 4107, 6580, 3912, 1609, 6670,
       1238, 1716, 1364, 4819, 3371, 6896, 1224, 1717,  605, 2408, 2195,
       1104,  793, 1809, 1645,  439, 6113, 6368,  184, 3631, 1788, 6568,
       6501, 5186, 1095, 6417, 2693, 1532, 2620, 2297,  203, 6767, 2845,
       6416, 5991, 2355, 2801, 2252, 6388,  235, 57

In [6]:
dev_inds.shape

(690,)

In [7]:
descs = pd.read_pickle('/usr0/home/mamille2/tumblr/data/list_descriptions_100posts.pkl')
len(descs)

6902

In [8]:
posts = pd.read_pickle('/usr0/home/mamille2/tumblr/data/textposts_recent100_100posts.pkl')
len(posts)

690200

In [9]:
tids = sorted(descs['tumblog_id'].tolist())

In [11]:
posts_by_blog = [[p for p in posts[posts['tumblog_id']==tid]['body_str_no_titles'].tolist()] for tid in tids]
len(posts_by_blog)

6902

In [16]:
posts_by_blog[[0,1]]

TypeError: list indices must be integers or slices, not list

In [18]:
dev_posts = []
for i in list(dev_inds[:688]):
    dev_posts.append(posts_by_blog[i])

len(dev_posts)

688

In [19]:
# Load weights on input
with open('/usr0/home/mamille2/tumblr_attn_test.pkl', 'rb') as f:
    wts = pickle.load(f)
    
len(wts)

AttributeError: 'list' object has no attribute 'shape'

In [23]:
def color_attn(val, total_max, total_min):
    """ Returns 0-1 for highlighting """
    
    scale = 1/total_max
    val = (val-total_min) * scale
    return val

In [24]:
total_max = max(d for wt in wts for d in wt)
total_min = min(d for wt in wts for d in wt)

In [None]:
# wts_viz = ""
wts_viz = []
# for i, (wt, sent) in enumerate(tqdm(zip(wts, text_data))):
for wt_arr, posts in tqdm(zip(wts, dev_posts)):
#     sent = ['<sent>'] + sent + ['</sent>']
    vals = [color_attn(d, total_max, total_min) for d in wt_arr]
#     display(HTML(''.join([f"<span style='background-color: rgba(255,0,0,{val})'>{w}</span>&nbsp" for val,w in zip(vals, sent)])))
#     wts_viz += f"Sample {i}:<br>" + \
#             ''.join([f"<span style='background-color: rgba(255,0,0,{val})'>{w}</span>&nbsp" for val,w in zip(vals, sent)]) + \
#             "<br><br>"
    wts_viz.append(''.join([f"<span style='background-color: rgba(255,0,0,{val})'>{html.escape(w)}</span>&nbsp" for val,w in zip(vals, sent)]))
    
# display(HTML(wts_viz))
len(wts_viz)

In [27]:
pd.set_option('display.max_colwidth', -1)

In [65]:
tops = []
bottoms = []

for i in range(688):
    post_wts = pd.DataFrame()
    post_wts['post'] = dev_posts[i] 
    post_wts['weight'] = list(wts[i])

    post_wts.sort_values('weight', inplace=True, ascending=False)
    top_post = post_wts.iloc[0]['post']
    bottom_post = post_wts.iloc[-1]['post']
    tops.append(top_post)
    bottoms.append(bottom_post)
    
print(len(tops))
print(len(bottoms))

688
688


In [66]:
quick_posts = pd.DataFrame()
quick_posts['tumblog_id'] = tids[:688]
quick_posts['top_post'] = tops
quick_posts['bottom_post'] = bottoms
len(quick_posts)

688

In [67]:
merged = pd.merge(quick_posts, descs, on=['tumblog_id'], how='inner')
len(merged)

688

In [61]:
merged.columns

Index(['tumblog_id', 'top_post', 'bottom_post', 'activity_time_epoch',
       'tumblr_blog_name', 'tumblr_blog_title', 'tumblr_blog_description',
       'tumblr_blog_url', 'tumblr_blog_theme', 'is_group_blog', 'is_primary',
       'is_private', 'created_time_epoch', 'updated_time_epoch', 'timezone',
       'language', 'blog_classifier', 'generated_date',
       'parsed_blog_description', 'segments', 'restr_segments_25',
       'segments_25_nopunct', 'age', 'gender', 'sexual orientation',
       'pronouns', 'personality type', 'ethnicity/nationality',
       'relationship status', 'sexuality/gender'],
      dtype='object')

In [71]:
test = merged.loc[:, ['tumblog_id', 'segments_25_nopunct', 'age', 'sexuality/gender', 'top_post', 'bottom_post']]
test[test['sexuality/gender']==True]

Unnamed: 0,tumblog_id,segments_25_nopunct,age,sexuality/gender,top_post,bottom_post
1,3222343,"[druscilla ryan, peterick writing, ao3, faqs, instagram bipolar, 29, tumblr mom]",True,True,": trans lifeline : 877 - 565 - 8860 depression hotline : 1 - 630 - 482 - 9696 suicide hotline : 1 - 800 - 784 - 8433 lifeline : 1 - 800 - 273 - 8255 trevor project : 1 - 866 - 488 - 7386 sexuality support : 1 - 800 - 246 - 7743 eating hotline : 1 - 847 - 831 - 3438 rape and sexual assault : 1 - 800 - 656 - 4673 grief support : 1 - 650 - 321 - 5272 runaway : 1 - 800 - 843 - 5200 , 1 - 800 - 843 - 5678 , 1 - 800 - 621 - 4000 exhale : after abortion hotline / pro - voice : 1 - 866 - 439 - 4253",
25,29655833,"[23 fl, bi]",True,True,"@ddoubledogdareya replied to your post “ @ddoubledogdareya replied to your post “ @ddoubledogdareya replied to … ” yes , cody charles ? this is being filmed .",@ddoubledogdareya you do ? ? ?
30,33131726,"[18, nsfw, pansexual bisexual, 420 friendly, ask me anything, love sex]",True,True,"photo courtesy : peter bick ncaa di editorial coverage is proudly sponsored by adidas . visit adidasswimming.com for more information on our sponsor . for all the latest coverage , check out our event coverage page . click here for live results the 2017 ncaa d1 men ’s swimming and diving championships have officially arrived in indianapolis , in ! the championship meet [ … ] the post quick links to 2017 ncaa d1 day 3 prelims live stream , heat sheets , results , and more ! appeared first on information overload news . from information overload news http://www.informationoverload.news/quick-links-to-2017-ncaa-d1-day-3-prelims-live-stream-heat-sheets-results-and-more/","georgia senior chase kalisz swam the fastest 400 i m in history at 3:33.42 , breaking his old record he set in 2014 at 3:34.50 . kalisz took last year off for the olympics , and was dethroned by will licon in 2015 , but before all of that kalisz won two straight 400 i m titles in 2013 and 2014 . [ … ] the post chase kalisz demolishes american record in 400 i m appeared first on information overload news . from information overload news http://www.informationoverload.news/chase-kalisz-demolishes-american-record-in-400-im/"
40,45793977,"[катя, 26, bi]",True,True,: respect girls with a chubby tummy respect girls with stretch marks respect girls with big thighs respect girls with hairy arms respect girls and their clothing of choice respect girls and their privacy respect girls and their confidence respect girls and their rights respect girls who are nt fully transitioned yet respect girls with scars respect girls who like girls respect girls who like both guys and girls respect girls who are asexual respect girls . do nt treat them as objects .,: lacquette : : caps sᴍᴀʟʟ ᴄᴀᴘs all caps nice nice nice nice nice
43,49542520,"[22, she her, bi, i do post nsfw, i dont bite, nevada]",True,True,"owo angst starters more angst starters even more angst starters trigger warnings apply ! ( mental illness , drugs and alcohol use , self - destructive behaviours , and vomiting ) dying sentence starters because cindy loves you all 💗 send me a “ ✵ ” and my character will try to break your heart headcanons ! bold what applies ! kiss kiss fall in love ? or hate . whatever floats your goat . and the old time fave . send me a ¯\_(ツ)_/¯ for my muse to find yours in their living room eating a bowl of cereal . enjoy~ kira / lucina / blossom / keahi","@mechanicalmechanic from here “ if you do n’t have a reason to be sitting alone , i ’ll join you ! ” whether or not she liked it , there was no time to respond before he had taken a seat next to her . “ you look like there ’s a lot bothering you . do you need to get it out ? it might be easier to smile afterwards . ”"
44,49602827,"[meet marshall lee abadeer, he is 28 years old]",True,True,: blankslate - chalkoutline : : do not fix your dark circles let the world know you re tired of its shit and ready to kill a man you ’re * it ’s * btw . i am a man . oooooooooooooh my gooooooooooooooooooooooood ooooooooh my god . oh my god . ooooooooooh . my god oh my god,: i ’m suffering from ema deprivation . isayama please .
48,53279678,"[memes pink and aesthetic, snap chat moody princess, f 21 bisexual]",True,True,": true . and apparently it ’s frowned upon to tell customers to just drink regular fucking coffee . who knew ? well , be careful . the next thing they ’ll ask you to do is start singing when someone drops a dollar in the tip jar .",": uh , there has been a bit of a drought lately . oh no . that wo n’t do . do n’t you have a wing man ?"
51,55873069,[they them],False,True,"i - think - i - like - this - host : destroyedmentality : i - think - i - like - this - host : @destroyedmentality - bakura sneered , angered by such a response . surely , he looked so much like the host he inhabited that even this man could recognise his younger self . “ how could you forget an old childhood friend so ? ” “ … um … what ? ” he frowned . “ where is your mom , little one ? i think you ’ve wandered too far away . ” ryou was really confused . how could a child be his childhood friend ? he moved far away from his childhood home ages ago . and he preferred not to think about that place much . was this man trying to upset him ? because it was truly working . “ you do n’t remember your mother dying , ryou ? ” he glared at him . “ it was just you and me , do n’t you remember ? ” he caught for a moment . “ despite your interesting nosiness … i was alone after certain abysmal events in my life . ” he glared .",# would anybody like to play ? #
61,63575671,"[lacy, 16, infp, taurus, bi adventure]",True,True,i just have no muse for any of my drafts right now,: alex : takes @dominusmontis ‘s shitty mug & smashes it on the floor . . . . rude .
64,63963756,"[patrick 25 he, him, dude, find funny, want to talk about, that makes me feel]",True,True,•wrap my legs around you •lay my head on your chest •run my hands through your hair •play with your hands •fall asleep in your arms •sing you a song •give you small cute kisses •talk about my day •trace my fingers on you,: i like to pretend i do n’t give a fuck but honestly i do n’t think i ’ve ever not given a fuck in my whole life


In [49]:
descs[descs['tumblog_id']==tids[6]]['segments_25_nopunct']

16319    [24, htx, gelli, sagittarius, heart is forever taken]
Name: segments_25_nopunct, dtype: object

In [53]:
len(wts)

688

In [52]:
posts.columns

Index(['post_id', 'activity_time_epoch', 'tumblog_id', 'post_title',
       'post_short_url', 'post_type', 'post_caption', 'post_format',
       'post_note_count', 'created_time_epoch', 'updated_time_epoch',
       'is_submission', 'source_title', 'source_url', 'post_classifier',
       'blog_classifier', 'accepts_answers', 'reblogged_from_post_id',
       'reblogged_from_metadata', 'root_post_id', 'body', 'mentions',
       'post_tags', 'body_toks', 'body_str', 'body_toks_no_titles',
       'body_str_no_titles'],
      dtype='object')

In [36]:
descs.columns

Index(['tumblog_id', 'activity_time_epoch', 'tumblr_blog_name',
       'tumblr_blog_title', 'tumblr_blog_description', 'tumblr_blog_url',
       'tumblr_blog_theme', 'is_group_blog', 'is_primary', 'is_private',
       'created_time_epoch', 'updated_time_epoch', 'timezone', 'language',
       'blog_classifier', 'generated_date', 'parsed_blog_description',
       'segments', 'restr_segments_25', 'segments_25_nopunct', 'age', 'gender',
       'sexual orientation', 'pronouns', 'personality type',
       'ethnicity/nationality', 'relationship status', 'sexuality/gender'],
      dtype='object')

# Predict category mentions

In [3]:
# Load descriptions
descs = pd.read_pickle('/usr0/home/mamille2/tumblr/data/list_descriptions_100posts.pkl')
print(descs.columns)
print(len(descs))

tids = sorted(descs['tumblog_id'].tolist())

Index(['tumblog_id', 'activity_time_epoch', 'tumblr_blog_name',
       'tumblr_blog_title', 'tumblr_blog_description', 'tumblr_blog_url',
       'tumblr_blog_theme', 'is_group_blog', 'is_primary', 'is_private',
       'created_time_epoch', 'updated_time_epoch', 'timezone', 'language',
       'blog_classifier', 'generated_date', 'parsed_blog_description',
       'segments', 'restr_segments_25', 'segments_25_nopunct', 'age', 'gender',
       'sexual orientation', 'pronouns', 'personality type',
       'ethnicity/nationality', 'relationship status', 'sexuality/gender'],
      dtype='object')
6902


In [46]:
# Load text posts
posts = pd.read_pickle('/usr0/home/mamille2/tumblr/data/textposts_recent100_100posts.pkl')
print(posts.columns)
len(posts)

Index(['post_id', 'activity_time_epoch', 'tumblog_id', 'post_title',
       'post_short_url', 'post_type', 'post_caption', 'post_format',
       'post_note_count', 'created_time_epoch', 'updated_time_epoch',
       'is_submission', 'source_title', 'source_url', 'post_classifier',
       'blog_classifier', 'accepts_answers', 'reblogged_from_post_id',
       'reblogged_from_metadata', 'root_post_id', 'body', 'mentions',
       'post_tags', 'body_toks', 'body_str', 'body_toks_no_titles',
       'body_str_no_titles'],
      dtype='object')


690200

## Prepare posts

In [47]:
# Text posts to word indices (Keras way)
# texts = [' '.join(posts[posts['tumblog_id']==tid]['body_str']) for tid in tids] # concatenated posts
texts = [' '.join(posts[posts['tumblog_id']==tid]['body_str_no_titles']) for tid in tids] # concatenated posts
len(texts)

6902

In [48]:
MAX_VOCAB_SIZE = 100000
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE,
                     filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n“”')
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print(f'Found {len(word_index)} unique words')

Found 485136 unique words


In [49]:
MAX_SEQUENCE_LENGTH = 20000
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
len(data)

6902

In [50]:
vocab = list(word_index.keys())[:MAX_VOCAB_SIZE] # lower indices are words kept
len(vocab)

100000

## Prepare description categories (labels)

In [51]:
cats = descs.columns.tolist()[-8:]
labels = list(zip(*[descs[cat] for cat in cats]))
labels = np.array(labels, dtype=int)
print(labels.shape)
print(cats)

(6902, 8)
['age', 'gender', 'sexual orientation', 'pronouns', 'personality type', 'ethnicity/nationality', 'relationship status', 'sexuality/gender']


## CNN model in Keras

In [52]:
def set_metrics(preds, actual):
    """ Returns set measures of precision, recall and f1 """
    
    precisions = []
    recalls = []
    
    for pred, act in zip(preds, actual):
        pred_labels = set(np.flatnonzero(pred)) # binary encoding to indices
        actual_labels = set(np.flatnonzero(act))
        correct = pred_labels.intersection(actual_labels)
        
        # precision
        if len(pred_labels) == 0:
            if len(actual_labels) == 0:
                prec = 1.0 # Case where no labels predicted and actually are no labels
            else: 
                prec = 0.0
        else:
            prec = len(correct)/len(pred_labels)
        precisions.append(prec)
        
        # recall
        if len(actual_labels) == 0:
            if len(pred_labels) == 0:
                rec = 1.0 # Case where no labels predicted and actually are no labels
            else:
                rec = 0.0
        else:
            rec = len(correct)/len(actual_labels)
        recalls.append(rec)
        
    total_prec = np.mean(precisions)
    total_rec = np.mean(recalls)
    total_f1 = 2 * total_prec * total_rec / (total_prec + total_rec)
    
    print(f'Precision: {total_prec}\nRecall: {total_rec}\nF1: {total_f1}')

In [53]:
def kappas(preds, cats):
    # tn, fp, fn, tp
    for cat, i in zip(cats, list(range(preds.shape[1]))):
        pred_col = preds.T[i]
        actual_col = y_dev.T[i]
        print(f'{cat}:\t{cohen_kappa_score(pred_col, actual_col)}\t{confusion_matrix(pred_col, actual_col).ravel()}')

In [12]:
# Load vocab embeddings
vocab_embed = np.load('/usr0/home/mamille2/tumblr/data/recent100_100posts_embeds.npy')

In [54]:
# Shuffle, split into train/dev/test
test_size = int(0.1 * len(data))
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=test_size)
x_train, x_dev, y_train, y_dev = train_test_split(x_train, y_train, test_size=test_size)
print(x_train.shape)
print(y_train.shape)
print(x_dev.shape)
print(y_dev.shape)
print(x_test.shape)
print(y_test.shape)

(5522, 20000)
(5522, 8)
(690, 20000)
(690, 8)
(690, 20000)
(690, 8)


### Train model (with attention)

In [None]:
class AttLayer(Layer):
    """ Modified from https://richliao.github.io/supervised/classification/2016/12/26/textclassifier-HATN/ """
    def __init__(self, **kwargs):
        self.init = initializations.get('normal')
        #self.input_spec = [InputSpec(ndim=3)]
        super(AttLayer, self).__init__(** kwargs)

    def build(self, input_shape):
        assert len(input_shape)==3
        #self.W = self.init((input_shape[-1],1))
        self.W = self.init((input_shape[-1],))
        #self.input_spec = [InputSpec(shape=input_shape)]
        self.trainable_weights = [self.W]
        super(AttLayer, self).build(input_shape)  # be sure you call this somewhere!

    def call(self, x, mask=None):
        eij = K.tanh(K.dot(x, self.W))

        ai = K.exp(eij)
        weights = ai/K.sum(ai, axis=1).dimshuffle(0,'x')

        weighted_input = x*weights.dimshuffle(0,1,'x')
        return weighted_input.sum(axis=1)

    def get_output_shape_for(self, input_shape):
        return (input_shape[0], input_shape[-1])

In [55]:
# Prepare model (with attention)

EMBEDDING_DIM = 300
embedding_layer = Embedding(len(vocab),
                            EMBEDDING_DIM,
                            weights = [vocab_embed],
                            input_length = MAX_SEQUENCE_LENGTH,
                            trainable=False
                           )

model = Sequential()

model.add(embedding_layer)
# model.add(Conv1D(1024, kernel_size=5, activation='relu'))
model.add(Conv1D(64, kernel_size=5, activation='relu'))
model.add(MaxPooling1D(pool_size=5))
model.add(Dropout(0.1))
model.add(Conv1D(16, kernel_size=3, activation='relu'))
model.add(MaxPooling1D(pool_size=3))
model.add(Dropout(0.1))
model.add(Flatten())
# model.add(Dense(16, activation='relu'))
model.add(Dense(8, activation='sigmoid')) # final classification layer

model.compile(loss='binary_crossentropy', optimizer='adam')

callbacks = [
    EarlyStopping(monitor='val_loss', patience=2, verbose=0),
]
model.fit(x_train, y_train,
#          batch_size=16, epochs=20, validation_data=(x_dev, y_dev))
         batch_size=16, epochs=20, validation_data=(x_dev, y_dev), callbacks=callbacks)

preds = model.predict(x_dev, batch_size=16)

preds[preds>=0.5] = 1
preds[preds<0.5] = 0

print()
set_metrics(preds, y_dev)
print()
kappas(preds, cats)

Train on 5522 samples, validate on 690 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20

Precision: 0.49613526570048316
Recall: 0.49384057971014494
F1: 0.4949852632546726

age:	0.007754000532137373	[456 215  12   7]
gender:	-0.005613732052250908	[636  52   2   0]
sexual orientation:	-0.00834394003676997	[628  59   3   0]
pronouns:	0.0028169014084508115	[571 114   4   1]
personality type:	-0.005272651588733135	[669  19   2   0]
ethnicity/nationality:	-0.019556714471968606	[656  25   9   0]
relationship status:	0.0	[675  15   0   0]
sexuality/gender:	0.008506069865389265	[514 171   3   2]


### Train model (no attention)

In [55]:
# Prepare model (no attention)

EMBEDDING_DIM = 300
embedding_layer = Embedding(len(vocab),
                            EMBEDDING_DIM,
                            weights = [vocab_embed],
                            input_length = MAX_SEQUENCE_LENGTH,
                            trainable=False
                           )

model = Sequential()

model.add(embedding_layer)
# model.add(Conv1D(1024, kernel_size=5, activation='relu'))
model.add(Conv1D(64, kernel_size=5, activation='relu'))
model.add(MaxPooling1D(pool_size=5))
model.add(Dropout(0.1))
model.add(Conv1D(16, kernel_size=3, activation='relu'))
model.add(MaxPooling1D(pool_size=3))
model.add(Dropout(0.1))
model.add(Flatten())
# model.add(Dense(16, activation='relu'))
model.add(Dense(8, activation='sigmoid')) # final classification layer

model.compile(loss='binary_crossentropy', optimizer='adam')

callbacks = [
    EarlyStopping(monitor='val_loss', patience=2, verbose=0),
]
model.fit(x_train, y_train,
#          batch_size=16, epochs=20, validation_data=(x_dev, y_dev))
         batch_size=16, epochs=20, validation_data=(x_dev, y_dev), callbacks=callbacks)

preds = model.predict(x_dev, batch_size=16)

preds[preds>=0.5] = 1
preds[preds<0.5] = 0

print()
set_metrics(preds, y_dev)
print()
kappas(preds, cats)

Train on 5522 samples, validate on 690 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20

Precision: 0.49613526570048316
Recall: 0.49384057971014494
F1: 0.4949852632546726

age:	0.007754000532137373	[456 215  12   7]
gender:	-0.005613732052250908	[636  52   2   0]
sexual orientation:	-0.00834394003676997	[628  59   3   0]
pronouns:	0.0028169014084508115	[571 114   4   1]
personality type:	-0.005272651588733135	[669  19   2   0]
ethnicity/nationality:	-0.019556714471968606	[656  25   9   0]
relationship status:	0.0	[675  15   0   0]
sexuality/gender:	0.008506069865389265	[514 171   3   2]


In [56]:
model.save('/usr0/home/mamille2/tumblr/data/100posts_cnn_no_titles.h5')

### Load pretrained model

In [13]:
# Load model
model = load_model('/usr0/home/mamille2/tumblr/data/100posts_cnn.h5')

In [14]:
preds = model.predict(x_dev, batch_size=16)

preds[preds>=0.5] = 1
preds[preds<0.5] = 0

## Non-neural baseline

In [57]:
# Shuffle, split into train/dev/test over full sequences (no padding)

test_size = int(0.1 * len(sequences))
texts_train, texts_test, y_train, y_test = train_test_split(texts, labels, test_size=test_size)
texts_train, texts_dev, y_train, y_dev = train_test_split(texts_train, y_train, test_size=test_size)
print(len(x_train))

# Feature extraction
vec = TfidfVectorizer(max_features=100000, tokenizer=lambda x: x.split())
vec.fit(texts_train)
x_train = vec.transform(texts_train)
x_dev = vec.transform(texts_dev)
x_train.shape
x_dev.shape

5522


(690, 100000)

In [60]:
# Unigram features
clf = OneVsRestClassifier(svm.LinearSVC(verbose=2))
# clf = OneVsRestClassifier(LogisticRegression(verbose=2))
clf.fit(x_train, y_train)

preds = clf.predict(x_dev)
preds.shape

print()
set_metrics(preds, y_dev)
print()
kappas(preds, cats)

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]
Precision: 0.5344202898550725
Recall: 0.5055314009661835
F1: 0.5195745922040605

age:	0.09397429477549646	[416 199  37  38]
gender:	0.0	[635  55   0   0]
sexual orientation:	0.0207885854556612	[615  73   1   1]
pronouns:	0.06148507980569051	[587  90   8   5]
personality type:	0.0	[661  29   0   0]
ethnicity/nationality:	0.0	[666  24   0   0]
relationship status:	0.0	[679  11   0   0]
sexuality/gender:	0.16526138279932556	[497 147  18  28]


In [61]:
def most_informative_features(vectorizer, classifier, labelid, n=10):
    feature_names = vectorizer.get_feature_names()
    topn = reversed(sorted(zip(classifier.coef_[labelid], feature_names))[-n:])

    for coef, feat in topn:
        print(feat, coef)

In [62]:
# Examine most informative features
for i, cat in enumerate(cats):
    print(cat)
    most_informative_features(vec, clf, i, n=20)
    print()

age
easter 1.52536390878
black 1.36050168318
啊 1.34087567881
liam 1.32234139492
but 1.31854853129
max 1.25950581859
ravus 1.25137064739
weight 1.24420209388
annie 1.2163911664
want 1.21460289523
shinichi 1.20965362139
dark 1.20817086625
➝ 1.19514376516
📣 1.18233474098
jesus 1.17119043649
control 1.16955390096
about 1.16922464803
lol 1.16399787059
1 1.15642802337
•i 1.134635959

gender
mkd 1.33034085095
kells 1.2590726688
allen 1.22674371765
reblogger 1.22065562904
desmin 1.1767936148
trans 1.14952233216
screencaps 1.10514626296
deleted 1.09435768534
blink 1.06948860756
autumnwwe 1.05044017595
snapchat 1.03449321941
danielle 0.969714321913
alastor 0.939369232548
edmée 0.928377704265
brody 0.916965618642
child 0.88411860204
anette 0.880370037032
darkfireplaceiplier 0.842477794764
cisco 0.842067575014
yellplease 0.840492459911

sexual orientation
gay 2.41869242637
lesbian 1.56997171775
cailean 1.37257762252
aces 1.28188699002
morrigan 1.26138202893
reblogger 1.26050667426
📣 1.20133779164


In [86]:
# See certain terms in context
# wd = '👀'
wd = 'weight'
match_rows = posts[posts['body_toks'].map(lambda x: wd in x)]
matches = [t for t in posts['body_toks'].tolist() if wd in t]
print(f"Number of posts with the word: {len(matches)}")

# Number of users 
print(f"Number of users with the word: {len(match_rows['tumblog_id'].unique())}")
print()

# kwic
for i,m in enumerate(matches):
    print(i)
    ind = m.index(wd)
    beg = max(0, ind-20)
    end = min(len(m), ind+20)
    print(' '.join(m[beg:end]))
    print()

Number of posts with the word: 7930
Number of users with the word: 2226

0
boredcave : losing weight slowly is better than not losing weight .

1
so i better start now • because if i resist , in a week or two i ’ll see the weight dropping down • so i wo n’t ruin my current progress • so i will eventually be able to

2
to replace . stay away . drink at least a glass of water every hour . it ’s better for weight loss to sip throughout the day than to chug a full glass ( except before eating , in which

3
- kelly : when you ’re going through all of this shit you ca n’t only focus on your goal weight the little in between victories are the most important •the pants you used to love finally fit again •you

4
i will leave those bad foods alone . i will choose tea and water instead . i can lose the weight .

5
cravings motivation -so many cute outfits you ’ve always wanted to wear -your eyes literally get bigger as you lose weight -being lifted by a significant other -sitting in someone ’s la

1072
i ’m looking for a weight loss buddy to lose 10 pounds with ! height / weight and matching stats does n’t matter but i

1073
i ’m looking for a weight loss buddy to lose 10 pounds with ! height / weight and matching stats does n’t matter but i

1074
i ’m looking for a weight loss buddy to lose 10 pounds with ! height / weight and matching stats does n’t matter but i

1075
i ’m looking for a weight loss buddy to lose 10 pounds with ! height / weight and matching stats does n’t matter but i

1076
miss - thin : think about it . when you reach your goal weight , you will be the thinspo . no more scrolling through the tag for hours , all you got

1077
my brother is losing weight and getting good grades now that he has my dad . i ’m going to turn everything around and

1078
my brother is losing weight and getting good grades now that he has my dad . i ’m going to turn everything around and

1079
my brother is losing weight and getting good grades now that he has my dad . i ’m going to t

2198
chosen for him .   “ you know it doll . ” he shook his head slightly , adjusting her weight and pushing forward . the pair ventured down the sidewalk , the bustle of city life a dull roar

2199
chosen for him .   “ you know it doll . ” he shook his head slightly , adjusting her weight and pushing forward . the pair ventured down the sidewalk , the bustle of city life a dull roar

2200
chosen for him .   “ you know it doll . ” he shook his head slightly , adjusting her weight and pushing forward . the pair ventured down the sidewalk , the bustle of city life a dull roar

2201
chosen for him .   “ you know it doll . ” he shook his head slightly , adjusting her weight and pushing forward . the pair ventured down the sidewalk , the bustle of city life a dull roar

2202
chosen for him .   “ you know it doll . ” he shook his head slightly , adjusting her weight and pushing forward . the pair ventured down the sidewalk , the bustle of city life a dull roar

2203
chosen for him .   “ you 

3198
lenoredauphine : “ things only weight on you if you let them . you lack the required sincerity and tact— crack open a book then

3199
you shifted sheepishly as the access for air became more difficult for you to grasp . a large amount of weight felt as if it was suffocating you which interrupted your sleep .    rubbing your eyes , you weakly

3200
home . * * * thomas plopped down on his bed with a sigh . the mattress squeaked at his weight and it cradled him for a bit . it had been a hard day for him . he had

3201
ago it was . you ’re mine to love . ” he leaned in to kiss him , feeling the weight lift off his chest although the thoughts still lingered in the back of his mind . he knew that

3202
singing , his voice seeming to pull him out of his state . he quietened down , placing his entire weight on the boy next to him , too tired to even think . everything felt heavy , especially his

3203
kiantaylcr : it was like a weight was lifted from his chest , hearing that he wanted the


4198
” she said playfully ,   “ i ’ve been waiting hours   for you . ” she shifts her weight , shrugging the strap of her leather backpack slightly higher onto one shoulder . in the twenty minutes it

4199
teach me , too ? “ thank you , sconosciuto , ” maeve said impatiently , still struggling against his weight ( his heavy , overbearing weight ) , hoping to stand and bring him to his feet as well

4200
hate me ? a great burden fell upon her shoulders , then , and maeve ’s expression fell under the weight of it . after all , odessa had left her . she ’d said goodbye and walked away ,

4201
educate teens on   issues that teens might go through like    * ziggy being self - conscious about his weight to the point where he just- stops eating and sportacus tells him that how his body feels is more

4202
the fat that hangs over my bra . the bulge of my stomach when i ’m sitting down . the weight of the extra fat hanging off my arms . the fatness of my face . the shaking of my

4203
about it

is the only restrictive eating disorder and acknowledge that osfed exists . anorexia is the only eating disorder with a weight criterion and by calling any sort of restriction anorexia , you ’re invalidating those diagnosed with osfed which is

5198
dancingskelatons : i hate being short and having anorexia at the same time . like i have to lose more weight to look as skinny as the taller girls . and short girls get chubbier quicker .

5199
i did nt . i do nt eat enough to gain so i m okay i m either the same weight as sunday , 1 up ( most likely water weight bc idk ? ) or bc my weight likes

5200
but for the time being , i m pretty sure my weights just going to fluctuate between my new lowest weight , and my 1st lw so hm

5201
strong man / of a boy yet to start his first fight ; your shoulders do not sulk with the weight of worlds , oh hero , oh golden - winner . he pulls down the tapestries / burns the

5202
out . there ’s an odd self - preservation to a simple cleansing . still , uns

bluethinly : is anyone else really excited to see how their face gets more defined with weight loss ?

6198
coffee - and - frost : the best part about weight loss is feeling it , finally , after being uncomfortable for so long . it ’s running your hands

6199
it ) but you begin to restrict what you eat throughout the day , just a little to maintain your weight ( that you hate more as the days go by . ) then the real problems start when your

6200
i am about to eat on my ana blogs ? 4 . will this help me get to my goal weight or set me back ? 5 . do i want a life thin and beautiful person or do i

6201
fairyframes : i wish i could just stay at home by myself until i reach my goal weight . i do n’t want to be in public when i ’m this fat .

6202
llleighsmith : anyone else feel like their spirit is ancient and they ’ve been carrying the weight of its heartbreak for an eternity

6203
something skinny beanpole brother is a size medium t - shirt . my 71 year old 5 foot 8 average weight dad i

6997
reason he needed , that and the possibility to help people afflicted with the red lyrium . he shifted his weight from one foot to the other . his eyes moved to the other figure , looking him over .

6998
of unfamiliar faces , feeling oddly misplaced to say the least . however , comfort was found in the dead weight of her cello case , her thumb scrolling across ever so often to calm her nerves .   “

6999
. i think i might go with gay ? i ’m still not sure i think either reading could hold weight . imo what i ’ve noticed is that he ’s always being told he ’s into a girl never

7000
her up ,   “ you want one of these ? ”    she offered the bag , shifting her weight onto one elbow and holding it out to him , shaking it a little as if to entice him

7001
”    he defended , rolling his eyes . it was the most efficient way to stay full and lose weight quickly – cucumber slices periodically all day instead of meals .   “ nachos are good , i will

7002
on one knee and his chin in his palm

In [82]:
pd.set_option('display.max_colwidth', -1)

In [83]:
# Look at descriptions of people with the word
match_ids = match_rows['tumblog_id'].unique()
match_descs = descs[descs['tumblog_id'].isin(match_ids)].loc[:, ['segments_25_nopunct']]
match_descs

Unnamed: 0,segments_25_nopunct
22663,"[adri, 16, intj taetally, 2 taes notpjm]"
128288,"[halsey, hal zi]"
135784,"[whoop there it is, 25, rn, tx]"
168698,"[meagan, 17, chicago]"
192982,"[i believe we re more, much more, like]"
213614,[castiel is my sweetheart]
259196,"[the picture is not me, content may be triggering, some things may be nsfw, i don t advocate anything]"
283014,[but you can call me vier]
325290,"[call me hanzo, all roleplay]"
325808,"[male, pansexual]"


In [77]:
match_rows

Unnamed: 0,post_id,activity_time_epoch,tumblog_id,post_title,post_short_url,post_type,post_caption,post_format,post_note_count,created_time_epoch,...,reblogged_from_post_id,reblogged_from_metadata,root_post_id,body,mentions,post_tags,body_toks,body_str,body_toks_no_titles,body_str_no_titles
1069793,156718884470,1486052554000,310775800,HOME 57,https://tmblr.co/Z45R4g2HzB6Ps,text,,html,0,1.48605e+12,...,,[],-1,"<div style=""width:600px;margin:0 auto""><div st...",{},"{(polyvore),(fashion),(style),(clothing)}","[home, 57, by, umay, -, cdxc, featuring, blank...",home 57 by umay - cdxc featuring blankets ❤ li...,"[home, 57, by, umay, -, cdxc, featuring, blank...",home 57 by umay - cdxc featuring blankets ❤ li...
1069788,156718883300,1486052552000,310775800,Yoins XVI: FLORAL PANTS,https://tmblr.co/Z45R4g2HzB67a,text,,html,0,1.48605e+12,...,,[],-1,"<div style=""width:600px;margin:0 auto""><div st...",{},"{(polyvore),(fashion),(style),(clothing)}","[yoins, xvi, :, floral, pants, by, paradiselem...",yoins xvi : floral pants by paradiselemonade f...,"[yoins, xvi, :, floral, pants, by, paradiselem...",yoins xvi : floral pants by paradiselemonade f...
1069783,156718882295,1486052550000,310775800,Wild at heart,https://tmblr.co/Z45R4g2HzB5tt,text,,html,0,1.48605e+12,...,,[],-1,"<div style=""width:600px;margin:0 auto""><div st...",{},"{(polyvore),(fashion),(style),(Audrey 3+1),(VI...","[wild, at, heart, by, xcuteniallx, featuring, ...",wild at heart by xcuteniallx featuring gray t ...,"[wild, at, heart, by, xcuteniallx, featuring, ...",wild at heart by xcuteniallx featuring gray t ...
1069776,156718881410,1486052548000,310775800,10.SHEIN,https://tmblr.co/Z45R4g2HzB5g2,text,,html,0,1.48605e+12,...,,[],-1,"<div style=""width:600px;margin:0 auto""><div st...",{},"{(polyvore),(fashion),(style),(Chanel),(clothi...","[10.shein, by, amina, -, plava, featuring, cha...",10.shein by amina - plava featuring chain hand...,"[10.shein, by, amina, -, plava, featuring, cha...",10.shein by amina - plava featuring chain hand...
1069772,156718880995,1486052547000,310775800,ROSEGAL #35,https://tmblr.co/Z45R4g2HzB5ZZ,text,,html,0,1.48605e+12,...,,[],-1,"<div style=""width:600px;margin:0 auto""><div st...",{},"{(polyvore),(fashion),(style),(Christian Loubo...","[rosegal, #, 35, by, nizaba, -, haskic, featur...",rosegal # 35 by nizaba - haskic featuring a ch...,"[rosegal, #, 35, by, nizaba, -, haskic, featur...",rosegal # 35 by nizaba - haskic featuring a ch...
1069768,156718879530,1486052545000,310775800,What's Your Sign: Cosmic Jewelry,https://tmblr.co/Z45R4g2HzB5Cg,text,,html,0,1.48605e+12,...,,[],-1,"<div style=""width:600px;margin:0 auto""><div st...",{},"{(polyvore),(fashion),(style),(H&M),(Reem Acra...","[what, ’s, your, sign, :, cosmic, jewelry, by,...",what ’s your sign : cosmic jewelry by dora04 f...,"[what, ’s, your, sign, :, cosmic, jewelry, by,...",what ’s your sign : cosmic jewelry by dora04 f...
1069764,156718878805,1486052543000,310775800,MEN 58,https://tmblr.co/Z45R4g2HzB51L,text,,html,0,1.48605e+12,...,,[],-1,"<div style=""width:600px;margin:0 auto""><div st...",{},"{(polyvore),(fashion),(style),(MAC Cosmetics),...","[men, 58, by, umay, -, cdxc, featuring, marc, ...",men 58 by umay - cdxc featuring marc jacobs ❤ ...,"[men, 58, by, umay, -, cdxc, featuring, marc, ...",men 58 by umay - cdxc featuring marc jacobs ❤ ...
1069756,156718876535,1486052539000,310775800,Pink perfection !,https://tmblr.co/Z45R4g2HzB4Tt,text,,html,0,1.48605e+12,...,,[],-1,"<div style=""width:600px;margin:0 auto""><div st...",{},"{(polyvore),(fashion),(style),(vintage),(cloth...","[pink, perfection, !, by, selmica11, featuring...",pink perfection ! by selmica11 featuring a sco...,"[pink, perfection, !, by, selmica11, featuring...",pink perfection ! by selmica11 featuring a sco...
1069751,156718876485,1486052538000,310775800,Outfit #19,https://tmblr.co/Z45R4g2HzB4T5,text,,html,0,1.48605e+12,...,,[],-1,"<div style=""width:600px;margin:0 auto""><div st...",{},"{(polyvore),(fashion),(style),(See by Chloé),(...","[outfit, #, 19, by, perplexidadesilencio, feat...",outfit # 19 by perplexidadesilencio featuring ...,"[outfit, #, 19, by, perplexidadesilencio, feat...",outfit # 19 by perplexidadesilencio featuring ...
1069745,156718875075,1486052536000,310775800,Cosmic jewelry,https://tmblr.co/Z45R4g2HzB473,text,,html,0,1.48605e+12,...,,[],-1,"<div style=""width:600px;margin:0 auto""><div st...",{},"{(polyvore),(fashion),(style),(Alex Perry),(Gu...","[cosmic, jewelry, by, elarmariodelcamaleon, fe...",cosmic jewelry by elarmariodelcamaleon featuri...,"[cosmic, jewelry, by, elarmariodelcamaleon, fe...",cosmic jewelry by elarmariodelcamaleon featuri...


In [36]:
selected_rows = posts[posts['body_toks'].map(lambda x: wd in x)]
selected_rows

Unnamed: 0,post_id,activity_time_epoch,tumblog_id,post_title,post_short_url,post_type,post_caption,post_format,post_note_count,created_time_epoch,...,blog_classifier,accepts_answers,reblogged_from_post_id,reblogged_from_metadata,root_post_id,body,mentions,post_tags,body_toks,body_str
3117793,158377449742,1489459086000.0,330521275,,https://tmblr.co/Z1Q6ve2JW21aE,text,,html,7,1489460000000.0,...,safe,False,158377000000.0,"[root_post_id#158376150132,root_post_blog_id#3...",158376000000.0,"<p><a href=""http://koogane.tumblr.com/post/158...",{},"{(they're great just.),(also probably gonna sa...","[koogane, :, nothing, -, matt, :, wait, ,, did...","koogane : nothing - matt : wait , did you lite..."
3117017,158377338677,1489458892000.0,330521275,,https://tmblr.co/Z1Q6ve2JW1cSr,text,,html,5,1489460000000.0,...,safe,False,158377000000.0,"[root_post_id#158376150132,root_post_blog_id#3...",158376000000.0,"<p><a href=""http://koogane.tumblr.com/post/158...",{},{(nothing matt)},"[koogane, :, nothing, -, matt, :, good, to, k,...",koogane : nothing - matt : good to k now i ’m ...
3111318,158376430452,1489457293000.0,330521275,,https://tmblr.co/Z1Q6ve2JV_8jq,text,,html,1,1489460000000.0,...,safe,False,158376000000.0,"[root_post_id#158376150132,root_post_blog_id#3...",158376000000.0,"<p><a href=""http://koogane.tumblr.com/post/158...",{},{(nothing matt)},"[koogane, :, @nothing, -, matt, liked, your, p...",koogane : @nothing - matt liked your post : by...
3780443,158331115167,1489365019000.0,330521275,,https://tmblr.co/Z1Q6ve2JTHHQV,text,,html,13,1489370000000.0,...,safe,False,158331000000.0,"[root_post_id#158330190922,root_post_blog_id#3...",158330000000.0,"<p><a href=""http://koogane.tumblr.com/post/158...",{},{(trashpaladin)},"[koogane, :, trashpaladin, :, duh, ., should, ...",koogane : trashpaladin : duh . should you real...
3777947,158330685212,1489364270000.0,330521275,,https://tmblr.co/Z1Q6ve2JTFeSS,text,,html,9,1489360000000.0,...,safe,False,158331000000.0,"[root_post_id#158330190922,root_post_blog_id#3...",158330000000.0,"<p><a href=""http://koogane.tumblr.com/post/158...",{},"{(what're you working on?),(trashpaladin)}","[koogane, :, trashpaladin, :, i, knowwww, but,...",koogane : trashpaladin : i knowwww but i need ...
3776747,158330495907,1489363941000.0,330521275,,https://tmblr.co/Z1Q6ve2JTEwEZ,text,,html,5,1489360000000.0,...,safe,False,158330000000.0,"[root_post_id#158330190922,root_post_blog_id#3...",158330000000.0,"<p><a href=""http://koogane.tumblr.com/post/158...",{},{(trashpaladin)},"[koogane, :, trashpaladin, :, mmmm, okay, but,...",koogane : trashpaladin : mmmm okay but be care...
3775463,158330280617,1489363563000.0,330521275,,https://tmblr.co/Z1Q6ve2JTE5gf,text,,html,1,1489360000000.0,...,safe,False,158330000000.0,"[root_post_id#158330190922,root_post_blog_id#3...",158330000000.0,"<p><a href=""http://koogane.tumblr.com/post/158...",{},{(trashpaladin)},"[koogane, :, @trashpaladin, replied, to, your,...",koogane : @trashpaladin replied to your post :...
2791401,158218474502,1489122836000.0,330521275,,https://tmblr.co/Z1Q6ve2JMZbG6,text,,html,4,1489120000000.0,...,safe,False,158218000000.0,"[root_post_id#158217778173,root_post_blog_id#3...",158218000000.0,"<p><a href=""http://koogane.tumblr.com/post/158...",{},{(not like i'm doing anything except lay aroun...,"[koogane, :, bigtopz, :, koogane, :, rest, in,...","koogane : bigtopz : koogane : rest in pieces ,..."
3603304,158217947627,1489121832000.0,330521275,,https://tmblr.co/Z1Q6ve2JMXadh,text,,html,2,1489121832000.0,...,safe,False,158218000000.0,"[root_post_id#158217778173,root_post_blog_id#3...",158217778173.0,"<p><span><a href=""http://koogane.tumblr.com/po...",{},"{(????? now i'm curious.),(bigtopz)}","[koogane, :, rest, in, pieces, ,, man, ., i, d...","koogane : rest in pieces , man . i did it and ..."
3602446,158217807347,1489121556000.0,330521275,,https://tmblr.co/Z1Q6ve2JMX2Np,text,,html,0,1489121556000.0,...,safe,False,158218000000.0,"[root_post_id#158217778173,root_post_blog_id#3...",158217778173.0,"<div class=""hide_overflow""><span><a class=""use...",{},"{(a what?),(bigtopz)}","[koogane, replied, to, your, post, :, i, ca, n...",koogane replied to your post : i ca n’t fuckin...


## 1-time

In [None]:
# Load word embeddings (from Tumblr halfday)
wd_embed = ft.load_model('/usr0/home/mamille2/tumblr/data/halfday_ft.bin')

In [None]:
# Build lookup table
vocab_embed = np.empty((len(vocab),300))
for i, wd in enumerate(vocab):
    vocab_embed[i,:] = wd_embed[wd]
    
vocab_embed.shape

In [None]:
# Save vocab embeddings
np.save('/usr0/home/mamille2/tumblr/data/recent100_100posts_embeds.npy', vocab_embed)

In [None]:
cutoff = 20000
len([l for l in lens if l <= cutoff])/len(lens)

In [None]:
# Examine sequence lengths
lens = [len(s) for s in sequences]

print(np.mean(lens))
print(np.median(lens))
print(max(lens))

In [None]:
# Text posts to word indices (Graham's way)
post_inds = []

w2i = defaultdict(lambda: len(w2i))
UNK = w2i["<unk>"] # 0 index

for tid in tids:
    toks = [t for p in posts[posts['tumblog_id']==tid]['body_toks'].tolist() for t in p]
    inds = [w2i[t] for t in toks]
    post_inds.append(inds) 
    
len(post_inds)

In [None]:
# Vocab size
len(w2i)

# Sample instances marked for certain identity categories

In [None]:
# Load labeled data
split = {}
s = 'train1000'
split[s] = pd.read_csv(f'/usr0/home/mamille2/tumblr/data/list_descriptions_{s}.csv', index_col=0)
print(split[s].columns)

In [None]:
pd.set_option('display.max_colwidth', -1)

In [None]:
# Sample from each column
# for c in list(split[s].columns)[2:-1]:
for c in ['non-English', 'age', 'name', 'location', 'interests', 'adult content',
         'sexual orientation', 'gender', 'pronouns', 'fandoms', 'link to external content',
         'occupation', 'personality type', 'astrological sign', 'ethnicity/nationality',
         'relationship status', 'mental health', 'personal description/commentary', 'other/notes']:
    print(c)
    display(split[s][split[s][c]==1].sample(n=5).loc[:, ['restr_segments_25', c]])
    print()

# Look for specific category values

In [None]:
# Load blog descriptions
descs = pd.read_pickle('/usr0/home/mamille2/tumblr/data/blog_descriptions_recent100.pkl')
print(descs.columns)
len(descs)

In [None]:
pd.set_option('display.max_colwidth', -1)

In [None]:
# search_term = r'bun\W'
# search_term = r'bun pronoun'
# search_term = r'\brl\b'
search_term = r'\b[0-9]w[0-9]\b'
selected = descs[descs['parsed_blog_description'].map(lambda x: True if re.search(search_term,x) else False)]
print(len(selected))
selected.loc[:,['parsed_blog_description']]

# Word and character ngrams for identity category mention prediction

## Change CSV to pickle

In [None]:
# Load labeled data
split = {'dev200': None}
for s in split:
    split[s] = pd.read_csv(f'/usr0/home/mamille2/tumblr/data/list_descriptions_{s}.csv', index_col=0)
    print(split[s].columns)

In [None]:
def split_rm_punct(segments):
    """ Return segments split on punctuation, punctuation removed """
    
    new_segs = []
    
    for seg in segments:
        new_seg = ' '.join(re.split(r'\W', seg))
        new_seg = re.sub(r'\W', ' ', new_seg)
        new_seg = re.sub(r'\s+', ' ', new_seg).strip()
        new_segs.append(new_seg)
        
    return new_segs

In [None]:
# String representation to list
# split[s]['restr_segments_25'] = split[s]['restr_segments_25'].map(lambda x: x[2:-2].split("', '"))
split[s]['restr_segments_25'] = split[s]['restr_segments_25'].map(lambda x: ast.literal_eval(x))
split[s]['restr_segments_25']

In [None]:
split[s]['segments_25_nopunct'] = list(map(split_rm_punct, tqdm(split[s]['restr_segments_25'].tolist())))

In [None]:
# NaNs -> 0
for c in ['gender', 'sexual orientation', 'pronouns']:
    split[s][c] = split[s][c].fillna(0)

In [None]:
split[s]['sexuality/gender'] = [max(tup) for tup in zip(split[s]['sexual orientation'], split[s]['gender'], split[s]['pronouns'])]
len(split[s][split[s]['sexuality/gender'] == 1])

In [None]:
split[s].to_pickle(f'/usr0/home/mamille2/tumblr/data/list_descriptions_{s}.pkl')

## Load data

In [None]:
# Load labeled data
split = {'train1000': None, 'dev200': None}
for s in split:
    split[s] = pd.read_pickle(f'/usr0/home/mamille2/tumblr/data/list_descriptions_{s}.pkl')
    print(split[s].columns)

## Unigrams and bag of character ngrams

In [None]:
vec_dict = {'unigrams': TfidfVectorizer(), 'char 1-4grams': TfidfVectorizer(analyzer='char', ngram_range=(1,4))}
clf_dict = {'NB': MultinomialNB(), 'SVM': svm.SVC()}
outcome_classes = ['sexual orientation', 'pronouns', 'gender', 'sexuality/gender']
bow = {s: {} for s in split}
labels = {s: {} for s in split}
outlines = []

# Get features
# Fit
for vec_name in vec_dict:
    data = [' '.join(segs) for segs in split['train1000']['segments_25_nopunct'].tolist()]
    vec_dict[vec_name].fit(data)

for s in split:
    for vec_name, vec in vec_dict.items():
        data = [' '.join(segs) for segs in split[s]['segments_25_nopunct'].tolist()]
        bow[s][vec_name] = vec.transform(data)
    
    # Get labels
    for l in outcome_classes:
        labels[s][l] = split[s][l].values

# Training
for vec_name in vec_dict:
    for l in outcome_classes:
        for clf_name, clf in clf_dict.items():
            clf.fit(bow['train1000'][vec_name], labels['train1000'][l])
            
            # Testing
            for s in split:
                preds = clf.predict(bow[s][vec_name])
                pos = preds==1 # positive guesses
                true_pos = labels[s][l]==1 # true positives
                matches = sum([all(tup) for tup in zip(pos, true_pos)])
                prec = f'{matches}/{sum(pos)} ({matches/sum(pos):.1%})'
                rec = f'{matches}/{sum(true_pos)} ({matches/sum(true_pos):.1%})'
            
                outlines.append([vec_name, clf_name, s, l, prec, rec])
        
pd.DataFrame(outlines, columns=['features', 'classifier', 'dataset', 'predicted class', 'precision', 'recall'])

## 1-time

In [None]:
test = ['•draw for life•', '•a student•', '•18•']
split_rm_punct(test)

# Pattern matching for mentions of identity categories

In [None]:
# Load labeled data
split = {'train1000': None, 'dev200': None}
for s in split:
    split[s] = pd.read_pickle(f'/usr0/home/mamille2/tumblr/data/list_descriptions_{s}.pkl')
    print(split[s].columns)

In [None]:
# Load US states
fpath = '/usr0/home/mamille2/tumblr/data/states.csv'
states = [s.lower() for s in pd.read_csv(fpath)['State'].tolist()]

# Load nationalities
fpath = '/usr0/home/mamille2/tumblr/data/nationalities.txt'
with open(fpath) as f:
    nats = [nat.lower() for nat in f.read().splitlines() if (len(nat) > 3 and not nat in states)]
    
print(len(nats))

# Load ethnicities
fpath = '/usr0/home/mamille2/tumblr/data/ethnicities.txt'
outlist = states + ['coast']
with open(fpath) as f:
    eths = [e.split()[0].lower() for e in f.read().splitlines() if (len(e.split()[0]) > 4 and not e.split()[0].lower() in outlist)]
    
print(len(eths))

In [None]:
# Regex patterns
terms = {
        'age': [r'(?:[^-+\w]|^)([1-6]{1}[0-9]{1})[^-+0-9]|^([1-6]{1}[0-9]{1})$',
               r'twelve',
               r'thirteen',
               r'fourteen',
               r'fifteen',
               r'sixteen',
               r'seventeen',
               r'eighteen',
               r'nineteen',
               r'twenty',
               r'thirty',
               r'forty',
               r'fifty',
               r'sixty'],
#         'location': [],
        'gender': [r'male\b', r'female', 
                    r'trans', r'ftm', r'mtf', r'cis',
                    r'girl\b', r'boy\b', r'\bman\b', r'guy\b', r'woman', r'gu+rl', r'gii+rl',
                    r'non-binary', r'nonbinary', r'nb', r'agender', r'neutrois',
                    r'\bmom\b', r'\bdad\b', r'wife', r'husband', r'\bbrother\b', r'\bson\b', r'\bsister\b',
                    r'bigender', r'lgbt'],
        'sexual orientation': 
                     [r'gay', r'straight', r'lesbian', r'\bhomo',
                       r'bisexual', r'\bbi\b', r'pansexual', r'\bpan\b',
                       r'lgbt', r'queer',
                       r'\bace\b', r'\basexual', r'aro-ace', r'aro/ace',
                     ],
         'pronouns': [
             r'(?:\W|\b)she(?:\W|\b)', r'(?:\W|\b)her(?:\W|\b)',
             r'(?:\W|\b)he(?:\W|\b)', r'(?:\W|\b)him(?:\W|\b)',
             r'(?:\W|\b)they(?:\W|\b)', r'(?:\W|\b)them(?:\W|\b)',
             r'pronouns'
                ],
        'personality type': [
            r'(?:i|e|a)(?:s|n)(?:t|f)(?:j|p)',
            r'introvert',
            r'extrovert', 
            r'ambivert',
            r'\b[0-9]w[0-9]\b',
            ],
        'ethnicity/nationality': [r'\b{}\b'.format(el) for el in eths + nats] + 
                [r'latino', r'latina', r'cubana', r'cubano', r'chilena', r'chileno', r'mexicano', r'mexicana',
                r'palestinian'],
        'relationship status': [
            r'taken', r'married', r'single', r'engaged', r'husband', r'spouse', r'wife', r'newlywed',
            r'in a rl', r'in rl', r'in a relationship',
        ]
}
terms['sexuality/gender'] = terms['gender'] + terms['sexual orientation'] + terms['pronouns']

excl_terms = {
    'age': ['nsfw 18', '18 nsfw', '18 only', 'only 18'],
}

In [None]:
# Combine terms in regex
terms_re = {}
for cat in terms:
    terms_re[cat] = r'|'.join(terms[cat])

In [None]:
def has_category(cat, segments):
    ans = False
    
    if not isinstance(segments, list):
        return ans
    
    ans = any(re.search(terms_re[cat], s) for s in segments)
#     for c in terms[cat]:
#         ans = any(re.search(c, s) for s in segments)
#         if ans:
#             break
            
    if cat in excl_terms:
        for c in excl_terms[cat]:
            if any(c in s for s in segments):
                ans = False
            
    return ans

## Results

In [None]:
positives = {}
negatives = {}
truecat = {}
pos_matches = {}
outlines = []

for cat in tqdm(terms):
    print(cat)
    positives[cat] = {}
    negatives[cat] = {}
    truecat[cat] = {}
    pos_matches[cat] = {}
    
    for sp in split:
    
        preds = []
        preds = split[sp]['segments_25_nopunct'].map(lambda x: has_category(cat, x))

        # Get precision and recall
        positives[cat][sp] = preds[preds==True]
        negatives[cat][sp] = preds[preds==False]
        pos_matches[cat][sp] = set(positives[cat][sp].index).intersection(split[sp][split[sp][cat]==1].index)
        truecat[cat][sp] = split[sp][split[sp][cat]==1]

        if len(pos_matches[cat][sp]) > 0:
            prec = len(pos_matches[cat][sp])/len(positives[cat][sp])
        else:    
            prec = 0
            
        prec_str = f'{len(pos_matches[cat][sp])}/{len(positives[cat][sp])} ({prec:.1%})'
            
        rec = len(pos_matches[cat][sp])/len(truecat[cat][sp])
        rec_str = f'{len(pos_matches[cat][sp])}/{len(truecat[cat][sp])} ({rec:.1%})'
        f1 = 2 * prec * rec / (prec + rec)

        outlines.append([sp, cat, prec_str, rec_str, f1])
    
pd.DataFrame(outlines, columns=['dataset', 'predicted class', 'precision', 'recall', 'f1'])

In [None]:
pd.set_option('display.max_colwidth', -1)

In [None]:
# Examine misclassified
cat = 'ethnicity/nationality'
sp = 'train1000'

print('False positives:')
false_positives = set(positives[cat][sp].index).intersection(split[sp][split[sp][cat]!=1].index)
display(split[sp].loc[false_positives, ['segments_25_nopunct', cat]])

print('False negatives:')
false_negatives = set(negatives[cat][sp].index).intersection(split[sp][split[sp][cat]==1].index)
display(split[sp].loc[false_negatives, ['segments_25_nopunct', cat]])

In [None]:
row = 6137916
for term in split[sp].loc[row, 'segments_25_nopunct']:
    for t in eths:
        if re.search(t, term):
            print(term)
            print(t)

In [None]:
for cat in ['sexual orientation', 'gender', 'pronouns', 'sexuality/gender']:
    print(cat)
    print(has_category(cat, split['dev200'].loc[4539145, 'segments_25_nopunct']))

## Apply to corpus of descriptions

In [None]:
# Load blog descriptions
descs = pd.read_pickle('/usr0/home/mamille2/tumblr/data/list_descriptions_recent100_restr25.pkl')
print(descs.columns)
len(descs)

In [None]:
# Annotate for identity categories
for cat in tqdm(terms):
    print(cat)
    descs[cat] = descs['segments_25_nopunct'].map(lambda x: has_category(cat, x))

## Modifications

In [None]:
sp = 'train500'
# sp = 'dev100'
incorrect = split[sp][split[sp]['restr_segments_25'].map(lambda x: 'aromantic' in ' '.join(x))]
# incorrect = split[sp][split[sp]['restr_segments_25'].map(lambda x: 'poly' in ' '.join(x))]
incorrect
# mask = split['train500']['restr_segments_25'].map(lambda x: 'poly' in ' '.join(x) if isinstance(x, list))
# split['train500'][mask]

In [None]:
# Corrections
sp = 'train500'
cat = 'pronouns'
val = 1
# sp = 'dev100'

# for i in incorrect.index:
for i in [3047905]:
    split[sp].loc[i, cat] = val
    
len(split[sp])

In [None]:
split['train500']['gender'] = split['train500']['gender'].fillna(0)

In [None]:
# Convert from string to list
split['dev100']['restr_segments_25'] = split['dev100']['restr_segments_25'].map(lambda x: x[2:-2].split("', '") if isinstance(x, str) else x)
split['dev100']['restr_segments_25']

In [None]:
# Remove mistake settings
split['train500'] = split['train500'][split['train500']['restr_segments_25'].map(lambda x: not isinstance(x, float))]
len(split['train500'])

In [None]:
# Remove mistake settings
split['dev100'] = split['dev100'][split['dev100']['restr_segments_25'].map(lambda x: not isinstance(x, float))]
len(split['dev100'])

In [None]:
split = {}
s = 'train1000'
split[s] = pd.read_csv(f'/usr0/home/mamille2/tumblr/data/list_descriptions_{s}.csv', index_col=0)
len(split[s])

In [None]:
# Convert from string to list
split[s]['restr_segments_25'] = split[s]['restr_segments_25'].map(lambda x: x[2:-2].split("', '") if isinstance(x, str) else x)
split[s]['restr_segments_25']

In [None]:
split[s].to_pickle(f'/usr0/home/mamille2/tumblr/data/list_descriptions_{s}.pkl')

In [None]:
split['dev100'].to_pickle('/usr0/home/mamille2/tumblr/data/list_descriptions_dev100.pkl')
split['dev100'].to_csv('/usr0/home/mamille2/tumblr/data/list_descriptions_dev100.csv')
split['train500'].to_pickle('/usr0/home/mamille2/tumblr/data/list_descriptions_train500.pkl')
split['train500'].to_csv('/usr0/home/mamille2/tumblr/data/list_descriptions_train500.csv')

## 1-time

In [None]:
def split_rm_punct(segments):
    """ Return segments split on punctuation, punctuation removed """
    
    new_segs = []
    
    for seg in segments:
        new_seg = ' '.join(re.split(r'\W', seg))
        new_seg = re.sub(r'\W', ' ', new_seg)
        new_seg = re.sub(r'\s+', ' ', new_seg).strip()
        new_segs.append(new_seg)
        
    return new_segs

In [None]:
descs['segments_25_nopunct'] = list(map(split_rm_punct, tqdm(descs['restr_segments_25'].tolist())))

In [None]:
descs.to_pickle('/usr0/home/mamille2/tumblr/data/list_descriptions_recent100_restr25.pkl')

In [None]:
preds = []
preds = split['dev200']['restr_segments_25'].map(lambda x: has_category('sexuality/gender', x))
preds[preds==True]

In [None]:
split['dev200'].columns

In [None]:
split['dev200'].rename(columns={'personal description/ commentary': 'personal description/commentary',
                               'ethnicity/ nationality': 'ethnicity/nationality'}, inplace=True)

In [None]:
split['dev200'].to_pickle('/usr0/home/mamille2/tumblr/data/list_descriptions_dev200.pkl')

In [None]:
has_category('sexuality/gender', split['dev200'].loc[4539145,'restr_segments_25'])

In [None]:
has_category('gender', ['male', '28'])

In [None]:
has_category('gender', ['girl', '28'])

In [None]:
has_category('sexual orientation', ['pan as fuck', '28'])

In [None]:
has_category('pronouns', ['she/her', 'them', 'he'])

In [None]:
has_category('pronouns', ['banshee'])

In [None]:
has_category('pronouns', ['he they'])

# Qualitatively examine description segments

In [None]:
# Load descriptions
list_desc_data = pd.read_pickle('/usr0/home/mamille2/tumblr/data/list_descriptions.pkl')
print(len(list_desc_data))
print(list_desc_data.columns)

In [None]:
pd.set_option('display.max_colwidth', 999)

In [None]:
samp = list_desc_data.sample(30)
samp

# Examine Brown clustering of description segments

In [None]:
with open('/usr0/home/mamille2/brown-cluster/desc_segments_20-c50-p1.out/paths') as f:
# with open('/usr0/home/mamille2/brown-cluster/desc_segments_20_freq-c50-p1.out/paths') as f:
    lines = f.read().splitlines()
    
len(lines)

In [None]:
outlines = []

for l in lines:
    l_split = l.split('\t')
    if len(l_split) == 3:
        outlines.append(l_split)
#         clu['all'][l_split[0]].append(l_split[1])
    
# print(len(clu['all']))
# clu['all'].keys()

clu = pd.DataFrame(outlines, columns=['cluster', 'word', 'freq'])
clu

In [None]:
clu['freq'] = clu['freq'].astype(int)

In [None]:
clu.sort_values(['cluster', 'freq'], inplace=True, ascending=False)
clu

In [None]:
pd.set_option('display.max_rows', 999)

In [None]:
clu

In [None]:
for val in clu['cluster'].unique():
    rows = clu[clu['cluster']==val]
    print(rows.head(20))
    print()

In [None]:
clu.to_csv('/usr0/home/mamille2/tumblr/results/desc_segments_brown_clusters.csv', index=False)

# Reduce dimensionality of description embeddings

In [None]:
# desc_embs = np.load('/usr0/home/mamille2/tumblr/data/desc_recent5_embeddings_avg.npy')
# desc_embs = np.load('/usr0/home/mamille2/tumblr/data/desc_embeddings_avg.npy')
desc_embs = np.load('/usr0/home/mamille2/tumblr/data/desc_recent5_avg.npy')
desc_embs.shape

In [None]:
# Get labels (top prob clusters)--just load saved probabilities
# probs = np.load('/usr0/home/mamille2/tumblr/data/gmm_50_desc_avg_probs.npy')
# probs = np.load('/usr0/home/mamille2/tumblr/data/gmm_cotrain_50_desc_avg_probs.npy')
probs = np.load('/usr0/home/mamille2/tumblr/data/recent5_gmm_50_desc_avg_probs.npy')
probs.shape

In [None]:
clusters_assgn = np.argsort(probs, axis=1)[:,-1] 
clusters_assgn.shape

## PCA

In [None]:
pca = PCA(n_components=2)
reduced = pca.fit_transform(desc_embs)
print(reduced.shape)
print(pca.explained_variance_ratio_)

## t-SNE

In [None]:
# Reduce dimensions to 50 first
pca = PCA(n_components=50)
pca_reduced = pca.fit_transform(desc_embs)
print(pca_reduced.shape)
print(pca.explained_variance_ratio_)

In [None]:
inds = np.random.choice(len(pca_reduced), int(1e4))
samp = pca_reduced[inds]

tsne = TSNE(n_components=2, verbose=2)
# reduced = tsne.fit_transform(desc_embs)
reduced = tsne.fit_transform(samp)
print(reduced.shape)

## Graph clusters of reduced dimensions

In [None]:
# If sampled, need to same cluster assignments
clusters_assgn = clusters_assgn[inds]
len(clusters_assgn)

In [None]:
%matplotlib inline

fig = plt.figure(figsize=(15,10))
scatter = plt.scatter(reduced[:,0], reduced[:,1], c=clusters_assgn, s=10)
plt.colorbar(scatter)
# plt.axis([-1,2.5,-4,1.5])
plt.axis([-3,10,-2,5])
# plt.title("PCA of cotrained description embeddings")
plt.title("PCA of description embeddings")
# plt.title("t-SNE of description embeddings (10k)")
# fig.savefig('/usr0/home/mamille2/tumblr/results/pca_cotrain.png', dpi=100)
fig.savefig('/usr0/home/mamille2/tumblr/results/pca_desc_recent5.png', dpi=100)
# fig.savefig('/usr0/home/mamille2/tumblr/results/tsne_cotrain.png', dpi=100)
# fig.savefig('/usr0/home/mamille2/tumblr/results/tsne_desc.png', dpi=100)
fig.show()

In [None]:
clu_ctr = Counter(clusters_assgn)
clu_ctr

# Run GMM clustering on blog descriptions

In [None]:
# Load data
desc_emb_path = '/usr0/home/mamille2/tumblr/data/desc_embeddings_avg.npy'
desc_emb = np.load(desc_emb_path)
desc_emb.shape

In [None]:
X = desc_emb[:500000,:]
clf = GaussianMixture(n_components=50, verbose=2, warm_start=True)
clf.fit(X)

In [None]:
outpath = '/usr0/home/mamille2/tumblr/data/gmm_20_desc.pkl'

with open(outpath, 'wb') as f:
    pickle.dump(clf, f)

## Try to continue training a model

In [None]:
# Load model
# path = '/usr0/home/mamille2/tumblr/data/gmm_20_desc.pkl'
path = '/usr0/home/mamille2/tumblr/data/gmm_50_desc.pkl'

with open(path, 'rb') as f:
    clf = pickle.load(f)

In [None]:
X = desc_emb[:500000,:]
clf.fit(X)

# Examine trained GMM

In [None]:
# Load data
# desc_emb_path = '/usr0/home/mamille2/tumblr/data/desc_embeddings_avg.npy'
# desc_emb_path = '/usr0/home/mamille2/tumblr/data/desc_recent5_embeddings_avg.npy'
desc_emb_path = '/usr0/home/mamille2/tumblr/data/desc_recent5_embeddings_sum.npy'
desc_emb = np.load(desc_emb_path)
desc_emb.shape

In [None]:
# Load model
# path = '/usr0/home/mamille2/tumblr/data/gmm_20_desc.pkl'
# path = '/usr0/home/mamille2/tumblr/data/gmm_50_desc.pkl'
# path = '/usr0/home/mamille2/tumblr/data/gmm_cotrain_50_desc.pkl'
path = '/usr0/home/mamille2/tumblr/data/gmm_cotrain_50_desc_sum.pkl'

with open(path, 'rb') as f:
    clf = pickle.load(f)

In [None]:
# Load descriptions
# path = '/usr0/home/mamille2/tumblr/data/en_blog_descriptions.pkl'
path = '/usr0/home/mamille2/tumblr/data/desc_recent5.pkl'
desc_df = pd.read_pickle(path)

# descs = desc_df['parsed_blog_description'].tolist()
desc_toks = desc_df['tokenized_blog_description'].tolist()

In [None]:
# clf.bic(desc_emb[:500000,:]) # -615M for 20 comps, -652M for 50 comps
clf.bic(desc_emb)

In [None]:
clf.lower_bound_

In [None]:
# Get highest weights
wted_comps = np.argsort(clf.weights_)[::-1]
wted_comps

## Examine datapoints with highest probabilities assigned for each cluster; examine cluster assignments

In [None]:
# probs = clf.predict_proba(desc_emb[:500000,:])
probs = clf.predict_proba(desc_emb)
probs.shape

In [None]:
top_probs = np.argsort(probs, axis=0)[::-1]
top_probs.shape

In [None]:
def top_descs(probs, descs, k, order, vocab_file=None):
    """ Prints top k descriptions for each component"""
    
    top_probs = np.argsort(probs, axis=0)[::-1]
    
    if vocab_file: # dict [n_words]: [vocab]
        with open(vocab_file, 'rb') as f:
            vocab = pickle.load(f)
    
    for i in order:
        print("Component {}".format(i))
        col = top_probs[:,i]
#     for i, c in enumerate(top_probs.T):
        
        for el in col[:k]: 
            if vocab_file:
                print('\t' + ' '.join(d if d in vocab[100000] else '<unk>' for d in descs[el])) # for tokenized
            else:
                print('\t' + ' '.join(d if d in vocab[100000] else '<unk>' for d in descs[el])) # for tokenized
#             print('\t' + descs[el])
            
        print()

In [None]:
# Top descriptions from halfday co-training, sum
top_descs(probs, desc_toks, 20, wted_comps, '/usr0/home/mamille2/tumblr/data/halfday_top5_vocab100000.pkl')

In [None]:
# Top descriptions from just descriptions (50 components)
top_descs(probs, descs, 20, wted_comps)

In [None]:
# Top descriptions from halfday co-training, averages
top_descs(probs, descs, 20, wted_comps)

## Find closest words in embedding space to cluster means
Doesn't really mean anything, as are averaging embeddings across all words in a post and 'dmitry' is closest to each cluster mean

In [None]:
path = '/usr0/home/mamille2/tumblr/data/desc_ftvecs100000.pkl'

with open(path, 'rb') as f:
    wd_embs = pickle.load(f)
    
len(wd_embs[100000])

In [None]:
closests = []
dist = euclidean

for m in tqdm(clf.means_):
    closest_dist = np.infty
    closest_wd = None
    
    for wd, emb in wd_embs[100000].items():
        if dist(m,emb) < closest_dist:
            closest_wd = wd
            
    closests.append(closest_wd)
    
closests

# Sample blog descriptions for analysis

In [None]:
# Load data
# data = pd.read_csv('/usr0/home/mamille2/tumblr/data/en_nan_blog_descriptions.csv')
data = pd.read_csv('/usr0/home/mamille2/tumblr/data/en_blog_descriptions.csv')
print(len(data))
data.columns

In [None]:
pd.set_option('display.max_colwidth', 999)

In [None]:
s = data.sample(n=10)
s.loc[:, ['tumblog_id', 'tumblr_blog_name', 'tumblr_blog_title', 'tumblr_blog_url', 'timezone', 'tumblr_blog_description', 'parsed_blog_description']]

## Blog descriptions from blogs that have text posts in halfday

In [None]:
text_posts = pd.read_pickle('/usr0/home/mamille2/tumblr/data/halfday_text.pkl')
print(len(text_posts))
text_posts.columns

In [None]:
# Blogs that also have text descriptions
tumblogs_allposts = text_posts['tumblog_id'].unique()
len(tumblogs_allposts)

In [None]:
count_series = text_posts.groupby(['tumblog_id']).size()

In [None]:
tumblogs_2posts = count_series[count_series >= 2].index

In [None]:
tumblogs_5posts = count_series[count_series >= 5].index

In [None]:
tumblogs_10posts = count_series[count_series >= 10].index

In [None]:
data_text = data[data['tumblog_id'].isin(tumblogs_allposts)]
len(data_text)

In [None]:
data_text2 = data[data['tumblog_id'].isin(tumblogs_2posts)]
len(data_text2)

In [None]:
data_text = data[data['tumblog_id'].isin(tumblogs_5posts)]
len(data_text)

In [None]:
data_text10 = data[data['tumblog_id'].isin(tumblogs_10posts)]
len(data_text10)

In [None]:
# Sample from those who have at least 10 text posts in halfday

s = data_text10.sample(n=10)
s.loc[:, ['tumblog_id', 'tumblr_blog_name', 'tumblr_blog_title', 'tumblr_blog_url', 'timezone', 'tumblr_blog_description', 'parsed_blog_description']]

In [None]:
# Sample from those who have at least 2 text posts in halfday

s = data_text2.sample(n=10)
s.loc[:, ['tumblog_id', 'tumblr_blog_name', 'tumblr_blog_title', 'tumblr_blog_url', 'timezone', 'tumblr_blog_description', 'parsed_blog_description']]

# LSA on blog descriptions

## Get blog descriptions

In [None]:
# Load data
# data = pd.read_csv('/usr0/home/mamille2/tumblr/data/en_nan_blog_descriptions.csv')
data = pd.read_csv('/usr0/home/mamille2/tumblr/data/en_blog_descriptions.csv')
print(len(data))
data.columns

In [None]:
blog_descs = data['parsed_blog_description'].values
blog_descs.shape

## Get tfidf matrix

In [None]:
tfidf = TfidfVectorizer(max_features=100000)
tfidf_mat = tfidf.fit_transform(blog_descs)
tfidf_mat.shape

## Do SVD

In [None]:
svd = TruncatedSVD(n_components=300)
svd_mat = svd.fit_transform(tfidf_mat)
svd_mat.shape

In [None]:
svd.explained_variance_ratio_.sum() 
# 17% with 100 components over full vocab 
# 22% with 100 components over top 100k words
# 34% with 300 components over top 100k words

In [None]:
# Words x components matrix
svd.components_.shape

In [None]:
# word features
feats = tfidf.get_feature_names()
len(feats)

## Get ranked word features by component

In [None]:
top = np.argsort(svd.components_)[:100]
top.shape

In [None]:
top_sub = top[:, :100]
top_sub.shape

In [None]:
feats2names = np.vectorize(lambda x: feats[x])
top_feats = feats2names(top_sub)
top_feats

In [None]:
for i, factor in enumerate(top_feats):
    print('Factor {}'.format(i))
    pprint(factor)
    print()

In [None]:
np.save('/usr0/home/mamille2/tumblr/data/lsa_descriptions_topwords.npy', top_feats)

## Get ranked documents by component

In [None]:
top_docs_idx = np.argsort(svd_mat.T) # Select 10 highest components
top_docs_idx.shape

In [None]:
top_docs_idx = top_docs_idx[:100]
top_docs_idx.shape

In [None]:
top_sub = top_docs_idx[:, :100]
top_sub.shape

In [None]:
idx2docs = np.vectorize(lambda x: blog_descs[x])
top_docs = idx2docs(top_sub)
top_docs

In [None]:
for i, factor in enumerate(top_docs):
    print('Factor {}'.format(i))
    pprint(factor)
    print()

In [None]:
np.save('/usr0/home/mamille2/tumblr/data/lsa_descriptions_topdocs.npy', top_docs)