In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split

from IPython.display import display

data_dirpath = '/usr2/mamille2/tumblr/data'

# Qualitatively examine non-list annotated descriptions

In [7]:
# Load annotated descriptions
descs = pd.read_pickle(os.path.join(data_dirpath, 'blog_descriptions_recent100_100posts.pkl'))
print(len(descs))
print(descs.columns)
print()

# Basic stats
cats = ['age', 'location', 'gender', 'sexual orientation', 'pronouns', 
        'personality type', 'ethnicity/nationality', 'relationship status', 'roleplay',
        'fandoms', 'interests', 'weight',
        'sexuality/gender']

outlines = []

for col in cats:
    annotated = sum(descs[col])
    outlines.append([col, annotated, annotated/len(descs) * 100])
#     print(f"{col}: {annotated}\t{annotated/len(descs): .1%}")

# No annotations
any_annote = sum([any(line) for line in list(zip(*[descs[col] for col in cats]))])
no_annote = len(descs) - any_annote
# print()
# print(f'# with no annotations: {no_annote}\t{no_annote/len(descs): .1%}')
outlines.append(['none', no_annote, no_annote/len(descs) * 100])

table = pd.DataFrame(outlines, columns=['category', 'n_instances', 'percentage of descriptions'])\
        .sort_values('percentage of descriptions', ascending=False).reset_index(drop=True)
table

20266
Index(['tumblog_id', 'activity_time_epoch', 'tumblr_blog_name',
       'tumblr_blog_title', 'tumblr_blog_description', 'tumblr_blog_url',
       'tumblr_blog_theme', 'is_group_blog', 'is_primary', 'is_private',
       'created_time_epoch', 'updated_time_epoch', 'timezone', 'language',
       'blog_classifier', 'generated_date', 'parsed_blog_description', 'age',
       'gender', 'sexual orientation', 'pronouns', 'personality type',
       'ethnicity/nationality', 'relationship status', 'sexuality/gender',
       'age_terms', 'gender_terms', 'sexual orientation_terms',
       'pronouns_terms', 'personality type_terms',
       'ethnicity/nationality_terms', 'relationship status_terms',
       'sexuality/gender_terms', 'location_terms', 'location',
       'roleplay_terms', 'roleplay', 'fandoms_terms', 'fandoms',
       'interests_terms', 'interests', 'weight_terms', 'weight'],
      dtype='object')



Unnamed: 0,category,n_instances,percentage of descriptions
0,none,9038,44.596862
1,sexuality/gender,4722,23.300109
2,age,3552,17.526892
3,interests,3345,16.505477
4,pronouns,2489,12.281654
5,roleplay,2040,10.066121
6,gender,2038,10.056252
7,fandoms,1837,9.064443
8,location,1360,6.710747
9,sexual orientation,1309,6.459094


## Evaluate quality of annotations

In [3]:
pd.set_option('display.max_colwidth', -1)

In [8]:
for col in sorted(cats):
    display(descs[descs[col]==True].sample(30).loc[:, ['parsed_blog_description', col, f'{col}_terms']])

Unnamed: 0,parsed_blog_description,age,age_terms
3873753,"Quant (/kwänt/, Dutch, meaning ""Rogue."") Voshell. 26. Far from home. Lvl 19 | Human | Thief",True,[19]
4193718,"ADRIANA CLARINGTON; You may know me, you may not know me. That all depends on what you do in your free time. If it's tuning to ESPN, then you're probably used to my mug. Hopefully you know me as #1 Cowboys Fan and, current lover of the one and only Dak Prescott. That's my man right there. For those of you who aren't into the world of sports, I'll do a quick introduction. The name's Adriana Clarington and I'm twenty-six years old. I work for ESPN as a sideline reporter, but in a year or two you can bet your ass I'll be the next Erin Andrews . Texan. That right there is all you need to know. If you want to to make me happy, bring me a Chipotle bowl with a glass of Pinot Noir. The whole bottle is preferable.",True,[twenty]
1102765,They/Them - 20 - California ~DEPRESSION HOE~,True,[20]
2309717,"Mel•19•INTJ•'Strayian Sometimes NSFW so tread lightly Sold my soul to fund Big Finish purchases I have too many interests to keep track of Not a shipper anymore, just loves fictional chatacters This blog is a mess (currently in process of cleaning up) (Icon by @sanjism)",True,[19]
4133171,23 | US | Straight | Taken | (tough) Lee,True,[23]
1226874,"I am unaccompanied, I own the depths of my existence ” Can you say the same for yourself? . Angelica Marie Rogers Van-Kamp Twenty-seven. Businesswoman and avid reader. Proud owner of XS, Omnia, Saint James Chateaux & Acqualina Resort. +",True,[Twenty]
2949980,18 year old Vampire child with a penchant for lust and cute things 🍼 Also follow my Goth account @xdeadsmeex,True,[18]
3943001,"M | 22 | Sarasota, FL | 18+, minors will be blocked | This is where I keep my thoughts - mostly private ones.",True,[22]
4053005,"-ESG | 15 | ""He""; ""him""; ""his"". | Italy",True,[15]
3808982,18 // he/them // i really love my partner...and kitties. don't forget kitties.,True,[18]


Unnamed: 0,parsed_blog_description,ethnicity/nationality,ethnicity/nationality_terms
2290585,Skydiving San Diego is North America’s longest operating Skydive business offering the most incredible views of Southern California’s topography.,True,[Southern]
3769897,*TRIGGER WARNING** disclaimer; I do not promote drug use at all. 🇻🇬Australian/ 18/ tauras Just another person with a habit to abuse drugs. Ice princess,True,[Australian]
1413134,18 year old Ashkenazi & Sephardi Jewish Shitposter. Founder of Kahanblr & Member of Toryblr.,True,[Ashkenazi]
1216164,"Howdy fellas. I uh...like to draw, I'm an English student, and an anti-SJW. That's literally it. Take a glance at my Deviantart",True,[English]
1619690,"female/19/bi/white - i truly believe pearl jam's ""jeremy"" is the highest form of art - links",True,[white]
3126822,"Autumn . 23. Leo/Virgo. Fae, forest, green witch. British as all get-out. Learning, growing, evolving. Aesthetically messy and typically flighty. Do no harm, but take no sh*t.",True,[British]
2724101,🌟I'm a star. he taught me that.🌟 Greetings I am a therian of the species european wild cat and red fox and I have the ears to prove it. I don't really care for the construct of gender so call me what you want. Please enjoy this cluster fuck that I call a tumblr.,True,[european]
3194458,Berwald. Swedish. Taken by his Romanian.,True,"[Swedish, Romanian]"
3704406,"Hello! I question why you're here but welcome you all the same! I'm the futuristicallycoldfox(because I couldn't resist the tumblr generated irl) but you can call me Frenchfryem. Or French fry. Or fry. Or Em. Or any variation of that or my irl. Honesty, anything is fine so long I know it's me. OH! AND I AM ACE! (ah, how lovely it is to say so)",True,[French]
3060501,"Watch anime English Dubbed, English Subbed",True,"[English, English]"


Unnamed: 0,parsed_blog_description,fandoms,fandoms_terms
3361439,"My name is Em. I reside in the Twin Cities, and I'm not a huge fan of having responsibility.",True,[fan]
3264485,indie rp blog multi-verse & ship friendlymun & muse: 20+ background artwork by me. d o n ' t s t e a l.,True,[verse]
3489915,"Hello, I am Hamilton trash. You can call me Mylee! Or Aiglentina, the name I was born with. I just like the name Mylee. Warning: I swear a lot, I can't help it for some reason. Another warning, this is now a mix of Harry Potter and Hamilton. Player of Cards against Hamilton Proud bisexual Tumblr mom is matt10nt This blog is safe for everyone except homo/heterophobes and nazis and trump supporters Please send in an ask! It makes my day. Sorry for long description",True,"[Hamilton, Harry Potter, Hamilton, Hamilton]"
4149020,"I'll put myself back into the narrative.. Welcome. I am Elizabeth Schuyler, wife of Mr. Alexander Hamilton who is never satisfied. Feel free to ask anything you wish. (INDIE ROLEPLAY FOR ELIZA FROM HAMILTON 18+ Read about, Rules and verses before interacting.)",True,"[Hamilton, HAMILTON, verse]"
2963274,"/ Irina / Female / Introvert / Pisces / Loves FF (new to the series), Horror (Movies, Games, Books) / Listens To Starset, RED, Hollywood Undead, Imagine Dragons, Papa Roach / Interested In Serial Killers, Creepypastas, Medical History, Plague Doctors, History, Gaming, Video Games, Anime, Manga, Space, Astronomy / I Post Creations I Make On Polyvore And Reblogs / My Acc. On Polyvore Is: Perfex",True,[series]
1055285,"Mariana. Female. Bored. Equality. This is my general/humour blog. My asthetic ones are: time-does-not-care and cannotdefinebeauty. The 100. X-men. NCIS and other fandoms that i cannot remember, there are too many.",True,[fandom]
3092392,canon-divergent shiro yoshiwara | adekan | mun & muse 18+ | triggering content,True,[canon]
332072,"My name is Nicole K. Literally, if you were to describe me, I am your typical Asian girl. This is a personal tumblr. Extremely rarely you may find original posts. Personal desc: Perfectionist. Origami. K-Pop. Anime. Band geek. Drawing. Vocaloid. Puzzles. Games. Dark thrillers. A love for all things cute. Ayup, I'm about as typical as you can get :3",True,[K-Pop]
3481775,"🌸Lauren 🌻23 🏡Nottingham, United Kingdom ❤Taken ♓Pisces 💉Type One Diabetic 🌱Nature 🔮Wiccan 👭Bisexual 👻Horror Movie Fan 🐈Cat Momma 👽Slightly Weird",True,[Fan]
2981378,We all know somebody who knows somebody who's doing great I know some people who know people who are flying straight ~~~~~~~~~~~~~~~~~~~~~~~~~~~ Selective Indie RP (Mutlimuse/Mulitship/ MultiVerse),True,[MultiVerse]


Unnamed: 0,parsed_blog_description,gender,gender_terms
2516839,"[5'7|Taurus|Hufflepuff|ISFJ] I am a fully""functional""fan girl who is into so many fandoms it's ridiculous should probably go interact with humans but not today satan, not today. [Fandoms] ·Supernatural ·Criminal Minds ·Teen Wolf ·The Vampire Diaries ·The Originals ·Fantastic Beast and Where to Find Them ·Doctor Strange ·Sherlock BBC ·Voltron Legendary Defender ·Gotham ·Arrow ·Miraculous: Tales of Ladybug & Cat Noir ·Etcetera",True,[girl]
2567430,Holy fuck I'm gay I especially gay for Lady Gaga... and basically any other woman... I need help,True,"[Lady, woman]"
3266605,♡ bisexual ♡ hypersexual ♡ anti-cishet ♡ he/him ♡ ♡ i love my boyfriends min yoongi and jung hoseok and my girlfriend leadermon ♡ ♡ please read my about! ♡ ♡ prev. yoongispisskink ♡,True,[cis]
2714470,"Wanda Maximoff. scarlet witch. sister.an avenger.Oh, but why did God give us fragile hearts and such brutal bones to house them in? I want to be all t e n d e r n e s s but my hands are much too h a r s h.",True,[sister]
2846642,— ☾ ✰ ° › she wears strength and darkness equally well. The girl has always been half goddess half hell.,True,[girl]
1685036,"Cameron, 19, nonbinary/trans, @whatsnewbussycat",True,"[nonbinary, trans]"
3374537,"A giant jumble of fangirl-ness, quotes, and a bunch of other things.",True,[girl]
3610526,| Zelle | Minor | Nonbinary | | Fae/Faer or They/Them | Links,True,[Nonbinary]
2093538,"Lonely and neutrois ~ Z/Gale/Frisk/King/ etc. ~ they/them ~ current special interests: Pokemon, Steven Universe, Nanatsu no Taizai ~ deviantart: napstahappenings",True,[neutrois]
3401237,"“this above all: to thine own self be true, and it must follow, as the night the day, thou canst not then be false to any man.”",True,[man]


Unnamed: 0,parsed_blog_description,interests,interests_terms
4384349,"""Beyond this place of wrath and tears, looms but the Horror of the shade, and yet the menace of the years, finds and shall find me unafraid."" slightly dark Credence Barebone rp, 18+ Asks always open",True,[Horror]
1877195,"Politics, memes, and too much Shakira",True,[memes]
1852716,final fantasy multi muse blog/ the main muse is the lioness. if you would like to talk with another muse please let me know (face claim for seon is dowra from mobinogi. art doesn't belong to me),True,[art]
4021401,"Singer/songwriter. Harry Potter is life. I play piano, too.",True,[write]
1043335,"Hodgepodge blog of animals, personal posts, funny things. Jewish conversion student.",True,[animals]
3336078,Hi ^-^! Im Chebba or Cheebs for short 💕💕. Meyer and Briggs say I'm a INFP which basically means I'm a Unicorn Princess 💕. I'm happily taken by the cutest prince 🤴🏻promote lots of self love 💕I am here to spread positivity and to make some friends too so please feel free to message me! I love pink positivity posts 💕I am also a huge Star Wars geek and I am obsessed with the Walking Dead Game. I will tag any gaming posts under #CheebsPlaysGames. I hope you enjoy my cute little blog 💕💕F,True,"[gaming, Games]"
3454778,"Amanda | Sweden | Coffee freak | I'm a happy virus and EXO's Chanyeol is my husband | I belong to the fandoms of EXO, GOT7, BTS, NCT, PENTAGON & MONSTA X",True,[Coffee]
3825414,Abigail | 18 | Transwoman/Lesbian | Writer,True,[Write]
3338494,Ally ~v bi~smol~annoying~obsessed with drag queens and phan~also bands and sparkles~those are pretty good too~she/her ~ plz message me im lonley and need friendos. ~ (╭☞ ✿ ᗜ ✿ )╭☞,True,[bands]
3382802,im ren and I'm 19 and welcome to my fan art blog,True,[art]


Unnamed: 0,parsed_blog_description,location,location_terms
1558264,Jewish lesbian 19 ATL,True,[ATL]
3289840,"Welcome to London, England where you get to decide what journey in life you take. From the beautiful waters, to the London Eye and of course Buckingham Palace, there’s all sorts of life altering choices and adventures that you can get lost in. Which road will you take? London;s Calling Roleplay is a small appless multi muse based roleplay focusing on character development. Applications are accepted daily and we have a small welcoming group willing to form connections and play in a safe environment. mobile navigation Established Nov 2016 Currently: Accepting daily",True,"[London, London, London]"
3784875,Sarah // 17 // UK // gay af,True,[UK]
1867610,Lesbian...22...Iowa...Snapchat: erb66n,True,[Iowa]
3961617,"renee young/paquette. las vegas resident, canada born. the maury povich of wwe. one of the stars of total divas, dog mom to blue, and pretty likeable? [rpg blog]",True,[canada]
1343436,"established in 1837 , wilson, oregon is the heart and soul of fayette county. as time has passed, wilson has grown, but the strong connections within the community have never weakened. day to day visitors, commuters, and college students come and go, but the sense of safety and camaraderie that wilson provides convinces many to kick their shoes off and stay a while. in wilson, there is no such thing as an unwelcome stranger. mn.",True,[oregon]
2382606,"Hello my names shaneka and my blog sucks. She/her. Christchurch, New Zealand, come kill me.",True,[New Zealand]
1796332,NFL Ø (@NFL_Ø) Nevada Football Lines : Sports Fantasy Casino Game Betting,True,[Nevada]
1089892,Casey•Capricorn•California,True,[California]
2230702,"Hi, I'm Aryn. I'm 17. Bisexual. Ohio.",True,[Ohio]


Unnamed: 0,parsed_blog_description,personality type,personality type_terms
315463,home ask || sub about ~ brittany 19 ○ f ○ virgo (she/her),True,[virgo]
232494,INFP,True,[INFP]
2257947,Maddy| 17| She\Her| Soft for OT4|A Harrie and a neutral who likes to say away from drama,True,[neutral]
1222163,Somewhere in South Florida. NSFW. 2A supporter. LEO and military friendly. I'm hip.,True,[LEO]
2684545,"my email is raidmyparade@yahoo-com .It's losechester, not 10sechester btw. She/her. Gemini. I don't write fanfic, but if someone has an idea for a story, I am a Fairly well writer, so if you submit your idea (if you want help, or just don't write yourself) I can write it. I love asks. If you wanna be friends, just send a message, or ask, and say anything... Literally anything. I've made tons of friends this way. I love to be tagged in things, so if you think of me, don't feel like you're gonna bother me or anything. Pacific time zone. Idk if you wanna know anything else, just ask. :)",True,[Gemini]
4239585,fred arthur weasley ii seventeen / seventh year slytherin halfblood,True,[slytherin]
3639018,INFP ||| Star Wars + Hamilton + Turn + any random fandoms ||| Most of my posts are queued ||| ASK ME QUESTIONS and I also take fic requests! ||| AO3: @The_Jedi_Temple_Archives,True,[INFP]
2968103,welcome to my blog!! 18 | INTP | Atheist | ♂ | gay | classical liberal | Minarchist | ♎ (not affiliated with icyarguments),True,[INTP]
1321272,16/polish/entp,True,[entp]
1849328,"Dannii. 26. Aries. ISFP. Strong, independent trash who don't need no can. NSFW dumpster fire majoring in Erumike & Eruri with a minor in literally every other SNK ship and many gay sports animu.",True,"[Aries, ISFP]"


Unnamed: 0,parsed_blog_description,pronouns,pronouns_terms
2774815,raye. she//her. hapa. half aesthetic queen half fandom trash. dosdudettes on polyvore and rayeofmanyrains on pinterest,True,"[she/, /her.]"
1854359,[Al/Female(she/her)/Scorpio(nov4)/Mexican/ace]only reblogs [Kin:Shay & Paul] (spanish:Al-pensamientoslocos],True,"[(she/, her)]"
3623370,USED FOR RPG. NOT HER,True,[HER]
3271282,"His opponent was a slender young woman with delicate, almost elfin features that were emphasized by her cropped black hair and elegantly pointed ears.",True,[her]
2179541,"❝ Use him. Fuck him and let him rock your world. But do not stay the night, don’t stay in bed and cuddle. After you cum, roll off him and walk out the door. ❞ || indie smut rp || mun && muses 20+ || NSFW NOT TAGGED.",True,"[him., him, him, him]"
2162096,"Hello! Feel free to call me Will, or any of my kin type's names. I use he/him pronouns Kin About BYF",True,"[he/, him, pronouns]"
3795190,"hajime tenga : he's thinking ""hold my hand, hold it tight whether the weather is cold tonight. i promise it will be alright "" she's on his mind day and night, thinks he takes her for granted but to her surprise, he needs her more than she needs him. won't fight, no just walks away . won't tell his secrets just keeps them safe . that's why she's, she's not just another face. --kiznaiver 02.",True,"[he', she', he, her, her, he, her, she, him., them, she', she']"
1332331,"~~~~~~~~she her, 5'3"", i need more friends~~~~~~~~~~~",True,"[~she, her,]"
1042785,"◤ ✕* º aziza ""azi or ziza"" sera saliba. 20. pre-med. sigma pi lambda. she looks so pure, so angelic, but be full of care, the most beautiful flowers, can be the most toxic. she appears to be an angel, a saint, but be careful, even satan was once an angel.",True,"[she, she]"
3127702,"| call me knock out | fictionkin | latinx-mexican | he/them/zyr | | transformers,, and the occasional fixation | |im an emotional wreck | | FAQ+Ask | Write a thing |",True,"[he/, them/]"


Unnamed: 0,parsed_blog_description,relationship status,relationship status_terms
3807349,"31, Mom, Wife. Lover of: Coffee, Rain, Gardening, Pumpkins, Autumn, Christmas.",True,[Wife]
2695919,"28 years old, she/her, bisexual, married, Hufflepuff, INFJ, Varric Tethras fangirl (I'm not sorry). Multifandom blog and teaching myself how to art. I try to maintain a discourse free and postive blog. I ship all the ships but Varric/Cassandra is otp. Fandoms: Dragon Age, Mass Effect, Horizon: Zero Dawn, Sense8, Orphan Black, and some sprinklings of other things. Everything is tagged. Let me know if you need something tagged specifically.",True,[married]
3571765,Laura. 21. Taken. Hunting and fishing addict. Flyers hockey/semi-retired ice hockey defenseman. Chevy runs deep. Vulture Culture. Love my 97 Silverado. Find me in the woods or on the lake,True,[Taken]
1368207,"✘ Vera ✘ Gay ✘ 19 ✘ Single✘ They/Them Hello, it's nice to meet you all, as it says above, my name is Vera. I've been on tumblr for more years than I'd prefer to count",True,[Single]
181453,Megan. Eighteen. Happily taken💗💗,True,[taken]
3795524,Family and Country 1st. Intergalactical Ground Commander Crew Earth 🌏 Force One. Cosmonauts. Deployed on Mission 2 Earth. We call Home. Married/Gay. Kamikazes ⭕️🏳️‍🌈🇵🇱🇨🇦🇺🇸🇩🇪 🇮🇹Yes. We Are Hotter Than U!,True,[Married]
3491147,❛ i've been waiting all year to get the hell up out of here and throw away my fears . i'm so faded off of all the things that i've taken and maybe i'm not really drunk ; maybe i'm really good at faking . ❜,True,[taken]
4135003,"✨ Yo! I'm Kai, your local sapphic witchy alien. They/she. 20 years young. Single as a pringle. Coffee fuels my soulless body. Sapphic positivity. Witchy stuff. Kpop shitposting. BLACKPINK, hella Rosé biased. My qpp Kier is an asshole but I love them. Feel free to message me anytime! Instagram & kik @ smolkaibean ✨",True,[Single]
3894224,29 single horny lesbian looking for a sexy fuckbudy lesbian who loves dirty talking n even will phonesex with me,True,[single]
2744554,Hi! My name is Sharon. 20. Taken,True,[Taken]


Unnamed: 0,parsed_blog_description,roleplay,roleplay_terms
3044657,indie rp: Arthur from the Tick; any iterations icon by andrexx on deviantart mainblog to womencanbeanythingthesedays,True,[rp]
2002804,Independent Roleplay Blog for Sinbad (Magi: The Labyrinth of Magic) penned by Kaz circa December 2015,True,"[Roleplay, penned]"
3821274,Marty + 21 + your friend + USA-EST literate + para/script/one liners crossover friendly + oc friendly A multi-muse roleplay blog by someone desperately trying to stop gathering muses. M!A Statuses: none [links for mobile],True,"[oc, muse, roleplay, muse, M!A]"
955467,"melancholy: a feeling of pensive sadness, typically with no obvious cause ☾ rp blog. mun & muses +18. semi-selective. please read guidelines.",True,"[rp, muse, semi-selective]"
3574523,Indie. Highly Selective .Beth Greene Of AMC's The Walking Dead Penned By Arya,True,"[Selective, Penned]"
3644672,NOT ACCEPTING NEW THREADS FOR THE MOMENT Independent roleplayer for Elmer Fudd from Looney Tunes.,True,[roleplay]
3899657,"O FORTUNA, OF GOLDEN ICHOR AND SILVER LIPS ! O LADY OF FAME AND FORTUNE ; HOW LUCKY MUST I BE TO HAVE YOUR SUNSHINE SMILE SHINE ON ME !( indie demigod oc. written by amelia. est. march 04, 2017. )",True,[oc]
3542220,27 year old journalist with the Daily Prophet. Ravenclaw alum. Private RP Blog for Perierunt Stellarum.,True,[RP]
3556874,indie roleplay blog/ SFW,True,[roleplay]
3880684,sweet dee reynolds rp blog ✌️,True,[rp]


Unnamed: 0,parsed_blog_description,sexual orientation,sexual orientation_terms
2745615,"sarah . twenty one. bisexual space princess. I like height differences, age gaps, Disney movies and the Skywalker's. prev. reylukeanakins",True,[bisexual]
2505303,ri / they / pan,True,[pan]
3921684,18+ bi baby,True,[bi]
294525,Holly Peters. Twenty Three. Bisexual. Guard.,True,[Bisexual]
3046112,Masters degree in the art of being gay and a doctorate in overthinking,True,[gay]
2539749,ethan || 15 || they/them || space ace || i'm a furry so don't follow if you hate that lmao ||,True,[ace]
3795359,Just a 22 yr old bi nerd currently blogging about sanvers and life,True,[bi]
2242541,19. TN. Bi.,True,[Bi]
3634623,33. Police Officer. Divorced. One daughter. Openly Gay. Missy Peregrym. { FOLLOW ME TO VIVARP },True,[Gay]
2584988,✨Phan✨ ✨Supernatural✨ ✨Sherlock✨ ✨(she/her)✨ ✨Lesbian✨,True,[Lesbian]


Unnamed: 0,parsed_blog_description,sexuality/gender,sexuality/gender_terms
2192510,"There's a bad man in everyone, no matter who we are. [ semi-indie oc blog ] [ wip ]",True,[man]
1667272,"This was going to be an art blog at one point but now it's just me yelling about how gay I am and how we should be nice to eachother (not that there's anything wrong with that). Not a legal adult so please don't hit on me, any pronouns will do, and I'd rather not have anything to do with terfs",True,"[gay, pronouns]"
3808982,18 // he/them // i really love my partner...and kitties. don't forget kitties.,True,"[he/, them]"
3565489,Disney lover and the lost princess • German • positivity only • Aladdin • beauty and the beast • makeup addict • winterkind • currently in recovery • aro ace,True,"[princess, ace]"
3201026,"Quien eres tu? -Ya no lo sé señor, he cambiado tantas veces que ya no lo sé. Alicia 💁",True,[he]
2945069,"homosexually hajime, the reality show",True,[homo]
1714849,boy I can't wait til i actually do stuff with my life,True,[boy]
3757246,"Hey~ Rey here! I am 18, complete fandom trash, and I am so behind on many of them so please don't spoil things for me! I am heading off to Italy soon, so if I seem dead sometimes, just know I have a horridly busy life outside of Tumblr...it's not a good busy life, but at least I am almost out.",True,[them]
2527869,"Male/Fl/Taurus/Single/Hispanic |Electronic Engineer, and future Aerospace Engineer| -memes, drugs, science, and porn.- when did things turn this way? I miss you on certain days S...ah",True,[Male]
3018463,Max | They/he moved to @jadecocoon2,True,"[They/, he]"


Unnamed: 0,parsed_blog_description,weight,weight_terms
3927602,"Hi! I'm Ashlee, my previous blog @suicideandpizza was terminated / Trigger Warning / I have 2 other blogs, my personal blog is @fuckyouthisusernameisnttaken and my supernatural fan blog @hellhoundsandangelsandpieohmy / She,Her / 18 / US / Bi / SW 110lbs / CW 101lbs / GW1 100lbs / UGW2 90lbs / UGW3 80lbs",True,"[SW, CW, GW1]"
3604542,grunge. thinspo. asks open. NOTE: I DO NOT SEND PICS SW: 118 lbs CW: 105.4 lbs GW 1: 100 lbs UGW: 90 lbs LW: 102 lbs,True,"[SW, lbs, CW, lbs, GW, lbs, lbs, LW, lbs]"
3663268,"Female. 19. 5'5"" HW 145 SW 127.6 lbs CW 118.8 GW1: 115 UGW: 105 I do not promote destructive behavior",True,"[HW, SW, lbs, CW, GW1]"
3806645,"CANDY SHE'S SWEET LIKE CANDY IN MY VEINS BABY, I'M DYING FOR ANOTHER TASTE AND EVERY NIGHT MY MIND IS RUNNING AROUND HER THUNDER IS GETTING LOUDER AND LOUDER BABY YOU'RE LIKE LIGHTNING IN A BOTTLE I CAN'T LET YOU GO NOW THAT I GOT IT ALL I NEED IS TO BE STRUCK BY YOUR ELECTRIC LOVE BABY, YOU'RE ELECTRIC LOVE, ELECTRIC LOVE DROWNING, YOU MAKE MY HEART BEAT LIKE THE RAIN SURROUND ME, HOLD ME DEEP BENEATH YOUR WEIGHT AND EVERY NIGHT MY MIND IS RUNNING AROUND HER THUNDER IS GETTING LOUDER AND LOUDER BABY YOU'RE LIKE LIGHTNING IN A BOTTLE I CAN'T LET YOU GO NOW THAT I GOT IT ALL I NEED IS TO BE STRUCK BY YOUR ELECTRIC LOVE BABY, YOU'RE ELECTRIC LOVE, ELECTRIC LOVE RUSHING THROUGH ME I FEEL YOUR ENERGY RUSHING THROUGH ME",True,[WEIGHT]
3629220,Trigger warning for eating disorders and mental illness in general.,True,[eating disorders]
2775979,5'4 SW:116.0 CW: 118.8 GW: 100.0,True,"[SW, CW, GW]"
2509540,18 NY•BOS SW: 140 CW: 129 GW: 120 UGW: 110,True,"[SW, CW, GW]"
1601211,"26 - UK - INFP-T - Lesbian - they/them - Survivor - Recovery Warrior. My asks are always on if you need me. Living with, Anorexia (in recovery), bipolar II, bpd traits, GAD and cPTSD.",True,[Anorexia]
2517103,"The Quick And Easy Way To Get Rip of Belly Fat In Only 30 days, Best Diet plan for women Over 50",True,[Fat]
3719011,"15 yo bisexual girl from London 🇬🇧diagnosed with anorexia in 2012 😞relapsing. Just want to gain some control and lose some weight, using this as my inspiration ✨if you don't like what you see just block! I'm not doing this for anyone apart from myself CW:95 GW1: 90 GW2:85 GW3: 80 GW4: 75 UGW: 70",True,"[anorexia, weight, CW, GW1, GW2, GW3]"


In [12]:
len(cats)

13

# Evaluate errors in comparison to hand-annotated

In [18]:
# Load data
# data = pd.read_pickle(os.path.join(data_dirpath, 'blog_descriptions_1000sample_train.pkl'))
data = pd.read_pickle(os.path.join(data_dirpath, 'blog_descriptions_1000sample_test.pkl'))

# Normalize to booleans
with open(os.path.join(data_dirpath, 'identity_categories.txt')) as f:
    cats = f.read().splitlines()
    
for col in cats:
    data[col] = [a==1 for a in data[col]]

pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', -1)

cat = 'fandoms'

print('False positives:')
false_positives = data[(data[cat]==False) & (data[f'{cat}_pred']==True)]
display(data.loc[false_positives.index, ['tumblog_id', 'parsed_blog_description', f'{cat}_terms']])

# print('False negatives:')
# false_negatives = data[(data[cat]==True) & (data[f'{cat}_pred']==False)]
# display(data.loc[false_negatives.index, ['tumblog_id', 'parsed_blog_description', f'{cat}_terms']])

False positives:


ValueError: max() arg is an empty sequence

Unnamed: 0,tumblog_id,parsed_blog_description,fandoms_terms
408,337981527,Celeste Creator and CEO of Celestialbodiez. Former WWE Diva's champion. Currently still breaking hearts…and SOME faces. Multiship | Multiverse | NSFW Friendly | 20+ | Semi Selective,"[WWE, Multiship, Multiverse]"
453,240787984,shaladin shippers will be blocked,"[shaladin, shipper]"
289,334343712,Indie . Pri . Selective . Canon Divergant. Penned by Aku . EST 2/2/17,[Canon]
347,331412837,"A Forum Managed by UConn's First-Year Writing Course on Adaptation and the Canon, Spring 2017",[Canon]
228,311957840,"Let me tell you what I wish I'd known, when I was young and dreamed of glory. You have no control, who lives who died, who tells your story. And I know that we can win, I know that greatness lies in you. Just remember from here on in: History has it's eyes on you. Multimuse||Multifandom||Canon/OC Roleplay Blog Written by Eliza Be Warned there are Doppelgangers on this blog","[fandom, Canon]"
453,240787984,shaladin shippers will be blocked,"[shaladin, shipper]"
289,334343712,Indie . Pri . Selective . Canon Divergant. Penned by Aku . EST 2/2/17,[Canon]
347,331412837,"A Forum Managed by UConn's First-Year Writing Course on Adaptation and the Canon, Spring 2017",[Canon]
228,311957840,"Let me tell you what I wish I'd known, when I was young and dreamed of glory. You have no control, who lives who died, who tells your story. And I know that we can win, I know that greatness lies in you. Just remember from here on in: History has it's eyes on you. Multimuse||Multifandom||Canon/OC Roleplay Blog Written by Eliza Be Warned there are Doppelgangers on this blog","[fandom, Canon]"
898,266332716,Call me Jazz. She/Her. 17 // CA. A fan of memes and the duo from Ohio.,[fan]


# Split 1000 annotated into train/test

In [19]:
data = pd.read_csv(os.path.join(data_dirpath, 'blog_descriptions_1000sample.csv'))
print(len(data))

train, test = train_test_split(data, test_size=0.2, random_state=7)

train.to_pickle(os.path.join(data_dirpath, 'blog_descriptions_1000sample_train.pkl'))
test.to_pickle(os.path.join(data_dirpath, 'blog_descriptions_1000sample_test.pkl'))

1000
