# Check AO3 tags for "character descriptor tags"

## Stats on additional tags

In [1]:
# Load metadata
import pandas as pd

fandoms = [
    'allmarvel',
'supernatural',
'harrypotter',
'dcu',
'sherlock',
'teenwolf',
'starwars',
'drwho',
'tolkien',
'dragonage',
]

metadata = {}

for f in fandoms:
    metadata_fpath = f'/data/fanfiction_ao3/{f}/complete_en_1k-50k/metadata.csv'
    metadata[f] = pd.read_csv(metadata_fpath)

n_fics_with_tags = 0
total_n_fics = 0
for fandom in fandoms:
    n_fics_with_tags += metadata[fandom]['additional tags'].map(lambda x: len(x) > 2).sum()
    total_n_fics += len(metadata[fandom])
print(f'{n_fics_with_tags/total_n_fics: .1%} ({n_fics_with_tags} / {total_n_fics})')

## Tags with character names

In [34]:
# Most popular terms in tags overall
import re

all_fic_tags = []
for fandom in fandoms:
    all_fic_tags += metadata[fandom]['additional tags'].map(lambda x: eval(x)).tolist()
tags = [re.split(r' |!', tag.lower()) for fic_tags in all_fic_tags for tag in fic_tags]
tag_terms = [term for tag in tags for term in tag]
tag_term_ctr = Counter(tag_terms)
tag_term_ctr.most_common(100)

[('dean', 58462),
 ('-', 58024),
 ('fluff', 39884),
 ('alternate', 39428),
 ('sex', 39377),
 ('universe', 38153),
 ('angst', 34580),
 ('castiel', 33090),
 ('of', 32518),
 ('winchester', 32304),
 ('and', 30487),
 ('a', 28615),
 ('sam', 27608),
 ('the', 26748),
 ('freeform', 24977),
 ('is', 23835),
 ('first', 17066),
 ('to', 15477),
 ('smut', 15227),
 ('hurt/comfort', 14998),
 ('in', 14931),
 ('harry', 14543),
 ('character', 14388),
 ('romance', 14271),
 ('sexual', 13007),
 ('canon', 12893),
 ('love', 12885),
 ('anal', 12817),
 ('i', 12630),
 ('plot', 11831),
 ('relationship', 11411),
 ('au', 11108),
 ('bottom', 10913),
 ('humor', 10360),
 ('not', 10188),
 ('death', 9961),
 ('with', 9896),
 ('time', 9643),
 ('fic', 9610),
 ('potter', 9233),
 ('cas', 9123),
 ('top', 9074),
 ('content', 8809),
 ('ending', 8313),
 ('established', 8054),
 ('kink', 8016),
 ('pov', 7997),
 ('explicit', 7573),
 ('this', 7550),
 ('one', 7517),
 ('hurt', 7419),
 ('&', 7381),
 ('happy', 7316),
 ('what', 7268),
 ('

In [18]:
# How many fics have at least one character tag?
n_fics_char_tags = {}
n_fics = {}
for fandom in fandoms:
    n_fics_char_tags[fandom] = metadata[fandom]['additional tags'].map(lambda x: any([tag for tag in eval(x) if any([part in tag.split() for part in name_parts[fandom]])])).sum()
    n_fics[fandom] = len(metadata[fandom])
total_n_fics_char_tags = sum(n_fics_char_tags.values())
total_n_fics = sum(n_fics.values())
print(total_n_fics_char_tags)
print(total_n_fics)
print(total_n_fics_char_tags/total_n_fics)

20657
186952
0.11049360263597073


In [38]:
# Load character names, get character tags
from collections import Counter

fandoms = ['harrypotter',
           'supernatural']
chars = {}
name_parts = {}
char_tags = {}
for fandom in fandoms:
    char_fpath = f'/data/fanfiction_ao3/{fandom}/canonical_characters.txt'
    with open(char_fpath) as f:
        chars[fandom] = f.read().splitlines()
    canonical_character_name_parts = set([part for name in chars[fandom] for part in name.split()])
    exclude = set(['The'])
    canonical_character_name_parts -= exclude
    name_parts[fandom] = set([c.lower().replace('"', '') for c in canonical_character_name_parts if len(c) > 1])

    # Filter to tags that mention characters
    char_tags[fandom] = [tag for tag in tags if any([part in tag for part in name_parts[fandom]])]
    print(len(char_tags[fandom]))

185829
162814


In [39]:
# Most popular terms with char tags
import nltk
stopwords = nltk.corpus.stopwords.words('english')

all_char_tags = sum([char_tags[fandom] for fandom in fandoms], [])
all_name_parts = set().union(*name_parts.values())
char_tag_terms = [term for tag in all_char_tags for term in tag if not term in stopwords and not term in all_name_parts]
char_tags_terms_ctr = Counter(char_tag_terms)
char_tags_terms_ctr.most_common(50)

[('bottom', 15116),
 ('top', 12183),
 ('-', 10713),
 ('death', 8145),
 ('freeform', 7282),
 ('protective', 7117),
 ('pov', 5957),
 ('alpha', 4668),
 ('omega', 3950),
 ('mentions', 3564),
 ('human', 3511),
 ('castiel/dean', 3422),
 ('cas', 3299),
 ('book', 3153),
 ('minor', 3143),
 ('bunker', 3120),
 ('bisexual', 2769),
 ('sub', 2768),
 ("winchester's", 2695),
 ('dom', 2621),
 ('year', 2535),
 ('female', 2532),
 ('good', 2424),
 ('episode:', 2292),
 ('universe', 2241),
 ('alternate', 2229),
 ('parenting', 2193),
 ('little', 2131),
 ('sex', 2104),
 ('a+', 2064),
 ('&', 2002),
 ('(supernatural)', 1999),
 ('angel', 1997),
 ('love', 1930),
 ('dark', 1901),
 ('loves', 1900),
 ('jealous', 1889),
 ('brother', 1815),
 ('past', 1782),
 ('established', 1762),
 ('big', 1734),
 ('sick', 1703),
 ('first', 1657),
 ('lives', 1614),
 ('feelings', 1604),
 ('young', 1599),
 ('au', 1598),
 ('next', 1592),
 ('auror', 1588),
 ('friendship', 1561)]

In [32]:
# Look for specifically ! character tags
tags_exclamation = [term for term in tag_terms if '!' in term]
tags_exclamation_ctr = Counter(tags_exclamation)
tags_exclamation_ctr.most_common(30)

[('bottom!dean', 718),
 ('top!dean', 428),
 ('demon!dean', 385),
 ('human!cas', 365),
 ('top!cas', 339),
 ('bottom!cas', 263),
 ('sub!dean', 254),
 ('bottom!sam', 233),
 ('top!sam', 212),
 ('dom!cas', 169),
 ('hurt!dean', 165),
 ('wing!kink', 146),
 ('alpha!dean', 144),
 ('dom!dean', 138),
 ('omega!dean', 129),
 ('human!castiel', 127),
 ('end!verse', 127),
 ('hurt!sam', 120),
 ('dom!sam', 116),
 ('fallen!cas', 107),
 ('mechanic!dean', 106),
 ('bottom!castiel', 99),
 ('top!castiel', 98),
 ('jealous!dean', 97),
 ('alpha!sam', 89),
 ('daddy!dean', 87),
 ('wee!chesters', 86),
 ('protective!dean', 85),
 ('girl!dean', 80),
 ('shipper!sam', 79)]