# Assemble high-LGBTQ/low-LGBTQ fandom sets

In [7]:
fandom_sets = {}
fandom_sets['hi_lgbtq'] = [
    'homestuck',
    'startrek',
    'dragonage',
    'buffy',
#     'jojo',
#     'pokemon',
#     'danganronpa',
#     'glee',
#     'fire_emblem',
#     'hannibal',
    #############
#     'stargate',
]

fandom_sets['lo_lgbtq'] = [
#     'walking_dead',
#     'shadowhunter',
    'song_ice_fire',
    'teenwolf',
    'naruto',
    'tolkien',
#     'percy_jackson',
    'harrypotter',
#     'attack_on_titan',
    'dcu',
    ##############
#     'drwho',
#     'supernatural',
#     'once',
]

# Assemble all_metadata
import os
import pandas as pd
from tqdm.notebook import tqdm

all_metadata = pd.DataFrame()
for fset in fandom_sets:
    for fandom in tqdm(fandom_sets[fset]):
        tqdm.write(fandom)
        data_dirpath = f'/data/fanfiction_ao3/{fandom}/complete_en_1k-50k/'
        all_metadata_fpath = os.path.join(data_dirpath, 'metadata.csv')
        fandom_all_metadata = pd.read_csv(all_metadata_fpath, parse_dates=['published'])
        if fandom == 'dcu': # remove supergirl fics
             fandom_all_metadata = fandom_all_metadata[fandom_all_metadata['fandom'].map(lambda x: not 'Supergirl' in x)]
        fandom_all_metadata.rename(columns={'fandom': 'annotated_fandom'}, inplace=True)
        fandom_all_metadata['fandom'] = [fandom] * len(fandom_all_metadata)
        fandom_all_metadata['dataset'] = [fset] * len(fandom_all_metadata)
        all_metadata = pd.concat([all_metadata, fandom_all_metadata])
all_metadata.drop_duplicates('fic_id', inplace=True) # don't want to double-count fics who appear in multiple fandoms
# all_metadata.reset_index(drop=True, inplace=True)
all_metadata.set_index('fic_id', inplace=True)
# all_metadata

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))

homestuck
startrek
dragonage
buffy



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=6.0), HTML(value='')))

song_ice_fire
teenwolf
naruto
tolkien
harrypotter
dcu



# Restrict time periods, uniformly sample across fandoms

In [8]:
# Latest fics/fandom
import datetime
beg_date = datetime.datetime(2014,6,26)
event_date = datetime.datetime(2015,6,26)
end_date = datetime.datetime(2016,6,26)

metadata = all_metadata.loc[(all_metadata['published'] >= beg_date) & (all_metadata['published'] <= end_date)]

# Find minimum #fics to sample/fandom
fandom_info = metadata.groupby('fandom').agg({'fandom': 'size', 'dataset': lambda x: x.tolist()[0]})
fandom_info = fandom_info.rename(columns={'fandom': 'fic_count'})
print(fandom_info['fic_count'].min())

# Fic counts
import plotly.express as px
fig = px.bar(fandom_info, x=fandom_info.index, y='fic_count', color='dataset', title='Fic counts')
fig.show()

2645


In [9]:
# Uniform sample across fandoms
fandom_sample_size = 2645
metadata = metadata.groupby('fandom').apply(lambda s: s.sample(fandom_sample_size, random_state=9))
metadata.index = metadata.index.droplevel('fandom')
print(metadata.fandom.value_counts())
print(len(metadata))

# Fic counts over time by fandom
fig = px.histogram(metadata, x='published', title='Fic counts over time by fandom', color='fandom')
fig.show()

teenwolf         2645
dcu              2645
buffy            2645
tolkien          2645
dragonage        2645
harrypotter      2645
startrek         2645
naruto           2645
homestuck        2645
song_ice_fire    2645
Name: fandom, dtype: int64
26450


# Calculate logistic regression for tag use before/after an event
Dependent variable (y): whether an individual fic has a certain type of tag (such as trans)  
Independent variable (X): whether the fic was published before or after the event  
Covariates (X): 
* numeric date, 
* whether a fic is hi-LGBTQ or lo-LGBTQ fandom, 
* the specific fandom

In [10]:
# Check which facs have certain tags
import re

def matching_tag(pattern, tags):
    contains_tag = False
    matching_tag = None
    for tag in tags:
        if re.search(pattern, tag):
            contains_tag = True
            matching_tag = tag
            break
    return contains_tag, matching_tag

# Convert specific columns to lists
def to_list_type(col):
    # Would love to just do eval(x) but had some escaped quote marks that prevented that
    return col.str.replace('\\"', '"').str.strip('[]').str.split(', ').map(lambda x: [t[1:-1] for t in x])

metadata['tags'] = to_list_type(metadata['additional tags'])
metadata['character_tags'] = to_list_type(metadata['character'])
metadata['relationship_tags'] = to_list_type(metadata['relationship'])
metadata['category_tags'] = to_list_type(metadata['category'])
relationship_types = [
    'M/M',
    'F/F',
    'F/M',
    'Gen',
    'Multi',
    'Other',
]

# Search for tags
from tqdm.notebook import tqdm

patterns = {
    'trans': re.compile(r'\b(trans|transfemale|transmale|transman|transwoman|mtf|ftm|non-binary|nonbinary|genderqueer|enby|nb)\b', flags=re.IGNORECASE),
#     'trans_character': re.compile(r'\b(trans|transfemale|transmale|transman|transwoman|non-binary|nonbinary|genderqueer|enby|nb)\b.*character', flags=re.IGNORECASE),
#     'gay': re.compile(r'\bgay\b', flags=re.IGNORECASE),
#     'american': re.compile(r'\bamerican\b', flags=re.IGNORECASE),
#     'queerphobia': re.compile(r'\b((homo|trans|queer)phobia)\b', flags=re.IGNORECASE),
    'wedding-marriage': re.compile(r'\b(wedding|marriage|married)\b', flags=re.IGNORECASE),
}
# patterns['any_tag'] = re.compile(r'.+') # if don't have tags, only have an empty string ['']
patterns['gay_marriage'] = re.compile(r'\b(same-sex|gay) (marriage|wedding)\b|(marriage equality)', flags=re.IGNORECASE) # if don't have tags, only have an empty string ['']

for tag_label in patterns:
    print(tag_label)
    metadata[f'contains_{tag_label}'], metadata[tag_label] = list(zip(*[matching_tag(patterns[tag_label], tags) for tags in tqdm([sum(t, []) for t in zip(metadata['character_tags'], metadata['tags'])], total=len(metadata))]))
    print(metadata[tag_label].count())
    print()
    
# Calculate category probabilities
for rel_type in relationship_types:
    print(rel_type)
    metadata[f'contains_{rel_type}'] = [rel_type in cats for cats in tqdm(metadata['category_tags'], total=len(metadata))]
    print(metadata[f'contains_{rel_type}'].sum())
    print()

# Combinations of metadata like 'wedding-marriage' + relationship type
mm_ff = metadata['contains_M/M'] | metadata['contains_F/F']
metadata['contains_wedding-marriage_M/M-F/F'] = (metadata['contains_wedding-marriage'] & mm_ff) | metadata['contains_gay_marriage']
# print(metadata['contains_wedding-marriage'].sum())
# print(metadata['wedding-marriage_M/M-F/F'].sum())

metadata['contains_wedding-marriage_F/M'] = metadata['contains_wedding-marriage'] & metadata['contains_F/M']
# print(metadata['wedding-marriage_F/M'].sum())

selected_cols = [
    'wedding-marriage_M/M-F/F',
    'wedding-marriage_F/M',
]

labels = list(patterns.keys()) + relationship_types + selected_cols
labels.remove('gay_marriage')
labels

trans


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=26450.0), HTML(value='')))


238

wedding-marriage


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=26450.0), HTML(value='')))


610

gay_marriage


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=26450.0), HTML(value='')))


16

M/M


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=26450.0), HTML(value='')))


11243

F/F


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=26450.0), HTML(value='')))


2015

F/M


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=26450.0), HTML(value='')))


9361

Gen


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=26450.0), HTML(value='')))


4842

Multi


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=26450.0), HTML(value='')))


1152

Other


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=26450.0), HTML(value='')))


528



['trans',
 'wedding-marriage',
 'M/M',
 'F/F',
 'F/M',
 'Gen',
 'Multi',
 'Other',
 'wedding-marriage_M/M-F/F',
 'wedding-marriage_F/M']

In [11]:
metadata

Unnamed: 0_level_0,title,author,author_key,rating,category,annotated_fandom,relationship,character,additional tags,language,...,contains_gay_marriage,gay_marriage,contains_M/M,contains_F/F,contains_F/M,contains_Gen,contains_Multi,contains_Other,contains_wedding-marriage_M/M-F/F,contains_wedding-marriage_F/M
fic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4436162,Sand Castles,sabershadowkat,sabershadowkat,"[""General Audiences""]","[""F/M""]","[""Buffy the Vampire Slayer""]","[""Spike/Buffy Summers""]","[""Buffy Summers"", ""Spike""]",[],English,...,False,,False,False,True,False,False,False,False,False
4452977,Willow's Spell,dmarsh14,dmarsh14,"[""Explicit""]","[""F/F""]","[""Buffy the Vampire Slayer""]","[""Tara Maclay/Willow Rosenberg""]","[""Willow Rosenberg"", ""Tara Maclay""]","[""Belly Kink"", ""Inflation"", ""Weight Gain""]",English,...,False,,False,True,False,False,False,False,False,False
3763297,In The Dark,andacus,andacus,"[""Mature""]","[""F/M""]","[""Supernatural"", ""Buffy the Vampire Slayer""]","[""Faith Lehane/Dean Winchester""]","[""Dean Winchester"", ""Faith Lehane"", ""Sam Winch...","[""Crossover"", ""dean and faith are so disfuncti...",English,...,False,,False,False,True,False,False,False,False,False
5130194,An Uncanny Love Story,Ashley_Winchester_77,Ashley_Winchester_77,"[""Explicit""]","[""F/M""]","[""Buffy the Vampire Slayer"", ""Harry Potter RPF""]","[""Spike/Hermione Granger""]","[""Luna Lovegood"", ""Harry Potter"", ""Hermione Gr...","[""Smut"", ""Love"", ""Angst"", ""Drama""]",English,...,False,,False,False,True,False,False,False,False,False
4933939,SDPD,DonSample,DonSample,"[""Teen And Up Audiences""]",[],"[""Buffy the Vampire Slayer"", ""Angel: the Series""]",[],"[""Kate Lockley"", ""Detective Clark"", ""Detective...",[],English,...,False,,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4037311,Immo,MirandaTam,MirandaTam,"[""General Audiences""]","[""Gen""]","[""TOLKIEN J. R. R. - Works"", ""The Lord of the ...",[],"[""Arwen"", ""Elladan"", ""Elrohir"", ""Erestor"", ""Li...","[""Genderqueer Character"", ""Genderqueer"", ""Bige...",English,...,False,,False,False,False,True,False,False,False,False
5326919,A Dwarven Advent Calendar,PericulaLudus,PericulaLudus,"[""General Audiences""]","[""Gen""]","[""The Hobbit - All Media Types""]",[],"[""Thorin Oakenshield"", ""D\u00eds"", ""Frerin"", ""...","[""Christmas"", ""Advent Calendar"", ""Family Fluff...",English,...,False,,False,False,False,True,False,False,False,False
3759347,"Guardian, The",HASA_Archivist,HASA_Archivist,"[""General Audiences""]",[],"[""The Lord of the Rings - J. R. R. Tolkien""]",[],"[""Arwen"", ""Maglor""]","[""Fourth Age"", ""General"", ""Canon - Engaging ga...",English,...,False,,False,False,False,False,False,False,False,False
4542345,Disagreement,memorywolf,memorywolf,"[""General Audiences""]","[""M/M""]","[""The Hobbit - All Media Types"", ""The Lord of ...","[""Elrond/Lindir""]",[],"[""Anger"", ""Hurt/Comfort""]",English,...,False,,True,False,False,False,False,False,False,False


In [31]:
# Prepare dataset for logistic regression
metadata['after_event'] = metadata['published'].map(lambda x: x >= event_date).astype(int)
metadata['days_from_beg'] = metadata['published'].map(lambda x: x - beg_date).dt.days
metadata['fandom_cat'] = metadata['fandom'].astype('category').cat.codes
metadata['dataset_cat'] = metadata['dataset'].astype('category').cat.codes

tag = 'trans'
selected = metadata.loc[:, ['after_event', 'days_from_beg', 'fandom_cat', 'dataset_cat', f'contains_{tag}']]
selected

Unnamed: 0_level_0,after_event,days_from_beg,fandom_cat,dataset_cat,contains_trans
fic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
4436162,1,396,0,0,False
4452977,1,398,0,0,False
3763297,0,295,0,0,False
5130194,1,494,0,0,False
4933939,1,465,0,0,False
...,...,...,...,...,...
4037311,0,338,9,1,True
5326919,1,523,9,1,False
3759347,0,294,9,1,False
4542345,1,410,9,1,False


In [33]:
print(sum(selected['contains_trans']))
print(len(selected))

238
26450


In [32]:
# Calculate regresions
import statsmodels.api as sm

logit = sm.Logit(selected['contains_trans'], selected[['after_event', 'days_from_beg', 'fandom_cat', 'dataset_cat']])
result = logit.fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.073899
         Iterations 9


0,1,2,3
Dep. Variable:,contains_trans,No. Observations:,26450.0
Model:,Logit,Df Residuals:,26446.0
Method:,MLE,Df Model:,3.0
Date:,"Thu, 28 Jan 2021",Pseudo R-squ.:,-0.4393
Time:,11:11:28,Log-Likelihood:,-1954.6
converged:,True,LL-Null:,-1358.1
Covariance Type:,nonrobust,LLR p-value:,1.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
after_event,2.7974,0.207,13.518,0.000,2.392,3.203
days_from_beg,-0.0124,0.000,-27.272,0.000,-0.013,-0.011
fandom_cat,-0.3641,0.018,-19.703,0.000,-0.400,-0.328
dataset_cat,-1.2046,0.116,-10.350,0.000,-1.433,-0.977
