In [1]:
import pandas as pd
import os

# Cluster based on additional tags, see top divergences

In [43]:
metadata_fpath = '/usr0/home/mamille2/erebor/fanfiction-project/data/ao3/harrypotter/au_dataset/metadata_train.csv'
metadata = pd.read_csv(metadata_fpath, index_col=0)
print(len(metadata))
print(metadata.index.name)
metadata.columns

12570
fic_id


Index(['title', 'author', 'author_key', 'rating', 'category', 'fandom',
       'relationship', 'character', 'additional tags', 'language', 'published',
       'status', 'status date', 'words', 'comments', 'kudos', 'bookmarks',
       'hits', 'chapter_count', 'series', 'seriespart', 'seriesid', 'summary',
       'preface_notes', 'afterword_notes', 'selected_tags'],
      dtype='object')

In [45]:
from collections import Counter

# Get vectors for tags

all_tags = [t.lower() for l in metadata['additional tags'] for t in eval(l)]
tag_ctr = Counter(all_tags)
print(len(all_tags))
threshold = 5
top_tag_ctr = [(tag,count) for (tag,count) in tag_ctr.most_common() if count >= threshold]
print(len(top_tag_ctr))
top_tags = [el[0] for el in top_tag_ctr]

97159
2344


In [46]:
from tqdm import tqdm_notebook as tqdm

for tag, count in tqdm(top_tag_ctr):
    metadata[tag] = metadata['additional tags'].map(lambda x: tag in [t.lower() for t in eval(x)])
#     print(sum(metadata[tag]))

HBox(children=(IntProgress(value=0, max=2344), HTML(value='')))




In [48]:
import numpy as np
# tag_vectors = metadata.set_index('fic_id').loc[:, top_tags].to_dict()
# tag_vectors = metadata.set_index('fic_id').loc[:, top_tags].to_dict(orient='index')
tag_vectors = np.array(metadata.loc[:, top_tags].values)
print(tag_vectors.shape)
tag_vectors[:2]

(12570, 2344)


array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

## Run PCA

In [49]:
from sklearn.decomposition import PCA
pca = PCA(n_components=.7, svd_solver='full')

In [50]:
reduced = pca.fit_transform(tag_vectors)
print(pca.explained_variance_[:10])
print(pca.n_components_)

[0.18700258 0.12319356 0.11022768 0.09911675 0.09382891 0.06295342
 0.05988378 0.05430083 0.04819833 0.04617006]
368


In [91]:
# Look at factor loadings on tags
def feats_for_factors(feature_names, pca, n_factors=20, n_feats=40):
    top = np.flip(np.argsort(pca.components_)[:n_factors, -1*n_feats:], axis=1)
    vec = np.vectorize(lambda x: feature_names[x])
    return vec(top)

In [92]:
factor_feats = feats_for_factors(top_tags, pca, n_feats=5)
print(feats_for_factors(top_tags, pca, n_feats=5))

[['alternate universe' 'romance' 'angst' 'explicit language'
  'sexual content']
 ['fluff' 'romance' 'angst' 'humor' 'established relationship']
 ['angst' 'alternate universe - canon divergence' 'hurt/comfort'
  'tragedy' 'drama']
 ['alternate universe' 'fluff' 'angst'
  'alternate universe - canon divergence' 'fluff and angst']
 ['alternate universe - canon divergence' 'romance' 'humor' 'fluff'
  'established relationship']
 ['sexual content' 'explicit language' 'plot what plot/porn without plot'
  'anal sex' 'fluff']
 ['humor' 'parody' 'angst' 'crack' 'explicit language']
 ['anal sex' 'established relationship' 'oral sex' 'smut'
  'plot what plot/porn without plot']
 ['au' 'crossover' 'one shot' 'anal sex'
  'plot what plot/porn without plot']
 ['hurt/comfort' 'friendship' 'established relationship' 'au' 'anal sex']
 ['alternate universe - modern setting' 'au'
  'alternate universe - non-magical' 'hurt/comfort'
  'alternate universe - muggle']
 ['friendship' 'drama' 'one shot' 'alter

In [93]:
from IPython.core.debugger import set_trace

# Look at factor loadings on examples
def examples_for_factors(data, reduced, n_factors=20, n_examples=20):
    """ Returns n_factors x n_examples """
    
    top = np.argsort(reduced, axis=0)[-1*n_examples:, :n_factors][::-1].T
    vec = np.vectorize(lambda x: data.index[x])
    return vec(top)

In [94]:
top_factor_examples = examples_for_factors(metadata, reduced, n_factors=10, n_examples=10)
top_factor_examples.shape

(10, 10)

## Load fic divergence distances

In [95]:
from IPython.core.debugger import set_trace

dist_dirpath = '/usr0/home/jfiacco/Research/fanfic/language_model/vocab_dists/train/'

factor_word_dists = []
for factor_examples in top_factor_examples:
    
    word_dists = {}
    
    for fic_id in factor_examples:
        dist_fpath = os.path.join(dist_dirpath, f"{fic_id}.txt.vocab.dists")
        dist = pd.read_csv(dist_fpath, sep='\t', header=None, names=['word', 'distance', 'frequency'], index_col=0)
        dist = dist[dist['frequency'] >= 3]
        word_dists.update(dist['distance'].to_dict())
        
    factor_word_dists.append(word_dists)
    
len(factor_word_dists)

10

In [96]:
from pprint import pprint

for i in range(len(factor_word_dists)):
    print(factor_feats[i])
    print(top_factor_examples[i])
    pprint([metadata.loc[fic_id, 'additional tags'] for fic_id in top_factor_examples[i]])
    factor_word_dist = factor_word_dists[i]
    sorted_dist = sorted(factor_word_dist.items(), key=lambda x: x[1], reverse=True)
    pprint(sorted_dist[:10])
    print()

['alternate universe' 'romance' 'angst' 'explicit language'
 'sexual content']
[10095980  9892973 10026635 10095860 10050935 10026620 10162880  9894131
 10180253 10063406]
["['Explicit Language', 'Chan', 'Heterosexual Sex', 'Slash sex', 'Mpreg', 'Out "
 "of Character', 'Self-Harm', 'Sexual Content', 'Spoilers', 'Threesome', "
 "'Action/Adventure', 'Alternate Universe', 'Angst', 'Tragedy', 'Crossover', "
 "'Drama', 'First Time', 'Horror', 'Supernatural - Freeform', 'Hurt/Comfort', "
 "'Mystery', 'Suspense', 'Pre-Slash', 'Romance']",
 "['Explicit Language', 'BDSM', 'Slash sex', 'Drug Use', 'Mpreg', 'Out of "
 "Character', 'Self-Harm', 'Sexual Content', 'Spoilers', 'Threesome', "
 "'Voyeurism', 'Alternate Universe', 'Angst', 'Tragedy', 'Bonding', 'Drama', "
 "'First Time', 'Horror', 'Supernatural - Freeform', 'Hurt/Comfort', "
 "'Pre-Slash', 'Romance']",
 "['Slash', 'Explicit Language', 'Slash sex', 'Incest', 'Mpreg', 'Out of "
 "Character', 'Sexual Content', 'Action/Adventure', 'Alternat