# Run baseline

In [2]:
import pickle
import os

unigram_path_dir = '/usr0/home/prashang/DirectedStudy/ACL_preprocessing/Vectors/'

features_chapters = {}

# Load chapter features
for fold in ['train', 'dev']:
    with open(os.path.join(unigram_path_dir, f'oneHotFeatures_{fold}.txt'), 'rb') as f:
        features_chapters[fold] = pickle.load(f)
    print(len(features_chapters[fold]))

22677
2544


## Get tags to predict

In [7]:
import pandas as pd

metadata = pd.read_csv('/usr2/scratch/fanfic/ao3_harrypotter_text/stories.csv')
# metadata.fic_id.dtype
print(len(metadata))
print(metadata.columns)

fic_ids = {}

for fold in ['train', 'dev']:
    fic_ids[fold] = sorted(set([int(name.split('_')[0]) for name in list(features_chapters[fold].keys())]))
    print(len(fic_ids['train']))

179407
Index(['fic_id', 'title', 'author', 'author_key', 'rating', 'category',
       'fandom', 'relationship', 'character', 'additional tags', 'language',
       'published', 'status', 'status date', 'words', 'comments', 'kudos',
       'bookmarks', 'hits', 'chapter_count', 'series', 'seriespart',
       'seriesid', 'summary', 'preface_notes', 'afterword_notes'],
      dtype='object')
4771
4771


In [8]:
# Get tags for folds
tags = {}
metadata_split = {}

for fold in ['train', 'dev']:
    metadata_split[fold] = metadata.loc[metadata['fic_id'].isin(fic_ids[fold]), ['fic_id', 'additional tags']]

    tags[fold] = metadata_split[fold].set_index('fic_id').to_dict()['additional tags']
    print(len(tags[fold].keys()))

    tags[fold] = {key: [tag.lower() for tag in eval(val)] for key,val in tags[fold].items()}

4771
597


In [9]:
# Get top 100 tags

from collections import Counter

tag_ctr = Counter([tag for l in tags['train'].values() for tag in l])
tag_vocab = {}
tag_vocab[100] = [a for a,b in tag_ctr.most_common(100)]
tag_vocab[100]

['fluff',
 'angst',
 'alternate universe - canon divergence',
 'hurt/comfort',
 'slow burn',
 'romance',
 'anal sex',
 'alternate universe',
 'smut',
 'humor',
 'au',
 'fluff and angst',
 'alternate universe - modern setting',
 'established relationship',
 'hogwarts eighth year',
 'hogwarts',
 "marauders' era",
 'time travel',
 'hp: ewe',
 'anal fingering',
 'drarry',
 'happy ending',
 'friendship',
 'friends to lovers',
 'mpreg',
 'crossover',
 'first time',
 'first kiss',
 'pining',
 'oral sex',
 'rimming',
 'angst with a happy ending',
 'wolfstar',
 'alternate universe - hogwarts',
 'blow jobs',
 'other additional tags to be added',
 'kissing',
 'emotional hurt/comfort',
 'magic',
 'implied/referenced child abuse',
 'one shot',
 'quidditch',
 'child abuse',
 'alternate universe - harry potter setting',
 'dumbledore bashing',
 'slash',
 'female harry potter',
 'plot what plot/porn without plot',
 'mutual pining',
 'post-hogwarts',
 'dirty talk',
 'alternate universe - soulmates',
 'p

In [15]:
# How many fics don't have top 100 tags?
notags = [l for l in tags['train'].values() if not any([t in l for t in tag_vocab[100]])]
print(len(notags))
print(len(tags['train']))
print(len(notags)/len(tags['train']))

1100
4771
0.23055963110459024


In [11]:
from tqdm import tqdm_notebook as tqdm
from IPython.core.debugger import set_trace

features = {'train': {}, 'dev': {}, 'test': {}}

# Concatenate chapter features into fics
for fold in ['train', 'dev']:
    for fic_id in tqdm(fic_ids[fold]): # would be faster to iterate through feature_chapters instead
        chapter_names = sorted([ch for ch in features_chapters[fold].keys() if str(fic_id)==ch.split('_')[0]])
        fic_features = [0] * len(features_chapters[fold][chapter_names[0]])
        for chapter_name in chapter_names:
            chapter_features = features_chapters[fold][chapter_name]
            fic_features = [a or b for a,b in zip(fic_features,chapter_features)]

        features[fold][fic_id] = fic_features

HBox(children=(IntProgress(value=0, max=4771), HTML(value='')))




HBox(children=(IntProgress(value=0, max=597), HTML(value='')))




In [34]:
tags['train'][fic_ids['train'][0]]

['quidditch',
 'pro quidditch',
 'ginny weasley plays pro quidditch',
 'chronic injury',
 'chudley cannons',
 "let's all just cross our fingers and hope for the best",
 'au',
 'au where they never dated']

In [12]:
tag_indicator = {'train': {}, 'dev': {}, 'test': {}}

for fold in ['train', 'dev']:
    tag_indicator[fold]  = {fic_id: [1 if tag in tags[fold][fic_id] else 0 for tag in tag_vocab[100]] for fic_id in fic_ids[fold]}
    print(any(tag_indicator[fold][fic_ids[fold][0]]))

True
True


In [13]:
# Assemble input features, output
import numpy as np

X = {}
y = {}

for fold in ['train', 'dev']:
    X[fold] = np.array([features[fold][fic_id] for fic_id in fic_ids[fold]])
    print(X[fold].shape)

    y[fold] = np.array([tag_indicator[fold][fic_id] for fic_id in fic_ids[fold]])
    print(y[fold].shape)

(4771, 10000)
(4771, 100)
(597, 10000)
(597, 100)


## Train classifier, predict

In [37]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

clf = OneVsRestClassifier(SVC(kernel='linear', verbose=2))
# clf = OneVsRestClassifier(LogisticRegression(n_jobs=10, verbose=2))
clf.fit(X['train'], y['train'])

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]

OneVsRestClassifier(estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=2),
          n_jobs=1)

In [38]:
preds = clf.predict(X['dev'])
print(preds.shape)

(597, 100)


In [31]:
# make sure are multilabel
for i in range(len(preds)):
    pos_preds = preds[i].tolist().count(1)
    if pos_preds > 1:
        print(i)

208
272
277
324
346
399
472
505
509
535
545
557
573


In [39]:
# Evaluate
from sklearn.metrics import precision_score, recall_score, f1_score
# print(precision_score(y['dev'], preds, average='weighted'))
# print(recall_score(y['dev'], preds, average='weighted'))
# print(f1_score(y['dev'], preds, average='weighted'))
print(precision_score(y['dev'], preds, average='macro'))
print(recall_score(y['dev'], preds, average='macro'))
print(f1_score(y['dev'], preds, average='macro'))

0.139729924242
0.0249751083879
0.0392986177847


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
