# Get fic_ids to run the pipeline

In [1]:
# Load existing data (5+ paragraphs with at least one pairing together)

import pandas as pd

# Load data
data = pd.read_pickle('/usr2/mamille2/fanfiction-project/data/features/relationship_prediction.pkl')
len(data)
print(data.columns)

pairings = data['pairing'].unique()
pairings

Index(['fic_id', 'pairing', 'pairing_embedding_embs',
       'pairing_embedding_unigrams', 'relationship', 'selected_relationships',
       'is_romantic'],
      dtype='object')


array([('draco', 'harry'), ('hermione', 'ron'), ('ginny', 'harry'),
       ('draco', 'hermione'), ('harry', 'hermione'), ('harry', 'ron')], dtype=object)

In [2]:
canon_relationship_map = {
        ('draco', 'harry'): False,
        ('hermione', 'ron'): True,
        ('ginny', 'harry'): True,
        ('draco', 'hermione'): False,
        ('harry', 'hermione'): False,
        ('harry', 'ron'): False,
                         }

relationship_type_map = {
        ('draco', 'harry'): True,
        ('hermione', 'ron'): False,
        ('ginny', 'harry'): False,
        ('draco', 'hermione'): False,
        ('harry', 'hermione'): False,
        ('harry', 'ron'): True,
                         }

data['is_canon'] = [(is_romantic and canon_relationship_map[pairing]) or (not is_romantic and not canon_relationship_map[pairing]) \
                    for is_romantic, pairing in zip(data['is_romantic'], data['pairing'])]

data['is_mm'] = data['pairing'].map(lambda x: relationship_type_map[x])

In [3]:
# Save out
data.to_pickle('/usr2/mamille2/fanfiction-project/data/features/relationship_prediction.pkl')

In [4]:
fic_ids = sorted(data['fic_id'].unique())
len(fic_ids)

44647

In [7]:
# Create dict of fic_id: chapter_fnames

fnames = os.listdir(data_dirpath)
len(fnames)

565584

In [8]:
from collections import defaultdict

fic_chapters = defaultdict(list)
for fname in tqdm(fnames):
    fic_id = int(fname.split('_')[0])
    fic_chapters[fic_id].append(fname)
    
len(fic_chapters)

HBox(children=(IntProgress(value=0, max=565584), HTML(value='')))

179407

In [10]:
# Copy those fic_ids to new directory for pipeline
import os
from tqdm import tqdm_notebook as tqdm

data_dirpath = '/usr2/scratch/fanfic/ao3_harrypotter_text/stories/'
# data_dirpath = '/usr2/mamille2/fanfiction-project/data/ao3/harrypotter/fics_paras
out_dirpath = '/usr2/mamille2/fanfiction-project/data/ao3/harrypotter/emnlp_dataset'

for fic_id in tqdm(fic_ids):
    chapter_fnames = fic_chapters[fic_id]
    
    # Combine into 1 fic text
    with open(os.path.join(out_dirpath, f'{fic_id}.csv'), 'w') as out:
        
        # header
        fname = chapter_fnames[0]
        fpath = os.path.join(data_dirpath, fname)
        with open(fpath) as f:
            for line in f:
                out.write(line)
            
        for fname in chapter_fnames[1:]:
            fpath = os.path.join(data_dirpath, fname)
            with open(fpath) as f:
                lines = f.read().splitlines()
                for line in lines[1:]:
                    out.write(line)

HBox(children=(IntProgress(value=0, max=44647), HTML(value='')))

In [16]:
# Sample equal split among pairings

sample = data.sample(frac=1).groupby('pairing').head(1000)
len(sample)

6000

In [17]:
# Save out sample
sample.to_pickle('/usr2/mamille2/fanfiction-project/data/features/relationship_prediction_sample6k.pkl')

In [18]:
sampled_fic_ids = sorted(sample['fic_id'].unique())
len(sampled_fic_ids)

5594

In [21]:
sampled_fic_ids[:10]

[119, 1260, 1261, 2674, 3896, 5223, 7350, 8151, 8154, 9773]

In [24]:
import shutil

for fic_id in tqdm(sampled_fic_ids):
    from_dirpath = '/usr2/mamille2/fanfiction-project/data/ao3/harrypotter/emnlp_dataset/fics'
    to_dirpath = '/usr2/mamille2/fanfiction-project/data/ao3/harrypotter/emnlp_dataset_6k/fics'
    
    shutil.copy(os.path.join(from_dirpath, f'{fic_id}.csv'), os.path.join(to_dirpath, f'{fic_id}.csv'))

HBox(children=(IntProgress(value=0, max=5594), HTML(value='')))