In [1]:
!pip install transformers
!pip install datasets

!pip install -U PyYAML

Collecting transformers
  Downloading transformers-4.9.1-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 8.1 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 63.1 MB/s 
[?25hCollecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 64.4 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 48.2 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully u

In [2]:
import pandas as pd
import yaml
import pathlib
import os

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


We want to get the fixed train/test split along with their success labels and genre labels. We then want to keep mind of the success distribution as well as the genre distribution for each success label in the test split.

                          success                     failure
                      /  /      .\. \             /  /      .\. \ 
                     /   |      .|.  \           /   |      .|.  \
                  crime drama   ... comedy     crime drama  ... comedy

We then want to split the training data into a train split and val split by randomly removing some percentage of the books from the train split and putting it into the val split while keeping the distribution of success and genre labels the same as that of the test split.

In [None]:
yamlpath = '/content/drive/MyDrive/Thesis/BookSuccessPredictor/datasets/goodreads_maharjan_super/raw_text/train_test_split_goodreads.yaml'
class GoodreadsSplit:
  def __init__(self):
    self.book_ids = []
    self.success = []
    self.genres = []
  
  def append_to_lists(self, bid, s, g):
    self.book_ids.append(bid)
    self.success.append(s)
    self.genres.append(g)

  def to_df(self):
    df = pd.DataFrame({
        'book_ids': self.book_ids,
        'success': self.success,
        'genre': self.genres
      })
    return df

In [None]:
train_split = GoodreadsSplit()
test_split = GoodreadsSplit()

In [None]:
dataset_base = pathlib.Path('/content/drive/MyDrive/Thesis/BookSuccessPredictor/datasets/goodreads_maharjan_super/raw_text')

In [None]:
def get_labels_from_doc(doc):
  path = list(dataset_base.rglob(doc))[0]
  classStr = os.path.dirname(path).rsplit('/', 1)[1]
  genreStr = os.path.dirname(path).rsplit('/', 2)[1]
  return classStr, genreStr

with open(yamlpath) as file:
  documents = yaml.full_load(file)
  for doc in documents['train']:
    classStr, genreStr = get_labels_from_doc(doc)
    train_split.append_to_lists(doc, classStr, genreStr)

  for doc in documents['test']:
    classStr, genreStr = get_labels_from_doc(doc)
    test_split.append_to_lists(doc, classStr, genreStr)

In [None]:
print('train:',len(train_split.book_ids), "\ntest:", len(test_split.book_ids))

train: 694 
test: 290


In [None]:
694+290

984

In [None]:
from glob import glob
result = []
for x in os.walk(dataset_base):
  for y in glob(os.path.join(x[0], '*.txt')):
    last_folder = pathlib.PurePath(y).parent.name
    if last_folder in ['success', 'failure']:
      result.append(y)

In [None]:
len(result)

1003

The reason for the discrepancy between the number of books in yaml and the number of books found in the folder is that some books are listed twice if they belong to multiple genres..
For simplicity's sake and the fact that it is only an issue for 19 books, we will ignore the fact that a book can have two genres

In [None]:
train_df = train_split.to_df()
test_df = test_split.to_df()

In [None]:
test_df

Unnamed: 0,book_ids,success,genre
0,13724_the+frontiersmen.txt,success,Short_stories
1,4765_loyalties.txt,failure,Drama
2,23588_a+filbert+is+a+nut.txt,failure,Short_stories
3,18057_flower+of+the+dusk.txt,success,Love_stories
4,109_renascence+and+other+poems.txt,success,Poetry
...,...,...,...
285,24873_lucy+maud+montgomery+short+stories+1896+...,success,Short_stories
286,27391_the+mouse+and+the+christmas+cake.txt,failure,Poetry
287,3612_john+bull's+other+island.txt,success,Drama
288,28164_the+big+bow+mystery.txt,failure,Detective_and_mystery_stories


In [None]:
success_labels = ['failure', 'success']
genres = ['Detective_and_mystery_stories', 'Drama', 'Fiction', 'Historical_fiction', 'Love_stories', 'Poetry', 'Science_fiction', 'Short_stories']

test_distros = {'failure': {}, 'success': {}}
for s_label in success_labels:
  for genre in genres:
    cnt = test_df[(test_df.success == s_label) & (test_df.genre == genre)].shape[0]
    test_distros[s_label][genre] = cnt / 290

In [None]:
# for s_label in test_distros:
#   for genre in test_distros[s_label]:
#     test_distros[s_label][genre] /= 290

In [None]:
test_distros

{'failure': {'Detective_and_mystery_stories': 0.07241379310344828,
  'Drama': 0.02413793103448276,
  'Fiction': 0.034482758620689655,
  'Historical_fiction': 0.017241379310344827,
  'Love_stories': 0.013793103448275862,
  'Poetry': 0.03103448275862069,
  'Science_fiction': 0.05517241379310345,
  'Short_stories': 0.10689655172413794},
 'success': {'Detective_and_mystery_stories': 0.03793103448275862,
  'Drama': 0.07931034482758621,
  'Fiction': 0.07931034482758621,
  'Historical_fiction': 0.05862068965517241,
  'Love_stories': 0.05517241379310345,
  'Poetry': 0.15172413793103448,
  'Science_fiction': 0.034482758620689655,
  'Short_stories': 0.1482758620689655}}

In [None]:
val_distros = {'failure': {}, 'success': {}}

In [None]:
total_test_samples = train_df.shape[0] * 0.2

In [None]:
for s_label in test_distros:
  for genre in test_distros[s_label]:
    val_distros[s_label][genre] = round(total_test_samples * test_distros[s_label][genre])

In [None]:
val_distros

{'failure': {'Detective_and_mystery_stories': 10,
  'Drama': 3,
  'Fiction': 5,
  'Historical_fiction': 2,
  'Love_stories': 2,
  'Poetry': 4,
  'Science_fiction': 8,
  'Short_stories': 15},
 'success': {'Detective_and_mystery_stories': 5,
  'Drama': 11,
  'Fiction': 11,
  'Historical_fiction': 8,
  'Love_stories': 8,
  'Poetry': 21,
  'Science_fiction': 5,
  'Short_stories': 21}}

In [None]:
val_idx = pd.Int64Index([])

In [None]:
val_idx

Int64Index([], dtype='int64')

In [None]:
for s_label in success_labels:
  for genre in genres:
    num_samples = val_distros[s_label][genre]
    oy = train_df[(train_df.success == s_label) & (train_df.genre == genre)].sample(num_samples).index
    val_idx = val_idx.union(oy)

In [None]:
val_df = train_df.loc[val_idx]

In [None]:
train_df = train_df.drop(val_idx)

In [None]:
listdoc = {
    'train': train_df['book_ids'].tolist(),
    'val': val_df['book_ids'].tolist(),
    'test': test_df['book_ids'].tolist()
}

In [None]:
with open('/content/drive/MyDrive/Thesis/BookSuccessPredictor/datasets/goodreads_maharjan_super/raw_preprocessed/trimmed_and_nered/train_test_val_80_20_split_goodreads.yaml', 'w') as f:
    yaml.dump(listdoc, f)

# Unused Code

In [None]:
yaml_books = train_split.book_ids + test_split.book_ids

In [None]:
yaml_books = set(yaml_books)

In [None]:
missing_books = []
for x in os.walk(dataset_base):
  for y in glob(os.path.join(x[0], '*.txt')):
    last_folder = pathlib.PurePath(y).parent.name
    if last_folder in ['success', 'failure']:
      if not os.path.basename(y) in yaml_books:
        missing_books.append(y)

In [None]:
missing_books

[]

In [None]:
def get_labels_from_doc(doc):
  path = list(dataset_base.rglob(doc))
  if (len(path)>1):
    print(path)
  path = path[0]
  classStr = os.path.dirname(path).rsplit('/', 1)[1]
  genreStr = os.path.dirname(path).rsplit('/', 2)[1]
  return classStr, genreStr

with open(yamlpath) as file:
  documents = yaml.full_load(file)
  for doc in documents['train']:
    classStr, genreStr = get_labels_from_doc(doc)

  for doc in documents['test']:
    classStr, genreStr = get_labels_from_doc(doc)

[PosixPath('/content/drive/MyDrive/Thesis/BookSuccessPredictor/datasets/goodreads_maharjan_super/raw_text/Historical_fiction/success/13707_twice+told+tales.txt'), PosixPath('/content/drive/MyDrive/Thesis/BookSuccessPredictor/datasets/goodreads_maharjan_super/raw_text/Short_stories/success/13707_twice+told+tales.txt')]
[PosixPath('/content/drive/MyDrive/Thesis/BookSuccessPredictor/datasets/goodreads_maharjan_super/raw_text/Historical_fiction/success/513_the+snow+image+and+other+stories.txt'), PosixPath('/content/drive/MyDrive/Thesis/BookSuccessPredictor/datasets/goodreads_maharjan_super/raw_text/Short_stories/success/513_the+snow+image+and+other+stories.txt')]
[PosixPath('/content/drive/MyDrive/Thesis/BookSuccessPredictor/datasets/goodreads_maharjan_super/raw_text/Drama/failure/16659_translations+of+shakuntala+and+other+works.txt'), PosixPath('/content/drive/MyDrive/Thesis/BookSuccessPredictor/datasets/goodreads_maharjan_super/raw_text/Poetry/failure/16659_translations+of+shakuntala+and