In [1]:
import pandas as pd
from nltk import tokenize

# Truncate the Data

In [2]:
data = pd.read_csv('data/proper_labels.csv')
data.drop(columns=['Unnamed: 0.1'], inplace=True)

In [9]:
truncated_data = pd.DataFrame(columns=data.columns[1:-1])

In [10]:
truncated_data

Unnamed: 0,name,author,genre,text


In [11]:
def camel_case_split(str):
    words = [[str[0]]]
    for c in str[1:]:
        if words[-1][-1].islower() and c.isupper():
            words.append(list(c))
        else:
            words[-1].append(c)

    return " ".join(
        (
            ". ".join(["".join(word) for word in words])
            .replace(".", ". ")
            .replace(",", ", ")
            .replace("?", "? ")
            .replace("!", "! ")
            .replace(";", "; ")
        ).split()
    )

for i,r in data.iterrows():
    if r['text length'] > 510:
        truncated_text = ""
        for s in tokenize.sent_tokenize(r['text']):
            combined = truncated_text+s+" "
            if len(camel_case_split(combined).split()) < 510:
                truncated_text = truncated_text+s+" "
            else:
                row_to_add = [r['name'], r['author'], r['genre'], truncated_text]
                truncated_data.loc[len(truncated_data)] = row_to_add
                truncated_text = ""
        if truncated_text:
            row_to_add = [r['name'], r["author"], r["genre"], truncated_text]
            truncated_data.loc[len(truncated_data)] = row_to_add        
    else:
        row_to_add = [r["name"], r["author"], r["genre"], r["text"]]
        truncated_data.loc[len(truncated_data)] = row_to_add

In [14]:
truncated_data['text'] = truncated_data['text'].map(camel_case_split)
truncated_data['text length'] = truncated_data['text'].map(lambda x: len(x.split(" ")))

In [16]:
truncated_data.to_csv('data/truncated_data.csv')

# Extend data labels

In [17]:
all_labels = list(set(pd.read_csv('data/excerpts.csv').genre))

In [22]:
extended_labels = pd.DataFrame(columns=(['name','author','text']+all_labels))

In [23]:
extended_labels

Unnamed: 0,name,author,text,Business-Finance-Law,Religion,Natural-History,Humour,Computing,Medical,Health,...,Romance,Entertainment,Teen-Young-Adult,Science-Geography,Personal-Development,Biography,Science-Fiction-Fantasy-Horror,Poetry-Drama,History-Archaeology,Society-Social-Sciences


In [24]:
for i,r in truncated_data.iterrows():
    all_genres = {key:"False" for key in all_labels}
    genres = r['genre'].split(",")
    for genre in genres:
        all_genres[genre] = "True"
    row_to_add = [r['name'], r['author'], r["text"]]
    row_to_add.extend(list(all_genres.values()))
    extended_labels.loc[len(extended_labels)] = row_to_add

In [25]:
extended_labels.head()

Unnamed: 0,name,author,text,Business-Finance-Law,Religion,Natural-History,Humour,Computing,Medical,Health,...,Romance,Entertainment,Teen-Young-Adult,Science-Geography,Personal-Development,Biography,Science-Fiction-Fantasy-Horror,Poetry-Drama,History-Archaeology,Society-Social-Sciences
0,The Man Who Loved China: The Fantastic Story o...,Simon Winchester,"Without haste, without fear, we conquer the world",False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,True,False
1,The Collected Poems,Sylvia Plath,"I lean to you, numb as a fossil. Tell me I'm h...",False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
2,The Collected Poems,Sylvia Plath,"Mother of otherness, Eat me. --from ""Poem for ...",False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
3,The Collected Poems,Sylvia Plath,She stopped fitting me so closely and seemed o...,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
4,The Collected Poems,Sylvia Plath,In the month of red leaves I climb to a bed of...,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False


In [34]:
extended_labels.to_csv('data/post_processed.csv',encoding='utf-8')

# Transformers Preprocessing

In [47]:
from datasets import DatasetDict, load_dataset, Dataset, load_from_disk

In [None]:
pd.read_csv('data/post_processed.csv')

In [41]:
dataset = Dataset.from_pandas(pd.read_csv('data/post_processed.csv'))

In [42]:
train_testvalid = dataset.train_test_split(0.4)
# Split the 10% test + valid in half test, half valid
test_valid = train_testvalid['test'].train_test_split(0.5)
# gather everyone if you want to have a single DatasetDict
train_test_valid_dataset = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})

In [43]:
train_test_valid_dataset

DatasetDict({
    train: Dataset({
        features: ['index', 'name', 'author', 'text', 'Business-Finance-Law', 'Religion', 'Natural-History', 'Humour', 'Computing', 'Medical', 'Health', 'Mind-Body-Spirit', 'Crime-Thriller', 'Technology-Engineering', 'Romance', 'Entertainment', 'Teen-Young-Adult', 'Science-Geography', 'Personal-Development', 'Biography', 'Science-Fiction-Fantasy-Horror', 'Poetry-Drama', 'History-Archaeology', 'Society-Social-Sciences'],
        num_rows: 8526
    })
    test: Dataset({
        features: ['index', 'name', 'author', 'text', 'Business-Finance-Law', 'Religion', 'Natural-History', 'Humour', 'Computing', 'Medical', 'Health', 'Mind-Body-Spirit', 'Crime-Thriller', 'Technology-Engineering', 'Romance', 'Entertainment', 'Teen-Young-Adult', 'Science-Geography', 'Personal-Development', 'Biography', 'Science-Fiction-Fantasy-Horror', 'Poetry-Drama', 'History-Archaeology', 'Society-Social-Sciences'],
        num_rows: 2842
    })
    valid: Dataset({
        features

In [44]:
train_test_valid_dataset.save_to_disk('data/neural_data_post_process')

Saving the dataset (0/1 shards):   0%|          | 0/8526 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2842 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2842 [00:00<?, ? examples/s]