In [292]:
import re
import pandas as pd
import string
import pickle
from textblob import TextBlob
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()
import spacy
from nltk.corpus import stopwords

In [293]:
def import_book(num):
    with open('books/book{}.txt'.format(num), 'r') as book:
        text = book.read()
    raw_chapters = re.split('CHAPTER|EPILOGUE',text)
    chapters = [' '.join(chapter.split()) for chapter in raw_chapters]
    chapters = [chapter.lower() for chapter in chapters]
    return chapters

def clean_chapter(text, chapter_name):
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('[‘’“”…▼—]', '', text)
    text = re.sub('prologue|epilogue', '', text)
    
    beginning = text[:100]
    end = text[100:]
    beginning = re.sub(chapter_name.lower().strip(), '',beginning)
    
    text= (beginning+end).strip()
    text=' '.join(text.split())
    return text

def clean_titles(text):
    text = ''.join(i for i in text if not i.isdigit())
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('Epilogue', '', text)
    text = re.sub('Prologue', '', text)
    return text

plural_exceptions = ['aes', 'lews', 'talmanes', 'pips']
def singularize_chapter(text, exceptions = plural_exceptions):
    tb = TextBlob(text)
    words = tb.words
    sing_list=[word if word in exceptions else word.singularize() for word in words]
    return ' '.join(sing_list)



In [294]:
sw = stopwords.words('english')
sw = [w.translate(str.maketrans('', '', string.punctuation)) for w in sw]
sw = list(set(sw))
sw.extend(['said','did','like', 'woman', 'man'])

def remove_stopwords(text, stop_words = sw):
    return ' '.join([word for word in text.split() if word not in stop_words])

In [295]:
all_text = []
for num in range(1,15):
    book = import_book(num)
    if num == 4:
        book = book[1:]
    print('Book', num, len(book), "chapters")
    all_text.extend(book)
    

Book 1 54 chapters
Book 2 51 chapters
Book 3 57 chapters
Book 4 58 chapters
Book 5 57 chapters
Book 6 57 chapters
Book 7 42 chapters
Book 8 32 chapters
Book 9 36 chapters
Book 10 32 chapters
Book 11 39 chapters
Book 12 52 chapters
Book 13 59 chapters
Book 14 51 chapters


In [296]:
text_df = pd.DataFrame(all_text, columns = ['text'])
text_df

Unnamed: 0,text
0,prologue dragonmount the palace still shook oc...
1,"1 an empty road the wheel of time turns, and a..."
2,2 strangers when rand and mat carried the firs...
3,3 the peddler clusters of pots clattered and b...
4,4 the gleeman the door of the inn banged shut ...
...,...
672,46 to awaken rand broke free from the darkness...
673,"47 she fought it off, but barely. the forsaken..."
674,48 so many dead. hundreds of thousands of men ...
675,49 beneath that was only emptiness. in the wol...


In [297]:
chapter_names = pd.read_csv('data/chapters.csv', usecols = ['chapter_title', 'book_title'])

In [298]:
chapter_names['chapter_title'] = chapter_names['chapter_title'].apply(lambda s: clean_titles(s))
chapter_names['cumulative_chapter_number'] = chapter_names.index +1 
chapter_names

Unnamed: 0,chapter_title,book_title,cumulative_chapter_number
0,Dragonmount,The Eye of the world,1
1,An Empty Road,The Eye of the world,2
2,Strangers,The Eye of the world,3
3,The Peddler,The Eye of the world,4
4,The Gleeman,The Eye of the world,5
...,...,...,...
672,To Awaken,A Memory of Light,673
673,Watching the Flow Writhe,A Memory of Light,674
674,A Brilliant Lance,A Memory of Light,675
675,Light and Shadow,A Memory of Light,676


In [299]:
allbooks = pd.concat([chapter_names, text_df], axis=1)
allbooks['text'] = allbooks.apply(lambda x: clean_chapter(x['text'], x['chapter_title']), axis=1)
allbooks

Unnamed: 0,chapter_title,book_title,cumulative_chapter_number,text
0,Dragonmount,The Eye of the world,1,the palace still shook occasionally as the ear...
1,An Empty Road,The Eye of the world,2,the wheel of time turns and ages come and pass...
2,Strangers,The Eye of the world,3,when rand and mat carried the first barrels th...
3,The Peddler,The Eye of the world,4,clusters of pots clattered and banged as s wag...
4,The Gleeman,The Eye of the world,5,the door of the inn banged shut behind the whi...
...,...,...,...,...
672,To Awaken,A Memory of Light,673,rand broke free from the darkness and entered ...
673,Watching the Flow Writhe,A Memory of Light,674,she fought it off but barely the forsaken lean...
674,A Brilliant Lance,A Memory of Light,675,so many dead hundreds of thousands of men and ...
675,Light and Shadow,A Memory of Light,676,beneath that was only emptiness in the wolf dr...


In [300]:
allbooks.to_pickle('data/allbooks.pkl')

Singularized

In [301]:
allbooks_singularized = allbooks.copy()
allbooks_singularized['text'] = allbooks_singularized.progress_apply(lambda x: singularize_chapter(x['text']), axis=1)

  0%|          | 0/677 [00:00<?, ?it/s]

In [302]:
allbooks_singularized.to_pickle('data/allbooks_singularized.pkl')

No Stop words

In [303]:
allbooks_nosw = allbooks.copy()
allbooks_nosw['text'] = allbooks_nosw.progress_apply(lambda x: remove_stopwords(x['text']), axis=1)

  0%|          | 0/677 [00:00<?, ?it/s]

In [304]:
allbooks_nosw.to_pickle('data/allbooks_nosw.pkl')

No stop words, signularized

In [305]:
allbooks_sing_nosw = allbooks_nosw.copy()
allbooks_sing_nosw['text'] = allbooks_sing_nosw.progress_apply(lambda x: singularize_chapter(x['text']), axis=1)

  0%|          | 0/677 [00:00<?, ?it/s]

In [308]:
allbooks_sing_nosw.to_pickle('data/allbooks_sing_nosw.pkl')

## Extracting list of characters for network graphing

In [264]:
characters = pd.read_html('https://wot.fandom.com/wiki/Statistical_analysis', match='917,726')[0]

In [265]:
characters = characters.sort_values('Word Count', ascending = False)

In [266]:
characters

Unnamed: 0,Character,# of POVs,"% of Total POVs (1,379)",Word Count,"% of Word Count (4,373,157)",Average Word Count per POV
6,Rand al'Thor,236,17.1139%,917726,20.9854%,3888.67
7,Perrin Aybara,154,11.1675%,540762,12.3655%,3511.44
5,Egwene al'Vere,130,9.4271%,529812,12.1151%,4075.48
19,Matrim Cauthon,116,8.4119%,486811,11.1318%,4196.65
24,Elayne Trakand,83,6.0189%,355856,8.1373%,4287.42
...,...,...,...,...,...,...
86,Abaldar Yulan,1,0.0725%,117,0.0027%,117.00
57,Luan Norwelyn,1,0.0725%,109,0.0025%,109.00
56,Dyelin Taravin,1,0.0725%,102,0.0023%,102.00
58,Ellorien Traemane,1,0.0725%,83,0.0019%,83.00


In [284]:
char_list = list(characters['Character'])

In [285]:
char_list = [char.split()[0].lower().replace("\'",'') for char in char_list]

In [287]:
for char in ['matrim', 'galadedrid', 'thomdril', 'lews', 'narrator', 'mazrim', 'quote', 'geofram', 'davram',
            'rodel', 'seanchan']:
    char_list.remove(char)

In [288]:
for char in ['mat', 'galad', 'thom', 'taim', 'bornhald', 'bashere', 'ituralde']:
    char_list.append(char)

In [289]:
char_list=sorted(char_list)

In [290]:
with open('char_list.pkl', 'wb') as f:
    pickle.dump(char_list, f)