In [41]:
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.max_rows', None)
import os
from tqdm.notebook import tqdm
import re
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.corpora import Dictionary
from gensim.models import LdaModel
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import glob



In [32]:
metadata = pd.read_csv('../data/metadata/metadata.csv')

In [33]:
#Select detective/mystery fiction and tales in English
kw = ['fiction','tale']
subj = ['detective', 'mystery','mysteries']
metadata = metadata[metadata['language']=='[\'en\']']
metadata = metadata[metadata['subjects'].apply(lambda x: any([k in x.lower() for k in kw]))]
metadata = metadata[metadata['subjects'].apply(lambda x: any([k in x.lower() for k in subj]))]

In [34]:
#How many titles found?
metadata.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1093 entries, 62 to 65332
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 1093 non-null   object 
 1   title              1093 non-null   object 
 2   author             1086 non-null   object 
 3   authoryearofbirth  875 non-null    float64
 4   authoryearofdeath  847 non-null    float64
 5   language           1093 non-null   object 
 6   downloads          1093 non-null   int64  
 7   subjects           1093 non-null   object 
 8   type               1093 non-null   object 
dtypes: float64(2), int64(1), object(6)
memory usage: 85.4+ KB


In [35]:
ids = list(metadata['id'])
ids[0]

'PG10057'

In [36]:
#List the ids and path for these titles
txts = [[id, '../data/text\\' + id + '_text.txt'] for id in ids]
txts

[['PG10057', '../data/text\\PG10057_text.txt'],
 ['PG10059', '../data/text\\PG10059_text.txt'],
 ['PG10067', '../data/text\\PG10067_text.txt'],
 ['PG10082', '../data/text\\PG10082_text.txt'],
 ['PG10110', '../data/text\\PG10110_text.txt'],
 ['PG10359', '../data/text\\PG10359_text.txt'],
 ['PG10377', '../data/text\\PG10377_text.txt'],
 ['PG10468', '../data/text\\PG10468_text.txt'],
 ['PG10474', '../data/text\\PG10474_text.txt'],
 ['PG10476', '../data/text\\PG10476_text.txt'],
 ['PG10521', '../data/text\\PG10521_text.txt'],
 ['PG10584', '../data/text\\PG10584_text.txt'],
 ['PG10624', '../data/text\\PG10624_text.txt'],
 ['PG10659', '../data/text\\PG10659_text.txt'],
 ['PG10709', '../data/text\\PG10709_text.txt'],
 ['PG10718', '../data/text\\PG10718_text.txt'],
 ['PG10724', '../data/text\\PG10724_text.txt'],
 ['PG10869', '../data/text\\PG10869_text.txt'],
 ['PG108', '../data/text\\PG108_text.txt'],
 ['PG10948', '../data/text\\PG10948_text.txt'],
 ['PG11127', '../data/text\\PG11127_text.txt

In [76]:
#note any files that don't exist (duplicates or files that were not utf8) and remove them from the metadata df
docs = []

for idx, filename in tqdm(txts):
    try:
        with open(filename, 'r', encoding = 'utf8') as txt:
            # added encoding = 'utf8' to prevent a continuation byte error
            #txt = f
            #pdf = pdftotext.PDF(f)
            txt = '\n\n'.join(txt).lower()
            txt = re.sub('-\s+', '', txt)
            txt = re.sub('--',' ', txt)
            txt = ' '.join(re.findall('[a-z\-]{2,}', txt))
            txt = txt[0:20000]
            docs.append(txt)
    except FileNotFoundError:
        print('WARNING: file not found ', filename)
        metadata = metadata[metadata['id']!=idx]

  and should_run_async(code)
  txt = re.sub('-\s+', '', txt)
  txt = ' '.join(re.findall('[a-z\-]{2,}', txt))


  0%|          | 0/1093 [00:00<?, ?it/s]



In [77]:
docs[0]

  and should_run_async(code)


'proofreaders the secret of the tower by anthony hope author of the prisoner of zenda rupert of hentzau etc contents doctor mary paying guest ii the general remembers iii mr saffron at home iv professional etiquette familiar implement vi odd story of captain duggle vii gentlemanly stranger viii captain alec raises his voice ix doctor mary ultimatum that magical word morocco xi the car behind the trees xii the secret of the tower xiii right of conquest xiv the scepter in the grave xv normal case xvi dead majesty xvii the chief mourners xviii the gold and the treasure chapter doctor mary paying guest just in time wasn it asked mary arkroyd two days before the the ceremony mercifully it had all been kept very quiet because it was only three months since poor gilly was killed forget whether you ever met gilly my half-brother you know only once in collingham gardens he had an exeat and dashed in one saturday morning when we were just finishing our work don you remember yes think do but sinc

In [78]:
#Does everything match?
print(len(metadata) == len(docs))
print(len(metadata))

True
1052


  and should_run_async(code)


In [79]:
# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in tqdm(range(len(docs))):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]

#take out the lemmatizer for now
#lemmatizer = WordNetLemmatizer()
#docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

  and should_run_async(code)
  self._context.run(self._callback, *self._args)


  0%|          | 0/1052 [00:00<?, ?it/s]

In [80]:
docs[0]

  and should_run_async(code)


['proofreaders',
 'the',
 'secret',
 'of',
 'the',
 'tower',
 'by',
 'anthony',
 'hope',
 'author',
 'of',
 'the',
 'prisoner',
 'of',
 'zenda',
 'rupert',
 'of',
 'hentzau',
 'etc',
 'contents',
 'doctor',
 'mary',
 'paying',
 'guest',
 'ii',
 'the',
 'general',
 'remembers',
 'iii',
 'mr',
 'saffron',
 'at',
 'home',
 'iv',
 'professional',
 'etiquette',
 'familiar',
 'implement',
 'vi',
 'odd',
 'story',
 'of',
 'captain',
 'duggle',
 'vii',
 'gentlemanly',
 'stranger',
 'viii',
 'captain',
 'alec',
 'raises',
 'his',
 'voice',
 'ix',
 'doctor',
 'mary',
 'ultimatum',
 'that',
 'magical',
 'word',
 'morocco',
 'xi',
 'the',
 'car',
 'behind',
 'the',
 'trees',
 'xii',
 'the',
 'secret',
 'of',
 'the',
 'tower',
 'xiii',
 'right',
 'of',
 'conquest',
 'xiv',
 'the',
 'scepter',
 'in',
 'the',
 'grave',
 'xv',
 'normal',
 'case',
 'xvi',
 'dead',
 'majesty',
 'xvii',
 'the',
 'chief',
 'mourners',
 'xviii',
 'the',
 'gold',
 'and',
 'the',
 'treasure',
 'chapter',
 'doctor',
 'mary',


In [81]:
# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)

  and should_run_async(code)


In [82]:
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

  and should_run_async(code)
  self._context.run(self._callback, *self._args)


In [83]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 8215
Number of documents: 1052


  and should_run_async(code)


In [84]:
# Set training parameters.
num_topics = 10
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make an index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

  and should_run_async(code)


In [85]:
contents = []

for [idx,txt], doc in tqdm(zip(txts, docs)):
    bow = dictionary.doc2bow(doc)
    topics = model.get_document_topics(bow)
    for res in topics:
        topic, pct = res
        contents.append({'txt': txt, 'topic': topic, 'percent': pct})

topics = pd.DataFrame(contents)

  and should_run_async(code)


0it [00:00, ?it/s]

In [86]:
vis = gensimvis.prepare(model, corpus, dictionary, mds = 'mmds', sort_topics=False)
#added mds = 'mmds' to prevent a complex number error
pyLDAvis.save_html(vis, 'lda.html')

  and should_run_async(code)


In [88]:
for i in range(10):
    for j in range(5):

        file = topics[topics.topic == i].sort_values('percent', ascending = False).head().iloc[j, 0]

        cmd = f'cp {file} representative_docs/topic_{i + 1}/.'

        os.system(cmd)

  and should_run_async(code)


In [89]:
topics

  and should_run_async(code)


Unnamed: 0,txt,topic,percent
0,../data/text\PG10057_text.txt,3,0.25093
1,../data/text\PG10057_text.txt,7,0.054103
2,../data/text\PG10057_text.txt,8,0.359421
3,../data/text\PG10057_text.txt,9,0.335291
4,../data/text\PG10059_text.txt,3,0.204621
5,../data/text\PG10059_text.txt,7,0.682388
6,../data/text\PG10059_text.txt,9,0.112746
7,../data/text\PG10067_text.txt,0,0.300141
8,../data/text\PG10067_text.txt,5,0.225549
9,../data/text\PG10067_text.txt,9,0.473983


In [65]:
topics[topics['topic']==0].sort_values('percent', ascending = False).head().iloc[0,0]

  and should_run_async(code)


'../data/text\\PG5162_text.txt'

In [90]:
topics

  and should_run_async(code)


Unnamed: 0,txt,topic,percent
0,../data/text\PG10057_text.txt,3,0.25093
1,../data/text\PG10057_text.txt,7,0.054103
2,../data/text\PG10057_text.txt,8,0.359421
3,../data/text\PG10057_text.txt,9,0.335291
4,../data/text\PG10059_text.txt,3,0.204621
5,../data/text\PG10059_text.txt,7,0.682388
6,../data/text\PG10059_text.txt,9,0.112746
7,../data/text\PG10067_text.txt,0,0.300141
8,../data/text\PG10067_text.txt,5,0.225549
9,../data/text\PG10067_text.txt,9,0.473983
