# This notebooks reviews:

#### Title selection by author
#### Topics important to selected title
#### Keywords important to selected topics
#### Titles that most closely match selected title
#### Metadata and subject list for title matches
<br>
<br>
    

## Imports

In [124]:
#Gensim
import gensim
import gensim.corpora as corpora
#from gensim.utils import simple_preprocess
#from gensim.models import CoherenceModel
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from wordcloud import WordCloud
import ipywidgets as widgets
from ipywidgets import interact, interact_manual
from sklearn.metrics.pairwise import euclidean_distances

## Load data frames and model
LDA model using gensim generating 20 topics

In [60]:
#Load metadata
metadata = pd.read_csv('../data/model_data/dm_lda_04/metadata_lda04.csv')
#Load model
model = gensim.models.ldamodel.LdaModel.load('../data/model_data/dm_lda_04/model.lda04')
#Load topics per book
topics_summary = pd.read_csv('../data/model_data/dm_lda_04/topics_summary.csv')

In [122]:
#Top 10 keywords per topic
contents = []
for t in range(model.num_topics):
    wordlist = list(model.show_topic(t, 10))
    for words in wordlist:
        contents.append({'topic':t, 'keyword': words[0], 'importance': words[1]})
keywords = pd.DataFrame(contents)


## A sample of topic importances by book

In [61]:
topics_summary.head()
#metadata.head()

Unnamed: 0,pg_id,doc_index,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,...,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19
0,PG10057,0,0.002099,0.000328,0.075983,0.111052,0.006879,0.00576,0.027011,0.001206,...,0.021455,0.025291,0.130951,0.050395,0.009008,0.290606,0.043932,0.069184,0.06799,0.005143
1,PG10067,1,0.001172,0.006686,0.045212,0.063337,0.022029,0.000235,0.029929,0.010021,...,0.036915,0.105833,0.140186,0.0462,0.001156,0.248974,0.015719,0.046573,0.084675,0.005638
2,PG10082,2,0.002074,0.002601,0.031155,0.07446,0.021008,0.001451,0.015176,0.0,...,0.039292,0.055678,0.079571,0.036918,0.000277,0.264973,0.024837,0.053421,0.067036,0.004741
3,PG10110,3,0.0,0.000666,0.063976,0.100419,0.016767,0.00156,0.021734,0.000379,...,0.023847,0.063837,0.095575,0.060068,0.003135,0.261102,0.057441,0.052551,0.050909,0.015607
4,PG10377,4,0.000597,0.001484,0.025551,0.046569,0.022708,0.007099,0.006495,0.001405,...,0.006505,0.03267,0.147046,0.03005,0.005407,0.291932,0.031601,0.190689,0.066801,0.002084


## Define functions

In [71]:
def booklist_topics(booklist):
    topics_list = topics_summary[topics_summary['pg_id'].isin(booklist)].sum()
    return(topics_list)

In [155]:
# Calculate euclidean distances for one book and output to a data frame
def show_title_matches(X_index):
    X = np.array([topics_summary.iloc[X_index,2:]])
    #print(X)
    contents = []
    for doc_index in range(len(topics_summary)):
        #print(doc_index)
        Y = np.array([topics_summary.iloc[doc_index, 2:]])
        dist = euclidean_distances(X,Y)[0][0]
        contents.append([doc_index, dist])
    doc_dist = pd.DataFrame(contents, columns = ['doc_index', 'dist'])
    doc_dist.sort_values('dist').head()
    doc_list = doc_dist.sort_values('dist').head()['doc_index'].to_list()
    pg_id_list = topics_summary[topics_summary['doc_index'].isin(doc_list)]['pg_id'].to_list()
    return metadata[metadata['id'].isin(pg_id_list)][['id','title','author','downloads','subjects']]

In [153]:
# Expand multiple subjects per title into a long form list
def show_title_subjects(meta_list):
    contents = []
    for row in meta_list.iterrows():
        #print(row[1])
        pg_id = (row[1][0])
        title = (row[1][1])
        subj = (row[1][4])
        subj = subj.replace('{','')
        subj = subj.replace('}','')
        subj = subj.split("', ")
        for item in subj:
            contents.append({'pg_id':pg_id,'title':title,'subject':item.replace("'","")})
    return pd.DataFrame(contents)

## Titles by author

In [76]:
@interact
def show_author_books(author = list(metadata['author'].sort_values().unique())):
    return metadata[metadata['author']==author][['id','title','authoryearofbirth','downloads','subjects']]

interactive(children=(Dropdown(description='author', options=('Adams, O. S.', 'Adams, Samuel Hopkins', 'Ainswo…

## Topics important to selected title

In [171]:
booklist = ['PG2031']
topics = pd.DataFrame(booklist_topics(booklist)[2:], columns = ['importance'])
topics.sort_values('importance', ascending = False)

Unnamed: 0,importance
topic_15,0.31571
topic_11,0.084347
topic_12,0.080243
topic_3,0.075901
topic_2,0.071252
topic_9,0.068211
topic_13,0.066681
topic_17,0.052701
topic_18,0.05087
topic_6,0.029921


## Keywords important to selected topics

In [121]:
@interact
def show_topic_keywords(topic = list(keywords['topic'].sort_values().unique())):
    return keywords[keywords['topic']==topic]
# add word cloud and num_words selection

interactive(children=(Dropdown(description='topic', options=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,…

## Titles that most closely match selected title

In [172]:
X_index = 119
meta_list = show_title_matches(X_index)
meta_list[['id','title','author','downloads']]

Unnamed: 0,id,title,author,downloads
106,PG19369,The Triumphs of Eugène Valmont,"Barr, Robert",92
119,PG2031,The Lock and Key Library: The Most Interesting...,,110
432,PG45622,The Curved Blades,"Wells, Carolyn",21
445,PG46462,Recollections of a Policeman,"Russell, William",20
684,PG9300,"Jennie Baxter, Journalist","Barr, Robert",24


## Expand the subjects per title

In [173]:
subjects = show_title_subjects(meta_list)
#subjects
subjects.style.set_properties(subset=['title'], **{'width': '500px'})

Unnamed: 0,pg_id,title,subject
0,PG19369,The Triumphs of Eugène Valmont,"Humorous stories, English"
1,PG19369,The Triumphs of Eugène Valmont,French -- England -- London -- Fiction
2,PG19369,The Triumphs of Eugène Valmont,Private investigators -- England -- London -- Fiction
3,PG19369,The Triumphs of Eugène Valmont,"Detective and mystery stories, English"
4,PG2031,The Lock and Key Library: The Most Interesting Stories of All Nations: Real Life,Detective and mystery stories
5,PG2031,The Lock and Key Library: The Most Interesting Stories of All Nations: Real Life,Parapsychology
6,PG2031,The Lock and Key Library: The Most Interesting Stories of All Nations: Real Life,Fiction
7,PG45622,The Curved Blades,Detective and mystery stories
8,PG45622,The Curved Blades,Fiction
9,PG46462,Recollections of a Policeman,Police -- Fiction


In [31]:
model.get_topic_terms(0, topn=20)

[(2828, 0.11076207),
 (4148, 0.098153025),
 (3522, 0.07361885),
 (3538, 0.056315064),
 (6945, 0.049061053),
 (6602, 0.037216607),
 (7836, 0.035317667),
 (578, 0.03328636),
 (2592, 0.029514303),
 (2270, 0.025690861),
 (653, 0.021255221),
 (3528, 0.019889649),
 (2045, 0.01943372),
 (5954, 0.016984068),
 (2241, 0.016026124),
 (908, 0.01256019),
 (4138, 0.012291809),
 (5989, 0.011526088),
 (3529, 0.009798709),
 (884, 0.009622333)]

In [34]:
model.get_term_topics(884, minimum_probability = .0001)

[(0, 0.009580477)]

In [29]:
model.show_topics(num_words=20)

[(0,
  '0.111*"manager" + 0.098*"island" + 0.074*"hedge" + 0.056*"traveller" + 0.049*"conspiracy" + 0.037*"cliff" + 0.035*"tent" + 0.033*"stage" + 0.030*"ch_teau" + 0.026*"theatre" + 0.021*"bicycle" + 0.020*"stall" + 0.019*"idol" + 0.017*"contradiction" + 0.016*"masterpiece" + 0.013*"goose" + 0.012*"compose" + 0.012*"stepmother" + 0.010*"tracing" + 0.010*"visage"'),
 (8,
  '0.107*"machine" + 0.058*"agent" + 0.037*"franc" + 0.035*"bracelet" + 0.034*"government" + 0.031*"caf" + 0.028*"watchman" + 0.022*"ami" + 0.021*"deceive" + 0.019*"aeroplane" + 0.016*"uproar" + 0.015*"pauper" + 0.014*"dryly" + 0.014*"compensation" + 0.014*"factory" + 0.013*"smuggler" + 0.012*"wager" + 0.012*"deputy" + 0.011*"spendthrift" + 0.011*"ruby"'),
 (7,
  '0.096*"cell" + 0.078*"poison" + 0.047*"pin" + 0.044*"laboratory" + 0.041*"pirate" + 0.036*"explosion" + 0.032*"powder" + 0.029*"needle" + 0.028*"tube" + 0.025*"apparatus" + 0.021*"demon" + 0.021*"leap" + 0.020*"jeweller" + 0.020*"nest" + 0.018*"cage" + 0.017*

In [30]:
model.show_topic(2, topn=20)

[('work', 0.026939858),
 ('fellow', 0.02650584),
 ('lot', 0.020576492),
 ('captain', 0.01899662),
 ('country', 0.018378364),
 ('people', 0.018180672),
 ('game', 0.017511971),
 ('bit', 0.015837248),
 ('city', 0.014247753),
 ('world', 0.012712098),
 ('company', 0.0124462405),
 ('town', 0.011849247),
 ('party', 0.011248954),
 ('dollar', 0.010929919),
 ('job', 0.010686318),
 ('luck', 0.009808063),
 ('chap', 0.009206914),
 ('club', 0.008768029),
 ('guess', 0.008693911),
 ('deck', 0.008483334)]

In [6]:
dir(model)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__ignoreds',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__numpys',
 '__recursive_saveloads',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__scipys',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_adapt_by_suffix',
 '_apply',
 '_load_specials',
 '_save_specials',
 '_smart_save',
 'add_lifecycle_event',
 'alpha',
 'bound',
 'callbacks',
 'chunksize',
 'clear',
 'decay',
 'diff',
 'dispatcher',
 'distributed',
 'do_estep',
 'do_mstep',
 'dtype',
 'eta',
 'eval_every',
 'expElogbeta',
 'gamma_threshold',
 'get_document_topics',
 'get_term_topics',
 'get_topic_terms',
 'get_topics',
 'id2word',
 'inference',
 'init_dir_prior',
 'iterations',
 'lifecycle_events',
 'load',
 'log_perplexity',
 'minimum_phi_value',
 'minimum_probability',


https://www.youtube.com/watch?v=oP3c1h8v2ZQ

In [None]:
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model, corpus=corpus, texts=data)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)