In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import chart_studio.plotly as csp

In [None]:
#All Gutenberg metadata
metadata = pd.read_csv('../data/metadata/metadata.csv')

In [None]:
metadata.count()

In [None]:
#Select detective/mystery fiction and tales in English
kw = ['fiction','tale']
subj = ['detective', 'mystery','mysteries']
mystery = metadata[metadata['language']=='[\'en\']']
mystery = mystery[mystery['subjects'].apply(lambda x: any([k in x.lower() for k in kw]))]
mystery = mystery[mystery['subjects'].apply(lambda x: any([k in x.lower() for k in subj]))]

In [None]:
popular = mystery[mystery['downloads'] > 50].sort_values('downloads', ascending = False)
popular

In [None]:
popular.groupby(['author'])['title'].count().sort_values(ascending = False)

In [None]:
not_popular = mystery[mystery['downloads'] <= 50].sort_values('downloads', ascending = False)
not_popular

In [None]:
not_popular.groupby(['author'])['title'].count().sort_values(ascending = False)

In [None]:
min_mystery = mystery[mystery['authoryearofbirth']>0]['authoryearofbirth'].min()
max_mystery = mystery[mystery['authoryearofbirth']>0]['authoryearofbirth'].max()
mystery[mystery['authoryearofbirth'].isin([min_mystery, max_mystery])]

In [None]:
min_meta = metadata[metadata['authoryearofbirth']>0]['authoryearofbirth'].min()
max_meta = metadata[metadata['authoryearofbirth']>0]['authoryearofbirth'].max()
metadata[metadata['authoryearofbirth'].isin([min_meta, max_meta])]

In [None]:
popular_meta = metadata[metadata['downloads'] > 1000].sort_values('downloads', ascending = False)
popular_meta.head(10)

In [None]:
popular_meta.at[894, 'title'] = 'A Modest Proposal'
popular_meta.at[63656, 'title'] = 'The Importance of Being Earnest'
popular_meta.head(10)

In [None]:
not_popular_meta = metadata[metadata['downloads'] <= 1000].sort_values('downloads', ascending = False)
not_popular_meta

In [None]:
mystery_auth = mystery[['author','downloads']].groupby('author').sum('downloads').sort_values(['downloads'],ascending = False).reset_index()
mystery_auth

In [None]:
metadata_auth = metadata[['author','downloads']].groupby('author').sum('downloads').sort_values(['downloads'],ascending = False).reset_index()
metadata_auth

In [None]:
rgb = 'rgb(203,213,232)'
fig1 = px.bar(mystery_auth.head(10), y='author', x='downloads',  
             #title = 'Mystery Authors - Top 10 by Total Downloads',
             template='plotly_dark',
             color_discrete_sequence=px.colors.qualitative.Antique,
             #hover_name = 'rept_name_1', 
             #hover_data = ['from_npi_specialty','patient_count' ],
             labels = {'author':'', 'downloads':'Number of downloads'},
            orientation = 'h')
#fig.update_xaxes(type='category')
#fig1.update_traces(texttemplate='%{x}', textposition='inside')
fig1.update_traces(marker_color=rgb, marker_line_color=rgb,
                  marker_line_width=1.5, opacity=0.6)
fig1.update_layout(barmode='stack'#, yaxis={'categoryorder':'category ascending'}
                  )
fig1.update_layout({'plot_bgcolor': 'rgba(0,0,0,0)','paper_bgcolor': 'rgba(0,0,0,0)'})
#fig1.update_layout({'plot_bgcolor': 'rgba(102,102,102,1)','paper_bgcolor': 'rgba(102,102,102,1)'})
fig1.show()

In [None]:
fig2 = px.bar(metadata_auth.head(10), y='author', x='downloads',  
             #title = 'All Authors - Top 10 by Total Downloads',
             template='plotly_dark',
             color_discrete_sequence=px.colors.qualitative.Antique,
             #hover_name = 'rept_name_1', 
             #hover_data = ['from_npi_specialty','patient_count' ],
             labels = {'author':'', 'downloads':'Number of downloads'},
            orientation = 'h')
#fig.update_xaxes(type='category')
#fig2.update_traces(texttemplate='%{x}', textposition='inside')
fig2.update_traces(marker_color=rgb, marker_line_color=rgb,
                  marker_line_width=1.5, opacity=0.6)
fig2.update_layout(barmode='stack'#, yaxis={'categoryorder':'category ascending'}
                  )
fig2.update_layout({'plot_bgcolor': 'rgba(0,0,0,0)','paper_bgcolor': 'rgba(0,0,0,0)'})
#fig2.update_layout({'plot_bgcolor': 'rgba(102,102,102,1)','paper_bgcolor': 'rgba(102,102,102,1)'})
fig2.show()

In [None]:
fig3 = px.bar(popular_meta.head(10), y='title', x='downloads',  
             #title = 'All Titles - Top 10 by Total Downloads',
             template='plotly_dark',
             color_discrete_sequence=px.colors.qualitative.Antique,
             #hover_name = 'rept_name_1', 
             #hover_data = ['from_npi_specialty','patient_count' ],
             labels = {'title':'', 'downloads':'Number of downloads'},
            orientation = 'h')
#fig.update_xaxes(type='category')
#fig3.update_traces(texttemplate='%{x}', textposition='inside')
fig3.update_traces(marker_color=rgb, marker_line_color=rgb,
                  marker_line_width=1.5, opacity=0.6)
fig3.update_layout(barmode='stack'#, yaxis={'categoryorder':'category ascending'}
                  )
fig3.update_layout({'plot_bgcolor': 'rgba(0,0,0,0)','paper_bgcolor': 'rgba(0,0,0,0)'})
#fig3.update_layout({'plot_bgcolor': 'rgba(102,102,102,1)','paper_bgcolor': 'rgba(102,102,102,1)'})
#fig3.update_layout(margin=dict(l=500, r=20, t=20, b=20))
fig3.show()

In [None]:
fig4 = px.bar(popular.head(10), y='title', x='downloads',  
             #title = 'Mystery Titles - Top 10 by Total Downloads',
             template='plotly_dark',
             color_discrete_sequence=px.colors.qualitative.Antique,
             #hover_name = 'rept_name_1', 
             #hover_data = ['from_npi_specialty','patient_count' ],
             labels = {'title':'', 'downloads':'Number of downloads'},
            orientation = 'h')
#fig.update_xaxes(type='category')
#fig4.update_traces(texttemplate='%{x}', textposition='inside')
fig4.update_traces(marker_color=rgb, marker_line_color=rgb,
                  marker_line_width=1.5, opacity=0.6)
fig4.update_layout(barmode='stack'#, yaxis={'categoryorder':'category ascending'}
                  )
fig4.update_layout({'plot_bgcolor': 'rgba(0,0,0,0)','paper_bgcolor': 'rgba(0,0,0,0)'})
#fig4.update_layout({'plot_bgcolor': 'rgba(102,102,102,1)','paper_bgcolor': 'rgba(102,102,102,1)'})
fig4.show()

In [None]:
data_words = pd.read_csv('../data/model_data/dm_lda_05/data_words.txt',delimiter = "\t", header = None, names = ['words'])
data_words.head()

In [None]:
word_list = data_words['words'].to_list()

In [None]:
word_count = []
for item in word_list:
    count = item.count(',') + 1
    #print(count)
    word_count.append(count)
word_counts = pd.DataFrame(word_count, columns = ['word_count'])

In [None]:
data_words = data_words.merge(word_counts, left_index = True, right_index = True)

In [None]:
data_words['category'] = (((data_words['word_count'] - (data_words['word_count'] % 25000))/25000) * 25).astype(int).astype(str) + 'K'
data_words['sort'] = (((data_words['word_count'] - (data_words['word_count'] % 25000))/25000)).astype(int)

In [None]:
data_words

In [None]:
title_word_counts = pd.DataFrame(data_words['category'].value_counts())
title_word_counts = title_word_counts.reset_index()
title_word_counts.columns =['word_count', 'num_titles']
sort = data_words.groupby(['category','sort']).count()
sort = sort.sort_values('sort').reset_index()
sort = sort[['category','sort']]
sort

In [None]:
title_word_counts = title_word_counts.merge(sort, right_on = 'category',left_on= 'word_count')

In [None]:
title_word_counts
title_word_counts=title_word_counts[['word_count','num_titles','sort']].sort_values('sort').reset_index(drop = True)

In [None]:
title_word_counts

In [None]:
fig5 = px.bar(title_word_counts, y='word_count', x='num_titles',  
             #title = 'Mystery Titles - Top 10 by Total Downloads',
             template='plotly_dark',
             color_discrete_sequence=px.colors.qualitative.Antique,
             #hover_name = 'rept_name_1', 
             #hover_data = ['from_npi_specialty','patient_count' ],
             labels = {'word_count':'Word count', 'num_titles':'Number of titles'},
            orientation = 'h')
#fig.update_xaxes(type='category')
#fig4.update_traces(texttemplate='%{x}', textposition='inside')
fig5.update_traces(marker_color=rgb, marker_line_color=rgb,
                  marker_line_width=1.5, opacity=0.6)
fig5.update_layout(barmode='stack'#, yaxis={'categoryorder':'category ascending'}
                  )
fig5.update_layout({'plot_bgcolor': 'rgba(0,0,0,0)','paper_bgcolor': 'rgba(0,0,0,0)'})
#fig5.update_layout({'plot_bgcolor': 'rgba(102,102,102,1)','paper_bgcolor': 'rgba(102,102,102,1)'})
fig5.show()