In [None]:
from IPython.display import HTML

In [None]:
HTML("""
<style>
    #notebook-container {
    width: 80% !important;
}
.dataTable {
    width: 100% !important;
}
.dataTables_scrollBody {
    width: 100% !important;
}
.dataTables_scrollHead {
    width: 100% !important;
}
.dataTables_filter {
   float: right !important;
}
.output_html {
    max-width: calc(100%) !important;
}
.rendered{
    font-size: 125%;
}
</style>
""")

In [None]:
html1 = '<img src="https://images.theconversation.com/files/45159/original/rptgtpxd-1396254731.jpg" width="900" style="margin-left:auto; margin-right:auto"/>'
HTML(html1)

# A quick analysis of my reading habits

* What do I read the most?
* What format do I prefer?
* Are there any Genre-Sub-Genre pairs I tend to gravitate towards?
* Who is my most read author?
* How long does it take me to read books?

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import plot

df = pd.read_csv('BookList.csv')

#removes any blank rows or books I never finished/rated
df = df[df['Rating'].notnull()]

# convert columns to correct (or better) data types
df[['Year', 'Pages']] = df[['Year', 'Pages']].apply(pd.to_numeric, downcast='integer')
df['Duration'] = df['Duration'].apply(pd.to_numeric, downcast='integer')
df['Rating'] = df['Rating'].apply(pd.to_numeric, downcast='float')
df['Start Date']= pd.to_datetime(df['Start Date'])
df['Finish Date']= pd.to_datetime(df['Finish Date'])
df.tail()

### Genre Count

In [None]:
title = "Top 5 Most-Read Genres"
sub = f"Based on total number of books I actually finished"

plot.gen_bar_graph(df, 'Genre', title, sub, color="#d27575")

In [None]:
title = "Top 10 Most-Read Sub-Genres"
sub = f"Based on total number of books I actually finished"

plot.gen_bar_graph(df, 'Sub-Genre', title, sub, 10, color="#529b9c")

In [None]:
#df[df.isna().any(axis=1)]

### Most Common Genre-Sub-Genre Pairing

In [None]:
title = "Top 10 Most Common Genre/Sub-Genre Pairings"
sub = f"Based on total number of books I actually finished"

df['Genre Pair'] = df['Genre'] + '-' + df['Sub-Genre']
plot.gen_bar_graph(df, 'Genre Pair', title, sub, num=10, color="#eac392")

### Most-Read Author

In [None]:
title = "Most-Read Authors"
sub = f"Based on total number of books I actually finished"
plot.gen_bar_graph(df, 'Author', title, sub, color="#9cba8f")

In [None]:
physical = (df[df['Format'] == 'Physical'].groupby('Genre')['Title']
            .count()
            .reset_index()
            .rename(columns={'Title':'Physical'}))
ebook = (df[df['Format'] == 'eBook'].groupby('Genre')['Title']
         .count()
         .reset_index()
         .rename(columns={'Title':'eBook'}))

format_df = pd.concat([physical, ebook['eBook']], axis=1).fillna(0).set_index('Genre')
format_df = format_df.convert_dtypes()

In [None]:
title = "Most-Read Book Format"
sub = f"Based on total number of books I actually finished"
plot.gen_stacked_bar_graph(format_df, title, sub)

### Ratings

In [None]:
dfr = df.groupby('Genre')['Rating'].mean().reset_index().round(2)
dfr['Total'] = df.groupby('Genre')['Title'].count().reset_index(drop=True)

title = "Ratings by Genre"
sub = f"Average rating of finished books by genre"
plot.gen_bar_graph(dfr, 'Genre', title, sub, avg=True, w_avg='Rating')

In [None]:
dfs = df.groupby('Sub-Genre')['Rating'].mean().reset_index().round(2)
dfs['Total'] = df.groupby('Sub-Genre')['Title'].count().reset_index(drop=True)
dfs = dfs.sort_values('Rating', ascending=True).reset_index(drop=True)

title = "Ratings by Sub-Genre"
sub = f"Average rating of finished books by genre"
plot.gen_hbar_graph(dfs, 'Sub-Genre', title, sub, avg=True, w_avg='Rating')

### Durations

In [None]:
dft = df.groupby('Genre')['Duration'].mean().reset_index()
dft = dft[dft['Duration'].notnull()].round(2)
dft['Total'] = df.groupby('Genre')['Title'].count().reset_index(drop=True)
# dft.loc[len(dft.index)] = ['Overall', dft['Duration'].mean()]
title = "Duration by Genre"
sub = f"Average number of days it took to read a book by genre"
plot.gen_bar_graph(dft, 'Genre', title, sub, avg=True, w_avg='Duration')

In [None]:
dfy = df.groupby('Sub-Genre')['Duration'].mean().reset_index()
dfy = dfy[dfy['Duration'].notnull()].round(2)
dfy['Total'] = df.groupby('Sub-Genre')['Title'].count().reset_index(drop=True)
dfy = dfy.sort_values('Duration', ascending=True).reset_index(drop=True)

title = "Duration by Sub-Genre"
sub = f"Average number of days it took to read a book by sub-genre"
plot.gen_hbar_graph(dfy, 'Sub-Genre', title, sub, avg=True, w_avg='Duration')

In [None]:
#get list of unique genres/subgenres 
subgenres = df['Sub-Genre'].unique()
genres = df['Genre'].unique()

#create new, NaN-filled dataframe where rows=genres, and columns=subgenres
dfh = pd.DataFrame(np.nan, columns=subgenres, index=genres)

# grp = data grouped by the two columns you want as vertical & horizontal axes
grp = df[df["Genre"].isin(genres)].groupby(["Genre", "Sub-Genre"], as_index=False)['Title'].count()

#for each row in grp (which is the count of genre-sub-genre pairs), put the count (which is 'Title') in the corresponding row/column
for idx, row in grp.iterrows():
    dfh.loc[row["Genre"], row["Sub-Genre"]] = row['Title']
    
dfh.fillna(0, inplace=True)

In [None]:
title = "Genre Pairing Heatmap"
sub = f"Because who doesn't love a heatmap?"
plot.gen_heatmap(dfh, title, sub)

In [None]:
dfc = df.groupby('Genre')['Pages'].mean().round(2).reset_index()
dfc['Total'] = df.groupby('Genre')['Title'].count().reset_index(drop=True)
dfc = dfc.sort_values('Pages', ascending=False)

title = "Average Number of Pages by Genre"
sub = f"blah blah"
plot.gen_bar_graph(dfc, 'Genre', title, sub, avg=True, w_avg='Pages')

In [None]:
dfx = df.groupby('Sub-Genre')['Pages'].mean().round(2).reset_index()
dfx['Total'] = df.groupby('Sub-Genre')['Title'].count().reset_index(drop=True)
dfx = dfx.sort_values('Pages', ascending=True).reset_index(drop=True)

title = "Average Number of Pages by Sub-Genre"
sub = f"ipso facto"
plot.gen_hbar_graph(dfx, 'Sub-Genre', title, sub, avg=True, w_avg='Pages')

## This is still a test to see if GitHub Pages is working correctly. More in-depth analysis to come soon!

In [None]:
# # Clean up space
#       - name: Remove unnecessary files
#         run: |
#           rm -rf /usr/share/dotnet
#           rm -rf /opt/ghc
#           rm -rf "/usr/local/share/boost"
#           rm -rf "$AGENT_TOOLSDIRECTORY"