In [None]:
import pandas as pd
import numpy as np

import plotly.express as px
import plotly.graph_objects as go

from sqlalchemy import create_engine

In [None]:
engine = create_engine("mysql+pymysql://{user}:{pw}@{host}/{db}"
                       .format(user="XXX",
                               pw="XXX",
                               host="XXX",
                               db="XXX"))


In [None]:
ratings = pd.read_sql_query('SELECT * FROM rating', engine)
books = pd.read_sql_query('SELECT * FROM book', engine)
book_tags = pd.read_sql_query('SELECT * FROM books_tags', engine)
tags = pd.read_sql_query('SELECT * FROM tag', engine)

In [None]:
tags_count = pd.merge(tags, book_tags.groupby('tag_id').agg('count').reset_index()[['tag_id', 'count']], left_on='id',right_on='tag_id', how='left').sort_values('count', ascending=False)

In [None]:
annotations = []

fig = go.Figure()
fig.add_trace(
    go.Histogram(
        x=ratings['rating'],
        name='bla',
        marker_color='indianred',
        opacity=0.75,
    )
)

# Title Part
annotations.append(
    dict(
        xref='paper', 
        yref='paper', 
        x=0.0, 
        y=1.05,
        xanchor='left', 
        yanchor='bottom',
        text=f'Total ratings',
        font=dict(
            family='Arial',
            size=30,
            color='rgb(37,37,37)'
         ),
         showarrow=False
    )
)

#Source Part
annotations.append(
    dict(
        xref='paper', 
        yref='paper', 
        x=0.5, 
        y=-0.1,
        xanchor='center', 
        yanchor='top',
        text=f'Total: {len(ratings)}',
        font=dict(
            family='Arial',
            size=16
        ),
        showarrow=False
    )
)
fig.update_layout(
    bargap=0.1,
    plot_bgcolor='white',
    annotations = annotations,
)
fig.show()

We can see that values tend to group on the higher ratings, giving us a negatively skewed distribution

In [None]:
annotations = []

fig = go.Figure()
fig.add_trace(
    go.Bar(
        x=tags_count['name'][:100],
        y=tags_count['count'][:100],
        name='Primary Product',
        marker_color='indianred'
    )
)

# Title Part
annotations.append(
    dict(
        xref='paper', 
        yref='paper', 
        x=0.0, 
        y=1.05,
        xanchor='left', 
        yanchor='bottom',
        text=f'Total ratings',
        font=dict(
            family='Arial',
            size=30,
            color='rgb(37,37,37)'
         ),
         showarrow=False
    )
)

#Bottom Part
annotations.append(
    dict(
        xref='paper', 
        yref='paper', 
        x=0.5, 
        y=-0.18,
        xanchor='center', 
        yanchor='top',
        text=f'Count: Kaggle {int(sum(tags_count["count"][:100]))}',
        font=dict(
            family='Arial',
            size=16
        ),
        showarrow=False
    )
)
fig.update_layout(
    bargap=0.1,
    plot_bgcolor='white',
    xaxis=dict(
        tickangle=45,
        range=(0,30)
    ),
    annotations = annotations,
)
fig.show()
tags_count[['count']].sum()

Almost 50% of all tags (427K out of 999K) assgined are in 100 tags. Also many of those tags are useless (owned, e-book, read-in-2016, etc) or they ressemble each other, so for future iterations of the the project it would be interesting to reduce these tags to a more useful quantity.