In [None]:
from numpy import nan
import pandas as pd

def get_lexile_int(arg):
    if isinstance(arg, float): 
        return arg
    digits = arg.replace('L', '')
    if digits.isdigit():
        return int(digits)
    return nan
df = pd.read_csv(filepath_or_buffer='/kaggle/input/commonlit-texts/commonlit_texts.csv')
# we have a bunch of data that is quasi-numerical so we need to make its numerical equivalent
df['grade_int'] = df['grade'].apply(func=lambda x: int(x.replace('th Grade', '').replace('rd Grade', '')))
df['lexile_int'] = df['lexile'].apply(func=get_lexile_int)
df['description_length'] = df['description'].apply(func=lambda x: len(x.split()))
df.head()

In [None]:
df.info()

In [None]:
df.nunique()

In [None]:
from plotly.express import bar
grade_df = df[['grade', 'grade_int']].drop_duplicates(ignore_index=True).sort_values(by='grade_int')
bar(data_frame=df, x='grade', category_orders={'grade': grade_df['grade'].values.tolist()})

In [None]:
bar(data_frame=df['author'].value_counts().nlargest(n=20).to_frame().reset_index(), x='author', y='count')


Weird how the top twenty is dominated by authors who are not exactly household names.

In [None]:
bar(data_frame=df, x='genre', category_orders= {'genre': sorted(df['genre'].unique().tolist())}, color='grade')

Maybe a volumetric plot would be better. Let's try.

In [None]:
from plotly.express import treemap
treemap(data_frame=df[['genre', 'grade']].groupby(by=['genre', 'grade']).size().reset_index().rename(columns={0: 'count'}), names='genre',  values='count', path=['genre'])

In [None]:
treemap(data_frame=df[['genre', 'grade']].groupby(by=['genre', 'grade']).size().reset_index().rename(columns={0: 'count'}), names='grade',  values='count', path=['grade'])

In [None]:
treemap(data_frame=df[['author', 'genre']].groupby(by=['author', 'genre']).size().reset_index(),
        names='author', values=0, path=['author']
       )

The corpus is mostly low-cardinality authors.

In [None]:
from plotly.express import scatter
scatter(data_frame=df, x='grade_int', y='lexile_int', color='genre', hover_name='title')

In [None]:
from plotly.express import scatter_matrix
scatter_matrix(data_frame=df, dimensions=['grade_int', 'lexile_int', 'description_length'], color='genre', hover_name='title')

In [None]:
from plotly.express import violin
violin(data_frame=df, x='grade_int', y='lexile_int', hover_name='title')

This graph looks weird, but it shows how the bulk of the lexile distribution rises slowly with the grade. Maybe a ridge/joy plot would be helpful here.

In [None]:
scatter(data_frame=df, x='grade_int', y='description_length', color='genre', hover_name='title')

We would like description length to be a proxy for something, but it doesn't seem to be.