## Libraries and Setup

In [102]:
import polars as pl
import numpy as np  # noqa: F401
import plotly.express as px  # noqa: F401
import plotly.graph_objects as go
import plotly.io as pio
import plotly.offline as po
pio.renderers.default='notebook'
po.init_notebook_mode(connected=True) 

## Read in the data

data source: https://www.kaggle.com/datasets/uciml/student-alcohol-consumption

In [103]:
stu_math = pl.read_csv("student-mat.csv", has_header=True)
stu_por = pl.read_csv('student-por.csv', has_header=True)
print(stu_math.head(5))
print(stu_por.head(5))

shape: (5, 33)
┌────────┬─────┬─────┬─────────┬───┬──────────┬─────┬─────┬─────┐
│ school ┆ sex ┆ age ┆ address ┆ … ┆ absences ┆ G1  ┆ G2  ┆ G3  │
│ ---    ┆ --- ┆ --- ┆ ---     ┆   ┆ ---      ┆ --- ┆ --- ┆ --- │
│ str    ┆ str ┆ i64 ┆ str     ┆   ┆ i64      ┆ i64 ┆ i64 ┆ i64 │
╞════════╪═════╪═════╪═════════╪═══╪══════════╪═════╪═════╪═════╡
│ GP     ┆ F   ┆ 18  ┆ U       ┆ … ┆ 6        ┆ 5   ┆ 6   ┆ 6   │
│ GP     ┆ F   ┆ 17  ┆ U       ┆ … ┆ 4        ┆ 5   ┆ 5   ┆ 6   │
│ GP     ┆ F   ┆ 15  ┆ U       ┆ … ┆ 10       ┆ 7   ┆ 8   ┆ 10  │
│ GP     ┆ F   ┆ 15  ┆ U       ┆ … ┆ 2        ┆ 15  ┆ 14  ┆ 15  │
│ GP     ┆ F   ┆ 16  ┆ U       ┆ … ┆ 4        ┆ 6   ┆ 10  ┆ 10  │
└────────┴─────┴─────┴─────────┴───┴──────────┴─────┴─────┴─────┘
shape: (5, 33)
┌────────┬─────┬─────┬─────────┬───┬──────────┬─────┬─────┬─────┐
│ school ┆ sex ┆ age ┆ address ┆ … ┆ absences ┆ G1  ┆ G2  ┆ G3  │
│ ---    ┆ --- ┆ --- ┆ ---     ┆   ┆ ---      ┆ --- ┆ --- ┆ --- │
│ str    ┆ str ┆ i64 ┆ str     ┆   ┆ i64      

In [104]:
print(stu_math.describe())

shape: (9, 34)
┌────────────┬────────┬──────┬───────────┬───┬──────────┬───────────┬───────────┬──────────┐
│ statistic  ┆ school ┆ sex  ┆ age       ┆ … ┆ absences ┆ G1        ┆ G2        ┆ G3       │
│ ---        ┆ ---    ┆ ---  ┆ ---       ┆   ┆ ---      ┆ ---       ┆ ---       ┆ ---      │
│ str        ┆ str    ┆ str  ┆ f64       ┆   ┆ f64      ┆ f64       ┆ f64       ┆ f64      │
╞════════════╪════════╪══════╪═══════════╪═══╪══════════╪═══════════╪═══════════╪══════════╡
│ count      ┆ 395    ┆ 395  ┆ 395.0     ┆ … ┆ 395.0    ┆ 395.0     ┆ 395.0     ┆ 395.0    │
│ null_count ┆ 0      ┆ 0    ┆ 0.0       ┆ … ┆ 0.0      ┆ 0.0       ┆ 0.0       ┆ 0.0      │
│ mean       ┆ null   ┆ null ┆ 16.696203 ┆ … ┆ 5.708861 ┆ 10.908861 ┆ 10.713924 ┆ 10.41519 │
│ std        ┆ null   ┆ null ┆ 1.276043  ┆ … ┆ 8.003096 ┆ 3.319195  ┆ 3.761505  ┆ 4.581443 │
│ min        ┆ GP     ┆ F    ┆ 15.0      ┆ … ┆ 0.0      ┆ 3.0       ┆ 0.0       ┆ 0.0      │
│ 25%        ┆ null   ┆ null ┆ 16.0      ┆ … ┆ 0.0     

In [105]:
by_sex = stu_math.group_by("sex").agg(
    pl.col("sex").count().alias('count')
)

print(by_sex)

by_school = stu_math.group_by('school').agg(
    pl.col("school").count().alias('count_school')
)

print(by_school)

shape: (2, 2)
┌─────┬───────┐
│ sex ┆ count │
│ --- ┆ ---   │
│ str ┆ u32   │
╞═════╪═══════╡
│ F   ┆ 208   │
│ M   ┆ 187   │
└─────┴───────┘
shape: (2, 2)
┌────────┬──────────────┐
│ school ┆ count_school │
│ ---    ┆ ---          │
│ str    ┆ u32          │
╞════════╪══════════════╡
│ GP     ┆ 349          │
│ MS     ┆ 46           │
└────────┴──────────────┘


In [106]:
studytime_order = {'1': '< 2 hours', '2': '2 to 5 hours', '3': '5 to 10 hours', '4': '> 10 hours'}
st_num_order = ['< 2 hours', '2 to 5 hours', '5 to 10 hours', '> 10 hours']

stu_math_labeled = stu_math.with_columns(
    pl.col('studytime').cast(pl.String).replace(studytime_order).cast(pl.Enum(st_num_order))
)

In [107]:
fig1 = go.Figure()

fig1.add_trace(go.Bar(
    x = by_school['school'],
    y = by_school['count_school'],
    width = 0.4
))

fig1.update_layout(
    title_text = 'Number of Students by School',
    xaxis_title = 'School',
    yaxis_title = 'Count',
    width = 500,
    height = 500,
    template = 'plotly_white'
)

fig1.show()

In [108]:
fig2 = go.Figure()

fig2.add_trace(go.Bar(
    x = by_sex['sex'],
    y = by_sex['count'],
    width = .4
))

fig2.update_layout(
    title_text = 'Number of Students by Sex',
    xaxis_title = 'Sex',
    yaxis_title = 'Count',
    width = 650,
    height = 650,
    template = 'plotly_dark'
)

fig2.show()

In [109]:
traveltime_order = {'1': '< 15 minutes', '2': '15 to 30 minutes', '3': '30 minutes to 1 hour', '4': '> 1 hour'}
tt_num_order = ['< 15 minutes', '15 to 30 minutes', '30 minutes to 1 hour', '> 1 hour']

traveltime_summary = stu_math.with_columns(
    pl.col('traveltime').cast(pl.String).replace(traveltime_order).cast(pl.Enum(tt_num_order))
).group_by('traveltime').agg(
    pl.col('failures').mean().alias('average_failures')
).sort('traveltime')

traveltime_summary

traveltime,average_failures
enum,f64
"""< 15 minutes""",0.299611
"""15 to 30 minutes""",0.35514
"""30 minutes to 1 hour""",0.478261
"""> 1 hour""",0.75


In [110]:
fig_traveltime = px.bar(traveltime_summary, x = 'traveltime', y = 'average_failures')

fig_traveltime.update_traces(width = 0.6)

fig_traveltime.update_layout(
    title = 'Average Number of Past Class Failures by Travel Time to School',
    width = 800,
    height = 800,
    yaxis_title = 'Average Number of Past Class Failures Per Student',
    xaxis_title = 'Travel Time to school'
)

fig_traveltime.show()

In [111]:
studytime_summary = stu_math.with_columns(
    pl.col('studytime').cast(pl.String).replace(studytime_order).cast(pl.Enum(st_num_order))
).group_by('studytime').agg(
    pl.col('failures').mean().alias('average_failures')
).sort('studytime')

studytime_summary

studytime,average_failures
enum,f64
"""< 2 hours""",0.52381
"""2 to 5 hours""",0.308081
"""5 to 10 hours""",0.230769
"""> 10 hours""",0.037037


In [112]:
fig_studytime = px.bar(studytime_summary, x = 'studytime', y = 'average_failures')

fig_studytime.update_traces(width = 0.6)

fig_studytime.update_layout(
    title = 'Average Number of Past Class Failures by Study Time',
    width = 800,
    height = 800,
    yaxis_title = 'Average Number of Past Class Failures Per Student',
    xaxis_title = 'Weekly Study Time'
)

fig_studytime.show()

In [113]:
finalmath_studytime = stu_math.with_columns(
    pl.col('studytime').cast(pl.String).replace(studytime_order).cast(pl.Enum(st_num_order))
).group_by('studytime').agg(
    pl.col('G3').mean().alias('mean_grade')
).sort('studytime')

finalmath_studytime

studytime,mean_grade
enum,f64
"""< 2 hours""",10.047619
"""2 to 5 hours""",10.171717
"""5 to 10 hours""",11.4
"""> 10 hours""",11.259259


In [114]:
fig_finalmath = px.bar(finalmath_studytime, x = 'studytime', y = 'mean_grade')

fig_finalmath.update_layout(
    title = 'Average Final Math Grade by Study Time',
    width = 800,
    height = 800,
    yaxis_title = 'Average Math Grade (0 - 20)',
    xaxis_title = 'Weekly Study Time'
)

fig_finalmath.show()

In [117]:
fig_finalmath_box = px.box(
    stu_math_labeled,
    x="studytime",
    y="G3",
    title="Final Grades (G3) by Weekly Study Time",
    labels={
        "studytime": "Weekly Study Time",
        "G3": "G3 - Final Grade (0 - 20)"
    },
    category_orders={"studytime": [
        '< 2 hours',
        '2 to 5 hours',
        '5 to 10 hours',
        '> 10 hours'
    ]}
)


fig_finalmath_box.update_layout(
    width = 800,
    height = 800
)

fig_finalmath_box.show()