## Libraries and Setup

In [1]:
import polars as pl
import numpy as np  # noqa: F401
import plotly.express as px  # noqa: F401
import plotly.graph_objects as go
import plotly.io as pio
import plotly.offline as po
import polars.selectors as cs
pio.renderers.default='notebook'
po.init_notebook_mode(connected=True) 

## Read in the data

data source: https://www.kaggle.com/datasets/uciml/student-alcohol-consumption

In [2]:
stu_math = pl.read_csv("student-mat.csv", has_header=True)
stu_por = pl.read_csv('student-por.csv', has_header=True)
print(stu_math.head(5))
print(stu_por.head(5))

shape: (5, 33)
┌────────┬─────┬─────┬─────────┬───┬──────────┬─────┬─────┬─────┐
│ school ┆ sex ┆ age ┆ address ┆ … ┆ absences ┆ G1  ┆ G2  ┆ G3  │
│ ---    ┆ --- ┆ --- ┆ ---     ┆   ┆ ---      ┆ --- ┆ --- ┆ --- │
│ str    ┆ str ┆ i64 ┆ str     ┆   ┆ i64      ┆ i64 ┆ i64 ┆ i64 │
╞════════╪═════╪═════╪═════════╪═══╪══════════╪═════╪═════╪═════╡
│ GP     ┆ F   ┆ 18  ┆ U       ┆ … ┆ 6        ┆ 5   ┆ 6   ┆ 6   │
│ GP     ┆ F   ┆ 17  ┆ U       ┆ … ┆ 4        ┆ 5   ┆ 5   ┆ 6   │
│ GP     ┆ F   ┆ 15  ┆ U       ┆ … ┆ 10       ┆ 7   ┆ 8   ┆ 10  │
│ GP     ┆ F   ┆ 15  ┆ U       ┆ … ┆ 2        ┆ 15  ┆ 14  ┆ 15  │
│ GP     ┆ F   ┆ 16  ┆ U       ┆ … ┆ 4        ┆ 6   ┆ 10  ┆ 10  │
└────────┴─────┴─────┴─────────┴───┴──────────┴─────┴─────┴─────┘
shape: (5, 33)
┌────────┬─────┬─────┬─────────┬───┬──────────┬─────┬─────┬─────┐
│ school ┆ sex ┆ age ┆ address ┆ … ┆ absences ┆ G1  ┆ G2  ┆ G3  │
│ ---    ┆ --- ┆ --- ┆ ---     ┆   ┆ ---      ┆ --- ┆ --- ┆ --- │
│ str    ┆ str ┆ i64 ┆ str     ┆   ┆ i64      

In [3]:
print(stu_math.describe())

shape: (9, 34)
┌────────────┬────────┬──────┬───────────┬───┬──────────┬───────────┬───────────┬──────────┐
│ statistic  ┆ school ┆ sex  ┆ age       ┆ … ┆ absences ┆ G1        ┆ G2        ┆ G3       │
│ ---        ┆ ---    ┆ ---  ┆ ---       ┆   ┆ ---      ┆ ---       ┆ ---       ┆ ---      │
│ str        ┆ str    ┆ str  ┆ f64       ┆   ┆ f64      ┆ f64       ┆ f64       ┆ f64      │
╞════════════╪════════╪══════╪═══════════╪═══╪══════════╪═══════════╪═══════════╪══════════╡
│ count      ┆ 395    ┆ 395  ┆ 395.0     ┆ … ┆ 395.0    ┆ 395.0     ┆ 395.0     ┆ 395.0    │
│ null_count ┆ 0      ┆ 0    ┆ 0.0       ┆ … ┆ 0.0      ┆ 0.0       ┆ 0.0       ┆ 0.0      │
│ mean       ┆ null   ┆ null ┆ 16.696203 ┆ … ┆ 5.708861 ┆ 10.908861 ┆ 10.713924 ┆ 10.41519 │
│ std        ┆ null   ┆ null ┆ 1.276043  ┆ … ┆ 8.003096 ┆ 3.319195  ┆ 3.761505  ┆ 4.581443 │
│ min        ┆ GP     ┆ F    ┆ 15.0      ┆ … ┆ 0.0      ┆ 3.0       ┆ 0.0       ┆ 0.0      │
│ 25%        ┆ null   ┆ null ┆ 16.0      ┆ … ┆ 0.0     

In [4]:
by_sex = stu_math.group_by("sex").agg(
    pl.col("sex").count().alias('count')
)

print(by_sex)

by_school = stu_math.group_by('school').agg(
    pl.col("school").count().alias('count_school')
)

print(by_school)

shape: (2, 2)
┌─────┬───────┐
│ sex ┆ count │
│ --- ┆ ---   │
│ str ┆ u32   │
╞═════╪═══════╡
│ M   ┆ 187   │
│ F   ┆ 208   │
└─────┴───────┘
shape: (2, 2)
┌────────┬──────────────┐
│ school ┆ count_school │
│ ---    ┆ ---          │
│ str    ┆ u32          │
╞════════╪══════════════╡
│ GP     ┆ 349          │
│ MS     ┆ 46           │
└────────┴──────────────┘


## Cleaning

In [5]:
studytime_order = {'1': '< 2 hours', '2': '2 to 5 hours', '3': '5 to 10 hours', '4': '> 10 hours'}
st_num_order = ['< 2 hours', '2 to 5 hours', '5 to 10 hours', '> 10 hours']

traveltime_order = {'1': '< 15 minutes', '2': '15 to 30 minutes', '3': '30 minutes to 1 hour', '4': '> 1 hour'}
tt_num_order = ['< 15 minutes', '15 to 30 minutes', '30 minutes to 1 hour', '> 1 hour']

educ_order = {0: 'None', 1: 'Up to 4th Grade', 2: '5th to 9th Grade', 3: 'secondary education', 4: 'higher education'}
educ_num_order = ['None', 'Up to 4th Grade', '5th to 9th Grade', 'secondary education', 'higher education']

stu_math_cleaned = stu_math.with_columns(
    pl.col('studytime').cast(pl.String).replace(studytime_order).cast(pl.Enum(st_num_order)),
    pl.col('traveltime').cast(pl.String).replace(traveltime_order).cast(pl.Enum(tt_num_order)),
    pl.col('Medu').cast(pl.String).replace(educ_order).cast(pl.Enum(educ_num_order)).alias('Medu_cat'),
    pl.col('Fedu').cast(pl.String).replace(educ_order).cast(pl.Enum(educ_num_order)).alias('Fedu_cat')
)
stu_math_cleaned = stu_math_cleaned.with_columns(
    cs.integer().cast(pl.String)
)
stu_math_cleaned = stu_math_cleaned.with_columns(
    cs.string().cast(pl.Categorical)
)

## Figures

In [6]:
fig1 = go.Figure()

fig1.add_trace(go.Bar(
    x = by_school['school'],
    y = by_school['count_school'],
    width = 0.4
))

fig1.update_layout(
    title_text = 'Number of Students by School',
    xaxis_title = 'School',
    yaxis_title = 'Count',
    width = 500,
    height = 500,
    template = 'plotly_white'
)

fig1.show()

In [7]:
fig2 = go.Figure()

fig2.add_trace(go.Bar(
    x = by_sex['sex'],
    y = by_sex['count'],
    width = .4
))

fig2.update_layout(
    title_text = 'Number of Students by Sex',
    xaxis_title = 'Sex',
    yaxis_title = 'Count',
    width = 650,
    height = 650,
    template = 'plotly_dark'
)

fig2.show()

In [8]:
traveltime_summary = stu_math.with_columns(
    pl.col('traveltime').cast(pl.String).replace(traveltime_order).cast(pl.Enum(tt_num_order))
).group_by('traveltime').agg(
    pl.col('failures').mean().alias('average_failures')
).sort('traveltime')

traveltime_summary

traveltime,average_failures
enum,f64
"""< 15 minutes""",0.299611
"""15 to 30 minutes""",0.35514
"""30 minutes to 1 hour""",0.478261
"""> 1 hour""",0.75


In [9]:
fig_traveltime = px.bar(traveltime_summary, x = 'traveltime', y = 'average_failures')

fig_traveltime.update_traces(width = 0.6)

fig_traveltime.update_layout(
    title = 'Average Number of Past Class Failures by Travel Time to School',
    width = 800,
    height = 800,
    yaxis_title = 'Average Number of Past Class Failures Per Student',
    xaxis_title = 'Travel Time to school'
)

fig_traveltime.show()

In [10]:
studytime_summary = stu_math.with_columns(
    pl.col('studytime').cast(pl.String).replace(studytime_order).cast(pl.Enum(st_num_order))
).group_by('studytime').agg(
    pl.col('failures').mean().alias('average_failures')
).sort('studytime')

studytime_summary

studytime,average_failures
enum,f64
"""< 2 hours""",0.52381
"""2 to 5 hours""",0.308081
"""5 to 10 hours""",0.230769
"""> 10 hours""",0.037037


In [11]:
fig_studytime = px.bar(studytime_summary, x = 'studytime', y = 'average_failures')

fig_studytime.update_traces(width = 0.6)

fig_studytime.update_layout(
    title = 'Average Number of Past Class Failures by Study Time',
    width = 800,
    height = 800,
    yaxis_title = 'Average Number of Past Class Failures Per Student',
    xaxis_title = 'Weekly Study Time'
)

fig_studytime.show()

In [12]:
finalmath_studytime = stu_math.with_columns(
    pl.col('studytime').cast(pl.String).replace(studytime_order).cast(pl.Enum(st_num_order))
).group_by('studytime').agg(
    pl.col('G3').mean().alias('mean_grade')
).sort('studytime')

finalmath_studytime

studytime,mean_grade
enum,f64
"""< 2 hours""",10.047619
"""2 to 5 hours""",10.171717
"""5 to 10 hours""",11.4
"""> 10 hours""",11.259259


In [13]:
fig_finalmath_box = px.box(
    stu_math_cleaned,
    x="studytime",
    y="G3",
    title="Final Grades (G3) by Weekly Study Time",
    labels={
        "studytime": "Weekly Study Time",
        "G3": "G3 - Final Grade (0 - 20)"
    },
    category_orders={"studytime": [
        '< 2 hours',
        '2 to 5 hours',
        '5 to 10 hours',
        '> 10 hours'
    ]}
)


fig_finalmath_box.update_layout(
    width = 800,
    height = 800
)

fig_finalmath_box.show()

In [14]:
corr_matrix = stu_math_cleaned.to_dummies().corr()

corr_rownames = corr_matrix.schema.names()

# ADD CORR ROWNAMES TO COLUMN AND FIGURE OUT HOW TO SHOW THEM ON THE PLOT

corr_matrix

school_GP,school_MS,sex_F,sex_M,age_15,age_16,age_17,age_18,age_19,age_20,age_21,age_22,address_R,address_U,famsize_GT3,famsize_LE3,Pstatus_A,Pstatus_T,Medu_0,Medu_1,Medu_2,Medu_3,Medu_4,Fedu_0,Fedu_1,Fedu_2,Fedu_3,Fedu_4,Mjob_at_home,Mjob_health,Mjob_other,Mjob_services,Mjob_teacher,Fjob_at_home,Fjob_health,Fjob_other,Fjob_services,…,G2_17,G2_18,G2_19,G2_4,G2_5,G2_6,G2_7,G2_8,G2_9,G3_0,G3_10,G3_11,G3_12,G3_13,G3_14,G3_15,G3_16,G3_17,G3_18,G3_19,G3_20,G3_4,G3_5,G3_6,G3_7,G3_8,G3_9,Medu_cat_5th to 9th Grade,Medu_cat_None,Medu_cat_Up to 4th Grade,Medu_cat_higher education,Medu_cat_secondary education,Fedu_cat_5th to 9th Grade,Fedu_cat_None,Fedu_cat_Up to 4th Grade,Fedu_cat_higher education,Fedu_cat_secondary education
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1.0,-1.0,-0.012286,0.012286,0.185824,0.217038,-0.010733,-0.300658,-0.105889,-0.150056,-0.138767,0.01829,-0.279797,0.279797,0.064866,-0.064866,0.045923,-0.045923,0.03176,-0.224275,0.089789,0.009636,0.071341,0.025899,-0.106065,0.02419,0.029868,0.040109,-0.069284,0.055139,-0.042497,0.053837,0.016823,-0.060148,0.079329,0.067746,-0.106639,…,0.041107,0.018278,0.03176,0.01829,-0.175623,-0.015777,-0.019503,0.021017,-0.004206,0.011384,-0.078705,0.035916,0.017906,-0.04079,0.067063,0.024046,-0.005473,0.045089,0.064262,-0.02949,0.01829,0.01829,-0.070874,0.030838,-0.050348,-0.007909,-0.022734,0.089789,0.03176,-0.224275,0.071341,0.009636,0.02419,0.025899,-0.106065,0.040109,0.029868
-1.0,1.0,0.012286,-0.012286,-0.185824,-0.217038,0.010733,0.300658,0.105889,0.150056,0.138767,-0.01829,0.279797,-0.279797,-0.064866,0.064866,-0.045923,0.045923,-0.03176,0.224275,-0.089789,-0.009636,-0.071341,-0.025899,0.106065,-0.02419,-0.029868,-0.040109,0.069284,-0.055139,0.042497,-0.053837,-0.016823,0.060148,-0.079329,-0.067746,0.106639,…,-0.041107,-0.018278,-0.03176,-0.01829,0.175623,0.015777,0.019503,-0.021017,0.004206,-0.011384,0.078705,-0.035916,-0.017906,0.04079,-0.067063,-0.024046,0.005473,-0.045089,-0.064262,0.02949,-0.01829,-0.01829,0.070874,-0.030838,0.050348,0.007909,0.022734,-0.089789,-0.03176,0.224275,-0.071341,-0.009636,-0.02419,-0.025899,0.106065,-0.040109,-0.029868
-0.012286,0.012286,1.0,-1.0,-0.064755,-0.008802,0.075074,-0.002247,0.028909,-0.033859,-0.053133,-0.053133,-0.028504,0.028504,0.089862,-0.089862,0.023443,-0.023443,0.024544,0.013253,0.066544,0.01016,-0.085967,-0.075237,0.060261,-0.006216,-0.019336,-0.018346,0.155501,0.019817,-0.002626,-0.002748,-0.165344,0.080212,0.061306,-0.033307,-0.005083,…,0.016649,-0.038967,-0.092263,-0.053133,0.002686,0.044641,0.066483,0.002776,0.02548,0.051413,0.007434,0.066567,-0.100382,0.012745,-0.004375,-0.025237,-0.062378,-0.006612,-0.038967,-0.028706,-0.053133,0.047768,-0.026366,0.135327,-0.02512,-0.052973,0.044567,0.066544,0.024544,0.013253,-0.085967,0.01016,-0.006216,-0.075237,0.060261,-0.018346,-0.019336
0.012286,-0.012286,-1.0,1.0,0.064755,0.008802,-0.075074,0.002247,-0.028909,0.033859,0.053133,0.053133,0.028504,-0.028504,-0.089862,0.089862,-0.023443,0.023443,-0.024544,-0.013253,-0.066544,-0.01016,0.085967,0.075237,-0.060261,0.006216,0.019336,0.018346,-0.155501,-0.019817,0.002626,0.002748,0.165344,-0.080212,-0.061306,0.033307,0.005083,…,-0.016649,0.038967,0.092263,0.053133,-0.002686,-0.044641,-0.066483,-0.002776,-0.02548,-0.051413,-0.007434,-0.066567,0.100382,-0.012745,0.004375,0.025237,0.062378,0.006612,0.038967,0.028706,0.053133,-0.047768,0.026366,-0.135327,0.02512,0.052973,-0.044567,-0.066544,-0.024544,-0.013253,0.085967,-0.01016,0.006216,0.075237,-0.060261,0.018346,0.019336
0.185824,-0.185824,-0.064755,0.064755,1.0,-0.305988,-0.294015,-0.261981,-0.130183,-0.044777,-0.025786,-0.025786,-0.049027,0.049027,0.036723,-0.036723,0.071396,-0.071396,-0.044777,-0.056879,-0.062303,-0.036755,0.143254,0.051431,-0.108081,0.015479,-0.039613,0.117454,-0.07439,0.065469,-0.068671,0.051433,0.052192,-0.032795,0.127602,-0.075875,0.027176,…,-0.00212,0.018507,0.099018,-0.025786,-0.101692,0.003162,-0.037822,-0.01471,-0.044674,-0.039979,-0.029085,0.023964,0.013104,-0.033318,0.009769,0.093603,-0.01018,-0.012533,0.054875,0.109548,-0.025786,-0.025786,-0.021439,0.028937,0.005507,-0.037587,-0.068411,-0.062303,-0.044777,-0.056879,0.143254,-0.036755,0.015479,0.051431,-0.108081,0.117454,-0.039613
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
0.02419,-0.02419,-0.006216,0.006216,0.015479,0.00913,0.018946,-0.039481,-0.046361,0.136505,-0.032287,-0.032287,0.005086,-0.005086,-0.071458,0.071458,-0.017115,0.017115,0.072315,-0.034039,0.266711,-0.01058,-0.226546,-0.045718,-0.328023,1.0,-0.373129,-0.363137,0.012864,0.002012,0.057569,0.038239,-0.13991,0.080758,-0.05987,0.065217,0.020872,…,-0.022716,-0.016029,0.008125,0.078611,-0.039859,-0.032429,0.022009,-0.006463,0.024185,0.036602,-0.036806,0.057083,-0.021247,-0.000525,0.003075,0.048183,-0.018607,-0.079592,-0.048499,0.076981,-0.032287,-0.032287,0.040634,-0.010703,-0.060511,-0.026887,0.018416,0.266711,0.072315,-0.034039,-0.226546,-0.01058,1.0,-0.045718,-0.328023,-0.363137,-0.373129
0.025899,-0.025899,-0.075237,0.075237,0.051431,0.038341,-0.040978,-0.036514,-0.018144,-0.006241,-0.003594,-0.003594,-0.038194,0.038194,0.045438,-0.045438,-0.024278,0.024278,-0.006241,0.070173,-0.042369,-0.041256,0.02551,1.0,-0.036514,-0.045718,-0.041534,-0.040422,-0.029893,-0.021893,0.021298,-0.042369,0.071181,-0.016475,-0.015588,0.06461,-0.044599,…,-0.008077,-0.012627,-0.006241,-0.003594,-0.014173,-0.013675,-0.016904,-0.021181,-0.027158,-0.023274,0.073262,-0.026217,-0.020818,-0.020818,-0.019323,-0.021539,0.166271,-0.00886,-0.012627,-0.008077,-0.003594,-0.003594,-0.009582,-0.014173,-0.010893,-0.021181,-0.019704,-0.042369,-0.006241,0.070173,0.02551,-0.041256,-0.045718,1.0,-0.036514,-0.040422,-0.041534
-0.106065,0.106065,0.060261,-0.060261,-0.108081,-0.065051,0.023928,0.061209,0.131109,-0.044777,0.098428,0.098428,0.100978,-0.100978,-0.032155,0.032155,-0.051397,0.051397,0.027121,0.433442,0.094084,-0.079964,-0.347294,-0.036514,1.0,-0.328023,-0.298005,-0.290025,0.17077,-0.11257,0.113727,-0.033869,-0.194705,-0.004325,-0.081911,0.149939,-0.056144,…,-0.057955,-0.017862,-0.044777,-0.025786,0.061594,0.10444,-0.010001,0.191178,0.049189,0.065864,0.078284,-0.07243,0.036314,-0.033318,-0.039701,-0.086864,-0.01018,-0.063568,-0.017862,-0.057955,-0.025786,-0.025786,-0.021439,0.061594,0.131001,0.031043,-0.019766,0.094084,0.027121,0.433442,-0.347294,-0.079964,-0.328023,-0.036514,1.0,-0.290025,-0.298005
0.040109,-0.040109,-0.018346,0.018346,0.117454,-0.0171,-0.024841,0.001032,-0.094703,-0.04957,-0.028546,-0.028546,-0.005494,0.005494,0.035253,-0.035253,0.097447,-0.097447,-0.04957,-0.220883,-0.242427,-0.082545,0.478431,-0.040422,-0.290025,-0.363137,-0.329905,1.0,-0.071853,0.078636,-0.212741,0.026445,0.265215,-0.023173,0.18751,-0.257888,-0.052226,…,0.094232,0.071654,0.018417,-0.028546,0.010945,-0.01285,-0.081653,-0.124973,-0.038199,-0.02473,-0.111851,0.028753,0.010224,0.032172,0.103802,0.04223,0.003335,0.122663,0.037263,0.041436,-0.028546,-0.028546,0.013364,-0.019936,-0.00741,-0.081709,-0.041515,-0.242427,-0.04957,-0.220883,0.478431,-0.082545,-0.363137,-0.040422,-0.290025,1.0,-0.329905


In [15]:
fig_corr = px.imshow(
    corr_matrix,
    text_auto = True,
    aspect = 'auto',
    zmin = 1,
    zmax = 1
)

fig_corr.show()