In [1]:
import numpy as np
import pandas as pd
from math import nan
from plotly.io import to_html
from ridgeplot import ridgeplot
from collections import Counter
from random import shuffle, sample

pd.options.display.max_rows = 500

In [2]:
year_list = [2013,2014,2015,2016,2017,2018,2019,2020,2021,2022]

num_presentes = {'before': [7173563, 8722248, 7746427, 8627179, 6731278,
                            5513733, 5095171, 5783109, 3389832, 3476105],
                 'after': [5007934, 5947914, 5604905, 5818264, 4426692,
                           3893729, 3701909, 2588681, 2238107, 2344823]}

def stratify_sampling(x, number):
    """
    ARGUMENTS:
    x      = Python list
    number = integer indicating the number of samples you want
    """
    counts = Counter(x)
    unique = list(counts.keys())
    list_size = number/len(x)

    smp = []
    for i in range(len(unique)):
        batch = int(list_size*counts[unique[i]])
        if batch == 0: smp.append(unique[i])
        else:
            for j in range(batch): smp.append(unique[i])
    
    diff = number - len(smp)
    batch = sample(x, diff)
    for i in range(diff): smp.append(batch[i])
    
    shuffle(smp)
    
    return smp

In [3]:
df = pd.read_pickle('../../data/vis02_enem_data.pickle')
df

Unnamed: 0,TP_FAIXA_ETARIA,ANO,NOTA_MEDIA
0,6,2013,450
1,12,2013,380
2,3,2013,440
3,11,2013,510
4,2,2013,410
...,...,...,...
41572953,12,2022,480
41572954,11,2022,610
41572955,2,2022,540
41572956,11,2022,500


## Vis02 - NOTA_MEDIA

In [4]:
array_dict = {}
smp = 300000

for year in year_list:
    sub = df[df.ANO == year]
    array_dict[year] = sub['NOTA_MEDIA'].tolist()
    array_dict[year] = stratify_sampling(array_dict[year], smp)
    while len(array_dict[year]) < smp:
        array_dict[year].append(nan)

data = np.array([array_dict[2013], array_dict[2014], array_dict[2015],
                 array_dict[2016], array_dict[2017], array_dict[2018],
                 array_dict[2019], array_dict[2020], array_dict[2021],
                 array_dict[2022]])
graph = pd.DataFrame(columns=year_list, data=data.T)
graph

Unnamed: 0,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,440,520,550,560,530,490,570,520,700,620
1,430,420,600,530,550,450,470,450,510,500
2,550,430,640,460,390,510,410,320,520,580
3,480,450,480,510,340,410,560,430,630,550
4,460,500,540,330,460,500,610,600,490,460
...,...,...,...,...,...,...,...,...,...,...
299995,480,590,470,550,450,650,420,470,510,500
299996,470,530,440,620,420,480,390,480,590,330
299997,460,420,540,620,610,470,450,460,470,700
299998,650,540,490,560,480,460,460,530,460,500


In [5]:
fig = ridgeplot(
    samples=graph.values.T,
    bandwidth=10,
    colorscale="viridis",
    colormode="index",
    coloralpha=0.6,
    labels=year_list,
    spacing=1
)

fig.update_layout(
    title="Distribuição das Notas Médias em Relação ao Ano",
    height=650,
    width=800,
    plot_bgcolor="rgba(255, 255, 255, 0.0)",
    xaxis_gridcolor="rgba(0, 0, 0, 0.1)",
    yaxis_gridcolor="rgba(0, 0, 0, 0.1)",
    xaxis_title="Nota Média",
    yaxis_title="Ano"
)

with open('../../data/vis02.html', 'w') as f:
    f.writelines(to_html(fig, include_plotlyjs='cnd', full_html=True))
print('Done!')

Done!


## Vis03 - TP_FAIXA_ETARIA

In [6]:
array_dict = {}
smp = 300000

for year in year_list:
    sub = df[df.ANO == year]
    array_dict[year] = sub['TP_FAIXA_ETARIA'].tolist()
    array_dict[year] = stratify_sampling(array_dict[year], smp)
    while len(array_dict[year]) < smp:
        array_dict[year].append(nan)

data = np.array([array_dict[2013], array_dict[2014], array_dict[2015],
                 array_dict[2016], array_dict[2017], array_dict[2018],
                 array_dict[2019], array_dict[2020], array_dict[2021],
                 array_dict[2022]])
graph = pd.DataFrame(columns=year_list, data=data.T)
graph

Unnamed: 0,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,2,6,3,1,11,2,6,7,2,3
1,5,11,2,5,11,6,2,12,4,2
2,2,6,14,3,5,9,7,3,3,1
3,3,1,2,12,3,4,3,11,6,13
4,2,11,1,9,3,7,1,9,15,6
...,...,...,...,...,...,...,...,...,...,...
299995,7,1,3,2,5,11,11,12,4,1
299996,3,3,1,3,12,15,3,11,2,6
299997,3,7,4,14,3,6,1,3,3,11
299998,2,7,4,1,4,10,7,4,6,3


In [7]:
fig = ridgeplot(
    samples=graph.values.T,
    bandwidth=10,
    colorscale="viridis",
    colormode="index",
    coloralpha=0.6,
    labels=year_list,
    spacing=1
)

fig.update_layout(
    title="Distribuição das Faixas Etárias em Relação ao Ano",
    height=650,
    width=800,
    plot_bgcolor="rgba(255, 255, 255, 0.0)",
    xaxis_gridcolor="rgba(0, 0, 0, 0.1)",
    yaxis_gridcolor="rgba(0, 0, 0, 0.1)",
    xaxis_title="Faixa Etária",
    yaxis_title="Ano"
)

with open('../../data/vis03.html', 'w') as f:
    f.writelines(to_html(fig, include_plotlyjs='cnd', full_html=True))
print('Done!')

Done!
