In [11]:
import numpy as np
import pandas as pd
from math import nan
import plotly.express as px
from plotly.io import to_html
from ridgeplot import ridgeplot
from collections import Counter
import plotly.graph_objects as go
from random import shuffle, sample

pd.options.display.max_rows = 500

In [None]:
year_list = [2013,2014,2015,2016,2017,2018,2019,2020,2021,2022]

def stratify_sampling(x, number):
    """
    ARGUMENTS:
    x      = Python list
    number = integer indicating the number of samples you want
    """
    counts = Counter(x)
    unique = list(counts.keys())
    list_size = number/len(x)

    sample = []
    for i in range(len(unique)):
        batch = int(list_size*counts[unique[i]])
        if batch == 0: sample.append(unique[i])
        else:
            for j in range(batch): sample.append(unique[i])
    
    diff = number - len(sample)
    batch = random.sample(x, diff)
    for i in range(diff): sample.append(batch[i])
    
    random.shuffle(sample)
    
    return sample

In [2]:
df = pd.read_pickle('../data/vis02_enem_data.pickle')
df

Unnamed: 0,TP_FAIXA_ETARIA,TP_COR_RACA,ANO,NOTA_MEDIA
0,21,Branca,2013,450
1,31-35,Parda,2013,380
2,18,Parda,2013,440
3,26-30,Branca,2013,510
4,17,Branca,2013,410
...,...,...,...,...
41572953,31-35,Branca,2022,480
41572954,26-30,Preta,2022,610
41572955,17,Parda,2022,540
41572956,26-30,Branca,2022,500


## TP_COR_RACA

In [3]:
year_list = [2013,2014,2015,2016,2017,2018,2019,2020,2021,2022]
racas = ['Nao Declarado','Branca','Preta','Parda','Amarela','Indigena','Nao Possui a Informacao']

In [4]:
array_dict = {}
for year in year_list:
    sub = df[df.ANO == year]
    count = sub['TP_COR_RACA'].value_counts()
    
    raca_count = [count['Nao Declarado'],count['Branca'],count['Preta'],count['Parda'],count['Amarela'],count['Indigena']]
    if count.shape[0] == 7: raca_count.append(count['Nao Possui a Informacao'])
    else: raca_count.append(0)
    
    array_dict[f'x_{year}']      = np.array(racas)
    array_dict[f'y_{year}']      = np.array(raca_count)
    array_dict[f'y_{year}_norm'] = (array_dict[f'y_{year}'] - array_dict[f'y_{year}'].min()) \
                            / (array_dict[f'y_{year}'].max() - array_dict[f'y_{year}'].min())
    array_dict[f'y_{year}']      = ["Num Candidatos: "+str(i) for i in raca_count]

In [21]:
fig = go.Figure()
for index, year in enumerate(year_list):
    fig.add_trace(go.Scatter(
                            x=racas,
                            y=np.full(1, len(year_list)-index),
                            mode='lines',
                            line_color='white'))
    
    fig.add_trace(go.Scatter(
                            x=array_dict[f'x_{year}'],
                            y=array_dict[f'y_{year}_norm'] + (len(year_list)-index) + 0.4,
                            fill='tonexty',
                            text=array_dict[f'y_{year}'],
                            name=f'{year}'))
    
    # plotly.graph_objects' way of adding text to a figure
    fig.add_annotation(
                        x=-0.2,
                        y=len(year_list)-index,
                        text=f'{year}',
                        showarrow=False,
                        yshift=7)

# here you can modify the figure and the legend titles
fig.update_layout(
                title='Distribuição de Raças Declaradas em Relação ao Ano',
                showlegend=False,
                xaxis=dict(title='Raça'),
                yaxis=dict(showticklabels=False),
                width=1000,
                height=700
                )

fig.show()

## TP_FAIXA_ETARIA

In [6]:
year_list = [2013,2014,2015,2016,2017,2018,2019,2020,2021,2022]
faixas_etarias = ['<17', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26-30',
                  '31-35', '36-40', '41-45', '46-50', '51-55', '56-60', '61-65', '66-70', '>70']

In [7]:
array_dict = {}
for year in year_list:
    sub = df[df.ANO == year]
    count = sub['TP_FAIXA_ETARIA'].value_counts()
    
    faixa_count = [count[i] for i in faixas_etarias]
    
    array_dict[f'x_{year}']      = np.array(faixas_etarias)
    array_dict[f'y_{year}']      = np.array(faixa_count)
    array_dict[f'y_{year}_norm'] = (array_dict[f'y_{year}'] - array_dict[f'y_{year}'].min()) \
                            / (array_dict[f'y_{year}'].max() - array_dict[f'y_{year}'].min())
    array_dict[f'y_{year}']      = ["Num Candidatos: "+str(i) for i in faixa_count]

In [8]:
fig = go.Figure()
for index, year in enumerate(year_list):
    fig.add_trace(go.Scatter(
                            x=faixas_etarias,
                            y=np.full(1, len(year_list)-index),
                            mode='lines',
                            line_color='white'))
    
    fig.add_trace(go.Scatter(
                            x=array_dict[f'x_{year}'],
                            y=array_dict[f'y_{year}_norm'] + (len(year_list)-index) + 0.4,
                            fill='tonexty',
                            text=array_dict[f'y_{year}'],
                            name=f'{year}'))
    
    # plotly.graph_objects' way of adding text to a figure
    fig.add_annotation(
                        x=-0.6,
                        y=len(year_list)-index,
                        text=f'{year}',
                        showarrow=False,
                        yshift=7)

# here you can modify the figure and the legend titles
fig.update_layout(
                title='Distribuição de Faixas Etárias em Relação ao Ano',
                showlegend=False,
                xaxis=dict(title='Faixa Etária'),
                yaxis=dict(showticklabels=False),
                width=1000,
                height=600
                )

fig.show()

## NOTA_MEDIA

In [18]:
array_dict = {}
sample = 50000

for year in year_list:
    sub = df[df.ANO == year]
    array_dict[year] = sub['NOTA_MEDIA'].tolist()
    array_dict[year] = stratify_sampling(array_dict[year], sample)
    while len(array_dict[year]) < sample:
        array_dict[year].append(nan)

data = np.array([array_dict[2013], array_dict[2014], array_dict[2015],
                 array_dict[2016], array_dict[2017], array_dict[2018],
                 array_dict[2019], array_dict[2020], array_dict[2021],
                 array_dict[2022]])
graph = pd.DataFrame(columns=year_list, data=data.T)
graph

Unnamed: 0,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,450,490,560,540,470,510,490,450,450,490
1,420,490,460,500,530,630,520,540,690,630
2,450,460,670,470,550,440,530,510,660,630
3,450,470,570,430,540,570,640,500,610,600
4,570,520,480,650,530,550,580,400,460,450
...,...,...,...,...,...,...,...,...,...,...
299995,490,500,510,550,540,490,490,490,520,500
299996,470,450,440,580,400,490,490,560,580,430
299997,490,530,390,610,510,480,450,700,570,490
299998,530,610,410,570,420,480,360,430,620,530


In [23]:
fig = ridgeplot(
    samples=graph.values.T,
    bandwidth=10,
    colorscale="viridis",
    colormode="index",
    coloralpha=0.6,
    labels=year_list,
    spacing=1
)

fig.update_layout(
    title="Distribuição das Notas Médias em Relação ao Ano",
    height=650,
    width=800,
    plot_bgcolor="rgba(255, 255, 255, 0.0)",
    xaxis_gridcolor="rgba(0, 0, 0, 0.1)",
    yaxis_gridcolor="rgba(0, 0, 0, 0.1)",
    xaxis_title="Nota Média",
    yaxis_title="Ano"
)

with open('../data/vis02.html', 'w') as f:
    f.writelines(to_html(fig, include_plotlyjs='cnd', full_html=True))