In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import gc
tqdm.pandas()

In [2]:
answer_data = pd.read_csv('../MicrodadosAlunos/answer_data.csv')
code_question_mapping = pd.read_csv('../MicrodadosAlunos/questao_prova_mapping.csv')

In [3]:
answer_data['RESPOSTA'] = answer_data['RESPOSTA'].replace('.','*')
display(answer_data['RESPOSTA'].value_counts())

RESPOSTA
A    72290959
B    71676717
C    71576313
D    65906092
E    58465315
*     1392684
Name: count, dtype: int64

In [4]:
def convert_to_prob_dist(x):
    unique, counts = np.unique(x, return_counts=True)
    a_counts = counts[np.where(unique=='A')]/len(x)
    a_counts = a_counts[0] if a_counts.size else 0
    b_counts = counts[np.where(unique=='B')]/len(x)
    b_counts = b_counts[0] if b_counts.size else 0
    c_counts = counts[np.where(unique=='C')]/len(x) 
    c_counts = c_counts[0] if c_counts.size else 0
    d_counts = counts[np.where(unique=='D')]/len(x)
    d_counts = d_counts[0] if d_counts.size else 0
    e_counts = counts[np.where(unique=='E')]/len(x) 
    e_counts = e_counts[0] if e_counts.size else 0
    none_counts = counts[np.where(unique=='*')]/len(x)
    none_counts = none_counts[0] if none_counts.size else 0
    np.testing.assert_allclose(np.sum([a_counts,b_counts,c_counts,d_counts,e_counts,none_counts]), 1, rtol=1e-5, atol=0)
    return np.array([a_counts,b_counts,c_counts,d_counts,e_counts,none_counts])

In [5]:
#first create the general answer distribution dataframe
df = answer_data.groupby('CO_ITEM')['RESPOSTA'].agg(list).reset_index()
df['answer_distribution'] = df['RESPOSTA'].progress_apply(lambda x:convert_to_prob_dist(np.array(x)))
df['TX_GABARITO'] = df['CO_ITEM'].apply(lambda x:code_question_mapping[code_question_mapping['CO_ITEM']==x]['TX_GABARITO'].iloc[0])
df['correct_probability'] = df.apply(lambda row: 0.0 if row['TX_GABARITO'] not in ['A','B','C','D','E'] else row['answer_distribution'][ord(row['TX_GABARITO'])-65],axis=1)
df[['CO_ITEM','TX_GABARITO','answer_distribution','correct_probability']].to_csv('DISTRIBUTIONS/answer_dist_per_question.csv',index=False)
del df
gc.collect()

100%|████████████████████████████████████████████████████████████████████████████████| 195/195 [00:35<00:00,  5.57it/s]


0

# Now create distributions based on categories

In [6]:
student_data = pd.read_csv('../MicrodadosAlunos/student_data.csv').drop(columns=['NU_INSCRICAO','NSE_SCORE'])

In [10]:
def create_distributions_from_column(column_name):
    print(column_name)
    answer_data = pd.read_csv('../MicrodadosAlunos/answer_data.csv')
    answer_data['RESPOSTA'] = answer_data['RESPOSTA'].replace('.','*')
    print('read data')
    answer_data = pd.merge(answer_data,student_data[['ID',column_name]],on='ID')
    answer_data = answer_data.groupby(['CO_ITEM',column_name])['RESPOSTA'].agg(list).reset_index()
    answer_data['answer_distribution'] = answer_data['RESPOSTA'].progress_apply(lambda x:convert_to_prob_dist(np.array(x)))
    print('created distributions')
    answer_data['TX_GABARITO'] = answer_data['CO_ITEM'].apply(lambda x:code_question_mapping[code_question_mapping['CO_ITEM']==x]['TX_GABARITO'].iloc[0])
    answer_data['correct_probability'] = answer_data.apply(lambda row: 0.0 if row['TX_GABARITO'] not in ['A','B','C','D','E'] else row['answer_distribution'][ord(row['TX_GABARITO'])-65],axis=1)
    answer_data[['CO_ITEM',column_name,'TX_GABARITO','answer_distribution','correct_probability']].to_csv(f'DISTRIBUTIONS/answer_dist_per_question_{column_name}.csv',index=False)
    del answer_data
    gc.collect()
    print('==================================================')

In [11]:
%%time
for column_name in student_data.columns[1:]:
    create_distributions_from_column(column_name)

SG_UF_PROVA
read data


100%|██████████████████████████████████████████████████████████████████████████████| 5255/5255 [01:05<00:00, 80.81it/s]


created distributions
TP_ESCOLA
read data


100%|████████████████████████████████████████████████████████████████████████████████| 585/585 [01:08<00:00,  8.52it/s]


created distributions
RENDA
read data


100%|██████████████████████████████████████████████████████████████████████████████| 3305/3305 [01:07<00:00, 48.71it/s]


created distributions
TP_SEXO
read data


100%|████████████████████████████████████████████████████████████████████████████████| 390/390 [01:12<00:00,  5.39it/s]


created distributions
TP_COR_RACA
read data


100%|██████████████████████████████████████████████████████████████████████████████| 1165/1165 [01:09<00:00, 16.73it/s]


created distributions
NSE_LEVEL
read data


100%|██████████████████████████████████████████████████████████████████████████████| 1560/1560 [01:00<00:00, 25.83it/s]


created distributions
CPU times: total: 13min 32s
Wall time: 24min 32s


In [12]:
nse_map = {1:'Very Low',2:'Very Low',3:'Medium',4:'Medium',5:'Medium',6:'Medium',7:'Very High',8:'Very High'}

In [15]:
#generate NSE AND RACE data
answer_data = pd.read_csv('../MicrodadosAlunos/answer_data.csv')
answer_data['RESPOSTA'] = answer_data['RESPOSTA'].replace('.','*')
print('read data')
answer_data = pd.merge(answer_data,student_data[['ID','NSE_LEVEL','TP_COR_RACA']],on='ID')
answer_data['NSE_LEVEL'] = answer_data['NSE_LEVEL'].apply(lambda nse:nse_map[nse])
answer_data = answer_data.groupby(['CO_ITEM','NSE_LEVEL','TP_COR_RACA'])['RESPOSTA'].agg(list).reset_index()
answer_data['answer_distribution'] = answer_data['RESPOSTA'].progress_apply(lambda x:convert_to_prob_dist(np.array(x)))
print('created distributions')
answer_data['TX_GABARITO'] = answer_data['CO_ITEM'].apply(lambda x:code_question_mapping[code_question_mapping['CO_ITEM']==x]['TX_GABARITO'].iloc[0])
answer_data['correct_probability'] = answer_data.apply(lambda row: 0.0 if row['TX_GABARITO'] not in ['A','B','C','D','E'] else row['answer_distribution'][ord(row['TX_GABARITO'])-65],axis=1)
answer_data.rename(columns={'NSE_LEVEL':'NSE_CLASS'},inplace=True)
answer_data[['CO_ITEM','NSE_CLASS','TP_COR_RACA','TX_GABARITO','answer_distribution','correct_probability']].to_csv(f'DISTRIBUTIONS/answer_dist_per_question_RACE_NSE.csv',index=False)
del answer_data
gc.collect()
print('==================================================')

read data


100%|█████████████████████████████████████████████████████████████████████████████| 3475/3475 [00:34<00:00, 100.95it/s]


created distributions


KeyError: "['NSE_CLASS'] not in index"

In [16]:
answer_data.rename(columns={'NSE_LEVEL':'NSE_CLASS'},inplace=True)
answer_data[['CO_ITEM','NSE_CLASS','TP_COR_RACA','TX_GABARITO','answer_distribution','correct_probability']].to_csv(f'DISTRIBUTIONS/answer_dist_per_question_RACE_NSE.csv',index=False)