In [198]:
import pandas as pd
import os,re
import numpy as np

In [199]:
parsed = os.listdir('Data')

In [200]:
codes = {
    2011:{'MT':130,'CN':123,'CH':119,'LC':126},
    2012:{'CN':153,'CH':154,'LC':155,'MT':156},
    2013:{'CN':188,'CH':187,'LC':189,'MT':190},
    2015:{'CN':277,'CH':273,'LC':280,'MT':284},
    2016:{'CN':307,'CH':308,'LC':309,'MT':310},
    2017:{'CN':407,'CH':408,'LC':409,'MT':410},
    2018:{'CN':463,'CH':464,'LC':465,'MT':466},
    2019:{'CN':519,'CH':520,'LC':521,'MT':522},
    2020:{'CN':604,'CH':574,'LC':584,'MT':594},
    2021:{'CN':916,'CH':886,'LC':896,'MT':906},
    2022:{'CN':1092,'CH':1062,'LC':1072,'MT':1082}
}

In [266]:
dfs = {}


def match_gab_to_df(df,gab):
    answers=[]
    languages=[]
    used = set()
    for line in df.to_dict(orient='records'):
        if line['test']=='redação':
            answers.append('')
            languages.append('portuguese')
            continue
        possible_gabarito = gab[gab['question']==line['question_number']]
        gabarito = possible_gabarito.iloc[0]
        
        if (str(gabarito['question'])+str(gabarito['test'])) in used:
            gabarito = possible_gabarito.iloc[1]
            
        answers.append(gabarito['answer'])
        languages.append(gabarito['test'] if not pd.isna(gabarito['test']) else 'portuguese')
        used.add(str(gabarito['question'])+str(gabarito['test']))
    return answers,languages


for prova in parsed:
    df = pd.read_csv('Data/'+prova)
    year = int(re.search("\d\d\d\d",prova).group())
    df['question'] = df['question'].apply(lambda question: question.upper())
    df['year'] = year
    df['question_number'] = df['question'].apply(lambda question: int(re.search("[\d]+",question).group()) if re.search("[\d]+",question) else np.nan)
    df['test']='redação'
    if 'D1' in prova or 'ia1' in prova:
        if year < 2017:
            df.loc[df['question_number']<=45,'test'] = 'CH'
            df.loc[df['question_number']>45,'test'] = 'CN'
        else:
            df.loc[df['question_number']<=45,'test'] = 'LC'
            df.loc[df['question_number']>45,'test'] = 'CH'
    else:
        if year < 2017:
            df.loc[df['question_number']<=135,'test'] = 'LC'
            df.loc[df['question_number']>135,'test'] = 'MT'
        else:
            df.loc[df['question_number']<=135,'test'] = 'CN'
            df.loc[df['question_number']>135,'test'] = 'MT'
    #add gabarito data
    gab = pd.read_csv('GabaritosCsv/'+prova.replace('CAD','GAB')).sort_values(by=['question','test'])
    df['answer'],df['language'] = match_gab_to_df(df,gab)
    #we want to sort dataframe in the following order: english -> spanish -> portuguese then question orders
    #this is useful for the adding of microdata
    df['sort_index'] = df['language'].apply(lambda lang: 0 if lang == 'english' else 1 if lang=='spanish' else 2)
    df = df.sort_values(by=['sort_index','question_number']).drop(columns=['sort_index'])
    
    #add microdata data
    important_columns = ['CO_PROVA','NU_PARAM_A','NU_PARAM_B','NU_PARAM_C','TP_LINGUA','CO_POSICAO','CO_HABILIDADE','TX_GABARITO']
    microdata = pd.read_csv(f'microdados/ITENS_PROVA_{year}.csv',sep=';',usecols=important_columns, encoding='latin-1')
    microdata.loc[microdata['TP_LINGUA'].isna(),'TP_LINGUA']='portuguese'
    microdata.loc[microdata['TP_LINGUA']==0,'TP_LINGUA']='english'
    microdata.loc[microdata['TP_LINGUA']==1,'TP_LINGUA']='spanish'
    
    #divide tests
    for test in df['test'].drop_duplicates().tolist():
        one_subject_test = df[df['test']==test].copy().reset_index(drop=True)
        if test == 'redação':
            one_subject_test.to_csv(f'ProvasComMicrodados/ENEM_{year}_{test}.csv')
        else:
            CO_PROVA = codes[year][test]
            microdata = microdata[microdata['CO_PROVA']==CO_PROVA].copy()
            one_subject_test['CO_PROVA'] = CO_PROVA
            
            try:
                output = pd.merge(one_subject_test,microdata,left_on=['question_number','language'],right_on=['CO_POSICAO','TP_LINGUA'])
                if not (output['TX_GABARITO']!=output['answer']).sum()==0:
                    raise Exception(f'Gabarito and answer should be the same for test {prova}')
            except:
                one_subject_test['match'] = list(one_subject_test.index+1)
                output = pd.merge(one_subject_test,microdata,left_on=['match'],right_on=['CO_POSICAO']).drop(columns=['match'])
                if not (output['TX_GABARITO']!=output['answer']).sum()==0:
                    print(f'Gabarito and answer should be the same for test {prova} {test}')
                    CO_PROVA = str(CO_PROVA) + 'MAYBEBROKENDONOTUSE'
            output.drop(columns=['CO_PROVA_x','TP_LINGUA','TX_GABARITO','CO_POSICAO'],inplace=True)
            output.rename(columns={'CO_PROVA_y':'CO_PROVA'},inplace=True)
            output['question_number'] = output['question_number'].astype(int)
            output.to_csv(f'ProvasComMicrodados/ENEM_{year}_{test}_CO_PROVA_{CO_PROVA}.csv',index=False)
    dfs[prova] = df

Gabarito and answer should be the same for test ia1_caderno3_CAD_branco_ledor_2011.csv CH
Gabarito and answer should be the same for test ia1_caderno3_CAD_branco_ledor_2012.csv CH
Gabarito and answer should be the same for test ia1_caderno3_CAD_branco_ledor_2013.csv CH
