#### Importing modules

In [46]:
import pandas as pd
import numpy as np
import os
from datetime import datetime

# 1.1 Loading Student Socioeconomic Questionnarie - Raw Dataset

In [47]:
data_path = '/Users/luisr/Desktop/Repositories/Data/saep/CSV/{}'
filename = 'saep_qst_aluno.csv'
student_data = pd.read_csv(data_path.format(filename), index_col=0)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  mask |= (ar1 == a)


In [48]:
student_data.head(3)

Unnamed: 0,ID_SAEB,ID_REGIAO,ID_UF,ID_MUNICIPIO,ID_ESCOLA,ID_DEPENDENCIA_ADM,ID_LOCALIZACAO,ID_CAPITAL,ID_TURMA,ID_TURNO,...,TX_RESP_Q053,TX_RESP_Q054,TX_RESP_Q055,TX_RESP_Q056,TX_RESP_Q057,TX_RESP_Q058,TX_RESP_Q059,TX_RESP_Q060,TX_RESP_Q061,TX_RESP_Q062
0,2011,1,11,1100015,11024682,2,1,2,52401,2,...,.,.,,,,,,,,
1,2011,1,11,1100015,11024682,2,1,2,52401,2,...,A,B,,,,,,,,
2,2011,1,11,1100015,11024682,2,1,2,52401,2,...,A,A,,,,,,,,


---

# 1. DATA CLEANING

In [49]:
data = student_data.copy()
ids = data.columns[:15]
questions = data.columns[15:]

# Column 'ID_TURNO'

* Print unique values in column

In [50]:
print('Valores únicos da coluna "ID_TURNO":', data['ID_TURNO'].unique()); print('')

Valores únicos da coluna "ID_TURNO": ['2' '1' '3' ' ' 1 2 3]



* Convert string values to integer type
* Replace missing values with NAN notation

In [51]:
data['ID_TURNO'] = data['ID_TURNO'].replace(' ', np.nan).astype(float)

Obs: Column 'ID_TURNO' is the only identification column with missing values

# Question Columns

In [52]:
# * Print unique values for all columns before cleaning
set(data[questions].values.reshape(-1))

{' ', '*', '.', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L'}

In [53]:
# * Unification of missiong values notation to NAN
for mark in ['*', ' ', '.']: data.replace(mark, np.nan, inplace=True)
# * Print unique values for all columns after cleaning
set(data[questions].values.reshape(-1))

{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', nan}

# Droping empty questionnaries (question values missing completely)

In [54]:
# drop completely empty rows (in the subset of the question columns)
data.dropna(how='all', subset=questions, inplace=True)
# Print data shape after dropping rows
data.shape

(849278, 77)

# Print unique values count for each identification column

In [55]:
categorical_cols = [
    'ID_SAEB', 'ID_REGIAO', 'ID_UF',
    'ID_DEPENDENCIA_ADM', 'ID_LOCALIZACAO', 'ID_CAPITAL',
    'ID_TURNO', 'ID_SERIE', 'IN_SITUACAO_CENSO',
    'IN_PROVA_BRASIL', 'IN_PREENCHIMENTO'
]
for col in categorical_cols:
    print(col, dict(data[col].value_counts()))

ID_SAEB {2011: 849278}
ID_REGIAO {2: 432467, 1: 416811}
ID_UF {23: 201111, 15: 166843, 21: 164870, 13: 107564, 22: 66486, 11: 42266, 17: 41944, 16: 22539, 12: 20996, 14: 14659}
ID_DEPENDENCIA_ADM {3: 566409, 2: 260657, 4: 20535, 1: 1677}
ID_LOCALIZACAO {1: 700335, 2: 148943}
ID_CAPITAL {2: 636393, 1: 212885}
ID_TURNO {2.0: 435424, 1.0: 384885, 3.0: 26538}
ID_SERIE {5: 468706, 9: 360455, 12: 20117}
IN_SITUACAO_CENSO {1: 849278}
IN_PROVA_BRASIL {1: 811587, 0: 37691}
IN_PREENCHIMENTO {1: 849278}


# Droping constant and repeated columns

In [56]:
empty_questions = data.loc[:, (data.isna().mean()>0.9)].columns.tolist(); empty_questions

['TX_RESP_Q059', 'TX_RESP_Q060', 'TX_RESP_Q061', 'TX_RESP_Q062']

In [58]:
out_cols = [
    'ID_SAEB', 'IN_SITUACAO_CENSO', 'IN_PREENCHIMENTO', # Constant columns
] + empty_questions

data.drop(out_cols, inplace=True, axis=1)

# Saving Clean Dataset

In [44]:
try: 
    os.mkdir('data'); os.mkdir('data/clean')
    print('Clean data folder created!')
except: None
    
# data.to_csv('data/clean/questionnaire.csv', index=False)
print(f'Saved at: {datetime.now()}')

Saved at: 2022-05-26 22:36:31.245489


In [45]:
data.head(3)

Unnamed: 0,ID_REGIAO,ID_UF,ID_MUNICIPIO,ID_ESCOLA,ID_DEPENDENCIA_ADM,ID_LOCALIZACAO,ID_CAPITAL,ID_TURMA,ID_TURNO,ID_SERIE,...,TX_RESP_Q049,TX_RESP_Q050,TX_RESP_Q051,TX_RESP_Q052,TX_RESP_Q053,TX_RESP_Q054,TX_RESP_Q055,TX_RESP_Q056,TX_RESP_Q057,TX_RESP_Q058
1,1,11,1100015,11024682,2,1,2,52401,2.0,5,...,A,B,A,B,A,B,,,,
2,1,11,1100015,11024682,2,1,2,52401,2.0,5,...,A,,A,A,A,A,,,,
3,1,11,1100015,11024682,2,1,2,52401,2.0,5,...,A,A,A,A,A,A,,,,


---

# 1.6 Loading Students Grades Dataset

### Loading

In [59]:
grades_filename = 'saep_res_aluno.csv'
grade_data = pd.read_csv(data_path.format(grades_filename))

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [60]:
data = grade_data.copy()

# 1.8 Cleaning

In [61]:
grades = data.columns[-10:-1].tolist()

#### Replace grade columns empty cells notation to "NAN" & type conversion to float

In [62]:
data[grades] = data[grades].replace(' ', np.nan).astype(float)

#### Droping rows without any grade data

In [63]:
data.dropna(how='all', subset=grades, inplace=True)

#### Extracting non repeated grade columns plus student id column for later merging

In [66]:
data = data[['ID_ALUNO', 'PROFICIENCIA_LP', 'PROFICIENCIA_MT', 'DESVIO_PADRAO_LP', 'DESVIO_PADRAO_MT', 'PESO']]

# 1.9 Saving Clean Students Grades Dataset

In [67]:
data.to_csv('data/clean/grades.csv', index=False); print(f'Saved at: {datetime.now()}')

Saved at: 2022-05-27 13:19:55.760588
