# Read tables from pdf files example notebook

### Import modules

In [28]:
import pandas as pd, numpy as np, tabula

### Load data from pdf

In [92]:
enc = 'Windows-1252'

tables = tabula.read_pdf('UFFMudancadeCurso2023.pdf', pages='all', encoding=enc)
for table in tables:
    table.columns = [col if '\r' not in col else col.split('\r')[0] for col in table.columns]

table = pd.concat(tables)

---
## Data Cleaning

Is usually necessary to perform some data cleaning with the resulting dataset.

##### Missing values per row

In [93]:
table.isna().sum(1).value_counts()

8     607
15     24
dtype: int64

##### Drop unformatted rows

In [94]:
keep_msk = table.isna().sum(1) != 15

table = table[keep_msk]

cols = ['Matematica', 'Redação', 'Nota']
for col in cols: table.loc[:, col] = table[col].replace('---', np.nan).astype(str).str.replace(',', '.').astype(float)

  for col in cols: table.loc[:, col] = table[col].replace('---', np.nan).astype(str).str.replace(',', '.').astype(float)


---
## Exploratory Data analysis

### 1. General exam result

#### Sort students by final grade

In [95]:
table.sort_values('Nota', ascending=False)[['Nome do Candidato', 'Nota']].reset_index(drop=True).head(50).tail(10)

Unnamed: 0,Nome do Candidato,Nota
40,LUCIO **************,80.0
41,JULIA **************,78.0
42,GUSTAVO ************,78.0
43,MARIANA ************,78.0
44,ALICIA *************,78.0
45,MAYARA *************,78.0
46,KEVIN **************,77.85
47,LUIS ***************,77.85
48,GABRIEL ************,77.5
49,ISABELLA ***********,77.5


#### Number of students who took exam

In [97]:
len(table)

607

## Specific disciplines grades

In [114]:
main_cols = ['Nome do Candidato', 'Matematica', 'Redação', 'Nota']

#### Best final grade among those who took the math exam

In [111]:
math = table[~table['Matematica'].isna()]; print(f'Students who took the math exam: {len(math)}')
print()
math.sort_values('Nota', ascending=False)[main_cols].head()

Students who took the math exam: 184



Unnamed: 0,Nome do Candidato,Matematica,Redação,Nota
1,LUIS ***************,75.0,90.0,77.85
0,KEVIN **************,75.0,80.0,77.85
0,GABRIEL ************,55.0,95.0,77.5
1,PEDRO **************,90.0,85.0,77.08
1,NICOLE *************,50.0,80.0,76.0


#### Best in math

In [121]:
math.sort_values('Matematica', ascending=False)[main_cols].reset_index().head(6)

Unnamed: 0,index,Nome do Candidato,Matematica,Redação,Nota
0,1,PEDRO **************,90.0,85.0,77.08
1,1,JOAO VICTOR ********,85.0,65.0,69.44
2,3,BRUNO **************,75.0,60.0,71.42
3,0,KEVIN **************,75.0,80.0,77.85
4,0,HELIO **************,75.0,80.0,70.0
5,1,LUIS ***************,75.0,90.0,77.85


#### Best in writing exam

In [124]:
math.sort_values('Redação', ascending=False)[main_cols].reset_index().head(8)

Unnamed: 0,index,Nome do Candidato,Matematica,Redação,Nota
0,2,GIOVANNA ***********,35.0,100.0,52.0
1,0,GABRIEL ************,55.0,95.0,77.5
2,0,MARIANA ************,65.0,95.0,70.0
3,1,HENRIQUE ***********,55.0,95.0,68.07
4,2,JEAN CARLOS ********,20.0,95.0,74.0
5,1,GUSTAVO ************,65.0,95.0,72.5
6,7,IGHOR **************,35.0,90.0,47.69
7,1,LUIS ***************,75.0,90.0,77.85
