<a href="https://colab.research.google.com/github/lucianocoelho-28/dio-curso-etl/blob/main/limpeza.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fundamentos de ETL com Python

### Acessar os dados no link do CENIPA
link: https://www2.fab.mil.br/cenipa/

https://dados.gov.br/dataset/ocorrencias-aeronauticas-da-aviacao-civil-brasileira

## 1. Etapa de limpeza de dados

In [None]:
#Importando as bibliotecas
import pandas as pd

In [None]:
#Criando nosso DataFrame
df = pd.read_csv("/content/ocorrencia_2010_2020.csv", sep=";", parse_dates=['ocorrencia_dia'], dayfirst=True)
df.head()

Unnamed: 0,codigo_ocorrencia,codigo_ocorrencia2,ocorrencia_classificacao,ocorrencia_cidade,ocorrencia_uf,ocorrencia_aerodromo,ocorrencia_dia,ocorrencia_hora,total_recomendacoes
0,40211,40211,INCIDENTE,RIO DE JANEIRO,RJ,****,2010-01-03,12:00:00,0
1,40349,40349,INCIDENTE,BELÉM,PA,SBBE,2010-01-03,11:05:00,0
2,40351,40351,INCIDENTE,RIO DE JANEIRO,RJ,SBRJ,2010-01-03,03:00:00,0
3,39527,39527,ACIDENTE,LUCAS DO RIO VERDE,MT,****,2010-01-04,17:30:00,0
4,40324,40324,INCIDENTE,PELOTAS,RS,SBPK,2010-01-05,19:25:00,0


In [None]:
#Localizando dados em uma coluna
df.loc[1, "ocorrencia_cidade"]

'BELÉM'

In [None]:
#Localizando dados em uma sequencia de colunas 
df.loc[1:3]

Unnamed: 0,codigo_ocorrencia,codigo_ocorrencia2,ocorrencia_classificacao,ocorrencia_cidade,ocorrencia_uf,ocorrencia_aerodromo,ocorrencia_dia,ocorrencia_hora,total_recomendacoes
1,40349,40349,INCIDENTE,BELÉM,PA,SBBE,2010-01-03,11:05:00,0
2,40351,40351,INCIDENTE,RIO DE JANEIRO,RJ,SBRJ,2010-01-03,03:00:00,0
3,39527,39527,ACIDENTE,LUCAS DO RIO VERDE,MT,****,2010-01-04,17:30:00,0


In [None]:
#Localizando dados em colunas diferentes
df.loc[[10,40]]

Unnamed: 0,codigo_ocorrencia,codigo_ocorrencia2,ocorrencia_classificacao,ocorrencia_cidade,ocorrencia_uf,ocorrencia_aerodromo,ocorrencia_dia,ocorrencia_hora,total_recomendacoes
10,39789,39789,INCIDENTE,SÃO PEDRO DO SUL,RS,****,2010-01-10,21:30:00,0
40,39158,39158,INCIDENTE,BELÉM,PA,****,2010-01-28,16:00:00,0


In [None]:
#Localizando todos os dados de uma coluna
df.loc[:,"ocorrencia_cidade"]

0           RIO DE JANEIRO
1                    BELÉM
2           RIO DE JANEIRO
3       LUCAS DO RIO VERDE
4                  PELOTAS
               ...        
5747              CAMPINAS
5748     LAGOA DA CONFUSÃO
5749        RIO DE JANEIRO
5750             VICENTINA
5751        RIO DE JANEIRO
Name: ocorrencia_cidade, Length: 5752, dtype: object

In [None]:
#Verificando se a coluna possui dados unicos
df.codigo_ocorrencia.is_unique

True

In [None]:
#Definindo indice para o DataFrame como codigo_ocorrencia
df.set_index('codigo_ocorrencia', inplace=True)

In [None]:
#Listando nosso DataFrame alterando o indice
df.head()

Unnamed: 0_level_0,codigo_ocorrencia2,ocorrencia_classificacao,ocorrencia_cidade,ocorrencia_uf,ocorrencia_aerodromo,ocorrencia_dia,ocorrencia_hora,total_recomendacoes
codigo_ocorrencia,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
40211,40211,INCIDENTE,RIO DE JANEIRO,RJ,****,2010-01-03,12:00:00,0
40349,40349,INCIDENTE,BELÉM,PA,SBBE,2010-01-03,11:05:00,0
40351,40351,INCIDENTE,RIO DE JANEIRO,RJ,SBRJ,2010-01-03,03:00:00,0
39527,39527,ACIDENTE,LUCAS DO RIO VERDE,MT,****,2010-01-04,17:30:00,0
40324,40324,INCIDENTE,PELOTAS,RS,SBPK,2010-01-05,19:25:00,0


In [None]:
#Listando DataFrame com codigo de ocorrencia
df.loc[40324]

codigo_ocorrencia2                        40324
ocorrencia_classificacao              INCIDENTE
ocorrencia_cidade                       PELOTAS
ocorrencia_uf                                RS
ocorrencia_aerodromo                       SBPK
ocorrencia_dia              2010-01-05 00:00:00
ocorrencia_hora                        19:25:00
total_recomendacoes                           0
Name: 40324, dtype: object

In [None]:
#Voltar o indice como antes
df.reset_index(drop=True, inplace=True)

In [None]:
#Listando o DataFrame com indice como antes
df.head()

Unnamed: 0,codigo_ocorrencia2,ocorrencia_classificacao,ocorrencia_cidade,ocorrencia_uf,ocorrencia_aerodromo,ocorrencia_dia,ocorrencia_hora,total_recomendacoes
0,40211,INCIDENTE,RIO DE JANEIRO,RJ,****,2010-01-03,12:00:00,0
1,40349,INCIDENTE,BELÉM,PA,SBBE,2010-01-03,11:05:00,0
2,40351,INCIDENTE,RIO DE JANEIRO,RJ,SBRJ,2010-01-03,03:00:00,0
3,39527,ACIDENTE,LUCAS DO RIO VERDE,MT,****,2010-01-04,17:30:00,0
4,40324,INCIDENTE,PELOTAS,RS,SBPK,2010-01-05,19:25:00,0


In [None]:
#Alterando os dados no DataFrame 
df.loc[0,'ocorrencia_aerodromo'] = ''

In [None]:
#Listando o DataFrame confirmando alteração *5m26s Realizando a limpeza de dados - Parte 2
df.head(1)

Unnamed: 0,codigo_ocorrencia2,ocorrencia_classificacao,ocorrencia_cidade,ocorrencia_uf,ocorrencia_aerodromo,ocorrencia_dia,ocorrencia_hora,total_recomendacoes
0,40211,INCIDENTE,RIO DE JANEIRO,RJ,,2010-01-03,12:00:00,0


In [None]:
#Alterar todos os dados de uma linha
df.loc[1] = 20

In [None]:
#Listando o DataFrame com as duas primeiras linhas
df.head(2)

In [None]:
#Alterar todos os dados de uma coluna
df.loc[:,'total_recomendacoes'] = 10

In [15]:
#Listando o DataFrame
df

Unnamed: 0,codigo_ocorrencia2,ocorrencia_classificacao,ocorrencia_cidade,ocorrencia_uf,ocorrencia_aerodromo,ocorrencia_dia,ocorrencia_hora,total_recomendacoes
0,40211,INCIDENTE,RIO DE JANEIRO,RJ,,2010-01-03,12:00:00,0
1,40349,INCIDENTE,BELÉM,PA,SBBE,2010-01-03,11:05:00,0
2,40351,INCIDENTE,RIO DE JANEIRO,RJ,SBRJ,2010-01-03,03:00:00,0
3,39527,ACIDENTE,LUCAS DO RIO VERDE,MT,****,2010-01-04,17:30:00,0
4,40324,INCIDENTE,PELOTAS,RS,SBPK,2010-01-05,19:25:00,0
...,...,...,...,...,...,...,...,...
5747,79804,INCIDENTE,CAMPINAS,SP,SBKP,2020-12-29,19:00:00,0
5748,79757,INCIDENTE GRAVE,LAGOA DA CONFUSÃO,TO,****,2020-12-30,18:30:00,0
5749,79802,INCIDENTE,RIO DE JANEIRO,RJ,SBGL,2020-12-30,00:54:00,0
5750,79756,INCIDENTE GRAVE,VICENTINA,MS,****,2020-12-31,09:00:00,0


In [33]:
#Realizando um backup de uma coluna no DataFrame
df['ocorrencia_uf_bkp'] = df.ocorrencia_uf

In [34]:
#Listando o DataFrame
df

Unnamed: 0,codigo_ocorrencia2,ocorrencia_classificacao,ocorrencia_cidade,ocorrencia_uf,ocorrencia_aerodromo,ocorrencia_dia,ocorrencia_hora,total_recomendacoes,ocorrencia_uf_bkp
0,40211,INCIDENTE,RIO DE JANEIRO,RJ,,2010-01-03,12:00:00,0,RJ
1,40349,INCIDENTE,BELÉM,PA,SBBE,2010-01-03,11:05:00,0,PA
2,40351,INCIDENTE,RIO DE JANEIRO,RJ,SBRJ,2010-01-03,03:00:00,0,RJ
3,39527,ACIDENTE,LUCAS DO RIO VERDE,MT,,2010-01-04,17:30:00,0,MT
4,40324,INCIDENTE,PELOTAS,RS,SBPK,2010-01-05,19:25:00,0,RS
...,...,...,...,...,...,...,...,...,...
5747,79804,INCIDENTE,CAMPINAS,SP,SBKP,2020-12-29,19:00:00,0,SP
5748,79757,INCIDENTE GRAVE,LAGOA DA CONFUSÃO,TO,,2020-12-30,18:30:00,0,TO
5749,79802,INCIDENTE,RIO DE JANEIRO,RJ,SBGL,2020-12-30,00:54:00,0,RJ
5750,79756,INCIDENTE GRAVE,VICENTINA,MS,,2020-12-31,09:00:00,0,MS


In [None]:
#Incluindo a informação GRAVE para ocorrencia_uf SP na ocorrencia_classificação
df.loc[df.ocorrencia_uf == 'SP', ['ocorrencia_classificacao']] = 'GRAVE'

In [35]:
#Listando o DataFrame
df

Unnamed: 0,codigo_ocorrencia2,ocorrencia_classificacao,ocorrencia_cidade,ocorrencia_uf,ocorrencia_aerodromo,ocorrencia_dia,ocorrencia_hora,total_recomendacoes,ocorrencia_uf_bkp
0,40211,INCIDENTE,RIO DE JANEIRO,RJ,,2010-01-03,12:00:00,0,RJ
1,40349,INCIDENTE,BELÉM,PA,SBBE,2010-01-03,11:05:00,0,PA
2,40351,INCIDENTE,RIO DE JANEIRO,RJ,SBRJ,2010-01-03,03:00:00,0,RJ
3,39527,ACIDENTE,LUCAS DO RIO VERDE,MT,,2010-01-04,17:30:00,0,MT
4,40324,INCIDENTE,PELOTAS,RS,SBPK,2010-01-05,19:25:00,0,RS
...,...,...,...,...,...,...,...,...,...
5747,79804,INCIDENTE,CAMPINAS,SP,SBKP,2020-12-29,19:00:00,0,SP
5748,79757,INCIDENTE GRAVE,LAGOA DA CONFUSÃO,TO,,2020-12-30,18:30:00,0,TO
5749,79802,INCIDENTE,RIO DE JANEIRO,RJ,SBGL,2020-12-30,00:54:00,0,RJ
5750,79756,INCIDENTE GRAVE,VICENTINA,MS,,2020-12-31,09:00:00,0,MS


In [36]:
#Listar todos que ocorrencia_uf for = SP
df.loc[df.ocorrencia_uf == 'SP']

Unnamed: 0,codigo_ocorrencia2,ocorrencia_classificacao,ocorrencia_cidade,ocorrencia_uf,ocorrencia_aerodromo,ocorrencia_dia,ocorrencia_hora,total_recomendacoes,ocorrencia_uf_bkp
11,40069,ACIDENTE,SÃO PAULO,SP,SBMT,2010-01-10,14:50:00,8,SP
16,39809,INCIDENTE,SÃO PAULO,SP,,2010-01-15,15:00:00,0,SP
18,39828,INCIDENTE,SANTOS,SP,,2010-01-15,17:45:00,0,SP
26,39847,INCIDENTE,SOROCABA,SP,,2010-01-20,13:10:00,0,SP
27,39768,INCIDENTE,CAMPINAS,SP,,2010-01-21,20:45:00,0,SP
...,...,...,...,...,...,...,...,...,...
5704,79739,INCIDENTE GRAVE,SÃO PAULO,SP,SBSP,2020-12-03,14:42:00,0,SP
5705,79705,INCIDENTE GRAVE,SOROCABA,SP,SDCO,2020-12-04,15:30:00,0,SP
5715,79718,INCIDENTE GRAVE,SÃO PAULO,SP,SBMT,2020-12-11,13:50:00,0,SP
5742,79800,INCIDENTE,SÃO PAULO,SP,SBMT,2020-12-28,10:15:00,0,SP


In [37]:
#Listando o DataFrame
df

Unnamed: 0,codigo_ocorrencia2,ocorrencia_classificacao,ocorrencia_cidade,ocorrencia_uf,ocorrencia_aerodromo,ocorrencia_dia,ocorrencia_hora,total_recomendacoes,ocorrencia_uf_bkp
0,40211,INCIDENTE,RIO DE JANEIRO,RJ,,2010-01-03,12:00:00,0,RJ
1,40349,INCIDENTE,BELÉM,PA,SBBE,2010-01-03,11:05:00,0,PA
2,40351,INCIDENTE,RIO DE JANEIRO,RJ,SBRJ,2010-01-03,03:00:00,0,RJ
3,39527,ACIDENTE,LUCAS DO RIO VERDE,MT,,2010-01-04,17:30:00,0,MT
4,40324,INCIDENTE,PELOTAS,RS,SBPK,2010-01-05,19:25:00,0,RS
...,...,...,...,...,...,...,...,...,...
5747,79804,INCIDENTE,CAMPINAS,SP,SBKP,2020-12-29,19:00:00,0,SP
5748,79757,INCIDENTE GRAVE,LAGOA DA CONFUSÃO,TO,,2020-12-30,18:30:00,0,TO
5749,79802,INCIDENTE,RIO DE JANEIRO,RJ,SBGL,2020-12-30,00:54:00,0,RJ
5750,79756,INCIDENTE GRAVE,VICENTINA,MS,,2020-12-31,09:00:00,0,MS


In [38]:
#Listando o DataFrame com as cinco primeiras linhas
df.head()

Unnamed: 0,codigo_ocorrencia2,ocorrencia_classificacao,ocorrencia_cidade,ocorrencia_uf,ocorrencia_aerodromo,ocorrencia_dia,ocorrencia_hora,total_recomendacoes,ocorrencia_uf_bkp
0,40211,INCIDENTE,RIO DE JANEIRO,RJ,,2010-01-03,12:00:00,0,RJ
1,40349,INCIDENTE,BELÉM,PA,SBBE,2010-01-03,11:05:00,0,PA
2,40351,INCIDENTE,RIO DE JANEIRO,RJ,SBRJ,2010-01-03,03:00:00,0,RJ
3,39527,ACIDENTE,LUCAS DO RIO VERDE,MT,,2010-01-04,17:30:00,0,MT
4,40324,INCIDENTE,PELOTAS,RS,SBPK,2010-01-05,19:25:00,0,RS


In [39]:
#Limpar os dados no DataFrame
df.loc[df.ocorrencia_aerodromo == '****', ['ocorrencia_aerodromo']] = pd.NA

In [40]:
#Listando o DataFrame com as cinco primeiras linhas
df.head()

Unnamed: 0,codigo_ocorrencia2,ocorrencia_classificacao,ocorrencia_cidade,ocorrencia_uf,ocorrencia_aerodromo,ocorrencia_dia,ocorrencia_hora,total_recomendacoes,ocorrencia_uf_bkp
0,40211,INCIDENTE,RIO DE JANEIRO,RJ,,2010-01-03,12:00:00,0,RJ
1,40349,INCIDENTE,BELÉM,PA,SBBE,2010-01-03,11:05:00,0,PA
2,40351,INCIDENTE,RIO DE JANEIRO,RJ,SBRJ,2010-01-03,03:00:00,0,RJ
3,39527,ACIDENTE,LUCAS DO RIO VERDE,MT,,2010-01-04,17:30:00,0,MT
4,40324,INCIDENTE,PELOTAS,RS,SBPK,2010-01-05,19:25:00,0,RS


###ocorrencia_uf ** ocorrencia_aerodromo ###! #### **** ***** ocorrencia_hora NULL

In [25]:
#Caso ocorra erro na limpeza do DataFrame
import warnings
import numpy as np
warnings.simplefilter(action='ignore', category=FutureWarning)
print('x' in np.arange(5))   #returns False, without Warning

False


In [41]:
#Limpar os dados no DataFrame
df.replace(['**','###!','####','****','*****','NULL'], pd.NA, inplace=True)

In [42]:
#Listar os dados NA dentro do DataFrame
df.isna().sum()

codigo_ocorrencia2             0
ocorrencia_classificacao       0
ocorrencia_cidade              0
ocorrencia_uf                  0
ocorrencia_aerodromo        2180
ocorrencia_dia                 0
ocorrencia_hora                1
total_recomendacoes            0
ocorrencia_uf_bkp              0
dtype: int64

In [43]:
#Listar os dados NULL dentro do DataFrame
df.isnull().sum()

codigo_ocorrencia2             0
ocorrencia_classificacao       0
ocorrencia_cidade              0
ocorrencia_uf                  0
ocorrencia_aerodromo        2180
ocorrencia_dia                 0
ocorrencia_hora                1
total_recomendacoes            0
ocorrencia_uf_bkp              0
dtype: int64

In [44]:
#Informar em todos os dados que possuirem nulos ou na trocar pelo valor 10
df.fillna(10, inplace=True)

In [45]:
#Listar os dados NULL dentro do DataFrame
df.isnull().sum()

codigo_ocorrencia2          0
ocorrencia_classificacao    0
ocorrencia_cidade           0
ocorrencia_uf               0
ocorrencia_aerodromo        0
ocorrencia_dia              0
ocorrencia_hora             0
total_recomendacoes         0
ocorrencia_uf_bkp           0
dtype: int64

In [46]:
#Listando o DataFrame com as cinco primeiras linhas
df.head()

Unnamed: 0,codigo_ocorrencia2,ocorrencia_classificacao,ocorrencia_cidade,ocorrencia_uf,ocorrencia_aerodromo,ocorrencia_dia,ocorrencia_hora,total_recomendacoes,ocorrencia_uf_bkp
0,40211,INCIDENTE,RIO DE JANEIRO,RJ,,2010-01-03,12:00:00,0,RJ
1,40349,INCIDENTE,BELÉM,PA,SBBE,2010-01-03,11:05:00,0,PA
2,40351,INCIDENTE,RIO DE JANEIRO,RJ,SBRJ,2010-01-03,03:00:00,0,RJ
3,39527,ACIDENTE,LUCAS DO RIO VERDE,MT,10,2010-01-04,17:30:00,0,MT
4,40324,INCIDENTE,PELOTAS,RS,SBPK,2010-01-05,19:25:00,0,RS


In [47]:
#Informar em todos os dados que possuirem 10 trocar para NA
df.replace([10], pd.NA, inplace=True)

In [48]:
#Listar os dados NULL dentro do DataFrame
df.isnull().sum()

codigo_ocorrencia2             0
ocorrencia_classificacao       0
ocorrencia_cidade              0
ocorrencia_uf                  0
ocorrencia_aerodromo        2180
ocorrencia_dia                 0
ocorrencia_hora                1
total_recomendacoes            2
ocorrencia_uf_bkp              0
dtype: int64

In [49]:
#Informar o valor 10 para a coluna total_recomendacoes que possuiam o valor anterior
df.fillna(value={'total_recomendacoes':10}, inplace=True)

In [50]:
#Listar os dados NULL dentro do DataFrame
df.isnull().sum()

codigo_ocorrencia2             0
ocorrencia_classificacao       0
ocorrencia_cidade              0
ocorrencia_uf                  0
ocorrencia_aerodromo        2180
ocorrencia_dia                 0
ocorrencia_hora                1
total_recomendacoes            0
ocorrencia_uf_bkp              0
dtype: int64

In [51]:
#Realizando um backup de uma coluna no DataFrame
df['total_recomendacoes_bkp'] = df.total_recomendacoes

In [52]:
#Listando o DataFrame com as cinco primeiras linhas
df.head()

Unnamed: 0,codigo_ocorrencia2,ocorrencia_classificacao,ocorrencia_cidade,ocorrencia_uf,ocorrencia_aerodromo,ocorrencia_dia,ocorrencia_hora,total_recomendacoes,ocorrencia_uf_bkp,total_recomendacoes_bkp
0,40211,INCIDENTE,RIO DE JANEIRO,RJ,,2010-01-03,12:00:00,0.0,RJ,0.0
1,40349,INCIDENTE,BELÉM,PA,SBBE,2010-01-03,11:05:00,0.0,PA,0.0
2,40351,INCIDENTE,RIO DE JANEIRO,RJ,SBRJ,2010-01-03,03:00:00,0.0,RJ,0.0
3,39527,ACIDENTE,LUCAS DO RIO VERDE,MT,,2010-01-04,17:30:00,0.0,MT,0.0
4,40324,INCIDENTE,PELOTAS,RS,SBPK,2010-01-05,19:25:00,0.0,RS,0.0


In [53]:
#Realizando a exclusão do backup de uma coluna no DataFrame
df.drop(['total_recomendacoes_bkp'], axis=1, inplace=True)

In [54]:
#Listando o DataFrame com as cinco primeiras linhas
df.head()

Unnamed: 0,codigo_ocorrencia2,ocorrencia_classificacao,ocorrencia_cidade,ocorrencia_uf,ocorrencia_aerodromo,ocorrencia_dia,ocorrencia_hora,total_recomendacoes,ocorrencia_uf_bkp
0,40211,INCIDENTE,RIO DE JANEIRO,RJ,,2010-01-03,12:00:00,0.0,RJ
1,40349,INCIDENTE,BELÉM,PA,SBBE,2010-01-03,11:05:00,0.0,PA
2,40351,INCIDENTE,RIO DE JANEIRO,RJ,SBRJ,2010-01-03,03:00:00,0.0,RJ
3,39527,ACIDENTE,LUCAS DO RIO VERDE,MT,,2010-01-04,17:30:00,0.0,MT
4,40324,INCIDENTE,PELOTAS,RS,SBPK,2010-01-05,19:25:00,0.0,RS


In [55]:
#Deletar todos os valores não informados
df.dropna()

Unnamed: 0,codigo_ocorrencia2,ocorrencia_classificacao,ocorrencia_cidade,ocorrencia_uf,ocorrencia_aerodromo,ocorrencia_dia,ocorrencia_hora,total_recomendacoes,ocorrencia_uf_bkp
0,40211,INCIDENTE,RIO DE JANEIRO,RJ,,2010-01-03,12:00:00,0.0,RJ
1,40349,INCIDENTE,BELÉM,PA,SBBE,2010-01-03,11:05:00,0.0,PA
2,40351,INCIDENTE,RIO DE JANEIRO,RJ,SBRJ,2010-01-03,03:00:00,0.0,RJ
4,40324,INCIDENTE,PELOTAS,RS,SBPK,2010-01-05,19:25:00,0.0,RS
6,40215,INCIDENTE,COARI,AM,SBUY,2010-01-07,18:40:00,0.0,AM
...,...,...,...,...,...,...,...,...,...
5743,79824,ACIDENTE,RIO PARANAÍBA,MG,SNRP,2020-12-28,17:00:00,0.0,MG
5746,79769,INCIDENTE GRAVE,MANOEL URBANO,AC,SIMB,2020-12-29,18:30:00,0.0,AC
5747,79804,INCIDENTE,CAMPINAS,SP,SBKP,2020-12-29,19:00:00,0.0,SP
5749,79802,INCIDENTE,RIO DE JANEIRO,RJ,SBGL,2020-12-30,00:54:00,0.0,RJ


In [56]:
#Listando o DataFrame
df

Unnamed: 0,codigo_ocorrencia2,ocorrencia_classificacao,ocorrencia_cidade,ocorrencia_uf,ocorrencia_aerodromo,ocorrencia_dia,ocorrencia_hora,total_recomendacoes,ocorrencia_uf_bkp
0,40211,INCIDENTE,RIO DE JANEIRO,RJ,,2010-01-03,12:00:00,0.0,RJ
1,40349,INCIDENTE,BELÉM,PA,SBBE,2010-01-03,11:05:00,0.0,PA
2,40351,INCIDENTE,RIO DE JANEIRO,RJ,SBRJ,2010-01-03,03:00:00,0.0,RJ
3,39527,ACIDENTE,LUCAS DO RIO VERDE,MT,,2010-01-04,17:30:00,0.0,MT
4,40324,INCIDENTE,PELOTAS,RS,SBPK,2010-01-05,19:25:00,0.0,RS
...,...,...,...,...,...,...,...,...,...
5747,79804,INCIDENTE,CAMPINAS,SP,SBKP,2020-12-29,19:00:00,0.0,SP
5748,79757,INCIDENTE GRAVE,LAGOA DA CONFUSÃO,TO,,2020-12-30,18:30:00,0.0,TO
5749,79802,INCIDENTE,RIO DE JANEIRO,RJ,SBGL,2020-12-30,00:54:00,0.0,RJ
5750,79756,INCIDENTE GRAVE,VICENTINA,MS,,2020-12-31,09:00:00,0.0,MS


In [57]:
#Deletar os dados na por coluna ocorrencia_uf
df.dropna(subset=['ocorrencia_uf'])

Unnamed: 0,codigo_ocorrencia2,ocorrencia_classificacao,ocorrencia_cidade,ocorrencia_uf,ocorrencia_aerodromo,ocorrencia_dia,ocorrencia_hora,total_recomendacoes,ocorrencia_uf_bkp
0,40211,INCIDENTE,RIO DE JANEIRO,RJ,,2010-01-03,12:00:00,0.0,RJ
1,40349,INCIDENTE,BELÉM,PA,SBBE,2010-01-03,11:05:00,0.0,PA
2,40351,INCIDENTE,RIO DE JANEIRO,RJ,SBRJ,2010-01-03,03:00:00,0.0,RJ
3,39527,ACIDENTE,LUCAS DO RIO VERDE,MT,,2010-01-04,17:30:00,0.0,MT
4,40324,INCIDENTE,PELOTAS,RS,SBPK,2010-01-05,19:25:00,0.0,RS
...,...,...,...,...,...,...,...,...,...
5747,79804,INCIDENTE,CAMPINAS,SP,SBKP,2020-12-29,19:00:00,0.0,SP
5748,79757,INCIDENTE GRAVE,LAGOA DA CONFUSÃO,TO,,2020-12-30,18:30:00,0.0,TO
5749,79802,INCIDENTE,RIO DE JANEIRO,RJ,SBGL,2020-12-30,00:54:00,0.0,RJ
5750,79756,INCIDENTE GRAVE,VICENTINA,MS,,2020-12-31,09:00:00,0.0,MS


In [58]:
#Listando o DataFrame
df

Unnamed: 0,codigo_ocorrencia2,ocorrencia_classificacao,ocorrencia_cidade,ocorrencia_uf,ocorrencia_aerodromo,ocorrencia_dia,ocorrencia_hora,total_recomendacoes,ocorrencia_uf_bkp
0,40211,INCIDENTE,RIO DE JANEIRO,RJ,,2010-01-03,12:00:00,0.0,RJ
1,40349,INCIDENTE,BELÉM,PA,SBBE,2010-01-03,11:05:00,0.0,PA
2,40351,INCIDENTE,RIO DE JANEIRO,RJ,SBRJ,2010-01-03,03:00:00,0.0,RJ
3,39527,ACIDENTE,LUCAS DO RIO VERDE,MT,,2010-01-04,17:30:00,0.0,MT
4,40324,INCIDENTE,PELOTAS,RS,SBPK,2010-01-05,19:25:00,0.0,RS
...,...,...,...,...,...,...,...,...,...
5747,79804,INCIDENTE,CAMPINAS,SP,SBKP,2020-12-29,19:00:00,0.0,SP
5748,79757,INCIDENTE GRAVE,LAGOA DA CONFUSÃO,TO,,2020-12-30,18:30:00,0.0,TO
5749,79802,INCIDENTE,RIO DE JANEIRO,RJ,SBGL,2020-12-30,00:54:00,0.0,RJ
5750,79756,INCIDENTE GRAVE,VICENTINA,MS,,2020-12-31,09:00:00,0.0,MS


In [59]:
#Deletar dados duplicados no DataFrame
df.drop_duplicates(inplace=True)

In [60]:
#Listando o DataFrame
df

Unnamed: 0,codigo_ocorrencia2,ocorrencia_classificacao,ocorrencia_cidade,ocorrencia_uf,ocorrencia_aerodromo,ocorrencia_dia,ocorrencia_hora,total_recomendacoes,ocorrencia_uf_bkp
0,40211,INCIDENTE,RIO DE JANEIRO,RJ,,2010-01-03,12:00:00,0.0,RJ
1,40349,INCIDENTE,BELÉM,PA,SBBE,2010-01-03,11:05:00,0.0,PA
2,40351,INCIDENTE,RIO DE JANEIRO,RJ,SBRJ,2010-01-03,03:00:00,0.0,RJ
3,39527,ACIDENTE,LUCAS DO RIO VERDE,MT,,2010-01-04,17:30:00,0.0,MT
4,40324,INCIDENTE,PELOTAS,RS,SBPK,2010-01-05,19:25:00,0.0,RS
...,...,...,...,...,...,...,...,...,...
5747,79804,INCIDENTE,CAMPINAS,SP,SBKP,2020-12-29,19:00:00,0.0,SP
5748,79757,INCIDENTE GRAVE,LAGOA DA CONFUSÃO,TO,,2020-12-30,18:30:00,0.0,TO
5749,79802,INCIDENTE,RIO DE JANEIRO,RJ,SBGL,2020-12-30,00:54:00,0.0,RJ
5750,79756,INCIDENTE GRAVE,VICENTINA,MS,,2020-12-31,09:00:00,0.0,MS
