# 0. Índice <a name="Contents"></a>
1. [Importando bibliotecas](#import)
2. [Construindo um dataframe](#read)
3. [Identificando dados ausentes/missing](#identificando)
4. [Tratando dados ausentes/missing](#tratando)
5. [Dados duplicados](#duplicados)
6. [Mapeamento](#map)



# 1. Importando bibliotecas <a name="import"></a>

<div style="text-align: right"
     
[Voltar ao índice](#Contents)

In [1]:
import pandas as pd
import numpy as np

# 2. Construindo um dataframe <a name="read"></a>
<div style="text-align: right"
     
[Voltar ao índice](#Contents)

In [2]:
df = pd.DataFrame(
    np.random.randn(9, 4)*100,
    index=["A", "B", "C", "D", "E", "F", "G", "H", "I"],
    columns=["coluna1", "coluna2", "coluna3","coluna4"],
)
df

Unnamed: 0,coluna1,coluna2,coluna3,coluna4
A,37.953582,-67.442977,-73.769612,-10.159269
B,139.9499,-50.458165,37.011801,-37.72863
C,-27.217205,-101.427183,-26.773712,175.222603
D,-159.484467,-187.087899,-38.484682,-34.360479
E,162.159503,191.139872,-61.321718,153.668198
F,78.543959,17.981604,69.4069,122.530173
G,-68.44854,29.01261,27.689917,105.278341
H,117.208783,141.097471,-165.066589,-52.618897
I,-130.318826,-47.955866,-44.402117,-79.955325


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9 entries, A to I
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   coluna1  9 non-null      float64
 1   coluna2  9 non-null      float64
 2   coluna3  9 non-null      float64
 3   coluna4  9 non-null      float64
dtypes: float64(4)
memory usage: 360.0+ bytes


## Alterando os tipos de dados

In [4]:
df['coluna1'] = df['coluna1'].astype(int)

In [5]:
df.dtypes

coluna1      int32
coluna2    float64
coluna3    float64
coluna4    float64
dtype: object

In [6]:
df['coluna3'] = df['coluna3'].astype(str)

In [7]:
df.dtypes

coluna1      int32
coluna2    float64
coluna3     object
coluna4    float64
dtype: object

In [8]:
df

Unnamed: 0,coluna1,coluna2,coluna3,coluna4
A,37,-67.442977,-73.76961185869824,-10.159269
B,139,-50.458165,37.01180093049798,-37.72863
C,-27,-101.427183,-26.77371217954196,175.222603
D,-159,-187.087899,-38.48468160945872,-34.360479
E,162,191.139872,-61.32171791422029,153.668198
F,78,17.981604,69.40689951570812,122.530173
G,-68,29.01261,27.689916965797455,105.278341
H,117,141.097471,-165.06658947606712,-52.618897
I,-130,-47.955866,-44.40211703807853,-79.955325


## Acrescentando dados faltantes na tabela

In [9]:
df.iloc[4,2]

'-61.32171791422029'

In [10]:
df.iloc[4,2] = np.nan

In [11]:
df

Unnamed: 0,coluna1,coluna2,coluna3,coluna4
A,37,-67.442977,-73.76961185869824,-10.159269
B,139,-50.458165,37.01180093049798,-37.72863
C,-27,-101.427183,-26.77371217954196,175.222603
D,-159,-187.087899,-38.48468160945872,-34.360479
E,162,191.139872,,153.668198
F,78,17.981604,69.40689951570812,122.530173
G,-68,29.01261,27.689916965797455,105.278341
H,117,141.097471,-165.06658947606712,-52.618897
I,-130,-47.955866,-44.40211703807853,-79.955325


In [12]:
df.iloc[1,0] = np.nan
df.iloc[4,0] = np.nan
df.iloc[3,0] = np.nan
df.iloc[8,0] = np.nan
df.iloc[6,0] = np.nan
df.iloc[4,3] = np.nan
df.iloc[4,3] = np.nan
df.iloc[8,3] = np.nan

In [13]:
df

Unnamed: 0,coluna1,coluna2,coluna3,coluna4
A,37.0,-67.442977,-73.76961185869824,-10.159269
B,,-50.458165,37.01180093049798,-37.72863
C,-27.0,-101.427183,-26.77371217954196,175.222603
D,,-187.087899,-38.48468160945872,-34.360479
E,,191.139872,,
F,78.0,17.981604,69.40689951570812,122.530173
G,,29.01261,27.689916965797455,105.278341
H,117.0,141.097471,-165.06658947606712,-52.618897
I,,-47.955866,-44.40211703807853,


In [14]:
# Os tipos de dados podem mudar após acrescentar um dado faltante
df.dtypes

coluna1    float64
coluna2    float64
coluna3     object
coluna4    float64
dtype: object

# 3. Identificando dados ausentes <a name="identificando"></a>
<div style="text-align: right"
     
[Voltar ao índice](#Contents)

In [15]:
df.isna()

Unnamed: 0,coluna1,coluna2,coluna3,coluna4
A,False,False,False,False
B,True,False,False,False
C,False,False,False,False
D,True,False,False,False
E,True,False,True,True
F,False,False,False,False
G,True,False,False,False
H,False,False,False,False
I,True,False,False,True


In [16]:
# Alias do isna
df.isnull()

Unnamed: 0,coluna1,coluna2,coluna3,coluna4
A,False,False,False,False
B,True,False,False,False
C,False,False,False,False
D,True,False,False,False
E,True,False,True,True
F,False,False,False,False
G,True,False,False,False
H,False,False,False,False
I,True,False,False,True


In [17]:
df['coluna1'].isna()

A    False
B     True
C    False
D     True
E     True
F    False
G     True
H    False
I     True
Name: coluna1, dtype: bool

In [18]:
df[df['coluna1'].isna()]

Unnamed: 0,coluna1,coluna2,coluna3,coluna4
B,,-50.458165,37.01180093049798,-37.72863
D,,-187.087899,-38.48468160945872,-34.360479
E,,191.139872,,
G,,29.01261,27.689916965797455,105.278341
I,,-47.955866,-44.40211703807853,


In [19]:
df[~df['coluna1'].isna()]

Unnamed: 0,coluna1,coluna2,coluna3,coluna4
A,37.0,-67.442977,-73.76961185869824,-10.159269
C,-27.0,-101.427183,-26.77371217954196,175.222603
F,78.0,17.981604,69.40689951570812,122.530173
H,117.0,141.097471,-165.06658947606712,-52.618897


In [20]:
df['coluna1']

A     37.0
B      NaN
C    -27.0
D      NaN
E      NaN
F     78.0
G      NaN
H    117.0
I      NaN
Name: coluna1, dtype: float64

In [21]:
df['coluna1'].isna()

A    False
B     True
C    False
D     True
E     True
F    False
G     True
H    False
I     True
Name: coluna1, dtype: bool

In [22]:
df['coluna1'].isna().sum()

5

In [23]:
df.isna().sum()

coluna1    5
coluna2    0
coluna3    1
coluna4    2
dtype: int64

In [24]:
df['coluna2']

A    -67.442977
B    -50.458165
C   -101.427183
D   -187.087899
E    191.139872
F     17.981604
G     29.012610
H    141.097471
I    -47.955866
Name: coluna2, dtype: float64

In [25]:
df['coluna2'].isna().sum()

0

In [32]:
percentage = (df.isnull().sum() / len(df)) * 100
percentage.round(2)

coluna1    55.56
coluna2     0.00
coluna3    11.11
coluna4    22.22
dtype: float64

# 4. Tratando dados ausentes <a name="tratando"></a>
<div style="text-align: right"
     
[Voltar ao índice](#Contents)

In [33]:
df['coluna1']

A     37.0
B      NaN
C    -27.0
D      NaN
E      NaN
F     78.0
G      NaN
H    117.0
I      NaN
Name: coluna1, dtype: float64

## Substituindo por 0

In [53]:
df['coluna1'].fillna(0)

A     37.0
B      0.0
C    -27.0
D      0.0
E      0.0
F     78.0
G      0.0
H    117.0
I      0.0
Name: coluna1, dtype: float64

## Substituindo pela média

In [57]:
df['coluna1']

A     37.0
B      NaN
C    -27.0
D      NaN
E      NaN
F     78.0
G      NaN
H    117.0
I      NaN
Name: coluna1, dtype: float64

In [59]:
(37-27+78+117)/4

51.25

In [60]:
df['coluna1'].mean()

51.25

In [62]:
med_col1 = df['coluna1'].mean()

In [63]:
df['coluna1'].fillna(med_col1)

A     37.00
B     51.25
C    -27.00
D     51.25
E     51.25
F     78.00
G     51.25
H    117.00
I     51.25
Name: coluna1, dtype: float64

## Substituindo pela mediana

In [64]:
df['coluna1']

A     37.0
B      NaN
C    -27.0
D      NaN
E      NaN
F     78.0
G      NaN
H    117.0
I      NaN
Name: coluna1, dtype: float64

In [65]:
df['coluna1'].sort_values()

C    -27.0
A     37.0
F     78.0
H    117.0
B      NaN
D      NaN
E      NaN
G      NaN
I      NaN
Name: coluna1, dtype: float64

In [66]:
(37+78)/2

57.5

In [67]:
df['coluna1'].median()

57.5

In [68]:
mediana_col1 = df['coluna1'].median()

In [69]:
df['coluna1'].fillna(mediana_col1)

A     37.0
B     57.5
C    -27.0
D     57.5
E     57.5
F     78.0
G     57.5
H    117.0
I     57.5
Name: coluna1, dtype: float64

In [81]:
df['coluna1']

A     37.0
B      NaN
C    -27.0
D      NaN
E      NaN
F     78.0
G      NaN
H    117.0
I      NaN
Name: coluna1, dtype: float64

In [78]:
df['coluna1'].fillna(method='ffill')

A     37.0
B     37.0
C    -27.0
D    -27.0
E    -27.0
F     78.0
G     78.0
H    117.0
I    117.0
Name: coluna1, dtype: float64

In [80]:
df['coluna1'].dropna()

A     37.0
C    -27.0
F     78.0
H    117.0
Name: coluna1, dtype: float64

In [82]:
# dropar todas as linhas que tenha pelo menos 1 NA
df.dropna()

Unnamed: 0,coluna1,coluna2,coluna3,coluna4
A,37.0,-67.442977,-73.76961185869824,-10.159269
C,-27.0,-101.427183,-26.77371217954196,175.222603
F,78.0,17.981604,69.40689951570812,122.530173
H,117.0,141.097471,-165.06658947606712,-52.618897


# 5. Dados duplicados <a name="duplicados"></a>
<div style="text-align: right"
     
[Voltar ao índice](#Contents)

In [91]:
df_dup = df.append(df.loc['D':'H',:]).sort_index()
df_dup

  df_dup = df.append(df.loc['D':'H',:]).sort_index()


Unnamed: 0,coluna1,coluna2,coluna3,coluna4
A,37.0,-67.442977,-73.76961185869824,-10.159269
B,,-50.458165,37.01180093049798,-37.72863
C,-27.0,-101.427183,-26.77371217954196,175.222603
D,,-187.087899,-38.48468160945872,-34.360479
D,,-187.087899,-38.48468160945872,-34.360479
E,,191.139872,,
E,,191.139872,,
F,78.0,17.981604,69.40689951570812,122.530173
F,78.0,17.981604,69.40689951570812,122.530173
G,,29.01261,27.689916965797455,105.278341


In [94]:
df_dup.drop_duplicates()

Unnamed: 0,coluna1,coluna2,coluna3,coluna4
A,37.0,-67.442977,-73.76961185869824,-10.159269
B,,-50.458165,37.01180093049798,-37.72863
C,-27.0,-101.427183,-26.77371217954196,175.222603
D,,-187.087899,-38.48468160945872,-34.360479
E,,191.139872,,
F,78.0,17.981604,69.40689951570812,122.530173
G,,29.01261,27.689916965797455,105.278341
H,117.0,141.097471,-165.06658947606712,-52.618897
I,,-47.955866,-44.40211703807853,


In [96]:
df_dup.drop_duplicates(subset=['coluna1'])

Unnamed: 0,coluna1,coluna2,coluna3,coluna4
A,37.0,-67.442977,-73.76961185869824,-10.159269
B,,-50.458165,37.01180093049798,-37.72863
C,-27.0,-101.427183,-26.77371217954196,175.222603
F,78.0,17.981604,69.40689951570812,122.530173
H,117.0,141.097471,-165.06658947606712,-52.618897


In [97]:
df_dup.duplicated()

A    False
B    False
C    False
D    False
D     True
E    False
E     True
F    False
F     True
G    False
G     True
H    False
H     True
I    False
dtype: bool

In [47]:
df_dup[df_dup.duplicated()]

Unnamed: 0,coluna1,coluna2,coluna3,coluna4
D,,-64.290138,-50.891017163242445,180.396043
E,,46.159592,,
F,86.0,115.97403,99.22420792191552,-16.118702
G,,117.71731,-249.37440300775089,-79.436463
H,2.0,78.292079,-150.65952036118293,-99.009263


# 6. Mapeamentos <a name="map"></a>
<div style="text-align: right"
     
[Voltar ao índice](#Contents)

In [102]:
# 1 feminino, 0 masculino
genero = pd.Series([1,0,1,1,1,1,0,0,0,1,1,0])
genero

0     1
1     0
2     1
3     1
4     1
5     1
6     0
7     0
8     0
9     1
10    1
11    0
dtype: int64

In [103]:
genero.map({1:'Feminino', 0:'Masculino'})

0      Feminino
1     Masculino
2      Feminino
3      Feminino
4      Feminino
5      Feminino
6     Masculino
7     Masculino
8     Masculino
9      Feminino
10     Feminino
11    Masculino
dtype: object

In [50]:
genero_2 = genero.map({1:'Feminino', 0:'Masculino'})

In [51]:
genero.map({1:'Feminino', 2:'Masculino'})

0     Feminino
1          NaN
2     Feminino
3     Feminino
4     Feminino
5     Feminino
6          NaN
7          NaN
8          NaN
9     Feminino
10    Feminino
11         NaN
dtype: object

In [52]:
genero_2.map('Genero: {}'.format)

0      Genero: Feminino
1     Genero: Masculino
2      Genero: Feminino
3      Genero: Feminino
4      Genero: Feminino
5      Genero: Feminino
6     Genero: Masculino
7     Genero: Masculino
8     Genero: Masculino
9      Genero: Feminino
10     Genero: Feminino
11    Genero: Masculino
dtype: object