# Importando o DataSet

In [None]:
import pandas as pd
import seaborn as sns

In [None]:
# Carregar o DataFrame Titanic
df_titanic = sns.load_dataset('titanic')

> Como visto na aula anterior, uma das etapas mais importantes do **Pré-Processamento** é o que chamamos de **Imputação.**

> Nela, tratamos valores vazios e removemos dados indesejados.

- Agora, a partir do DataSet titânic, faça a imputação dos a dados a partir das exigências!

## Imputação

Decisões em cima dos dados vazios:
- **Média**
  - Desvio Padrão
- **Moda**
- **Mediana**
- **Remoção Total:** Remoção de todos os valores faltantes, importante para colunas inuteis.

Funções usadas:
- `.fillna()` : preenche os valores vazios com algum valor.
    - `.fillna(df['exemplo'].mean(), inplace=True)`
- `.replace()` : substitui um valor por outro.
    - `df['floor'].replace('-', np.nan)`
- `.dropna()` : remove linhas que contem valores vazios.
    - `dropna(subset=['exemplo'], inplace=True)`
- `drop()` : remove uma coluna.
    - `.drop(columns=['exemplo_1', 'exemplo_2', ..., 'exemplo_n'], inplace=True)`

obs: o uso do `inplace=True`, faz com que os dados alterados, sejam alterados no DataFrame príncipal.

In [None]:
df_titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [None]:
df_titanic.shape

(891, 15)

In [None]:
df_titanic.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


1. Imputação da coluna 'age' com a mediana

In [None]:
# mostrando valores ausentes antes da imputacao
print("Valores ausentes antes da imputação: ", df_titanic['age'].isnull().sum())

Valores ausentes antes da imputação:  177


In [None]:
# printando a coluna 'age' para mostrar valores ausentes
print(df_titanic[['age']].tail(15))

      age
876  20.0
877  19.0
878   NaN
879  56.0
880  25.0
881  33.0
882  22.0
883  28.0
884  25.0
885  39.0
886  27.0
887  19.0
888   NaN
889  26.0
890  32.0


In [None]:
# calculando a mediana
mediana_idade = df_titanic['age'].median()
print("Mediana da coluna 'age': ", mediana_idade)

Mediana da coluna 'age':  28.0


In [None]:
# imputação dos valores ausentes
df_titanic['age'].fillna(mediana_idade, inplace=True)

In [None]:
#conferindo os valores ausentes
print("Valores ausentes apos imputacao: ", df_titanic['age'].isnull().sum())

Valores ausentes apos imputacao:  0


In [None]:
# dataframe com os valores depois da imputação
print(df_titanic[['age']].tail(15))

      age
876  20.0
877  19.0
878  28.0
879  56.0
880  25.0
881  33.0
882  22.0
883  28.0
884  25.0
885  39.0
886  27.0
887  19.0
888  28.0
889  26.0
890  32.0


2. Decisão de remover valores faltantes na coluna 'embark_town'

In [None]:
# printando valores da coluna 'embark_town'
print(df_titanic[['embark_town']].head())

   embark_town
0  Southampton
1    Cherbourg
2  Southampton
3  Southampton
4  Southampton


In [None]:
# printando quantidade de valores faltantes da coluna 'embark_town'
print("quantidade de valores faltantes: ", df_titanic['embark_town'].isnull().sum())

quantidade de valores faltantes:  2


In [None]:
# drop das linhas que possuem valores faltantes da coluna 'embark_town' e print para conferir os valores faltantes
df_titanic.dropna(subset=['embark_town'], inplace=True)

In [None]:
print("quantidade de valores faltantes: ", df_titanic['embark_town'].isnull().sum())

quantidade de valores faltantes:  0


3. Remover colunas 'Alive', 'Class', 'embarked', 'deck', 'alive', 'adult_male' e 'embark_town'

In [None]:
# drop das colunas selecionadas
df_titanic.drop(columns=['alive', 'class', 'embarked', 'deck', 'adult_male', 'embark_town'], inplace=True)

In [None]:
# mostrando o dataframe apos retirada das colunas
df_titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,who,alone
0,0,3,male,22.0,1,0,7.25,man,False
1,1,1,female,38.0,1,0,71.2833,woman,False
2,1,3,female,26.0,0,0,7.925,woman,True
3,1,1,female,35.0,1,0,53.1,woman,False
4,0,3,male,35.0,0,0,8.05,man,True


- A decisão de remover estas colunas, se da pelas redundancias delas.

Verificação

In [None]:
# Verifique novamente os valores ausentes
df_titanic.isnull().sum()

Unnamed: 0,0
survived,0
pclass,0
sex,0
age,0
sibsp,0
parch,0
fare,0
who,0
alone,0


In [None]:
# Me de o resumo do DataFrame
df_titanic.info()

<class 'pandas.core.frame.DataFrame'>
Index: 889 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  889 non-null    int64  
 1   pclass    889 non-null    int64  
 2   sex       889 non-null    object 
 3   age       889 non-null    float64
 4   sibsp     889 non-null    int64  
 5   parch     889 non-null    int64  
 6   fare      889 non-null    float64
 7   who       889 non-null    object 
 8   alone     889 non-null    bool   
dtypes: bool(1), float64(2), int64(4), object(2)
memory usage: 63.4+ KB


In [None]:
df_titanic.shape

(889, 9)

In [None]:
df_titanic.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,889.0,889.0,889.0,889.0,889.0,889.0
mean,0.382452,2.311586,29.315152,0.524184,0.382452,32.096681
std,0.48626,0.8347,12.984932,1.103705,0.806761,49.697504
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,22.0,0.0,0.0,7.8958
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,35.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [None]:
# Visualize novamente a cabeça do DataFrame
df_titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,who,alone
0,0,3,male,22.0,1,0,7.25,man,False
1,1,1,female,38.0,1,0,71.2833,woman,False
2,1,3,female,26.0,0,0,7.925,woman,True
3,1,1,female,35.0,1,0,53.1,woman,False
4,0,3,male,35.0,0,0,8.05,man,True
