## **IMPORTES**

In [None]:
# Data handling
import pandas as pd
import numpy as np

# Pre-processing data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

## **INTEGRAÇÃO DE DADOS**

In [None]:
train = pd.read_csv(filepath_or_buffer="../data/train.csv")
test = pd.read_csv(filepath_or_buffer="../data/test.csv")
df = pd.concat(objs=[train, test], axis=0, ignore_index=True, copy=True)

## **LIMPEZA DE DADOS**

In [None]:
# Remoção de colunas não numéricas
df = df.select_dtypes(exclude=['object'])
df.drop(columns=['pctsomecol18_24'], inplace=True)

In [None]:
sns.boxplot(df['target_deathrate'])

In [None]:
print(df[(df['target_deathrate'] < 280)  & (df['target_deathrate'] > 70)].shape)
print(df.shape)

In [None]:
df = df[(df['target_deathrate'] < 280)  & (df['target_deathrate'] > 70)]
sns.boxplot(df['target_deathrate'])

## **AMOSTRAGEM DE DADOS**

In [None]:
X = df.drop(columns=['target_deathrate'])
y = df['target_deathrate']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)

In [None]:
print(y_train.shape)
print(X_train.shape)
print(y_test.shape)
print(X_test.shape)

## **VALORES FALTANTES**

In [None]:
pd.DataFrame(data={
    "Percentual (%)": round((X_train.isnull().sum()[(X_train.isnull().sum() > 0)] / X_train.shape[0]) * 100, 2),
    "Quantidade": X_train.isnull().sum()[(X_train.isnull().sum() > 0)]
})

In [None]:
X_train['pctemployed16_over'] = X_train['pctemployed16_over'].fillna(value=X_train['pctemployed16_over'].mean())
X_train['pctprivatecoveragealone'] = X_train['pctprivatecoveragealone'].fillna(value=X_train['pctprivatecoveragealone'].mean())

In [None]:
pd.DataFrame(data={
    "Percentual (%)": round((X_test.isnull().sum()[(X_test.isnull().sum() > 0)] / X_test.shape[0]) * 100, 2),
    "Quantidade": X_test.isnull().sum()[(X_test.isnull().sum() > 0)]
})

In [None]:
X_test['pctemployed16_over'] = X_test['pctemployed16_over'].fillna(value=X_test['pctemployed16_over'].mean())
X_test['pctprivatecoveragealone'] = X_test['pctprivatecoveragealone'].fillna(value=X_test['pctprivatecoveragealone'].mean())

## **NORMALIZAÇÃO COLUNA ALVO**

In [None]:
y_train = np.log1p(y_train)
y_test = np.log1p(y_test)

In [None]:
plt.figure(figsize=(10,5))
sns.histplot(data=y_train, kde=True)

In [None]:
plt.figure(figsize=(10,5))
sns.histplot(data=y_test, kde=True)

## **PADRONIZAÇÃO DOS DADOS**


### $z_{score} = \frac{x - x_{media}}{x_{std}}$

In [None]:
scaler = StandardScaler()
scaler.fit(X=X_train)

X_train = scaler.transform(X=X_train)
X_test = scaler.transform(X=X_test)

## **PERSISTINDO DADOS**

In [None]:
train = pd.DataFrame(data=X_train, columns=df.drop(columns=['target_deathrate']).columns)
train['target_deathrate'] = y_train.to_list()
train.to_csv(path_or_buf='../data/train_process.csv', index=False)
train

In [None]:
test = pd.DataFrame(data=X_test, columns=df.drop(columns=['target_deathrate']).columns)
test['target_deathrate'] = y_test.to_list()
test.to_csv(path_or_buf='../data/test_process.csv', index=False)
test

## **MATRIZ DE GRÁFICO DE DISPERÇÃO**

In [None]:
sns.pairplot(
    data=train,
    kind='reg',
    plot_kws={'line_kws':{'color':'red'}}
)  # 'reg' para adicionar linha de regressão