# Preparação dos dados

#### 1 - Carga dos dados no dataframe

In [1]:
import pandas as pd

# Realizando parse de data em coluna única no momento da carga dos dados no dataframe
df = pd.read_csv("../../Data/Raw/household_power_consumption.txt", sep=";", parse_dates=[['Date','Time']]) 

df.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,Date_Time,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
0,2006-12-16 17:24:00,4.216,0.418,234.84,18.4,0.0,1.0,17.0
1,2006-12-16 17:25:00,5.36,0.436,233.63,23.0,0.0,1.0,16.0
2,2006-12-16 17:26:00,5.374,0.498,233.29,23.0,0.0,2.0,17.0
3,2006-12-16 17:27:00,5.388,0.502,233.74,23.0,0.0,1.0,17.0
4,2006-12-16 17:28:00,3.666,0.528,235.68,15.8,0.0,1.0,17.0


#### 2 - Verificações

In [2]:
df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075259 entries, 0 to 2075258
Data columns (total 8 columns):
 #   Column                 Non-Null Count    Dtype         
---  ------                 --------------    -----         
 0   Date_Time              2075259 non-null  datetime64[ns]
 1   Global_active_power    2075259 non-null  object        
 2   Global_reactive_power  2075259 non-null  object        
 3   Voltage                2075259 non-null  object        
 4   Global_intensity       2075259 non-null  object        
 5   Sub_metering_1         2075259 non-null  object        
 6   Sub_metering_2         2075259 non-null  object        
 7   Sub_metering_3         2049280 non-null  float64       
dtypes: datetime64[ns](1), float64(1), object(6)
memory usage: 126.7+ MB


> Podemos verificar que Sub_metering_3 contém menos valores não nulos e tem tipo diferente das outras. Pela descrição do problema todas deveriam ser numéricas

In [3]:
df.describe(include='all')

  df.describe(include='all')


Unnamed: 0,Date_Time,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
count,2075259,2075259,2075259.0,2075259,2075259.0,2075259.0,2075259.0,2049280.0
unique,2075259,6534,896.0,5168,377.0,153.0,145.0,
top,2006-12-16 17:24:00,?,0.0,?,1.0,0.0,0.0,
freq,1,25979,472786.0,25979,169406.0,1840611.0,1408274.0,
first,2006-12-16 17:24:00,,,,,,,
last,2010-12-11 23:59:00,,,,,,,
mean,,,,,,,,6.458447
std,,,,,,,,8.437154
min,,,,,,,,0.0
25%,,,,,,,,0.0


> Descrição mostra **NaN** para colunas que deveriam ser numéricas, Ex.: Global_active_power, Voltage, etc., possível existência de dados com tipos diferentes

In [4]:
df.dtypes

Date_Time                datetime64[ns]
Global_active_power              object
Global_reactive_power            object
Voltage                          object
Global_intensity                 object
Sub_metering_1                   object
Sub_metering_2                   object
Sub_metering_3                  float64
dtype: object

> Verificação de tipos mostra object para as mesmas colunas, o que reforça a suposição

In [5]:
# Tentativa de converter uma das colunas em float
pd.to_numeric(df['Voltage'])


ValueError: Unable to parse string "?" at position 6839

> Erro dada a impossibilidade de converter a string '?' em número. Verificando o arquivo conseguimos ver que os valores faltantes estão marcado dessa forma.
![missing values](missing_values.png)

#### 3 - Ajustes

In [6]:
# Elimina as linhas com "?" definidos como dados faltantes
for col in df.columns:
    df = df[df[col] != "?"]

In [7]:
# Converte todas as colunas em float, menos a primeira que é do tipo datetime
for col in df.columns[1:]:
    df[col] = pd.to_numeric(df[col], downcast="float")

In [8]:
df.dtypes

Date_Time                datetime64[ns]
Global_active_power             float32
Global_reactive_power           float32
Voltage                         float32
Global_intensity                float32
Sub_metering_1                  float32
Sub_metering_2                  float32
Sub_metering_3                  float32
dtype: object

In [9]:
df.describe(include='all')

  df.describe(include='all')


Unnamed: 0,Date_Time,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
count,2049280,2049280.0,2049280.0,2049280.0,2049280.0,2049280.0,2049280.0,2049280.0
unique,2049280,,,,,,,
top,2006-12-16 17:24:00,,,,,,,
freq,1,,,,,,,
first,2006-12-16 17:24:00,,,,,,,
last,2010-12-11 23:59:00,,,,,,,
mean,,1.091615,0.1237145,240.8398,4.627758,1.121923,1.29852,6.458447
std,,1.057294,0.112722,3.239987,4.444396,6.153031,5.822026,8.437154
min,,0.076,0.0,223.2,0.2,0.0,0.0,0.0
25%,,0.308,0.048,238.99,1.4,0.0,0.0,0.0


#### 4 - Enriquecendo os dados com novas categorias (feature engenieering)

> Criando novas colunas de ano e mes. Podem ser importantes para padrões de consumos por mês ou ano.

In [10]:
df['Month'] =df['Date_Time'].dt.month

df['Year'] = df['Date_Time'].dt.year

df.head(3)

Unnamed: 0,Date_Time,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,Month,Year
0,2006-12-16 17:24:00,4.216,0.418,234.839996,18.4,0.0,1.0,17.0,12,2006
1,2006-12-16 17:25:00,5.36,0.436,233.630005,23.0,0.0,1.0,16.0,12,2006
2,2006-12-16 17:26:00,5.374,0.498,233.289993,23.0,0.0,2.0,17.0,12,2006


> Criando coluna para as estações do ano. Partiremos da premissa que as medições são realizadas no hemisfério norte e que só há mudanças de estação no início dos meses. Podem ser importantes para padrões de consumos por estação.
> 
> - Dez -> Fev = Inverno (1)<br>
> - Mar -> Mai = Primavera (2)<br> 
> - Jun -> Ago = Verão (3)<br>
> - Set -> Nov = Outono (4)<br>

In [11]:
seasons = [1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 1]

month_to_season = dict(zip(range(1,13), seasons))

df['Season']= df['Month'].map(month_to_season) 

df.head(3)

Unnamed: 0,Date_Time,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,Month,Year,Season
0,2006-12-16 17:24:00,4.216,0.418,234.839996,18.4,0.0,1.0,17.0,12,2006,1
1,2006-12-16 17:25:00,5.36,0.436,233.630005,23.0,0.0,1.0,16.0,12,2006,1
2,2006-12-16 17:26:00,5.374,0.498,233.289993,23.0,0.0,2.0,17.0,12,2006,1


> Como não existe diferença de importância entre as estações do ano, vamos gerar uma coluna para cada uma. Isso evita de que as distâncias calculadas pelos algoritmos de clusterização não deem mais importância a uma do que a outra.
> Neste caso a columa Season foi gerada por um label encode e as novas serão one hot encode.

In [12]:
df['Winter'] = df['Season'].apply(lambda x: 1 if x == 1 else 0)
df['Spring'] = df['Season'].apply(lambda x: 1 if x == 2 else 0)
df['Summer'] = df['Season'].apply(lambda x: 1 if x == 3 else 0)
df['Autunum'] = df['Season'].apply(lambda x: 1 if x == 4 else 0)

df.head(3)

Unnamed: 0,Date_Time,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,Month,Year,Season,Winter,Spring,Summer,Autunum
0,2006-12-16 17:24:00,4.216,0.418,234.839996,18.4,0.0,1.0,17.0,12,2006,1,1,0,0,0
1,2006-12-16 17:25:00,5.36,0.436,233.630005,23.0,0.0,1.0,16.0,12,2006,1,1,0,0,0
2,2006-12-16 17:26:00,5.374,0.498,233.289993,23.0,0.0,2.0,17.0,12,2006,1,1,0,0,0


In [13]:
# Exclusão da coluna Date_Time e Season que não são mais necessárias

df.drop(['Season'], inplace=True, axis=1)
df.drop(['Date_Time'], inplace=True, axis=1)

In [14]:
df.head()

Unnamed: 0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,Month,Year,Winter,Spring,Summer,Autunum
0,4.216,0.418,234.839996,18.4,0.0,1.0,17.0,12,2006,1,0,0,0
1,5.36,0.436,233.630005,23.0,0.0,1.0,16.0,12,2006,1,0,0,0
2,5.374,0.498,233.289993,23.0,0.0,2.0,17.0,12,2006,1,0,0,0
3,5.388,0.502,233.740005,23.0,0.0,1.0,17.0,12,2006,1,0,0,0
4,3.666,0.528,235.679993,15.8,0.0,1.0,17.0,12,2006,1,0,0,0


> Analisando a parte elétrica. Foram informadas as potências ativas e reativas totais, mas não a aparente total. Pelo triângulo de potências a potência aparente total pode ser descoberta, mas para a nossa análise não fará a diferença. Um consumidor residencial paga pela potência ativa consumida. A potência reativa tem mais a ver com a qualidade do consumo da potência total e só seria importante se o consumidor fosse industrial. No Brasil, por lei, a quantidade de energia reativa retornada à geração não pode ser maior que 8%. Vamos ignorá-la neste caso.

![Triangulo das Potências](triangulo_potencias.png)  fórmula ![Fórmula das Potências](formula_potencias.png)

Analogia

![Analogia](analogia_chopp.png) 

#### 5 - Salvando o dataframe preparado para a análise exploratória

In [15]:
df.to_pickle("../../Data/Processed/df_prepared.pkl")