# Preliminary Exploratory Data Analysis

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.subplots as sp
import plotly.graph_objs as go

In [None]:
# dataset imported from kaggle at: https://www.kaggle.com/datasets/uciml/electric-power-consumption-data-set?resource=download
file_path = 'data/household_power_consumption.zip'

df = pd.read_csv(file_path, delimiter=';')
df.sort_values(by=['Date', 'Time'], inplace=True)
print(df.shape)
df.head()

## Part 1: Data Quality Checks

#### Null/Missing Values

In [None]:
df[(df.eq('?').any(axis=1)) | (df.isna().any(axis=1))]

In [None]:
df.dropna(inplace=True)
df = df[~df.apply(lambda row: row.eq('?').any(), axis =1)]

#### Data Types

In [None]:
df.info()

In [None]:
df['Date_time'] = pd.to_datetime(df['Date'] + ' ' + df['Time'], format="%d/%m/%Y %H:%M:%S")
# df['Date'] = pd.to_datetime(df['Date'], format="%d/%m/%Y")
# df['Time'] = pd.to_datetime(df['Time'], format="%H:%M:%S")
df.drop(columns=['Date', 'Time'], inplace=True)

for column in df.columns:
    if column not in ('Date', 'Time', 'Date_time'):
        print(column)
        df[column] = df[column].astype(float)

#### Duplicates

In [None]:
df[df.duplicated()]

## Part 2: Identifying Data Characterstics

In [None]:
df.describe()

In [None]:
fig = sp.make_subplots(rows=len(df.columns[:-1]), cols=1, subplot_titles=df.columns)
fig.update_layout(height=1000, width=800)

for i, column in enumerate(df.columns[:-1]):
    histogram = go.Histogram(x=df[column], name=column)
    fig.add_trace(histogram, row=i+1, col=1)

fig.update_layout(title_text="Distributions", showlegend=False)
fig.show()

In [None]:
temp  = df.assign(year=df['Date_time'].dt.year)
temp = temp.groupby('year').count().iloc[:, 0]

bar_chart = go.Figure(data=[go.Bar(x=temp.index, y=temp)])

bar_chart.update_layout(title_text='Data Points Per Year', xaxis_title='Year', yaxis_title='Count')

bar_chart.show()

In [None]:

fig = sp.make_subplots(rows=5, cols=1)
fig.update_layout(height=1000, width=1000)

for i, year in enumerate(range(2006, 2011)):
    temp_df = df[df['Date_time'].dt.year == year].sort_values(by='Date_time')
    line_chart = go.Scatter(x=temp_df['Date_time'], y=temp_df['Voltage'], mode='lines', name=year)
    fig.add_trace(line_chart, row=i+1, col=1)
    

fig.update_layout(title_text="Voltage", showlegend=False)
fig.show()

In [None]:
fig = sp.make_subplots(rows=12, cols=1)
fig.update_layout(height=1000, width=1000)

for i, month in enumerate(range(1,13)):
    temp_df = df[(df['Date_time'].dt.year == 2010) & (df['Date_time'].dt.month == month)].sort_values(by='Date_time')
    line_chart = go.Scatter(x=temp_df['Date_time'], y=temp_df['Voltage'], mode='lines', name=month)
    fig.add_trace(line_chart, row=i+1, col=1)
    

fig.update_layout(title_text="Voltage", showlegend=False)
fig.show()

In [None]:
temp_df = df[(df['Date_time'].dt.year == 2010) & (df['Date_time'].dt.month == 10)].sort_values(by='Date_time')
temp_df


In [None]:
dt = 1 # sampled every 1 minute
f = temp_df['Voltage'] # - temp_df['Voltage'].mean()
n = len(temp_df)
fhat = np.fft.fft(f, n)
PSD = fhat * np.conj(fhat) / n
freq = (1/(dt*n)) * np.arange(n)
L = np.arange(1, n//2, dtype='int')

fig = sp.make_subplots(rows=2, cols=1)
line_chart = go.Scatter(x=temp_df['Date_time'], y=f, mode='lines', name='Actual Voltage')
fig.add_trace(line_chart, row=1, col=1)
    
line_chart = go.Scatter(x=freq[L], y=np.real(PSD[L]), mode='lines', name='Power Spectrum')
fig.add_trace(line_chart, row=2, col=1)

#fig.update_layout(title_text="Voltage", showlegend=False)
#fig.show()

In [None]:
indices = PSD > 4000
PSD_clean = PSD * indices
fhat = indices * fhat
ffilt = np.fft.ifft(fhat)

fig = sp.make_subplots(rows=4, cols=1)
fig.update_layout(height=800)
line_chart1 = go.Scatter(x=temp_df['Date_time'], y=f, mode='lines', name='Actual Voltage')
fig.add_trace(line_chart, row=1, col=1)
    
line_chart2 = go.Scatter(x=freq[L], y=np.real(PSD[L]), mode='lines', name='Power Spectrum')
fig.add_trace(line_chart, row=2, col=1)

line_chart3 = go.Scatter(x=temp_df['Date_time'], y=np.real(ffilt), mode='lines', name='Denoisified Voltage')
fig.add_trace(line_chart, row=3, col=1)


fig.add_trace(line_chart1, row=4, col=1)
fig.add_trace(line_chart3, row=4, col=1)


#fig.update_layout(title_text="Voltage", showlegend=False)
fig.show()

In [None]:
temp_df.to_csv('data/sampled_household_power_consumption.csv')