In [None]:
"""
Course: dpat4
Lab: 04
Part: 1
"""

import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from contextlib import contextmanager

TIMER: dict = {}


def record(lib: str, task: str, time_used: float) -> None:
    """
    Record the elapsed time of the task using specific lib
    :param lib: the name of the lib used (numpy, pandas)
    :param task: task number (task1)
    :param time_used: time used to complete the task
    :return: None
    """
    global TIMER
    
    if lib not in TIMER:
        TIMER[lib] = {}
    TIMER[lib][task] = time_used


@contextmanager
def timer(lib: str, task: str):
    start = time.time()
    try:
        yield
    finally:
        end = time.time()
        time_used = round(end - start, 6)
        record(lib, task, time_used)
        
        print(f'task: {task:}\nlib: {lib}\ntime used: {time_used}')


URL = "household_power_consumption.csv"

print('Setup complete...')

## Task No. 0
Desc: Імпортування дата сету

In [None]:
# task: 0
# lib: pandas

with timer('pandas', 'task0'):
    pd_data = pd.read_csv(URL, sep=';', na_values='?')
    pd_data.dropna(inplace=True)

pd_data.head()

In [None]:
# task: 0
# lib: numpy

with timer('numpy', 'task0'):
    dtypes = [
        ('Date', 'U10'), 
        ('Time', 'U8'), 
        ('Global_active_power', 'f8'), 
        ('Global_reactive_power', 'f8'), 
        ('Voltage', 'f8'), 
        ('Global_intensity', 'f8'), 
        ('Sub_metering_1', 'f8'), 
        ('Sub_metering_2', 'f8'), 
        ('Sub_metering_3', 'f8')
    ]
    
    np_data = np.genfromtxt(
        URL, 
        delimiter=';',
        missing_values='?',
        filling_values=np.nan,
        dtype=dtypes,
        encoding=None,
        names=True
    )
    
    np_columns = [col for col, _ in dtypes[2:]]
    mask = np.all([~np.isnan(np_data[col]) for col in np_columns], axis=0)

    np_data = np_data[mask]

np_data

In [None]:
nan_mask = np.any([np.isnan(np_data[col]) for col in np_columns], axis=0)

nan = np_data[nan_mask]

nan

## Task No. 1
Desc: Обрати всі домогосподарства, у яких загальна активна споживана потужність перевищує **5 кВт**.

In [None]:
# task: 1
# lib: pandas

with timer('pandas', 'task1'):
    pd_active_power = pd_data[
        pd_data['Global_active_power'] > 5
    ]

pd_active_power

In [None]:
# task: 1
# lib: numpy

with timer('numpy', 'task1'):
    np_active_power = np_data[
        np_data['Global_active_power'] > 5
    ]

np_active_power

## Task No. 2
Обрати всі домогосподарства, у яких вольтаж перевищує 235 В.

In [None]:
# task: 2
# lib: pandas

with timer('pandas', 'task2'):
    pd_voltage = pd_data[
        pd_data['Voltage'] > 235
    ]

pd_voltage

In [None]:
# task: 2
# lib: numpy

with timer('numpy', 'task2'):
    np_voltage = np_data[
        np_data['Voltage'] > 235
    ]

np_voltage

## Task No. 3
Обрати всі домогосподарства, у яких сила струму лежить в межах 19-20 А, для них виявити ті, у яких пральна машина та холодильних споживають більше, ніж бойлер та кондиціонер.

In [None]:
# task: 3
# lib: pandas

with timer('pandas', 'task3'):
    pd_global_intensity = pd_data[
        (pd_data['Global_intensity'] >= 19) & 
        (pd_data['Global_intensity'] <= 20)
    ]
    
    pd_specific_consumption = pd_global_intensity[
        (pd_global_intensity['Sub_metering_1'] + pd_global_intensity['Sub_metering_2']) > pd_global_intensity['Sub_metering_3']
    ]


pd_specific_consumption

In [None]:
# task: 3
# lib: numpy

with timer('numpy', 'task3'):
    # Global_intensity col for range >= 19 and <= 20
    current_range = np_data[
        (np_data['Global_intensity'] >= 19) & (np_data['Global_intensity'] <= 20)
    ]

    # sum of the Sub_metering groups
    sub_metering_1_2 = current_range['Sub_metering_1'] + current_range['Sub_metering_2']
    sub_metering_3 = current_range['Sub_metering_3']
    
    # getting df were sub_metering_1 + sub_metering_2 > sub_metering_3
    specific_consumption = current_range[sub_metering_1_2 > sub_metering_3]

specific_consumption

## Task No. 4
Обрати випадковим чином 500000 домогосподарств (без повторів елементів вибірки) для них обчислити середні величини усіх 3-х груп споживання електричної енергії

In [None]:
# task: 4
# lib: pandas

with timer('pandas', 'task4'):

    # random 500000 items, random_state - seed for the random generator
    pd_random_households = pd_data.sample(n=500000, random_state=1).reset_index(drop=True)
    
    pd_mean_consumption = pd_random_households[
        ['Global_active_power', 
         'Global_intensity', 
         'Voltage', 
         'Sub_metering_1', 
         'Sub_metering_2', 
         'Sub_metering_3']
    ].mean()

pd_mean_consumption

In [None]:
# task: 4
# lib: numpy

with timer('numpy', 'task4'):
    np_random_idx = np.random.choice(np_data.shape[0], 500000, replace=True)
    np_random_households = np_data[np_random_idx]
    
    mean_fields = ['Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3']
    np_mean_consumption = {col: np.mean(np_random_households[col]) for col in mean_fields}

np_mean_consumption

## Task No. 5 (4.1)
Desc: Обрати ті домогосподарства, які після 18-00 споживають понад **6 кВт** за годину, в середньому, серед відібраних визначити ті, у яких основне споживання електроенергії у вказаний проміжок часу припадає на пральну машину, сушарку, холодильник та освітлення (група 2 є найбільшою), а потім обрати кожен третій результат із першої половини та кожен четвертий результат із других половин.

In [None]:
# task: 5 (4.1)
# lib: pandas

with timer('pandas', 'task5'):

    time_threshold = pd.to_datetime('18:00:00').time()
    
    pd_random_households['Time'] = pd.to_datetime(pd_random_households['Time'], format='%H:%M:%S').dt.time
    
    # 6 кВт past time_threshold
    pd_evening_consumption = pd_random_households[
        (pd_random_households['Time'] >= time_threshold) & 
        (pd_random_households['Global_active_power'] > 6)
    ]
    
    # where consumption in the group 2 is greater than 3 and 1 combined
    pd_main_consumption = pd_evening_consumption[
        pd_evening_consumption['Sub_metering_2'] > 
        pd_evening_consumption[
            ['Sub_metering_1', 'Sub_metering_3']
        ].max(axis=1)
    ]
    
    # from the first half, every third part
    pd_first_half = pd_main_consumption.iloc[:len(pd_main_consumption) // 2].iloc[::3]
    
    # from the second half, every fourth part
    pd_second_half = pd_main_consumption.iloc[len(pd_main_consumption) // 2:].iloc[::4]
    
    # merging the first and second halves
    pd_final_selection = pd.concat([pd_first_half, pd_second_half])

pd_final_selection

In [None]:
# task: 5 (4.1)
# lib: numpy

with timer('numpy', 'task5'):


    np_time_threshold: str = '18:00:00'

    np_evening_consumption = np_random_households[
        (np_random_households['Time'] >= np_time_threshold) & (np_random_households['Global_active_power'] > 6)
    ]
    
    np_main_consumption = np_evening_consumption[
        np_evening_consumption['Sub_metering_2'] > np.maximum(
            np_evening_consumption['Sub_metering_1'], 
            np_evening_consumption['Sub_metering_3']
        )
    ]
    
    np_first_half = np_main_consumption[:len(np_main_consumption) // 2][::3]
    np_second_half = np_main_consumption[len(np_main_consumption) // 2:][::4]
    
    np_final_selection = np.hstack((np_first_half, np_second_half)) 

np_final_selection

In [None]:
TIMER

In [None]:

# main plot
plt.figure(figsize=(8,6))
for lib_name, tasks in TIMER.items():
    plt.plot(tasks.keys(), tasks.values(), marker='x', label=lib_name)

plt.ylabel('Time, s')
plt.title(f"Elapsed Time per Task")
plt.xlabel("Task")
plt.legend()
plt.grid(True)

# zoomed in plot for small numbers
plt.figure(figsize=(6,4))
for lib_name, tasks in TIMER.items():
    plt.plot(tasks.keys(), tasks.values(), marker='x', label=lib_name)

# calculating min value from all TIMER dict to set as Y bottom limit
bottom_threshold = min(value for lib in TIMER.values() for value in lib.values())

plt.ylim(bottom_threshold, bottom_threshold + 0.5)
plt.xlabel("Tasks")
plt.ylabel("Time, s")
plt.title("Elapsed Time per Task (Zoom for Small Points)")
plt.legend()
plt.grid(True)

plt.tight_layout()

In [None]:
import seaborn as sns

pd_data['Time'] = pd.to_datetime(pd_data['Time'], format='%H:%M:%S')

df = pd_data[['Global_intensity', 'Global_active_power', 'Global_reactive_power']]
ax = sns.heatmap(df, annot=True)

plt.show()

In [None]:
ax = sns.displot(pd_data['Global_intensity'], kind="kde")  # Adjust for other columns

# Customize the plot (optional)
plt.title("Distribution of Global Intensity")
plt.xlabel("Global Intensity")
plt.ylabel("Density")
plt.grid(True)
plt.show()

In [None]:
df = pd_data[['Sub_metering_1','Sub_metering_2','Sub_metering_3']].mean()

sns.barplot(df)

# Customize the plot (optional)
plt.title("Correlation Heatmap (Sample)")
plt.show()