# Частина 2

Аналіз датасету Individual Household Electric Power Consumption Dataset.
Здійснюється очищення даних, формування вибірок та аналіз часових витрат.

In [1]:
#Визначення ф-ції. завантаження та очищення
import pandas as pd
import numpy as np
import timeit
import os

def load_and_clean(path):
    df = pd.read_csv(
        path,
        sep=";",
        na_values="?",
        low_memory=False
    )
    
    df = df.dropna()

    df["DateTime"] = pd.to_datetime(
        df["Date"] + " " + df["Time"],
        dayfirst=True
    )

    df = df.drop(columns=["Date", "Time"])

    numeric_cols = df.columns.drop("DateTime")
    df[numeric_cols] = df[numeric_cols].astype(float)

    return df

In [2]:
#Виклик
df = load_and_clean("household_power_consumption.txt")
df.head()

Unnamed: 0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,DateTime
0,4.216,0.418,234.84,18.4,0.0,1.0,17.0,2006-12-16 17:24:00
1,5.36,0.436,233.63,23.0,0.0,1.0,16.0,2006-12-16 17:25:00
2,5.374,0.498,233.29,23.0,0.0,2.0,17.0,2006-12-16 17:26:00
3,5.388,0.502,233.74,23.0,0.0,1.0,17.0,2006-12-16 17:27:00
4,3.666,0.528,235.68,15.8,0.0,1.0,17.0,2006-12-16 17:28:00


Завдання 1: Вибірка записів із загальною активною потужністю понад 5 кВт

In [3]:
#Визначення ф-ції
def select_power_above_5(df):
    return df[df["Global_active_power"] > 5]

In [4]:
#Виклик
print(timeit.timeit(lambda: select_power_above_5(df), number=5))
select_power_above_5(df).head()

0.02237389999208972


Unnamed: 0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,DateTime
1,5.36,0.436,233.63,23.0,0.0,1.0,16.0,2006-12-16 17:25:00
2,5.374,0.498,233.29,23.0,0.0,2.0,17.0,2006-12-16 17:26:00
3,5.388,0.502,233.74,23.0,0.0,1.0,17.0,2006-12-16 17:27:00
11,5.412,0.47,232.78,23.2,0.0,1.0,17.0,2006-12-16 17:35:00
12,5.224,0.478,232.99,22.4,0.0,1.0,16.0,2006-12-16 17:36:00


Завдання 2: Вибірка записів за силою струму 19–20 А

In [5]:
#Визначення ф-ції
def select_current_range(df):
    subset = df[(df["Global_intensity"] >= 19) & 
                (df["Global_intensity"] <= 20)]

    condition = (
        subset["Sub_metering_2"] + subset["Sub_metering_1"]
        >
        subset["Sub_metering_3"]
    )

    return subset[condition]

In [6]:
#Виклик
print(timeit.timeit(lambda: select_current_range(df), number=5))
select_current_range(df).head()

0.038218700035940856


Unnamed: 0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,DateTime
45,4.464,0.136,234.66,19.0,0.0,37.0,16.0,2006-12-16 18:09:00
460,4.582,0.258,238.08,19.6,0.0,13.0,0.0,2006-12-17 01:04:00
464,4.618,0.104,239.61,19.6,0.0,27.0,0.0,2006-12-17 01:08:00
475,4.636,0.14,237.37,19.4,0.0,36.0,0.0,2006-12-17 01:19:00
476,4.634,0.152,237.17,19.4,0.0,35.0,0.0,2006-12-17 01:20:00


Завдання 3: Випадкова вибірка 500000 записів

In [7]:
#Визначення ф-ції
def random_sample_stats(df):
    sample = df.sample(n=500000, replace=False)
    
    means = sample[[
        "Sub_metering_1",
        "Sub_metering_2",
        "Sub_metering_3"
    ]].mean()

    return means

In [8]:
#Виклик
print(timeit.timeit(lambda: random_sample_stats(df), number=3))
random_sample_stats(df)

0.5025116999750026


Sub_metering_1    1.120012
Sub_metering_2    1.306420
Sub_metering_3    6.444960
dtype: float64

завдання 4: Аналіз споживання після 18:00

In [9]:
#Визначення ф-ції
def evening_high_consumption(df):
    evening = df[df["DateTime"].dt.hour >= 18]
    high = evening[evening["Global_active_power"] > 6]

    group_condition = (
        high["Sub_metering_2"] >
        high[["Sub_metering_1", "Sub_metering_3"]].max(axis=1)
    )

    result = high[group_condition]

    half = len(result) // 2
    first_half = result.iloc[:half:3]
    second_half = result.iloc[half::4]

    return pd.concat([first_half, second_half])

In [10]:
#Виклик
print(timeit.timeit(lambda: evening_high_consumption(df), number=3))
evening_high_consumption(df).head()

0.25406680000014603


Unnamed: 0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,DateTime
41,6.052,0.192,232.93,26.2,0.0,37.0,17.0,2006-12-16 18:05:00
44,6.308,0.116,232.25,27.0,0.0,36.0,17.0,2006-12-16 18:08:00
17494,6.386,0.374,236.63,27.0,1.0,36.0,17.0,2006-12-28 20:58:00
17498,8.088,0.262,235.5,34.4,1.0,72.0,17.0,2006-12-28 21:02:00
17501,7.23,0.152,235.22,30.6,1.0,73.0,17.0,2006-12-28 21:05:00


Завдання 5: Нормалізація даних

In [14]:
#Визначення ф-ції
def normalize(df):
    numeric = df.select_dtypes(include=np.number)
    return (numeric - numeric.min()) / (numeric.max() - numeric.min())

normalized_df = normalize(df)
normalized_df.head()

Unnamed: 0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
0,0.374796,0.300719,0.37609,0.377593,0.0,0.0125,0.548387
1,0.478363,0.313669,0.336995,0.473029,0.0,0.0125,0.516129
2,0.479631,0.358273,0.32601,0.473029,0.0,0.025,0.548387
3,0.480898,0.361151,0.340549,0.473029,0.0,0.0125,0.548387
4,0.325005,0.379856,0.403231,0.323651,0.0,0.0125,0.548387


Завдання 6: Стандартизація даних

In [15]:
#Визначення ф-ції
def standardize(df):
    numeric = df.select_dtypes(include=np.number)
    return (numeric - numeric.mean()) / numeric.std()

standardized_df = standardize(df)
standardized_df.head()

Unnamed: 0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
0,2.955076,2.61072,-1.851816,3.098788,-0.182337,-0.051274,1.24942
1,4.037084,2.770405,-2.225274,4.133799,-0.182337,-0.051274,1.130897
2,4.050325,3.320431,-2.330213,4.133799,-0.182337,0.120487,1.24942
3,4.063566,3.355916,-2.191323,4.133799,-0.182337,-0.051274,1.24942
4,2.434881,3.586572,-1.592555,2.513781,-0.182337,-0.051274,1.24942


Завдання 7: Обчислення коефіцієнтів кореляції

In [16]:
#Визначення ф-ції
def correlations(df):
    pearson = df["Global_active_power"].corr(
        df["Voltage"], method="pearson"
    )

    spearman = df["Global_active_power"].corr(
        df["Voltage"], method="spearman"
    )

    return pearson, spearman

def correlations(df):
    pearson = df["Global_active_power"].corr(
        df["Voltage"], method="pearson"
    )

    spearman = df["Global_active_power"].corr(
        df["Voltage"], method="spearman"
    )

    return pearson, spearman

Завдання 8: Обчислення коефіцієнтів кореляції

In [17]:
#Визначення ф-ції
def add_time_category(df):
    df["Time_category"] = pd.cut(
        df["DateTime"].dt.hour,
        bins=[0,6,12,18,24],
        labels=["Night","Morning","Afternoon","Evening"],
        right=False
    )
    return df

df = add_time_category(df)

df_encoded = pd.get_dummies(df, columns=["Time_category"])
df_encoded.head()

Unnamed: 0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,DateTime,Time_category_Night,Time_category_Morning,Time_category_Afternoon,Time_category_Evening
0,4.216,0.418,234.84,18.4,0.0,1.0,17.0,2006-12-16 17:24:00,False,False,True,False
1,5.36,0.436,233.63,23.0,0.0,1.0,16.0,2006-12-16 17:25:00,False,False,True,False
2,5.374,0.498,233.29,23.0,0.0,2.0,17.0,2006-12-16 17:26:00,False,False,True,False
3,5.388,0.502,233.74,23.0,0.0,1.0,17.0,2006-12-16 17:27:00,False,False,True,False
4,3.666,0.528,235.68,15.8,0.0,1.0,17.0,2006-12-16 17:28:00,False,False,True,False
