In [None]:
import pandas as pd
import numpy as np
import timeit
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [None]:
df = pd.read_csv(
    "household_power_consumption.txt",
    sep=";",
    low_memory=False
)

df.head()

In [None]:
# замінюємо ? на NaN
df.replace("?", np.nan, inplace=True)

# перетворюємо числові колонки
cols = [
    "Global_active_power",
    "Global_reactive_power",
    "Voltage",
    "Global_intensity",
    "Sub_metering_1",
    "Sub_metering_2",
    "Sub_metering_3"
]

for col in cols:
    df[col] = df[col].astype(float)

# видаляємо рядки з пропусками
df.dropna(inplace=True)

df.info()

In [None]:
def power_over_5(df):
    return df[df["Global_active_power"] > 5]
timeit.timeit(lambda: power_over_5(df), number=3)

In [None]:
def complex_filter(df):
    subset = df[
        (df["Global_intensity"] >= 19) &
        (df["Global_intensity"] <= 20)
    ]
    
    return subset[
        (subset["Sub_metering_2"] + subset["Sub_metering_3"]) >
        (subset["Sub_metering_1"])
    ]

timeit.timeit(lambda: complex_filter(df), number=3)

In [None]:
def random_sample_mean(df):
    sample = df.sample(n=500000, replace=False)
    
    return {
        "Sub1_mean": sample["Sub_metering_1"].mean(),
        "Sub2_mean": sample["Sub_metering_2"].mean(),
        "Sub3_mean": sample["Sub_metering_3"].mean()
    }

timeit.timeit(lambda: random_sample_mean(df), number=3)

In [None]:
def evening_filter(df):
    df["Time"] = pd.to_datetime(df["Time"])
    
    subset = df[
        (df["Time"].dt.hour >= 18) &
        (df["Global_active_power"] > 6)
    ]
    
    subset = subset[
        subset["Sub_metering_2"] >
        subset[["Sub_metering_1","Sub_metering_3"]].max(axis=1)
    ]
    
    first_half = subset.iloc[:len(subset)//2]
    second_half = subset.iloc[len(subset)//2:]
    
    result = pd.concat([
        first_half.iloc[::3],
        second_half.iloc[::4]
    ])
    
    return result

timeit.timeit(lambda: evening_filter(df), number=1)

In [None]:
scaler = MinMaxScaler()
df_normalized = df.copy()

df_normalized[cols] = scaler.fit_transform(df[cols])

df_normalized.head()

In [None]:
standard_scaler = StandardScaler()
df_standardized = df.copy()

df_standardized[cols] = standard_scaler.fit_transform(df[cols])

df_standardized.head()

In [None]:
pearson = df["Global_active_power"].corr(df["Voltage"], method="pearson")
spearman = df["Global_active_power"].corr(df["Voltage"], method="spearman")

print("Pearson:", pearson)
print("Spearman:", spearman)

In [None]:
df["Month"] = pd.to_datetime(df["Date"]).dt.month

df_encoded = pd.get_dummies(df, columns=["Month"])

df_encoded.head()