In [1]:
import numpy as np
import os
import urllib.request
import zipfile
import timeit
import pandas as pd

In [2]:
def setup_folders(folder='lab4'):
    if not os.path.exists(folder):
        os.makedirs(folder)

def download_and_unzip_data(url, folder, file_name):
    file_path = os.path.join(folder, file_name)

    if not os.path.exists(file_path):
        print(f"Downloading {file_name}...")
        with urllib.request.urlopen(url) as response, open(file_path, 'wb') as out_file:
            out_file.write(response.read())
        print(f"{file_name} downloaded!")

        #unzip
        with zipfile.ZipFile(file_path, 'r') as zip_ref:
            zip_ref.extractall(folder)
        print(f"Extractes {file_name} to {folder}")

folder = 'lab4'
setup_folders(folder)

download_url = "https://archive.ics.uci.edu/static/public/235/individual+household+electric+power+consumption.zip"
file_name = 'household_power_consumption.zip'
download_and_unzip_data(download_url, folder, file_name)

In [3]:
def process_pandas_data(file_path):
    df = pd.read_csv(file_path, sep=";", na_values=['?'])
    df = df.dropna()
    df['DateTime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'], format='%d/%m/%Y %H:%M:%S')
    df = df.drop(columns = ['Date', 'Time'])
    numeric_cols = ['Global_active_power', 'Global_reactive_power', 'Voltage', 'Global_intensity', 'Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3']
    df[numeric_cols] = df[numeric_cols].astype(float)
    return df

file_path =os.path.join(folder, 'household_power_consumption.txt')
df = process_pandas_data(file_path)
print("Pandas dataframe processed successfuly")
df.head()

Pandas dataframe processed successfuly


Unnamed: 0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,DateTime
0,4.216,0.418,234.84,18.4,0.0,1.0,17.0,2006-12-16 17:24:00
1,5.36,0.436,233.63,23.0,0.0,1.0,16.0,2006-12-16 17:25:00
2,5.374,0.498,233.29,23.0,0.0,2.0,17.0,2006-12-16 17:26:00
3,5.388,0.502,233.74,23.0,0.0,1.0,17.0,2006-12-16 17:27:00
4,3.666,0.528,235.68,15.8,0.0,1.0,17.0,2006-12-16 17:28:00


In [4]:
start_time = timeit.default_timer()
high_power = df[df['Global_active_power'] > 5]
end_time = timeit.default_timer()
total_time_pd = end_time - start_time
print(f"Час виконання: {total_time_pd} секунд")
print("Домогосподарства з загальною активною потужністю більше 5 кВт:")
high_power.head()

Час виконання: 0.026722899987362325 секунд
Домогосподарства з загальною активною потужністю більше 5 кВт:


Unnamed: 0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,DateTime
1,5.36,0.436,233.63,23.0,0.0,1.0,16.0,2006-12-16 17:25:00
2,5.374,0.498,233.29,23.0,0.0,2.0,17.0,2006-12-16 17:26:00
3,5.388,0.502,233.74,23.0,0.0,1.0,17.0,2006-12-16 17:27:00
11,5.412,0.47,232.78,23.2,0.0,1.0,17.0,2006-12-16 17:35:00
12,5.224,0.478,232.99,22.4,0.0,1.0,16.0,2006-12-16 17:36:00


In [5]:
start_time = timeit.default_timer()
high_voltage = df[df['Voltage'] > 235]
end_time = timeit.default_timer()
total_time_pd = end_time - start_time
print(f"Час виконання: {total_time_pd} секунд")
print("Домогосподарства з напругою більше 235 В:")
high_voltage.head()

Час виконання: 0.2918151000048965 секунд
Домогосподарства з напругою більше 235 В:


Unnamed: 0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,DateTime
4,3.666,0.528,235.68,15.8,0.0,1.0,17.0,2006-12-16 17:28:00
5,3.52,0.522,235.02,15.0,0.0,2.0,17.0,2006-12-16 17:29:00
6,3.702,0.52,235.09,15.8,0.0,1.0,17.0,2006-12-16 17:30:00
7,3.7,0.52,235.22,15.8,0.0,1.0,17.0,2006-12-16 17:31:00
14,4.054,0.422,235.24,17.6,0.0,1.0,17.0,2006-12-16 17:38:00


In [8]:
start_time = timeit.default_timer()
current_range = df[(df['Global_intensity'] >= 19) & (df['Global_intensity'] <=20)]
condition_met = current_range[(current_range['Sub_metering_2'] > current_range['Sub_metering_3'])]
end_time = timeit.default_timer()
total_time_pd = end_time - start_time
print(f"Час виконання: {total_time_pd} секунд")
print("Домогосподарства з силою струму 19-20 А, де пральна машина та холодильник споживають більше:")
condition_met.head()

Час виконання: 0.054012399981729686 секунд
Домогосподарства з силою струму 19-20 А, де пральна машина та холодильник споживають більше:


Unnamed: 0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,DateTime
45,4.464,0.136,234.66,19.0,0.0,37.0,16.0,2006-12-16 18:09:00
460,4.582,0.258,238.08,19.6,0.0,13.0,0.0,2006-12-17 01:04:00
464,4.618,0.104,239.61,19.6,0.0,27.0,0.0,2006-12-17 01:08:00
475,4.636,0.14,237.37,19.4,0.0,36.0,0.0,2006-12-17 01:19:00
476,4.634,0.152,237.17,19.4,0.0,35.0,0.0,2006-12-17 01:20:00


In [9]:
start_time = timeit.default_timer()
random_sample = df.sample(n=500000, replace=False)
mean_consumption = random_sample[['Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3']].mean()
end_time = timeit.default_timer()
total_time_pd = end_time - start_time
print(f"Час виконання: {total_time_pd} секунд")
print("Середні величини усіх трьох груп споживання електроенергії:")
mean_consumption

Час виконання: 0.39404149999609217 секунд
Середні величини усіх трьох груп споживання електроенергії:


Sub_metering_1    1.123476
Sub_metering_2    1.303394
Sub_metering_3    6.457390
dtype: float64

In [11]:
start_time = timeit.default_timer()
evening_high_usage = df[(df['DateTime'].dt.hour >=18) & (df['Global_active_power'] >6)]
group2_largest = evening_high_usage[evening_high_usage['Sub_metering_2'] > evening_high_usage['Sub_metering_1']]
first_half_selection = group2_largest.iloc[:len(group2_largest)//2:3]
second_half_selection = group2_largest.iloc[:len(group2_largest)//2:4]
end_time = timeit.default_timer()
total_time_pd = end_time - start_time
print(f"Час виконання: {total_time_pd} секунд")
combined_df = pd.concat([first_half_selection, second_half_selection])
print("Домогосподарства, які після 18:00 споживають понад 6 кВт за хвилину в середньому:")
combined_df.head()

Час виконання: 0.356082400016021 секунд
Домогосподарства, які після 18:00 споживають понад 6 кВт за хвилину в середньому:


Unnamed: 0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,DateTime
41,6.052,0.192,232.93,26.2,0.0,37.0,17.0,2006-12-16 18:05:00
44,6.308,0.116,232.25,27.0,0.0,36.0,17.0,2006-12-16 18:08:00
17494,6.386,0.374,236.63,27.0,1.0,36.0,17.0,2006-12-28 20:58:00
17498,8.088,0.262,235.5,34.4,1.0,72.0,17.0,2006-12-28 21:02:00
17501,7.23,0.152,235.22,30.6,1.0,73.0,17.0,2006-12-28 21:05:00
