In [1]:
import pandas as pd
import numpy as np
import time 

In [2]:
df = pd.read_csv('household_power_consumption.txt', delimiter=';', na_values='?', low_memory=False) 

In [3]:
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')
df['Time'] = pd.to_timedelta(df['Time'])
df['Global_active_power'] = pd.to_numeric(df['Global_active_power'], errors='coerce')
df['Global_reactive_power'] = pd.to_numeric(df['Global_reactive_power'], errors='coerce')
df['Voltage'] = pd.to_numeric(df['Voltage'], errors='coerce')
df['Global_intensity'] = pd.to_numeric(df['Global_intensity'], errors='coerce')
df['Sub_metering_1'] = pd.to_numeric(df['Sub_metering_1'], errors='coerce')
df['Sub_metering_2'] = pd.to_numeric(df['Sub_metering_2'], errors='coerce')
df['Sub_metering_3'] = pd.to_numeric(df['Sub_metering_3'], errors='coerce') 

In [4]:
df.dropna(inplace=True) 

In [5]:
df.isnull().values.any() 

False

In [6]:
data_array = df.to_numpy() 

# Task 1 

### Pandas

In [7]:
start_time = time.time()
df_high_power = df[df['Global_active_power'] > 5]
df_time = time.time() - start_time 
df_high_power 

Unnamed: 0,Date,Time,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
1,2006-12-16,0 days 17:25:00,5.360,0.436,233.63,23.0,0.0,1.0,16.0
2,2006-12-16,0 days 17:26:00,5.374,0.498,233.29,23.0,0.0,2.0,17.0
3,2006-12-16,0 days 17:27:00,5.388,0.502,233.74,23.0,0.0,1.0,17.0
11,2006-12-16,0 days 17:35:00,5.412,0.470,232.78,23.2,0.0,1.0,17.0
12,2006-12-16,0 days 17:36:00,5.224,0.478,232.99,22.4,0.0,1.0,16.0
...,...,...,...,...,...,...,...,...,...
2069356,2010-11-22,0 days 18:40:00,5.408,0.150,231.50,23.6,48.0,0.0,0.0
2069357,2010-11-22,0 days 18:41:00,5.528,0.144,232.48,24.6,53.0,0.0,0.0
2071586,2010-11-24,0 days 07:50:00,5.172,0.050,235.18,22.0,0.0,38.0,17.0
2071587,2010-11-24,0 days 07:51:00,5.750,0.000,234.40,24.6,0.0,39.0,17.0


### Numpy 

In [8]:
start_time = time.time()
np_high_power = data_array[data_array[:, 2] > 5]
np_time = time.time() - start_time 
np_high_power 

array([[Timestamp('2006-12-16 00:00:00'), Timedelta('0 days 17:25:00'),
        5.36, ..., 0.0, 1.0, 16.0],
       [Timestamp('2006-12-16 00:00:00'), Timedelta('0 days 17:26:00'),
        5.374, ..., 0.0, 2.0, 17.0],
       [Timestamp('2006-12-16 00:00:00'), Timedelta('0 days 17:27:00'),
        5.388, ..., 0.0, 1.0, 17.0],
       ...,
       [Timestamp('2010-11-24 00:00:00'), Timedelta('0 days 07:50:00'),
        5.172, ..., 0.0, 38.0, 17.0],
       [Timestamp('2010-11-24 00:00:00'), Timedelta('0 days 07:51:00'),
        5.75, ..., 0.0, 39.0, 17.0],
       [Timestamp('2010-11-25 00:00:00'), Timedelta('0 days 07:21:00'),
        5.074, ..., 1.0, 2.0, 18.0]], dtype=object)

In [9]:
print(df_time, np_time)

0.0899817943572998 0.07520008087158203


# Task 2

### Pandas

In [10]:
start_time = time.time()
df_high_voltage = df[df['Voltage'] > 235]
df_time_voltage = time.time() - start_time 
df_high_voltage 

Unnamed: 0,Date,Time,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
4,2006-12-16,0 days 17:28:00,3.666,0.528,235.68,15.8,0.0,1.0,17.0
5,2006-12-16,0 days 17:29:00,3.520,0.522,235.02,15.0,0.0,2.0,17.0
6,2006-12-16,0 days 17:30:00,3.702,0.520,235.09,15.8,0.0,1.0,17.0
7,2006-12-16,0 days 17:31:00,3.700,0.520,235.22,15.8,0.0,1.0,17.0
14,2006-12-16,0 days 17:38:00,4.054,0.422,235.24,17.6,0.0,1.0,17.0
...,...,...,...,...,...,...,...,...,...
2075254,2010-11-26,0 days 20:58:00,0.946,0.000,240.43,4.0,0.0,0.0,0.0
2075255,2010-11-26,0 days 20:59:00,0.944,0.000,240.00,4.0,0.0,0.0,0.0
2075256,2010-11-26,0 days 21:00:00,0.938,0.000,239.82,3.8,0.0,0.0,0.0
2075257,2010-11-26,0 days 21:01:00,0.934,0.000,239.70,3.8,0.0,0.0,0.0


### Numpy

In [11]:
start_time = time.time()
np_high_voltage = data_array[data_array[:, 4] > 235]
np_time_voltage = time.time() - start_time 
np_high_voltage 

array([[Timestamp('2006-12-16 00:00:00'), Timedelta('0 days 17:28:00'),
        3.666, ..., 0.0, 1.0, 17.0],
       [Timestamp('2006-12-16 00:00:00'), Timedelta('0 days 17:29:00'),
        3.52, ..., 0.0, 2.0, 17.0],
       [Timestamp('2006-12-16 00:00:00'), Timedelta('0 days 17:30:00'),
        3.702, ..., 0.0, 1.0, 17.0],
       ...,
       [Timestamp('2010-11-26 00:00:00'), Timedelta('0 days 21:00:00'),
        0.938, ..., 0.0, 0.0, 0.0],
       [Timestamp('2010-11-26 00:00:00'), Timedelta('0 days 21:01:00'),
        0.934, ..., 0.0, 0.0, 0.0],
       [Timestamp('2010-11-26 00:00:00'), Timedelta('0 days 21:02:00'),
        0.932, ..., 0.0, 0.0, 0.0]], dtype=object)

In [12]:
print(df_time_voltage, np_time_voltage)

0.15698742866516113 0.2693796157836914


# Task 3

### Pandas

In [13]:
start_time = time.time()
df_current = df[(df['Global_intensity'] >= 19) & (df['Global_intensity'] <= 20)]
df_current = df_current[df_current['Sub_metering_2'] > df_current['Sub_metering_3']]
df_time_current = time.time() - start_time 
df_current 

Unnamed: 0,Date,Time,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
45,2006-12-16,0 days 18:09:00,4.464,0.136,234.66,19.0,0.0,37.0,16.0
460,2006-12-17,0 days 01:04:00,4.582,0.258,238.08,19.6,0.0,13.0,0.0
464,2006-12-17,0 days 01:08:00,4.618,0.104,239.61,19.6,0.0,27.0,0.0
475,2006-12-17,0 days 01:19:00,4.636,0.140,237.37,19.4,0.0,36.0,0.0
476,2006-12-17,0 days 01:20:00,4.634,0.152,237.17,19.4,0.0,35.0,0.0
...,...,...,...,...,...,...,...,...,...
2071589,2010-11-24,0 days 07:53:00,4.666,0.000,235.72,19.8,0.0,39.0,17.0
2071590,2010-11-24,0 days 07:54:00,4.694,0.000,236.78,19.8,0.0,39.0,18.0
2071591,2010-11-24,0 days 07:55:00,4.602,0.000,237.08,19.4,0.0,40.0,17.0
2071592,2010-11-24,0 days 07:56:00,4.536,0.000,237.03,19.0,0.0,39.0,17.0


### Numpy 

In [14]:
start_time = time.time()
np_current = data_array[(data_array[:, 5] >= 19) & (data_array[:, 5] <= 20)]
np_current = np_current[np_current[:, 7] > np_current[:, 8]]
np_time_current = time.time() - start_time 
np_current 

array([[Timestamp('2006-12-16 00:00:00'), Timedelta('0 days 18:09:00'),
        4.464, ..., 0.0, 37.0, 16.0],
       [Timestamp('2006-12-17 00:00:00'), Timedelta('0 days 01:04:00'),
        4.582, ..., 0.0, 13.0, 0.0],
       [Timestamp('2006-12-17 00:00:00'), Timedelta('0 days 01:08:00'),
        4.618, ..., 0.0, 27.0, 0.0],
       ...,
       [Timestamp('2010-11-24 00:00:00'), Timedelta('0 days 07:55:00'),
        4.602, ..., 0.0, 40.0, 17.0],
       [Timestamp('2010-11-24 00:00:00'), Timedelta('0 days 07:56:00'),
        4.536, ..., 0.0, 39.0, 17.0],
       [Timestamp('2010-11-24 00:00:00'), Timedelta('0 days 07:57:00'),
        4.626, ..., 0.0, 39.0, 17.0]], dtype=object)

In [15]:
print(df_time_current, np_time_current)

0.019698619842529297 0.17140865325927734


# Task 4

### Pandas

In [16]:
start_time = time.time()
df_sample = df.sample(n=500000, random_state=42)
df_avg_consumption = df_sample[['Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3']].mean()
df_time_sample = time.time() - start_time 
df_avg_consumption 

Sub_metering_1    1.119258
Sub_metering_2    1.308912
Sub_metering_3    6.452950
dtype: float64

### Numpy

In [17]:
start_time = time.time()
np_sample = np.random.choice(np.arange(data_array.shape[0]), size=500000, replace=False)
np_avg_consumption = np.mean(data_array[np_sample, 6:9], axis=0)
np_time_sample = time.time() - start_time 
np_avg_consumption 

array([1.11331, 1.285534, 6.439592], dtype=object)

In [18]:
print(df_time_sample, np_time_sample)

0.20702457427978516 0.48879528045654297


# Task 5 

### Pandas 

In [19]:
start_time = time.time()
df_evening = df[(df['Time'] >= pd.to_timedelta('18:00:00')) & (df['Global_active_power'] > 6)]
df_group2_dominant = df_evening[df_evening['Sub_metering_2'] > df_evening[['Sub_metering_1', 'Sub_metering_3']].max(axis=1)]
df_result = pd.concat([df_group2_dominant.iloc[::3][:len(df_group2_dominant)//2], df_group2_dominant.iloc[::4][len(df_group2_dominant)//2:]])
df_time_evening = time.time() - start_time 
df_result 

Unnamed: 0,Date,Time,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
41,2006-12-16,0 days 18:05:00,6.052,0.192,232.93,26.2,0.0,37.0,17.0
44,2006-12-16,0 days 18:08:00,6.308,0.116,232.25,27.0,0.0,36.0,17.0
17494,2006-12-28,0 days 20:58:00,6.386,0.374,236.63,27.0,1.0,36.0,17.0
17498,2006-12-28,0 days 21:02:00,8.088,0.262,235.50,34.4,1.0,72.0,17.0
17501,2006-12-28,0 days 21:05:00,7.230,0.152,235.22,30.6,1.0,73.0,17.0
...,...,...,...,...,...,...,...,...,...
2066471,2010-11-20,0 days 18:35:00,6.784,0.364,228.38,30.0,21.0,35.0,16.0
2066474,2010-11-20,0 days 18:38:00,6.302,0.376,229.65,27.8,15.0,34.0,17.0
2066477,2010-11-20,0 days 18:41:00,6.282,0.360,229.21,27.8,14.0,35.0,16.0
2066480,2010-11-20,0 days 18:44:00,6.374,0.392,228.52,28.2,15.0,36.0,16.0


### Numpy 

In [20]:
start_time = time.time()
np_evening = data_array[(data_array[:, 1] >= np.timedelta64(18, 'h')) & (data_array[:, 2] > 6)]
np_group2_dominant = np_evening[np_evening[:, 7] > np.maximum(np_evening[:, 6], np_evening[:, 8])]
np_result = np.concatenate([np_group2_dominant[::3][:len(np_group2_dominant)//2], np_group2_dominant[::4][len(np_group2_dominant)//2:]])
np_time_evening = time.time() - start_time 
np_result 

array([[Timestamp('2006-12-16 00:00:00'), Timedelta('0 days 18:05:00'),
        6.052, ..., 0.0, 37.0, 17.0],
       [Timestamp('2006-12-16 00:00:00'), Timedelta('0 days 18:08:00'),
        6.308, ..., 0.0, 36.0, 17.0],
       [Timestamp('2006-12-28 00:00:00'), Timedelta('0 days 20:58:00'),
        6.386, ..., 1.0, 36.0, 17.0],
       ...,
       [Timestamp('2010-11-20 00:00:00'), Timedelta('0 days 18:41:00'),
        6.282, ..., 14.0, 35.0, 16.0],
       [Timestamp('2010-11-20 00:00:00'), Timedelta('0 days 18:44:00'),
        6.374, ..., 15.0, 36.0, 16.0],
       [Timestamp('2010-11-20 00:00:00'), Timedelta('0 days 18:49:00'),
        6.21, ..., 21.0, 34.0, 17.0]], dtype=object)

In [21]:
print(df_time_evening, np_time_evening)

0.029779434204101562 4.797719717025757
