In [124]:
import requests, zipfile
import pandas as pd
from io import BytesIO
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00235/household_power_consumption.zip'

# get a zip file from UCI machine learning respository 
request = requests.get(url)
file = zipfile.ZipFile(BytesIO(request.content))

# reads txt file from zip, as byte code 
with file.open('household_power_consumption.txt') as f:
    txt = f.readlines()

    # decodes bytes to string, splaces return and newline characters with ''
txt_decoded = [row.decode("utf-8").replace('\r\n', '') for row in txt]

# extracts column names 
cols = txt_decoded[0].split(';')

# create a data frame
df = pd.DataFrame(columns=cols, data = [row.split(';') for row in txt_decoded[1:100000]])

# combines date and time col to a date time col
df['Datetime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'])

# sets date time as index
df.set_index('Datetime', inplace=True)
df.drop(['Date', 'Time'], axis=1, inplace=True)

# coerrce all the string columns to float
for col in df.columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')
df.head()

Unnamed: 0_level_0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2006-12-16 17:24:00,4.216,0.418,234.84,18.4,0.0,1.0,17.0
2006-12-16 17:25:00,5.36,0.436,233.63,23.0,0.0,1.0,16.0
2006-12-16 17:26:00,5.374,0.498,233.29,23.0,0.0,2.0,17.0
2006-12-16 17:27:00,5.388,0.502,233.74,23.0,0.0,1.0,17.0
2006-12-16 17:28:00,3.666,0.528,235.68,15.8,0.0,1.0,17.0




+    global_active_power: The total active power consumed by the household (kilowatts).
+    global_reactive_power: The total reactive power consumed by the household (kilowatts).
+    voltage: Average voltage (volts).
+    global_intensity: Average current intensity (amps).
+    sub_metering_1: Active energy for kitchen (watt-hours of active energy).
+    sub_metering_2: Active energy for laundry (watt-hours of active energy).
+    sub_metering_3: Active energy for climate control systems (watt-hours of active energy).


In [125]:
# groups by daily
daily_groups = df.resample('D') 
# aggregates by some
daily_data = daily_groups.sum()
daily_data.head()

Unnamed: 0_level_0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2006-12-16,1209.176,34.922,93552.53,5180.8,0.0,546.0,4926.0
2006-12-17,3390.46,226.006,345725.32,14398.6,2033.0,4187.0,13341.0
2006-12-18,2203.826,161.792,347373.64,9247.2,1063.0,2621.0,14018.0
2006-12-19,1666.194,150.942,348479.01,7094.0,839.0,7602.0,6197.0
2006-12-20,2225.748,160.998,348923.61,9313.0,0.0,2648.0,14063.0


In [128]:
dates = daily_data.index
min_date = min(dates)
max_date = max(dates)
horrizon = min_date
print(min_date, max_date)

def get_data_ranges(dates, d=7):
    min_date = min(dates)
    max_date = max(dates)
    horrizon = min_date
    while horrizon + pd.Timedelta(days=d*2) <= max_date:
        x_date_range = pd.date_range(horrizon, periods=d)
        y_date_range = pd.date_range(horrizon + pd.Timedelta(days=d) , periods=d)
        horrizon =  horrizon +  pd.Timedelta(days=d)
        yield x_date_range, y_date_range

date_ranges = list(get_data_ranges(daily_data.index, d=7))
date_ranges[0]

2006-12-16 00:00:00 2007-12-02 00:00:00


(DatetimeIndex(['2006-12-16', '2006-12-17', '2006-12-18', '2006-12-19',
                '2006-12-20', '2006-12-21', '2006-12-22'],
               dtype='datetime64[ns]', freq='D'),
 DatetimeIndex(['2006-12-23', '2006-12-24', '2006-12-25', '2006-12-26',
                '2006-12-27', '2006-12-28', '2006-12-29'],
               dtype='datetime64[ns]', freq='D'))

(Timestamp('2006-12-16 00:00:00', freq='D'),
 Timestamp('2006-12-23 00:00:00', freq='D'))