# import packages:

In [5]:
import pandas as pd
import datetime
import numpy as np

# download dataset and unzip file:

In [7]:
# Define URL
URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/00321/LD2011_2014.txt.zip"

# Download the data and unzip it using Python
import urllib.request
import zipfile
import io

# Download the file from `URL` and save it locally under `file_name`:
urllib.request.urlretrieve(URL, "LD2011_2014.txt.zip")

# Unzip file:
zip_ref = zipfile.ZipFile("LD2011_2014.txt.zip", 'r')
zip_ref.extractall()
zip_ref.close()

# read dataset:

In [8]:
# open raw file downloaded to local machine:
raw_file_path = 'LD2011_2014.txt'

# use raw file to read csv and get raw dataframe:
raw_dataset = pd.read_csv(raw_file_path, delimiter=';', header=0, decimal=',', index_col=False, low_memory=False)

# check dataframe info and head:
print(raw_dataset.info())
print(raw_dataset.head())

# rename column containing dates' in string format and convert it to datetime format:

In [20]:
# since column with datetime values doesn't have name, rename it to 'datetime':
data_timecol = raw_dataset.rename(columns={'Unnamed: 0':'datetime'})

# check dtypes for dataframe before converstion of 'datetime' column to datetime dtype:
print(data_timecol.dtypes)

# transform rennamed column into datetime dtype:
data_timecol['datetime'] = pd.to_datetime(data_timecol['datetime'], format='%Y-%m-%d %H:%M:%S')

# check if conversion was successful:
print(data_timecol.dtypes.value_counts())

datetime     object
MT_001      float64
MT_002      float64
MT_003      float64
MT_004      float64
             ...   
MT_366      float64
MT_367      float64
MT_368      float64
MT_369      float64
MT_370      float64
Length: 371, dtype: object


# resample data from 15min frequency to hourly frequency:

In [12]:
# resample data from 15min level to hourly level:
data_resamp = data_timecol.resample('1H', on='datetime').sum().reset_index()

# check to see if resampling went through:
print(data_resamp.head())

             datetime  MT_001  MT_002  MT_003  MT_004  MT_005  MT_006  MT_007   
0 2011-01-01 00:00:00     0.0     0.0     0.0     0.0     0.0     0.0     0.0  \
1 2011-01-01 01:00:00     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
2 2011-01-01 02:00:00     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
3 2011-01-01 03:00:00     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
4 2011-01-01 04:00:00     0.0     0.0     0.0     0.0     0.0     0.0     0.0   

   MT_008  MT_009  ...  MT_361  MT_362  MT_363  MT_364  MT_365  MT_366   
0     0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0  \
1     0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
2     0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
3     0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
4     0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   

   MT_367  MT_368  MT_369  MT_370  
0     0.0     0.0     0.0     0.

# to get tidy dataframe, use melt to pivot dataframe and get it vertically stacked:

In [13]:
resampled_data = data_resamp

# stack dataset by melting it (the value_name col will be recalculated after):
melted_data = pd.melt(resampled_data, id_vars=['datetime'], var_name='client_id', value_name='KwH')

# Remove 'MT_' from client_id col:
melted_data['client_id'] = melted_data['client_id'].str.replace('MT_', '').astype(int)
print(melted_data.head())

             datetime  client_id  KwH
0 2011-01-01 00:00:00          1  0.0
1 2011-01-01 01:00:00          1  0.0
2 2011-01-01 02:00:00          1  0.0
3 2011-01-01 03:00:00          1  0.0
4 2011-01-01 04:00:00          1  0.0


# For client data that started late and ended early, get first and last nonzero dates for each client:

In [15]:
cutoff_dates = melted_data[melted_data['KwH'] != 0].groupby('client_id', as_index=False).agg(
    min=('datetime','min'),
    max=('datetime','max')
    )

cutoff_dates.tail()
melted_data.head()

Unnamed: 0,client_id,min,max
365,366,2012-01-01,2015-01-01
366,367,2012-01-01,2015-01-01
367,368,2012-01-01,2015-01-01
368,369,2012-01-01,2015-01-01
369,370,2013-01-01,2015-01-01


# if client presents empty rows at the start or the end of the dataframe, get rid of it:

In [19]:
tmp = melted_data.copy()

def filter_rows_before_after_date(df, cutoff_dates):
    df = df.merge(cutoff_dates, on='client_id', how='left')
    df['is_before_start'] = df['datetime'] < df['min']
    df['is_after_end'] = df['datetime'] > df['max']
    df.drop(df[df.is_before_start == True].index, inplace=True)
    df.drop(df[df.is_after_end == True].index, inplace=True)
    df.drop(['is_before_start', 'is_after_end', 'min', 'max'], axis=1, inplace=True)
    return df

melted_data = filter_rows_before_after_date(melted_data, cutoff_dates)
melted_data.head()

Unnamed: 0,datetime,client_id,KwH
8760,2012-01-01 00:00:00,1,12.690355
8761,2012-01-01 01:00:00,1,16.497462
8762,2012-01-01 02:00:00,1,19.035533
8763,2012-01-01 03:00:00,1,17.766497
8764,2012-01-01 04:00:00,1,19.035533
