---
# Part 1: Data Preprocessing
---

## Step 1: Merge the Data
---

In [1]:
import pandas as pd
from datetime import datetime
import numpy as np
import re
import os, fnmatch
import zipfile
from datetime import datetime

In [2]:
### helpers 
def walklevel(some_dir, level=0):
    some_dir = some_dir.rstrip(os.path.sep)
    assert os.path.isdir(some_dir)
    num_sep = some_dir.count(os.path.sep)
    for root, dirs, files in os.walk(some_dir):
        yield root, dirs, files
        num_sep_this = root.count(os.path.sep)
        if num_sep + level <= num_sep_this:
            del dirs[:]
def swap_day_month(s):
    day, month, yr = s.split('.')
    return month +'/' + day +'/' + yr
def swap_day_month2(s):
    yr, month, day = s.split('-')
    return month +'/' + day +'/' + yr

In [15]:
### master DataFrames of each house
house_df_dict = {}

CUR_PATH = './DataSets'
change_path = lambda DIR : CUR_PATH + '/' + DIR

for root, dirs, files in walklevel(CUR_PATH): 
    ### 6 Houses
    for house in dirs:
        house_number = int(house.split(' ')[1])
        print('###' + house + '...')
        CUR_PATH = change_path(house)
        # read .txt file
        for file in os.listdir(CUR_PATH)  :  
            if fnmatch.fnmatch(file, '*.txt'):
                with open(change_path(file)) as f:
                    F = f.read()
                    num_appliance_str = re.findall(r'[0-9][0-9]:\s[^\(]*[a-z]', F)
                    dates = re.findall(r'[0-9][0-9]\.[0-9][0-9]\.[0-9][0-9]', F)
            
        # calculate data range from .txt file
        start_date, end_date = swap_day_month(dates[2]), swap_day_month(dates[3]) #Plugs
        # create the master df for the house
        master_df = pd.DataFrame(pd.date_range(start=start_date, end=end_date, freq='S', closed='left'),
                                 columns=['date']) #Plugs
        ### Process Occupancy Data
#         print('Processing Occupancy Data...')
        
        
        ### Process Plug Data
        print('Processing Plugs Data...')
        CUR_PATH = change_path('Plugs')
#         # unzip
#         for file in os.listdir(CUR_PATH)  :  
#             if fnmatch.fnmatch(file, '*.zip'):
#                 import zipfile
#                 zip = zipfile.ZipFile(change_path(file), 'r')
#                 zip.extractall()
#                 print('unzipped ' + file) ### cannot unzip and save the subdirectory
        CUR_PATH = change_path('{:02}'.format(house_number))
        ROOT = CUR_PATH
        # get the names of the each electric appliance
        num_appliance_dict = dict(item.split(": ") for item in num_appliance_str)
        # loop through each appliance
        for n in num_appliance_dict.keys():
            appliance_name = num_appliance_dict[n]
            CUR_PATH = change_path(n)
            concat_lst = []
            # loop through each date of this appliance
            done = [] # prevent duplicates from House 02/09
            for file in os.listdir(CUR_PATH):  
                if fnmatch.fnmatch(file, '*.csv'):
                    date = swap_day_month2(os.path.splitext(file)[0][:10])
                    if date in done:
                        continue 
                    else:
                        done.append(date)
                    csv_df = pd.read_csv(change_path(file), names=[appliance_name])
                    cur_df = pd.DataFrame(pd.date_range(start=date, periods=60*60*24, freq='S'), columns=['date'])
                    cur_df[appliance_name] = csv_df.iloc[:,0]
                    concat_lst.append(cur_df)
            df = pd.concat(concat_lst, ignore_index=True)
            master_df = master_df.merge(df, on='date', how='left')
            CUR_PATH = ROOT
                        
        ### Process Metor Data (not now)
        house_df_dict[house_number] = master_df
        CUR_PATH = './DataSets'
        print('Completed {}!'.format(house))
print('Successfully merged all the data!!!')

###House 3...
Processing Plugs Data...
Completed House 3!
###House 4...
Processing Plugs Data...
Completed House 4!
###House 5...
Processing Plugs Data...
Completed House 5!
###House 2...
Processing Plugs Data...
Completed House 2!
###House 1...
Processing Plugs Data...
Completed House 1!
###House 6...
Processing Plugs Data...
Completed House 6!


In [16]:
### TEST
from IPython.display import display
for key in sorted(house_df_dict.keys()):
    df = house_df_dict[key]
    display('House {:02d} has {} millions of rows'.format(key, round(df.shape[0]/1000000, 2)))
    display(df.head())

'House 01 has 2.42 millions of rows'

Unnamed: 0,date,Fridge,Dryer,Coffee machine,Kettle,Washing machine,Freezer
0,2012-06-01 00:00:00,49.2516,830.508,,0.0,4.39739,2.23178
1,2012-06-01 00:00:01,49.2516,834.774,,0.0,4.39739,2.23178
2,2012-06-01 00:00:02,49.2516,834.774,,0.0,4.39739,2.23178
3,2012-06-01 00:00:03,51.3899,832.641,,0.0,4.39739,2.23178
4,2012-06-01 00:00:04,49.2516,832.641,,0.0,6.5338,2.23178


'House 02 has 21.08 millions of rows'

Unnamed: 0,date,Tablet,Dishwasher,Air exhaust,Fridge,Entertainment,Freezer,Kettle,Lamp,Laptops,Stove,Stereo
0,2012-06-01 00:00:00,2.21504,0.0,,,,53.651,0.0,,0.0,,
1,2012-06-01 00:00:01,4.3293,0.0,,,,55.7929,0.0,,0.0,,
2,2012-06-01 00:00:02,2.21504,0.0,,,,53.651,0.0,,0.0,,
3,2012-06-01 00:00:03,2.21504,0.0,,,,53.651,0.0,,0.0,,
4,2012-06-01 00:00:04,2.21504,0.0,,,,55.7929,0.0,,0.0,,


'House 03 has 8.64 millions of rows'

Unnamed: 0,date,Tablet,Freezer,Coffee machine,Fridge,Kettle,Entertainment
0,2012-10-23 00:00:00,-1.0,-1.0,-1.0,-1.0,,
1,2012-10-23 00:00:01,-1.0,-1.0,-1.0,-1.0,,
2,2012-10-23 00:00:02,-1.0,-1.0,-1.0,-1.0,,
3,2012-10-23 00:00:03,-1.0,-1.0,-1.0,-1.0,,
4,2012-10-23 00:00:04,-1.0,-1.0,-1.0,-1.0,,


'House 04 has 18.14 millions of rows'

Unnamed: 0,date,Fridge,Kitchen appliances,Lamp,Stereo and laptop,Freezer,Tablet,Entertainment,Microwave
0,2012-06-27 00:00:00,102.429,2.16516,2.23978,15.0524,172.72,0.0,10.7178,4.34694
1,2012-06-27 00:00:01,100.296,2.16516,2.23978,15.0524,170.589,0.0,10.7178,2.23214
2,2012-06-27 00:00:02,102.429,0.0,0.0,15.0524,172.72,0.0,10.7178,4.34694
3,2012-06-27 00:00:03,102.429,0.0,0.0,15.0524,172.72,2.22889,10.7178,4.34694
4,2012-06-27 00:00:04,100.296,2.16516,2.23978,15.0524,172.72,0.0,10.7178,2.23214


'House 05 has 18.84 millions of rows'

Unnamed: 0,date,Tablet,Coffee machine,Fountain,Microwave,Fridge,Entertainment,Kettle
0,2012-06-27 00:00:00,2.20778,4.48706,8.72041,4.44332,4.44546,6.56679,
1,2012-06-27 00:00:01,2.20778,2.3477,8.72041,4.44332,4.44546,8.69303,
2,2012-06-27 00:00:02,4.33249,4.48706,8.72041,6.57853,4.44546,8.69303,
3,2012-06-27 00:00:03,4.33249,4.48706,8.72041,4.44332,4.44546,6.56679,
4,2012-06-27 00:00:04,2.20778,2.3477,8.72041,4.44332,4.44546,8.69303,


'House 06 has 18.84 millions of rows'

Unnamed: 0,date,Lamp,Laptop,Router,Coffee machine,Entertainment,Fridge,Kettle
0,2012-06-27 00:00:00,0.0,4.35384,19.3387,0.0,15.0043,2.19884,0.0
1,2012-06-27 00:00:01,0.0,4.35384,19.3387,0.0,15.0043,2.19884,0.0
2,2012-06-27 00:00:02,0.0,6.47995,19.3387,0.0,15.0043,0.0,0.0
3,2012-06-27 00:00:03,0.0,6.47995,19.3387,0.0,15.0043,0.0,0.0
4,2012-06-27 00:00:04,0.0,4.35384,19.3387,0.0,15.0043,0.0,0.0


## Step 2: Clean the Data
---

### (a) Deal with `nan` and `-1` values

---
# Part 2: EDA
---

---
# Part 3: Time Series Spike
---

---
# Part 4: ML
---

---
# Part 5: Linear Programming
---