---
# Part 1: Data Preprocessing
---

## Step 1: Merge the Data
---

- Import necessary packages.

In [1]:
import pandas as pd
from datetime import datetime
import numpy as np
import re
import os, fnmatch
import zipfile
from datetime import datetime, timedelta
from IPython.display import display

- Create Helpers.

In [2]:
def walklevel(some_dir, level=0):
    some_dir = some_dir.rstrip(os.path.sep)
    assert os.path.isdir(some_dir)
    num_sep = some_dir.count(os.path.sep)
    for root, dirs, files in os.walk(some_dir):
        yield root, dirs, files
        num_sep_this = root.count(os.path.sep)
        if num_sep + level <= num_sep_this:
            del dirs[:]
            
def swap_day_month(s):
    day, month, yr = s.split('.')
    return month +'/' + day +'/' + yr

def swap_day_month2(s):
    yr, month, day = s.split('-')
    return month +'/' + day +'/' + yr

def create_hr_seq():
    now = datetime(2000, 1, 1, 0, 0, 0)
    last = datetime(2000, 1, 2, 0, 0, 0)
    delta = timedelta(seconds=1)
    times = []
    while now < last:
        times.append(repr(now.strftime('%H:%M:%S')))
        now += delta
    return times

def create_dtype_dict():
    dic = {}
    dic['Unnamed: 0'] = str
    for i in create_hr_seq():
        dic[i] = float
    return dic

- Iterate through each house:
  - Create a master DataFrame;
  - Process *occupancy* data:
    -
  - Process *plugs* data:

In [4]:
!echo %cd% # under windows

%cd%


In [9]:
### master DataFrames of each house
house_df_dict = {}

CUR_PATH = '~/Desktop/IEOR135/DataSets'
change_path = lambda DIR : CUR_PATH + '/' + DIR

for root, dirs, files in walklevel(CUR_PATH): 
    ### 6 Houses
    for house in dirs:
        house_number = int(house.split(' ')[1])
        print('...' + house + '...')
        CUR_PATH = change_path(house)
        # read .txt file
        for file in os.listdir(CUR_PATH):  
            if fnmatch.fnmatch(file, '*.txt'):
                with open(change_path(file)) as f:
                    F = f.read()
                    num_appliance_str = re.findall(r'[0-9][0-9]:\s[^\(]*[a-z]', F)
                    dates = re.findall(r'[0-9][0-9]\.[0-9][0-9]\.[0-9][0-9]', F)
            
        # calculate data range from .txt file
        start_date, end_date = swap_day_month(dates[2]), swap_day_month(dates[3]) #Plugs
        # create the master df for the house
        master_df = pd.DataFrame(pd.date_range(start=start_date, end=end_date, freq='S', closed='left'),
                                 columns=['date']) #Plugs>Occupancy
        ### Process Occupancy Data
        if house_number != 6: # not available
            print('Processing Occupancy Data...')
            ROOT = CUR_PATH
            CUR_PATH = change_path('Occupany') if house_number != 1 else change_path('Occupancy')

            # read summer/winter.csv
            csv_tuple = []
            for file in os.listdir(CUR_PATH):  
                if fnmatch.fnmatch(file, '*.csv'):
                    csv_df = pd.read_csv(change_path(file), dtype=create_dtype_dict())
                    # transpose csv_df
                    csv_df = csv_df.set_index('Unnamed: 0')
                    csv_df.index.name = ''
                    csv_df = csv_df.T 
                    csv_tuple.append(csv_df)
            assert csv_tuple[0].shape[0] == csv_tuple[1].shape[0]
            combined_csv = pd.concat(csv_tuple, axis=1)       
            # convert to 2 cols
            combined_csv.index.name = 'date'
            concat_lst = []
            for col in combined_csv.columns:
                df = combined_csv[col].to_frame()
                df.rename(columns={col:'foo'}, inplace=True)
                df.reset_index(level=0, inplace=True)
                df['date'] = pd.date_range(start=col, periods=60*60*24, freq='S')
                concat_lst.append(df)
            # add to master_df
            final_df = pd.concat(concat_lst).rename(columns={'foo':'occupancy'})
            master_df = master_df.merge(final_df, on='date', how='left')
            CUR_PATH = ROOT
        
        
        
        ### Process Plug Data
        print('Processing Plugs Data...')
        CUR_PATH = change_path('Plugs')
        CUR_PATH = change_path('{:02}'.format(house_number))
        ROOT = CUR_PATH
        # get the names of the each electric appliance
        num_appliance_dict = dict(item.split(": ") for item in num_appliance_str)
        # loop through each appliance
        for n in num_appliance_dict.keys():
            appliance_name = num_appliance_dict[n]
            CUR_PATH = change_path(n)
            concat_lst = []
            # loop through each date of this appliance
            done = [] # prevent duplicates from House 02/09
            for file in os.listdir(CUR_PATH):  
                if fnmatch.fnmatch(file, '*.csv'):
                    date = swap_day_month2(os.path.splitext(file)[0][:10])
                    if date in done:
                        continue 
                    else:
                        done.append(date)
                    csv_df = pd.read_csv(change_path(file), names=[appliance_name])
                    cur_df = pd.DataFrame(pd.date_range(start=date, periods=60*60*24, freq='S'), columns=['date'])
                    cur_df[appliance_name] = csv_df.iloc[:,0]
                    concat_lst.append(cur_df)
            df = pd.concat(concat_lst, ignore_index=True)
            master_df = master_df.merge(df, on='date', how='left')
            CUR_PATH = ROOT
                        
        ### Process Metor Data (not now)
        house_df_dict[house_number] = master_df
        CUR_PATH = './DataSets'
        print('Completed {}!'.format(house))
print('Successfully merged all the data!!!')

AssertionError: 

In [None]:
ROOT = './DataSets'
CUR_PATH = ROOT
change_path = lambda DIR : CUR_PATH + '/' + DIR

for root, dirs, files in walklevel(CUR_PATH):
    ### 6 Houses
    for house in dirs:
        house_number = int(house.split(' ')[1])
        print('...' + house + '...')
        HOUSE_ROOT = change_path(house)
        CUR_PATH = HOUSE_ROOT

        if house_number != 6:
            CUR_PATH = change_path('Occupany') if house_number != 1 else change_path('Occupancy')
            for file in os.listdir(CUR_PATH)  :  
                if fnmatch.fnmatch(file, '*.zip'):
                    zip = zipfile.ZipFile(change_path(file), 'r')
                    zip.extractall(CUR_PATH)
                    print('unzipped ' + file + ' to ' + CUR_PATH)
            CUR_PATH = HOUSE_ROOT

        CUR_PATH = change_path('Plugs')
        for file in os.listdir(CUR_PATH)  :  
            if fnmatch.fnmatch(file, '*.zip'):
                zip = zipfile.ZipFile(change_path(file), 'r')
                zip.extractall(CUR_PATH)
                print('unzipped ' + file + ' to ' + CUR_PATH)
        CUR_PATH = ROOT

- Check the created master DataFrames.

In [None]:
for key in sorted(house_df_dict.keys()):
    df = house_df_dict[key]
    print('House {:02d} has {} millions of rows with the preview:'.format(key, round(df.shape[0]/1000000, 2)))
    display(df.head())
    print('Its features/columns are:')
    print(*df.columns, sep=", ") 
    print('')
    print('The general data statistics are:')
    display(df.describe())
    print('The Data Frame information (null, data type, etc.) are:')
    display(df.info())

- Save the created DataFrames to files.

In [None]:
for key in sorted(house_df_dict.keys()):
    df = house_df_dict[key]
    fname = 'house_{:02d}'.format(key, round(df.shape[0]/1000000, 2)) + '.csv'
    df.to_csv(change_path(fname))

## Step 2: Clean the Data
---

### (a) Deal with `nan` and `-1` values

### (b) ...

---
# Part 2: EDA
---

---
# Part 3: Time Series Spike
---

---
# Part 4: ML
---

---
# Part 5: Linear Programming
---