In [1]:
import os
import sys
import csv
import ast
import json
from datetime import datetime
import numpy as np
import pandas as pd

In [32]:
class HomeData():
    def __init__(self, path):
        self.root_dir = path
        self.home = path.split('/')[-1].split('-')[-2]
        self.system = path.split('/')[-1].split('-')[-1]
    
    def mylistdir(self, directory):
        filelist = os.listdir(directory)
        return [x for x in filelist if not (x.startswith('.') or 'Icon' in x)] 

    def make_storage_directory(self, target_dir):
        if not os.path.exists(target_dir):
            os.makedirs(target_dir)
        return target_dir
    
    def date_segments(self, dates):
        output = []
        cur_list = [dates[0]]
        for dt_pair in zip(dates[1:], dates):
            if (dt_pair[0] - dt_pair[1]).days > 1:
                output.append(cur_list)
                cur_list = [dt_pair[0]]
            else:
                cur_list.append(dt_pair[0])
        output.append(cur_list)
        return output   
        

In [15]:
class HomeOccupancy(HomeData):
    
    def __init__(self, path, freq = '60S'):      
        DataReadWrite.__init__(self, path) 
        self.ground_path = os.path.join(self.root_dir, 'GroundTruth')
        self.write_dir = self.make_storage_directory(os.path.join(self.root_dir, 'Full_Occupancy_Files'))
        self.occ_freq = freq    
        self.occupant_names = []

    def mylistdir(self, directory):
        filelist = os.listdir(directory)
        return [x for x in filelist if x.endswith('.csv')]        
        
    def get_ground_truth(self):
        occupant_files = self.mylistdir(self.ground_path)
        occupants = {}
        enter_times, exit_times = [], []
        
        for occ in occupant_files:
            #occupant_name = occ.strip('.csv').split('-')[1] ## H3-black
            occupant_name = occ.strip('.csv').split('-')[0]  ## H1, H3-red
            self.occupant_names.append(occupant_name)
            ishome = []
            with open(os.path.join(self.ground_path, occ)) as csv_file:
                csv_reader, line_count = csv.reader(csv_file, delimiter=','), 0
                for row in csv_reader:
                    status, when = row[1], row[2].split('at')
                    dt_day = datetime.strptime(str(when[0] + when[1]), '%B %d, %Y  %I:%M%p')
                    ishome.append((status, dt_day))
                    if line_count == 0:
                        enter_times.append(dt_day)
                    line_count += 1
                exit_times.append(dt_day)
                
            occupants[occupant_name] = ishome        
        self.first_last = (sorted(enter_times)[0], sorted(exit_times)[-1])
        return occupants
    
    def create_occupancy_df(self, occupants):
        occ_range = pd.date_range(start=self.first_last[0], end=self.first_last[1], freq=self.occ_freq)    
        occ_df = pd.DataFrame(index=occ_range)
        
        for occ in occupants:
            occ_df[occ] = 99
            s1 = 'exited'
            for r in occupants[occ]:
                date = r[1]
                s2 = r[0]                
                occ_df.loc[(occ_df.index < date) & (occ_df[occ]==99) & (s1 == 'exited') & (s2 == 'entered'), occ] =  0
                occ_df.loc[(occ_df.index < date) & (occ_df[occ]==99) & (s1 == 'entered') & (s2 == 'exited'), occ] =  1
                s1 = s2               
            occ_df.loc[(occ_df.index >= date) & (occ_df[occ] == 99) & (s1 == 'entered'), occ] = 1
            occ_df.loc[(occ_df.index >= date) & (occ_df[occ] == 99) & (s1 == 'exited'), occ] = 0    
            
        occ_df['number'] = occ_df[list(occupants.keys())].sum(axis = 1)
        occ_df['occupied'] = 0
        occ_df.loc[occ_df['number'] > 0, 'occupied'] = 1
        return (occ_df)
        
    def write_occupancy_csv(self, df):       
        fname = os.path.join(self.write_dir,'{}-Occupancy_df.csv'.format(self.home))
        if not os.path.isfile(fname):
            df.to_csv(fname, index = True)
            print(fname + ': Write Sucessful!')
        else:
            print(fname + ': File already exists')    

            
    def main(self):
        occupant_status = self.get_ground_truth()
        self.df = self.create_occupancy_df(occupant_status)
        self.write_occupancy_csv(self.df)
        

In [36]:
class ReadEnv(HomeData):
    
    def __init__(self, path, sensor_hub):
        HomeData.__init__(self, path)
        self.name = sensor_hub
        self.env_dir = os.path.join(self.root_dir, self.name, 'env_params')
        self.all_data = {}
        self.first_last = {}
        self.num_folders = 288
        self.files_per = 5
        self.minutes_per_day = 1440
        self.total_minutes = {}
        self.dates_to_use = []
        self.details = []

        
    def get_date_folders(self, path):
        date_folders = self.mylistdir(path)
        date_folders.sort()
        self.day1, self.dayn = date_folders[0], date_folders[-1]
        return date_folders

    def read_in_data(self, path):
        with open(path, 'r') as f:
            try:
                self.data_dicts = json.loads(f.read())
                for time_point in self.data_dicts:
                    for measure in time_point:
                        self.measurements[measure].append(time_point[measure])
            except:
                pass
            
    
    def get_all_data(self, path, day):
        self.measurements = {
            'time':[], 'tvoc_ppb':[], 'temp_c':[], 'rh_percent':[], 
            'light_lux':[],'co2eq_ppm':[], 'dist_mm':[], 'co2eq_base':[], 'tvoc_base':[]}
        file_path = os.path.join(path, day)
        minute_folders = self.mylistdir(file_path)
        minute_folders.sort()
        num_missing = 5 * (self.num_folders - len(minute_folders))
        min_1, min_L = minute_folders[0], minute_folders[-1]
        min_n = str(int(min_L) + 4).zfill(4)
        self.first_last[day] = min_1, min_n
        for minute in minute_folders:
            sub_files_path = os.path.join(file_path, minute)
            sub_files = self.mylistdir(sub_files_path)
            sub_files.sort()
            missing = self.files_per - len(sub_files)
            num_missing += missing
            for file in sub_files:
                if file.endswith('.json'):
                    self.read_in_data(os.path.join(sub_files_path, file))
        
        self.all_data[day] = self.measurements
        total_day = 1440 - num_missing
        self.total_minutes[day] = total_day
        
    
    def get_day_summary(self, day):
        self.get_all_data(self.env_dir, day)
        try:
            total = self.total_minutes[day]/self.minutes_per_day
            perc = '{:.2f}'.format(total)
        except Exception as e:
            print('except: {}'.format(e))
            perc = 0.00
        F1, F2 = self.first_last[day][0], self.first_last[day][1]
        s = (f'({F1[0:2]}:{F1[2:4]}, {F2[0:2]}:{F2[2:4]})')
        details = '{} {} {} {}'.format(self.name, day, s, perc)
        return details, total

      
    def get_date_splits(self, dates):
        dt_dates = [datetime.strptime(date, '%Y-%m-%d') for date in dates]
        date_lists = self.date_segments(dt_dates)
        all_lists = [[date.strftime('%Y-%m-%d') for date in sublist] for sublist in date_lists]
        return all_lists   
      
   
    def main(self):
        date_folders = self.get_date_folders(self.env_dir)
        
        for day in date_folders:
            day_details, total = self.get_day_summary(day)
            print(day_details)
            self.details.append(day_details)
            
            if total > 0.85:
                self.dates_to_use.append(day)   
            
        self.list_of_dates = self.get_date_splits(self.dates_to_use)     
        

In [None]:
class CleanEnvData(HomeData):#, ReadEnv):
    
    def __init__(self, path, sensor_hub):
        HomeData.__init__(self, path)
        #ReadEnv.__init__(self, path, sensor_hub)
        self.all_dfs = {}

    def clean_dates(self, df, day): 
        df['time'] = df['time'].str.strip('Z').str.replace('T',' ')
        df['datetime_index'] = pd.to_datetime(df['time'])         
        df = df.set_index('datetime_index')
        df.index = df.index.floor('10s')
        df2 = self.create_full_dfs(df, day)        
        str_date = df2.index.strftime('%Y-%m-%d %H:%M:%S')
        df2.insert(loc = 0, column = 'str_datetime', value = str_date)
        datetime_col = df2['str_datetime'].str.split(' ', n = 1, expand = True)         
        df2.insert(loc = 0, column = 'date', value = datetime_col[0])
        df2.insert(loc = 0, column = 'time-hr-min-sec', value = datetime_col[1])
        time_col = datetime_col[1].str.split(':', n = 2, expand = True)    
        df2.insert(loc = 0, column = 'second', value = time_col[2])
        df2.insert(loc = 0, column = 'minute', value = time_col[1])
        df2.insert(loc = 0, column = 'hour', value = time_col[0])        
        df2 = df2.drop(columns = ['str_datetime', 'time'])
        df2 = df2.sort_values(by = ['date', 'hour', 'minute', 'second'])
        df2['home'] = self.home
        df2['sensor'] = self.name
        return df2     
    
        
    def main(self):
        
        
            

In [10]:
print(all_sensor_data['RS1']['2019-07-17'])

{'time': ['2019-07-17T00:05:01Z', '2019-07-17T00:05:10Z', '2019-07-17T00:05:20Z', '2019-07-17T00:05:31Z', '2019-07-17T00:05:41Z', '2019-07-17T00:05:50Z', '2019-07-17T00:06:01Z', '2019-07-17T00:06:11Z', '2019-07-17T00:06:21Z', '2019-07-17T00:06:31Z', '2019-07-17T00:06:41Z', '2019-07-17T00:06:51Z', '2019-07-17T00:07:01Z', '2019-07-17T00:07:10Z', '2019-07-17T00:07:21Z', '2019-07-17T00:07:31Z', '2019-07-17T00:07:41Z', '2019-07-17T00:07:51Z', '2019-07-17T00:08:01Z', '2019-07-17T00:08:11Z', '2019-07-17T00:08:21Z', '2019-07-17T00:08:31Z', '2019-07-17T00:08:41Z', '2019-07-17T00:08:50Z', '2019-07-17T00:09:01Z', '2019-07-17T00:09:10Z', '2019-07-17T00:09:21Z', '2019-07-17T00:09:31Z', '2019-07-17T00:09:41Z', '2019-07-17T00:09:51Z', '2019-07-17T00:10:01Z', '2019-07-17T00:10:10Z', '2019-07-17T00:10:21Z', '2019-07-17T00:10:32Z', '2019-07-17T00:10:41Z', '2019-07-17T00:10:50Z', '2019-07-17T00:11:01Z', '2019-07-17T00:11:11Z', '2019-07-17T00:11:21Z', '2019-07-17T00:11:31Z', '2019-07-17T00:11:41Z', '2019-

In [37]:
path = '/Users/maggie/Desktop/HPD_mobile_data/HPD_mobile-H3/H3-red'
sensors = ['RS1', 'RS2', 'RS3', 'RS4', 'RS5']

In [7]:
o = HomeOccupancy(path)
o.main()

In [38]:
all_sensor_data = {}
all_dates_to_use = {}
all_details = {}

for sensor in sensors:
    s = ReadEnv(path, sensor)
    s.main()
    all_sensor_data[sensor] = s.all_data
    all_dates_to_use[sensor] = s.dates_to_use
    all_details[sensor] = s.details
    print(s.list_of_dates)
    

RS1 2019-07-16 (15:40, 23:59) 0.32
RS1 2019-07-17 (00:00, 23:59) 0.93
RS1 2019-07-18 (00:00, 23:59) 0.95
RS1 2019-07-19 (00:00, 23:59) 0.95
RS1 2019-07-20 (00:00, 23:59) 0.94
RS1 2019-07-21 (00:00, 23:59) 0.95
RS1 2019-07-22 (00:00, 23:59) 0.95
RS1 2019-07-23 (00:00, 23:59) 0.94
RS1 2019-07-24 (00:00, 23:59) 0.94
RS1 2019-07-25 (00:00, 23:59) 0.96
RS1 2019-07-26 (00:00, 23:59) 0.96
RS1 2019-07-27 (00:00, 23:59) 0.97
RS1 2019-07-28 (00:00, 23:59) 0.96
RS1 2019-07-29 (00:00, 23:59) 0.97
RS1 2019-07-30 (00:00, 23:59) 0.95
RS1 2019-07-31 (00:00, 23:59) 0.97
RS1 2019-08-01 (00:00, 23:59) 0.97
RS1 2019-08-02 (00:00, 23:59) 0.91
RS1 2019-08-03 (00:00, 23:59) 0.94
RS1 2019-08-04 (00:00, 23:59) 0.88
RS1 2019-08-05 (00:00, 07:29) 0.28
RS1 2019-08-14 (18:50, 23:59) 0.20
RS1 2019-08-15 (00:00, 23:59) 0.91
RS1 2019-08-16 (00:00, 23:59) 0.92
RS1 2019-08-17 (00:00, 23:59) 0.92
RS1 2019-08-18 (00:00, 23:59) 0.96
RS1 2019-08-19 (00:00, 23:59) 0.92
RS1 2019-08-20 (00:00, 23:59) 0.94
RS1 2019-08-21 (00:0

In [None]:
#     def create_full_dfs(self, df, day):
#         day_start, day_end = self.first_last[day][0], self.first_last[day][1]
#         df_fullday = self.make_date_range(day1 = day) ##use this for full 24hours
#         df2 = df.reindex(df_fullday, fill_value = 0) 
#         df2.fillna(np.Nan)
#         return df2         
        
        
#     def make_date_range(self, day1, dayn = None, t1 = '0000', tn = '2359'):
#         self.range_start = str(day1 + ' ' + t1[0:2] + ':' + t1[2:4] + ':00')
#         self.range_end = str(day1 + ' ' + tn[0:2] + ':' + tn[2:4] + ':50')
#         date_range = pd.date_range(start=self.range_start, end=self.range_end, freq='10s')
#         return date_range 
   




            new_df = pd.DataFrame.from_dict(self.data[day])
            
            if new_df.empty:
                self.missing_days.append(day)
                self.first_last[day] = ('0000', '0000')
                continue
            new_df = self.absolute_humidity(new_df)
            new_df = self.check_rh(new_df, day)
            cleaned_data = self.clean_dates(new_df, day)
            dfwTruth = self.attach_ground_truth(cleaned_data, occupancy)
            self.all_dfs[day] = dfwTruth
            

        
        self.write_occupancy_df(occupancy)
        self.write_data(self.all_dfs, date_folders)

def make_storage_directory(self, root):
        target_dir = os.path.join(root, self.name, '0_complete_csv')
        return
     



