In [None]:
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
import datetime
import pytz

from astral import LocationInfo
from astral.sun import sun

pd.options.mode.chained_assignment = None  # default='warn'
pd.set_option('display.max_columns', None)

In [None]:
mapping = {'bathroom1': 'Bathroom', 
        'WC1': 'Bathroom',
        'kitchen': 'Kitchen',
        'hallway': 'Hallway',
        'corridor1': 'Hallway',
        'dining room': 'Dining Room',
        'living room': 'Living Room',
        'lounge': 'Lounge',
        'study': 'Study',
        'office': 'Office',#
        'conservatory': 'Conservatory',
        'bedroom1': 'Bedroom',
        'main door':'Front door',
        'front door': 'Front door',
        'back door': 'Back door',
        'cellar': 'Cellar',
          'garage': 'Garage',
          'secondary':'Secondary',
          'fridge door': 'Fridge Door'} 

In [None]:
light = pd.read_parquet('Light.parquet', engine='pyarrow')
light['participant_id'] = light['patient_id'].apply(lambda x: x[:5])
light = light[['participant_id','start_date','location_name','value']]

light['location_name'] = light['location_name'].map(lambda x:mapping[x])
ids = light['participant_id'].unique()
light.head(3)

In [None]:
def prepare_for_stats( df, location, participant):
    df = df[(df['location_name']==location)&(df['participant_id']==participant)].reset_index(drop=True)
    df['datetime'] = pd.to_datetime(df['start_date']).dt.tz_localize(tz='UTC')#.dt.round('1T')
    df['datetime'] =  df['datetime'].dt.round('1T')
    if df['datetime'].dt.tz!='Europe/London':
        df['datetime'] = df['datetime'].dt.tz_convert(pytz.timezone('Europe/London'))
    df['date'] =  df['datetime'].dt.date
    if df.duplicated(['datetime']).sum()!=0:
        df['duplicates']=df.duplicated(['datetime']).sum()
       # print('Found some duplicate times, removing')
        df = df.drop_duplicates(subset=['datetime'],keep='first')#

    df['noon'] = df['datetime'].dt.floor('D') + pd.Timedelta(8, unit='h')
    df['starts'] = (df['datetime']-df['noon'].iloc[0]).dt.total_seconds()/3600
    df['ends'] = df['starts'].shift(-1)
    df['duration'] = df['ends'] - df['starts']

    df['remove_value?']='no'
    df.loc[df['duration']>=24, ['remove_value?']] = 'yes'
    df['logvalue'] = np.log10(df['value']+1)
    
    return df[['participant_id','datetime', 'location_name', 'value','logvalue','start_date','remove_value?','duration']].reset_index(drop=True)


def append_row(df, new_df):
    return pd.concat([ df, new_df]).reset_index(drop=True)

In [None]:
def start_light_table():
    return pd.DataFrame(columns=['participant_id','datetime', 'location_name', 'value','logvalue','start_date','remove_value?','duration'])
    
def update_light_table(home_number, all_df, participant, df):
    
    locations =  ['Bedroom','Lounge','Kitchen','Hallway','Bathroom']
    for room in locations: 
        if len(all_df[(all_df['location_name']==room)&(all_df['participant_id']==participant)])!=0:
            new_df = prepare_for_stats(all_df, room, participant)
            df = append_row(df, new_df)

        else:
            print('not environmental info at all for ', participant, room)

    return df

In [None]:
table = start_light_table()

for num_participant, participant in enumerate(ids):
        print('current length table:', len(table))
        print('.....................now doing participant: ', num_participant, participant)
        table = update_light_table(num_participant, light, participant,table) 
        df = pa.Table.from_pandas(table)
        pq.write_table(df, 'all_participants_light_clean_data_09_FEB_2024.parquet', compression='BROTLI')

table

Unnamed: 0,participant_id,datetime,location_name,value,logvalue,start_date,remove_value?,duration
0,TTTuJ,2021-05-27 21:08:00+01:00,Bedroom,5.0,0.778151,2021-05-27 20:07:52,no,0.250000
1,TTTuJ,2021-05-27 21:23:00+01:00,Bedroom,7.0,0.903090,2021-05-27 20:22:48,no,1.750000
2,TTTuJ,2021-05-27 23:08:00+01:00,Bedroom,289.0,2.462398,2021-05-27 22:07:45,no,0.250000
3,TTTuJ,2021-05-27 23:23:00+01:00,Bedroom,8.0,0.954243,2021-05-27 22:22:42,no,0.250000
4,TTTuJ,2021-05-27 23:38:00+01:00,Bedroom,5.0,0.778151,2021-05-27 22:37:54,no,5.733333
...,...,...,...,...,...,...,...,...
15599364,QbsYB,2023-06-20 21:14:00+01:00,Bathroom,5.0,0.778151,2023-06-20 20:13:32,no,2.066667
15599365,QbsYB,2023-06-20 23:18:00+01:00,Bathroom,454.0,2.658011,2023-06-20 22:17:54,no,0.066667
15599366,QbsYB,2023-06-20 23:22:00+01:00,Bathroom,5.0,0.778151,2023-06-20 22:21:55,no,0.616667
15599367,QbsYB,2023-06-20 23:59:00+01:00,Bathroom,519.0,2.716003,2023-06-20 22:59:17,no,0.100000


In [None]:
len(table[table.isna().any(axis=1)])*100/len(table)

In [None]:
len(table['participant_id'].unique())

In [None]:
print(ids)