In [None]:
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq

import datetime
import pytz

pd.options.mode.chained_assignment = None  # default='warn'

In [None]:
mapping = {'bathroom1': 'Bathroom', 
        'WC1': 'Bathroom',
        'kitchen': 'Kitchen',
        'hallway': 'Hallway',
        'corridor1': 'Hallway',
        'dining room': 'Dining Room',
        'living room': 'Living Room',
        'lounge': 'Lounge',
        'study': 'Study',
        'office': 'Office',#
        'conservatory': 'Conservatory',
        'bedroom1': 'Bedroom',
        'main door':'Front door',
        'front door': 'Front door',
        'back door': 'Back door',
        'cellar': 'Cellar',
          'garage': 'Garage',
          'secondary':'Secondary',
          'fridge door': 'Fridge Door'} 

In [None]:
temp = pd.read_parquet('Ambient_Temperature.parquet', engine='pyarrow')
temp['participant_id'] = temp['patient_id'].apply(lambda x: x[:5])
temp = temp[['participant_id','start_date','location_name','value']]

temp['location_name'] = temp['location_name'].map(lambda x:mapping[x])
ids = temp['participant_id'].unique()

In [None]:
def prepare_for_stats( df, location, participant):
    df = df[(df['location_name']==location)&(df['participant_id']==participant)].reset_index(drop=True)
    df['datetime'] = pd.to_datetime(df['start_date']).dt.tz_localize(tz='UTC')#.dt.round('1T')
    df['datetime'] =  df['datetime'].dt.round('1T')
    if df['datetime'].dt.tz!='Europe/London':
        df['datetime'] = df['datetime'].dt.tz_convert(pytz.timezone('Europe/London'))
    df['date'] =  df['datetime'].dt.date
    df['flag']=False
    if ((df['value'].values < 0).any()==True):
        print('Found negative value, original data size', len(df))
        df['delta_t'] = abs(df['value'].shift(-1)-df['value'])
        df['flag'] = np.where((df['value']<0)&(df['delta_t']>3),True, False)
#         df = df[df['flag']!=True]
#         print('after aapplying conditions ', len(df))
    if df.duplicated(['datetime']).sum()!=0:
        df['duplicates']=df.duplicated(['datetime']).sum()
        print('Found some duplicate times, removing')
        df = df.drop_duplicates(subset=['datetime'],keep='first')#

    df['noon'] = df['datetime'].dt.floor('D') + pd.Timedelta(7, unit='h')
    df['starts'] = (df['datetime']-df['noon'].iloc[0]).dt.total_seconds()/3600
    df['ends'] = df['starts'].shift(-1)
    df['duration'] = df['ends'] - df['starts']
    
    df['remove_value?']='no'
    df.loc[df['duration']>=24, ['remove_value?']] = 'yes'
    df.loc[df['flag']==True, ['remove_value?']] = 'yes'
    
    return df[['participant_id','datetime', 'location_name', 'value','start_date','remove_value?','duration','flag']].reset_index(drop=True)


def append_row(df, new_df):
    return pd.concat([ df, new_df]).reset_index(drop=True)
    

In [None]:
def start_temp_table():
    return pd.DataFrame(columns=['participant_id','datetime', 'location_name', 'value','start_date','remove_value?','duration','flag'])
    
def update_temp_table(home_number, all_df, participant, df):
    
    locations =  ['Bedroom','Lounge','Kitchen','Hallway','Bathroom']
    for room in locations: 
        if len(all_df[(all_df['location_name']==room)&(all_df['participant_id']==participant)])!=0:
            new_df = prepare_for_stats(all_df, room, participant)
            df = append_row(df, new_df)

        else:
            print('not environmental info at all for ', participant, room)

    return df

In [None]:
table = start_temp_table()

for num_participant, participant in enumerate(ids):
        print('current length table:', len(table))
        print('.....................now doing participant: ', num_participant, participant)
        table = update_temp_table(num_participant, temp, participant, table) 
        df = pa.Table.from_pandas(table)
        pq.write_table(df, 'all_participants_temperature_clean_data_09_FEB_2024.parquet', compression='BROTLI')

table

In [None]:
len(table['participant_id'].unique())

In [None]:
len(table[table.isna().any(axis=1)])*100/len(table)