In [None]:
import os
import pandas as pd
import numpy as np
from utils import *
import warnings
warnings.filterwarnings('ignore')

LOADING DATA

In [24]:
accel_file = "data/all_accelerometer_data_pids_13.csv"
accel_data = pd.read_csv(accel_file)

extracted_files = []
for root, dirs, files in os.walk("data"):
    for file in files:
        extracted_files.append(os.path.join(root, file))

phone_file = "data/phone_types.csv"
phone_data = pd.read_csv(phone_file)

tac_files = [f for f in extracted_files if "clean_tac" in f and f.endswith(".csv")]
tac_data_list = [pd.read_csv(f) for f in tac_files]

Extracting the Personal IDs

In [25]:
extracted_data = {
    os.path.basename(file).split("_")[0]: pd.read_csv(file)
    for file in extracted_files if "clean_tac" in file and file.endswith(".csv")
}

print("Extracted PIDs:", list(extracted_data.keys()))

Extracted PIDs: ['DK3500', 'JR8022', 'SA0297', 'BU4707', 'HV0618', 'SF3079', 'MJ8002', 'CC6740', 'PC6771', 'MC7070', 'DC6359', 'BK7610', 'JB3156']


In [26]:
m2s = lambda x: x // 1000
accel_data['timestamp'] = accel_data['time'].apply(m2s)

for axis in ['x', 'y', 'z']:
    accel_data[axis] = accel_data[axis] / 40

unique_pids = np.unique(phone_data['pid'])
filtered_dfs = {pid: accel_data[accel_data['pid'] == pid] for pid in unique_pids}

dfs = {os.path.basename(file_name): pd.read_csv(file_name) for file_name in extracted_files if "clean_tac" in file_name and file_name.endswith(".csv")}

merged_dfs = {
    pid: pd.merge(filtered_dfs[pid], dfs[f"{pid}_clean_TAC.csv"], on="timestamp", how="inner")
    for pid in unique_pids
}

GENERATING SEGMENTS

In [27]:

def generate_tac_segments(sorted_tac_series):
    value_counts = sorted_tac_series.value_counts().sort_index(ascending=True)
    return pd.Series(value_counts.tolist(), index=value_counts.index, name='count')

tac_segments = {pid: generate_tac_segments(df['TAC_Reading']) for pid, df in merged_dfs.items()}

In [28]:
tac_data = pd.concat(tac_data_list, ignore_index=True)

accel_data["time"] = pd.to_datetime(accel_data["time"], unit="ms")
tac_data["timestamp"] = pd.to_datetime(tac_data["timestamp"], unit="s")

accel_data = accel_data.merge(phone_data, on="pid", how="left")

In [29]:
save_path = "data/processed_data"
os.makedirs(save_path, exist_ok=True)
accel_data.to_csv(os.path.join(save_path, "accelerometer_data.csv"), index=False)
tac_data.to_csv(os.path.join(save_path, "tac_data.csv"), index=False)

accel_data["time"] = pd.to_datetime(accel_data["time"])
tac_data["timestamp"] = pd.to_datetime(tac_data["timestamp"])

save_processed_path = "data/merged_data"
os.makedirs(save_processed_path, exist_ok=True)
accel_data.to_csv(os.path.join(save_processed_path, "accelerometer_data_processed.csv"), index=False)
tac_data.to_csv(os.path.join(save_processed_path, "tac_data_processed.csv"), index=False)

In [30]:
merged_dfs = {pid: df.sort_values(by=['TAC_Reading']).reset_index(drop=True) for pid, df in merged_dfs.items()}

In [31]:
tac_reading_counts = {
    pid: pd.Series(df['TAC_Reading'].value_counts().sort_values(ascending=False).tolist(), name='count')
    for pid, df in merged_dfs.items()
}

In [33]:
results = {}

for pid, df in merged_dfs.items():
    sorted_df = df.sort_values(by=['TAC_Reading']).reset_index(drop=True)
    tac_counts = tac_reading_counts[pid]

    pe_results = {'x': [], 'y': [], 'z': []}
    comp_results = {'x': [], 'y': [], 'z': []}
    tac_readings = []

    start_index = 0
    unique_tac_values = sorted_df['TAC_Reading'].unique()
    
    for segment_index, segment_value in enumerate(tac_counts):
        end_index = start_index + segment_value
        if end_index > len(sorted_df):  
            end_index = len(sorted_df)

        if segment_index < len(unique_tac_values):  
            tac_readings.append(unique_tac_values[segment_index])
        else:
            tac_readings.append(np.nan)

        for axis in ['x', 'y', 'z']:
            segment = sorted_df[axis].iloc[start_index:end_index].dropna()

            if len(segment) < 3:
                pe_results[axis].append(np.nan)
                comp_results[axis].append(np.nan)
            else:
                op = ordinal_patterns(segment.values, 3, 1)
                pe_results[axis].append(p_entropy(op))
                comp_results[axis].append(complexity(op))

        start_index = end_index

    tac_readings = tac_readings[:len(pe_results['x'])]

    results[pid] = {
        axis: pd.DataFrame({
            'Pe_results': pe_results[axis], 
            'Comp_results': comp_results[axis], 
            'TAC_Reading': tac_readings
        }).sort_values(by=['TAC_Reading'], ascending=True).reset_index(drop=True)
        for axis in ['x', 'y', 'z']
    }


In [None]:
for pid in results.keys():
    for axis in ['x', 'y', 'z']:
        results[pid][axis]['Sober_classification'] = results[pid][axis]['TAC_Reading'].apply(lambda x: 1 if x > 0.08 else 0)


In [None]:
output_dir = "data/final_data"
os.makedirs(output_dir, exist_ok=True)

for pid, axes_data in results.items():
    output_path = os.path.join(output_dir, f"{pid}_processed.csv")
    combined_df = pd.concat(axes_data.values(), keys=axes_data.keys(), names=['Axis'])
    combined_df.to_csv(output_path)

CREATING THE MERGED DATA

In [None]:

folder_path = 'data/final_data'  
tac_data = pd.read_csv('data/merged_data/tac_data.csv')
tac_data['timestamp'] = pd.to_datetime(tac_data['timestamp'])

columns_to_keep = ['Axis', 'Pe_results', 'Comp_results', 'TAC_Reading', 'Sober_classification']
dataframes = []

for filename in os.listdir(folder_path):
    if filename.endswith('_processed.csv'):
        file_path = os.path.join(folder_path, filename)
        pid = filename.split('_')[0]

        df = pd.read_csv(file_path)

        tac_pid_data = tac_data[tac_data['pid'] == pid].reset_index(drop=True)

        if len(tac_pid_data) < len(df):
            continue

        df = df[columns_to_keep].copy()
        df['timestamp'] = tac_pid_data['timestamp'].iloc[:len(df)].values
        df['PID'] = pid

        dataframes.append(df)

merged_df = pd.concat(dataframes, ignore_index=True)
merged_df = merged_df[['PID', 'timestamp'] + columns_to_keep]

merged_df.to_csv('merged_data.csv', index=False)
print("Merged data saved to merged_data.csv")


Merged data saved to merged_data.csv
