### Dependencies

In [1]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import numpy as np

2024-03-04 10:50:35.589227: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-04 10:50:35.812311: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-04 10:50:35.812334: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-04 10:50:35.813892: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-04 10:50:35.930319: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-04 10:50:35.931319: I tensorflow/core/platform/cpu_feature_guard.cc:182] This Tens

# Preprocessing

## Feature Scaling & File Sorting

In [6]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import os
import glob

# input and output directories
root_dir = '/home/joshuavargas/research/datasets/BB-MAS_Dataset/BB-MAS_Dataset/'
output_base_dir = '/home/joshuavargas/research/datasets/BB-MAS_Preprocessed/'

# distinguish between accelerometer and gyroscope data
sensor_types = {'Accelerometer': 'accelerometer', 'Gyroscope': 'gyroscope'}

# folders 1 - 117
for i in range(1, 118):
    folder_path = os.path.join(root_dir, str(i))
    csv_files = glob.glob(os.path.join(folder_path, '*_PocketPhone_*.csv'))
    
    if not csv_files:
        continue
    
    for csv_file in csv_files:
        df = pd.read_csv(csv_file)
        
        # copy the features that aren't being normalized then normalize to be between -1 and 1
        output_df = df[['time']].copy()
        scaler = MinMaxScaler(feature_range=(-1, 1))
        
        for column in ['Xvalue', 'Yvalue', 'Zvalue']:
            scaled_values = scaler.fit_transform(df[[column]])
            output_df[column] = scaled_values
            
        # convert timestamp to panda datetime -> calculate elapsed times -> drop original timestamp
        output_df['time'] = pd.to_datetime(output_df['time'])
        output_df['elapsed_time'] = output_df['time'].diff().dt.total_seconds().fillna(0)
        output_df.drop(columns=['time'], inplace=True)
        
        # determine sensor type for output directory
        for sensor_key, sensor_folder in sensor_types.items():
            if sensor_key in csv_file:
                sensor_output_dir = os.path.join(output_base_dir, sensor_folder)
                break
        else:
            sensor_output_dir = os.path.join(output_base_dir, 'other')
        
        # make sure folders exist and set output file names to be the input file name
        os.makedirs(sensor_output_dir, exist_ok=True)
        output_filename = os.path.basename(csv_file)
        
        # Write to file in the specific sensor type folder
        output_df.to_csv(os.path.join(sensor_output_dir, output_filename), index=False)


## Resizing 

In [1]:
import pandas as pd
import os
import glob

def resize_files_in_folder(folder_path, sensor_type):
    csv_files = glob.glob(os.path.join(folder_path, '*_PocketPhone_*.csv'))
    num_rows = 1000000  # Start with a large number that will be reduced to the smallest file size

    # First pass: Find the minimum number of rows
    for csv_file in csv_files:
        df = pd.read_csv(csv_file)
        num_rows = min(num_rows, df.shape[0])

    print(f"Least number of rows in {sensor_type} folder: ", num_rows)

    # Second pass: Resize files
    for csv_file in csv_files:
        df = pd.read_csv(csv_file)
        resized_df = df.head(num_rows)  # Keep only the top 'num_rows' rows
        resized_df.to_csv(csv_file, index=False)  # Save the resized DataFrame back to the same file

# Paths to accelerometer and gyroscope data folders
folder_path_accelerometer = '/home/joshuavargas/research/datasets/BB-MAS_Preprocessed/accelerometer/'
folder_path_gyroscope = '/home/joshuavargas/research/datasets/BB-MAS_Preprocessed/gyroscope/'

# Process accelerometer and gyroscope data
resize_files_in_folder(folder_path_accelerometer, "accelerometer")
resize_files_in_folder(folder_path_gyroscope, "gyroscope")


Least number of rows in accelerometer folder:  17024
Least number of rows in gyroscope folder:  17023


## Sequencing

NameError: name 'your_label_logic_here' is not defined

In [32]:
import tensorflow as tf
import numpy as np

root_dir = '/home/joshuavargas/research/datasets/BB-MAS_Preprocessed/accelerometer/1_PocketPhone_Accelerometer_(Samsung_S6).csv'
df = pd.read_csv(root_dir)

# Assuming you want to predict Zvalue
features_df = df[['Xvalue', 'Yvalue', 'Zvalue', 'elapsed_time']]
targets_df = df['Zvalue']

# Convert DataFrame and Series to TensorFlow datasets
features_dataset = tf.data.Dataset.from_tensor_slices(features_df.values)
targets_dataset = tf.data.Dataset.from_tensor_slices(targets_df.values)

window_size = 3
batch_size = 5

# Create a dataset of windows
dataset = tf.data.Dataset.zip((features_dataset, targets_dataset))
dataset = dataset.window(window_size, shift=1, drop_remainder=True)
dataset = dataset.flat_map(lambda f, t: tf.data.Dataset.zip((f.batch(window_size), t.batch(window_size))))
dataset = dataset.map(lambda f, t: (f, t[-1]))
dataset = dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)

for features, target in dataset.take(1):
    print("Features:\n", features.numpy())
    print("Target:", target.numpy())


Features:
 [[[0.65307536 0.22328091 0.58793919 0.        ]
  [0.64372309 0.27365402 0.56851849 0.011     ]
  [0.6228413  0.32416043 0.54596388 0.009     ]]

 [[0.64372309 0.27365402 0.56851849 0.011     ]
  [0.6228413  0.32416043 0.54596388 0.009     ]
  [0.60794695 0.36380595 0.53452039 0.01      ]]

 [[0.6228413  0.32416043 0.54596388 0.009     ]
  [0.60794695 0.36380595 0.53452039 0.01      ]
  [0.60616556 0.39132461 0.52958212 0.01      ]]

 [[0.60794695 0.36380595 0.53452039 0.01      ]
  [0.60616556 0.39132461 0.52958212 0.01      ]
  [0.61205403 0.40018655 0.51661915 0.01      ]]

 [[0.60616556 0.39132461 0.52958212 0.01      ]
  [0.61205403 0.40018655 0.51661915 0.01      ]
  [0.6303627  0.39012525 0.50650519 0.009     ]]]
Target: [0.54596388 0.53452039 0.52958212 0.51661915 0.50650519]


## Scale Verification 

In [19]:
import matplotlib.pyplot as plt

root_dir   = '/Users/joshuavargas/research/datasets/BB-MAS_Dataset/BB-MAS_Dataset/'
output_dir = '/Users/joshuavargas/research/datasets/BB-MAS_Preprocessed/accelerometer/'

folder_path = os.path.join(root_dir, "1")
csv_files = glob.glob(os.path.join(folder_path, '*_PocketPhone_*.csv'))
scaled_files = glob.glob(os.path.join(output_dir, '1_PocketPhone_*.csv'))

df = pd.read_csv(csv_files[0])
X_train = df['Xvalue']
X_scatter = df[['Xvalue', 'Yvalue']]

df_scaled = pd.read_csv(scaled_files[0])
X_train_scaled = df_scaled['Xvalue']
X_scatter_scaled = df[['Xvalue', 'Yvalue']]

# histograms

# before scaling
plt.figure(figsize=(10, 6))
plt.hist(X_train, bins=50, alpha=0.5, label='Before Scaling')
plt.legend()
plt.show()

# after scaling
plt.figure(figsize=(10, 6))
plt.hist(X_train_scaled, bins=50, alpha=0.5, label='After Scaling')
plt.legend()
plt.show()


# scatter plot
plt.figure(figsize=(8, 6))
plt.scatter(X_scatter['Xvalue'], X_scatter['Yvalue'], alpha=0.5, label='Before Scaling')
plt.scatter(X_scatter_scaled['Xvalue'], X_scatter_scaled['Yvalue'], alpha=0.5, label='After Scaling')
plt.title("Feature Comparison Before and After Scaling")
plt.xlabel("Xvalue")
plt.ylabel("Yvalue")
plt.legend()
plt.show()



NameError: name 'os' is not defined

# LSTM Creation