### Importing Libraries

In [1]:
import pandas as pd
import os
from scipy.io import arff as scipy_arff


### Data Ingestion

In [4]:
def import_raw_data(main_data_directory):
    paths = {
        'phone_accel': os.path.join(main_data_directory, 'raw', 'phone', 'accel'),
        'phone_gyro': os.path.join(main_data_directory, 'raw', 'phone', 'gyro'),
        'watch_accel': os.path.join(main_data_directory, 'raw', 'watch', 'accel'),
        'watch_gyro': os.path.join(main_data_directory, 'raw', 'watch', 'gyro')
    }
    
    data_frames = []
    
    for device_sensor, path in paths.items():
        for filename in os.listdir(path):
            if filename.endswith('.txt'):
                file_path = os.path.join(path, filename)
                try:
                    df = pd.read_csv(file_path, sep=',', header=None, names=['Subject-id', 'Activity Code', 'Timestamp', 'x', 'y', 'z'], engine='python')
                    device, sensor = device_sensor.split('_')
                    df['Device'] = device
                    df['Sensor'] = sensor
                    data_frames.append(df)
                except pd.errors.ParserError as e:
                    print(f"Error parsing file {file_path}: {e}")
    
    combined_df = pd.concat(data_frames, ignore_index=True)
    
    return combined_df


def import_arff_data(main_data_directory):
    paths = {
        'phone_accel': os.path.join(main_data_directory, 'arff_files', 'phone', 'accel'),
        'phone_gyro': os.path.join(main_data_directory, 'arff_files', 'phone', 'gyro'),
        'watch_accel': os.path.join(main_data_directory, 'arff_files', 'watch', 'accel'),
        'watch_gyro': os.path.join(main_data_directory, 'arff_files', 'watch', 'gyro')
    }
    
    data_frames = []
    
    for device_sensor, path in paths.items():
        for filename in os.listdir(path):
            if filename.endswith('.arff'):
                file_path = os.path.join(path, filename)
                try:
                    arff_data, meta = scipy_arff.loadarff(file_path)
                    df = pd.DataFrame(arff_data)
                    # Convert byte strings to string
                    df = df.applymap(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)
                    # Extract device and sensor information
                    device, sensor = device_sensor.split('_')
                    df['Device'] = device
                    df['Sensor'] = sensor
                    data_frames.append(df)
                except Exception as e:
                    print(f"Error processing file {file_path}: {e}")
    
    if data_frames:
        combined_df = pd.concat(data_frames, ignore_index=True)
        # Convert the first column to categorical
        combined_df.iloc[:, 0] = combined_df.iloc[:, 0].astype('category')
        return combined_df
    else:
        print("No ARFF files found or processed.")
        return None


def load_data():
    main_data_directory = "/Users/subash/Desktop/datasets/archive/wisdm-dataset/wisdm-dataset"
    raw_data_df = import_raw_data(main_data_directory)
    arff_data_df = import_arff_data(main_data_directory)
    return raw_data_df, arff_data_df

In [5]:
raw_df,arff_df = load_data()

In [6]:
raw_df

Unnamed: 0,Subject-id,Activity Code,Timestamp,x,y,z,Device,Sensor
0,1631,A,1553872620859145,-3.231689,0.960129,1.223938;,phone,accel
1,1631,A,1553872671213149,-3.688065,2.486359,2.971283;,phone,accel
2,1631,A,1553872721567153,-2.923523,8.615723,5.365753;,phone,accel
3,1631,A,1553872771921157,0.362640,16.023514,7.0350494;,phone,accel
4,1631,A,1553872822275160,-5.205841,7.684662,6.512863;,phone,accel
...,...,...,...,...,...,...,...,...
15649248,1610,S,18917068249901,-0.508430,0.602261,0.061357632;,watch,gyro
15649249,1610,S,18917118178041,0.014615,0.945276,-0.16980475;,watch,gyro
15649250,1610,S,18917168106181,3.126252,1.933841,-0.83666027;,watch,gyro
15649251,1610,S,18917218034321,3.550228,2.166069,-1.326682;,watch,gyro


In [7]:
arff_df.head()

Unnamed: 0,"""ACTIVITY""","""X0""","""X1""","""X2""","""X3""","""X4""","""X5""","""X6""","""X7""","""X8""",...,"""XYCOS""","""XZCOS""","""YZCOS""","""XYCOR""","""XZCOR""","""YZCOR""","""RESULTANT""","""class""",Device,Sensor
0,A,0.06,0.11,0.215,0.255,0.24,0.07,0.03,0.01,0.005,...,0.751094,-0.005809,-0.376951,0.383184,0.377359,-0.10338,10.7499,1610,phone,accel
1,A,0.05,0.13,0.17,0.22,0.285,0.09,0.02,0.02,0.01,...,0.741898,0.069865,-0.368142,0.361264,0.48803,-0.070615,10.8633,1610,phone,accel
2,A,0.07,0.135,0.165,0.25,0.19,0.13,0.025,0.015,0.01,...,0.68907,0.227904,-0.299957,0.255459,0.568801,-0.134642,10.9722,1610,phone,accel
3,A,0.075,0.145,0.14,0.205,0.275,0.105,0.01,0.025,0.02,...,0.686601,0.244267,-0.280844,0.259784,0.56738,-0.128378,10.9814,1610,phone,accel
4,A,0.085,0.145,0.145,0.175,0.285,0.1,0.03,0.02,0.005,...,0.700471,0.057924,-0.399338,0.311718,0.398679,-0.245695,10.8796,1610,phone,accel


### Pre-processing

In [8]:
arff_df.columns = arff_df.columns.str.strip('"')

In [9]:
raw_df['z'] = raw_df['z'].str.slice(stop=-1)

In [10]:
raw_df.columns

Index(['Subject-id', 'Activity Code', 'Timestamp', 'x', 'y', 'z', 'Device',
       'Sensor'],
      dtype='object')

In [11]:
arff_df.columns

Index(['ACTIVITY', 'X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9',
       'Y0', 'Y1', 'Y2', 'Y3', 'Y4', 'Y5', 'Y6', 'Y7', 'Y8', 'Y9', 'Z0', 'Z1',
       'Z2', 'Z3', 'Z4', 'Z5', 'Z6', 'Z7', 'Z8', 'Z9', 'XAVG', 'YAVG', 'ZAVG',
       'XPEAK', 'YPEAK', 'ZPEAK', 'XABSOLDEV', 'YABSOLDEV', 'ZABSOLDEV',
       'XSTANDDEV', 'YSTANDDEV', 'ZSTANDDEV', 'XVAR', 'YVAR', 'ZVAR', 'XMFCC0',
       'XMFCC1', 'XMFCC2', 'XMFCC3', 'XMFCC4', 'XMFCC5', 'XMFCC6', 'XMFCC7',
       'XMFCC8', 'XMFCC9', 'XMFCC10', 'XMFCC11', 'XMFCC12', 'YMFCC0', 'YMFCC1',
       'YMFCC2', 'YMFCC3', 'YMFCC4', 'YMFCC5', 'YMFCC6', 'YMFCC7', 'YMFCC8',
       'YMFCC9', 'YMFCC10', 'YMFCC11', 'YMFCC12', 'ZMFCC0', 'ZMFCC1', 'ZMFCC2',
       'ZMFCC3', 'ZMFCC4', 'ZMFCC5', 'ZMFCC6', 'ZMFCC7', 'ZMFCC8', 'ZMFCC9',
       'ZMFCC10', 'ZMFCC11', 'ZMFCC12', 'XYCOS', 'XZCOS', 'YZCOS', 'XYCOR',
       'XZCOR', 'YZCOR', 'RESULTANT', 'class', 'Device', 'Sensor'],
      dtype='object')

In [12]:
arff_df.isna().sum()

ACTIVITY     0
X0           0
X1           0
X2           0
X3           0
            ..
YZCOR        0
RESULTANT    0
class        0
Device       0
Sensor       0
Length: 95, dtype: int64

In [13]:
# Converting timestamp from Unix to readable format
raw_df['Timestamp'] = pd.to_datetime(raw_df['Timestamp'], unit='us')


In [14]:
raw_df

Unnamed: 0,Subject-id,Activity Code,Timestamp,x,y,z,Device,Sensor
0,1631,A,2019-03-29 15:17:00.859145,-3.231689,0.960129,1.223938,phone,accel
1,1631,A,2019-03-29 15:17:51.213149,-3.688065,2.486359,2.971283,phone,accel
2,1631,A,2019-03-29 15:18:41.567153,-2.923523,8.615723,5.365753,phone,accel
3,1631,A,2019-03-29 15:19:31.921157,0.362640,16.023514,7.0350494,phone,accel
4,1631,A,2019-03-29 15:20:22.275160,-5.205841,7.684662,6.512863,phone,accel
...,...,...,...,...,...,...,...,...
15649248,1610,S,1970-08-07 22:44:28.249901,-0.508430,0.602261,0.061357632,watch,gyro
15649249,1610,S,1970-08-07 22:45:18.178041,0.014615,0.945276,-0.16980475,watch,gyro
15649250,1610,S,1970-08-07 22:46:08.106181,3.126252,1.933841,-0.83666027,watch,gyro
15649251,1610,S,1970-08-07 22:46:58.034321,3.550228,2.166069,-1.326682,watch,gyro


In [15]:
raw_df.isna().sum()

Subject-id       0
Activity Code    0
Timestamp        0
x                0
y                0
z                0
Device           0
Sensor           0
dtype: int64

In [16]:
raw_df.to_csv("raw.csv")
arff_df.to_csv("arff.csv")