In [4]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [8]:
# This function goes up N = levels in the specified path. 
# Useful because it allows paths to be valid for both our computers.
# E.g. If you are in your_path = "D:\\Masters_Courses\\ELG5255_Applied_Machine_Learning\\Project_NAPS\\NAPS\\src\\data1\\preprocess.ipynb"
# Then getParent(path, 2) = "D:\\Masters_Courses\\ELG5255_Applied_Machine_Learning\\Project_NAPS\\NAPS\\"
# so getParent goes up N folders or directories.
#test
def getParent(path, levels = 1):
    if (levels <= 1):
        return os.path.dirname(path)
    else:
        return getParent(os.path.dirname(path), levels-1)

current_working_directory = os.getcwd() 
project_directory = getParent(current_working_directory, 2)
#joins path together. Assumes data is stored in %PROJECT_FOLDER%/data/
file_path_training = os.path.join(project_directory, 'data', 'train_series.parquet')
file_path_target   = os.path.join(project_directory, 'data', 'train_events.csv')

# Reading the Parquet file into a pandas DataFrame
training_df = pd.read_parquet(file_path_training)
target_df = pd.read_csv(file_path_target)

#Add target column to training data, k
df = pd.merge(training_df, target_df, on=['series_id', 'timestamp'], how='left')

#Printing the training data
print(df)

              series_id    step                 timestamp     anglez    enmo
0          038441c925bb       0  2018-08-14T15:30:00-0400   2.636700  0.0217
1          038441c925bb       1  2018-08-14T15:30:05-0400   2.636800  0.0215
2          038441c925bb       2  2018-08-14T15:30:10-0400   2.637000  0.0216
3          038441c925bb       3  2018-08-14T15:30:15-0400   2.636800  0.0213
4          038441c925bb       4  2018-08-14T15:30:20-0400   2.636800  0.0215
...                 ...     ...                       ...        ...     ...
127946335  fe90110788d2  592375  2017-09-08T00:14:35-0400 -27.277500  0.0204
127946336  fe90110788d2  592376  2017-09-08T00:14:40-0400 -27.032499  0.0233
127946337  fe90110788d2  592377  2017-09-08T00:14:45-0400 -26.841200  0.0202
127946338  fe90110788d2  592378  2017-09-08T00:14:50-0400 -26.723900  0.0199
127946339  fe90110788d2  592379  2017-09-08T00:14:55-0400 -31.521601  0.0205

[127946340 rows x 5 columns]


In [7]:
series_ids = df['series_id'].unique()

#In this cell seperate each series into
for series_id in series_ids:
    # Extract data for the current series
    series_data = df[df['series_id'] == series_id]
    
print(series_data)
print(series_ids)

              series_id    step                 timestamp     anglez    enmo
127353960  fe90110788d2       0  2017-08-04T17:30:00-0400 -27.707001  0.0298
127353961  fe90110788d2       1  2017-08-04T17:30:05-0400 -33.867500  0.0488
127353962  fe90110788d2       2  2017-08-04T17:30:10-0400 -15.475000  0.1077
127353963  fe90110788d2       3  2017-08-04T17:30:15-0400 -73.656197  0.0530
127353964  fe90110788d2       4  2017-08-04T17:30:20-0400 -53.152901  0.0601
...                 ...     ...                       ...        ...     ...
127946335  fe90110788d2  592375  2017-09-08T00:14:35-0400 -27.277500  0.0204
127946336  fe90110788d2  592376  2017-09-08T00:14:40-0400 -27.032499  0.0233
127946337  fe90110788d2  592377  2017-09-08T00:14:45-0400 -26.841200  0.0202
127946338  fe90110788d2  592378  2017-09-08T00:14:50-0400 -26.723900  0.0199
127946339  fe90110788d2  592379  2017-09-08T00:14:55-0400 -31.521601  0.0205

[592380 rows x 5 columns]


In [None]:
# Assuming df is your DataFrame
X = df.drop('event', axis=1)  # Features
y = df['event']  # Target variable
