<a href="https://colab.research.google.com/github/mrushad/ml4qs-G39/blob/main/notebooks/mlqs_lydia.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [None]:
import os
import pandas as pd
import numpy as np

# Cloning github repo to access data

In [None]:
!git clone https://github.com/mrushad/ml4qs-G39

fatal: destination path 'ml4qs-G39' already exists and is not an empty directory.


# Functions

## read_csv_files()

In [None]:
def read_csv_files():
  csv_files = [
      'Accelerometer.csv',
      'Linear Accelerometer.csv',
      'Barometer.csv',
      'Location.csv',
      'Gyroscope.csv',
      'Magnetometer.csv',
      'Proximity.csv'
  ]

  # Dictionary to store DataFrames
  dfs = {}

  for filename in csv_files:
      try:
          # Create a clean DataFrame name (e.g., 'Accelerometer_df')
          df_name = filename.replace('.csv', '') + '_df'

          # Read the CSV file
          df = pd.read_csv(filename)

          # Store it in the dictionary
          dfs[df_name] = df

          print(f"Loaded '{filename}' into DataFrame '{df_name}'. Shape: {df.shape}")
          # Optional: Display the first few rows to verify
          # print(dfs[df_name].head())

      except FileNotFoundError:
          print(f"Error: '{filename}' not found in the current directory ({os.getcwd()}).")
      except Exception as e:
          print(f"Error loading '{filename}': {e}")

  # --- Accessing your DataFrames ---

  # You can now access your DataFrames like this:
  if 'Accelerometer_df' in dfs:
      print("\nAccelerometer_df head:")
      print(dfs['Accelerometer_df'].head())

  if 'Gyroscope_df' in dfs:
      print("\nGyroscope_df head:")
      print(dfs['Gyroscope_df'].head())

  # Or iterate through them
  print("\n--- All loaded DataFrames ---")
  for name, df in dfs.items():
      print(f"DataFrame: {name}, Columns: {df.columns.tolist()}, Shape: {df.shape}")
  return dfs

## combine_dfs()

In [None]:
def combine_dfs(sensor_dfs):
    # Initialize an empty list to store processed DataFrames
    processed_dfs = []

    # Process each DataFrame
    for sensor_name, df in sensor_dfs.items(): # Iterate using the 'sensor_dfs' for cleaner prefixes
        # Create a copy to avoid modifying the original DataFrame directly
        df_copy = df.copy()

        # Round the 'Time (s)' column to three decimal places
        df_copy['Time (s)'] = df_copy['Time (s)'].round(3)

        # Rename columns to include the sensor name as a prefix
        new_columns = {}
        for col in df_copy.columns:
            if col != 'Time (s)':
                # Replace spaces in column names with underscores for consistency
                # Also remove parentheses from the unit part of the column name
                new_col_name = f"{sensor_name}_{col.replace(' ', '_').replace('(', '').replace(')', '')}"
                new_columns[col] = new_col_name
        df_copy = df_copy.rename(columns=new_columns)
        processed_dfs.append(df_copy) # Correctly append to 'processed_dfs'

    # Merge all processed DataFrames based on the 'Time (s)' column
    # Start with the first processed DataFrame
    if processed_dfs:
        combined_df = processed_dfs[0] # Use 'processed_dfs' for merging
        for i in range(1, len(processed_dfs)):
            combined_df = pd.merge(combined_df, processed_dfs[i], on='Time (s)', how='outer')

        # Display the combined DataFrame information
        print("Combined DataFrame Head:")
        print(combined_df.head())
        print("\nCombined DataFrame Info:")
        combined_df.info()
    else:
        print("No DataFrames to combine.")
    return combined_df

# CSV to DF

## Biking Data

In [None]:
os.chdir('ml4qs-G39')
!ls -F

data/  LICENSE	notebooks/  README.md  reports/  requirements.txt  src/


In [None]:
%cd data/raw

/content/ml4qs-G39/data/raw


In [None]:
!ls -F

'BikingData1 2025-06-06 10-48-07'/  'WalkingData1 2025-06-06 15-30-08'/


In [None]:
%cd BikingData1 2025-06-06 10-48-07

/content/ml4qs-G39/data/raw/BikingData1 2025-06-06 10-48-07


In [None]:
!ls -F

 Accelerometer.csv  'Linear Accelerometer.csv'	 meta/
 Barometer.csv	     Location.csv		 ml4qs-G39/
 Gyroscope.csv	     Magnetometer.csv		 Proximity.csv


In [None]:
biking_dfs = read_csv_files()
bkining_accelerometer_df = biking_dfs['Accelerometer_df']
bkining_linear_accelerometer_df = biking_dfs['Linear Accelerometer_df']
bkining_barometer_df = biking_dfs['Barometer_df']
bkining_location_df = biking_dfs['Location_df']
bkining_gyroscope_df = biking_dfs['Gyroscope_df']
bkining_magnetometer_df = biking_dfs['Magnetometer_df']
bkining_proximity_df = biking_dfs['Proximity_df']

Loaded 'Accelerometer.csv' into DataFrame 'Accelerometer_df'. Shape: (90423, 4)
Loaded 'Linear Accelerometer.csv' into DataFrame 'Linear Accelerometer_df'. Shape: (90423, 4)
Loaded 'Barometer.csv' into DataFrame 'Barometer_df'. Shape: (847, 2)
Loaded 'Location.csv' into DataFrame 'Location_df'. Shape: (899, 8)
Loaded 'Gyroscope.csv' into DataFrame 'Gyroscope_df'. Shape: (90423, 4)
Loaded 'Magnetometer.csv' into DataFrame 'Magnetometer_df'. Shape: (90423, 4)
Loaded 'Proximity.csv' into DataFrame 'Proximity_df'. Shape: (1, 2)

Accelerometer_df head:
   Time (s)  X (m/s^2)  Y (m/s^2)  Z (m/s^2)
0  0.002582  -2.956801   3.204536  12.733271
1  0.012535  -1.568887   2.797832  11.682157
2  0.022488  -0.466879   3.101850  10.308014
3  0.032441  -1.001867   3.238216   8.759784
4  0.042394  -1.313219   2.810705   8.288564

Gyroscope_df head:
   Time (s)  X (rad/s)  Y (rad/s)  Z (rad/s)
0  0.000094  -0.010970   0.069035   0.648594
1  0.010047   0.057596   0.260039   0.495670
2  0.020000  -0.00142

In [None]:
biking_df = combine_dfs(biking_dfs)
biking_df.head()

Combined DataFrame Head:
   Time (s)  Accelerometer_df_X_m/s^2  Accelerometer_df_Y_m/s^2  \
0    -0.535                       NaN                       NaN   
1     0.000                       NaN                       NaN   
2     0.003                 -2.956801                  3.204536   
3     0.010                       NaN                       NaN   
4     0.013                 -1.568887                  2.797832   

   Accelerometer_df_Z_m/s^2  Linear Accelerometer_df_X_m/s^2  \
0                       NaN                              NaN   
1                       NaN                        -2.192563   
2                 12.733271                              NaN   
3                       NaN                        -0.712324   
4                 11.682157                              NaN   

   Linear Accelerometer_df_Y_m/s^2  Linear Accelerometer_df_Z_m/s^2  \
0                              NaN                              NaN   
1                        -0.171700           

Unnamed: 0,Time (s),Accelerometer_df_X_m/s^2,Accelerometer_df_Y_m/s^2,Accelerometer_df_Z_m/s^2,Linear Accelerometer_df_X_m/s^2,Linear Accelerometer_df_Y_m/s^2,Linear Accelerometer_df_Z_m/s^2,Barometer_df_X_hPa,Location_df_Latitude_°,Location_df_Longitude_°,...,Location_df_Direction_°,Location_df_Horizontal_Accuracy_m,Location_df_Vertical_Accuracy_°,Gyroscope_df_X_rad/s,Gyroscope_df_Y_rad/s,Gyroscope_df_Z_rad/s,Magnetometer_df_X_µT,Magnetometer_df_Y_µT,Magnetometer_df_Z_µT,Proximity_df_Distance_cm
0,-0.535,,,,,,,1008.348923,,,...,,,,,,,,,,
1,0.0,,,,-2.192563,-0.1717,4.255854,,,,...,,,,-0.01097,0.069035,0.648594,-21.498955,-16.158371,-37.33429,
2,0.003,-2.956801,3.204536,12.733271,,,,,,,...,,,,,,,,,,5.0
3,0.01,,,,-0.712324,-0.519467,2.735299,,,,...,,,,0.057596,0.260039,0.49567,-21.380814,-16.132431,-36.897217,
4,0.013,-1.568887,2.797832,11.682157,,,,,,,...,,,,,,,,,,


## Walking Data

In [None]:
%cd ../
!ls -F

/content/ml4qs-G39
data/  LICENSE	notebooks/  README.md  reports/  requirements.txt  src/


In [None]:
%cd WalkingData1 2025-06-06 15-30-08

/content/ml4qs-G39/data/raw/WalkingData1 2025-06-06 15-30-08


In [None]:
!ls -F

 Accelerometer.csv  'Linear Accelerometer.csv'	 meta/
 Barometer.csv	     Location.csv		 Proximity.csv
 Gyroscope.csv	     Magnetometer.csv


In [None]:
walking_dfs = read_csv_files()
walking_accelerometer_df = walking_dfs['Accelerometer_df']
walking_linear_accelerometer_df = walking_dfs['Linear Accelerometer_df']
walking_barometer_df = walking_dfs['Barometer_df']
walking_location_df = walking_dfs['Location_df']
walking_gyroscope_df = walking_dfs['Gyroscope_df']
walking_magnetometer_df = walking_dfs['Magnetometer_df']
walking_proximity_df = walking_dfs['Proximity_df']

Loaded 'Accelerometer.csv' into DataFrame 'Accelerometer_df'. Shape: (90423, 4)
Loaded 'Linear Accelerometer.csv' into DataFrame 'Linear Accelerometer_df'. Shape: (90423, 4)
Loaded 'Barometer.csv' into DataFrame 'Barometer_df'. Shape: (847, 2)
Loaded 'Location.csv' into DataFrame 'Location_df'. Shape: (899, 8)
Loaded 'Gyroscope.csv' into DataFrame 'Gyroscope_df'. Shape: (90423, 4)
Loaded 'Magnetometer.csv' into DataFrame 'Magnetometer_df'. Shape: (90423, 4)
Loaded 'Proximity.csv' into DataFrame 'Proximity_df'. Shape: (1, 2)

Accelerometer_df head:
   Time (s)  X (m/s^2)  Y (m/s^2)  Z (m/s^2)
0  0.002582  -2.956801   3.204536  12.733271
1  0.012535  -1.568887   2.797832  11.682157
2  0.022488  -0.466879   3.101850  10.308014
3  0.032441  -1.001867   3.238216   8.759784
4  0.042394  -1.313219   2.810705   8.288564

Gyroscope_df head:
   Time (s)  X (rad/s)  Y (rad/s)  Z (rad/s)
0  0.000094  -0.010970   0.069035   0.648594
1  0.010047   0.057596   0.260039   0.495670
2  0.020000  -0.00142

In [None]:
walking_df = combine_dfs(walking_dfs)
walking_df.head()

Combined DataFrame Head:
   Time (s)  Accelerometer_df_X_m/s^2  Accelerometer_df_Y_m/s^2  \
0    -0.535                       NaN                       NaN   
1     0.000                       NaN                       NaN   
2     0.003                 -2.956801                  3.204536   
3     0.010                       NaN                       NaN   
4     0.013                 -1.568887                  2.797832   

   Accelerometer_df_Z_m/s^2  Linear Accelerometer_df_X_m/s^2  \
0                       NaN                              NaN   
1                       NaN                        -2.192563   
2                 12.733271                              NaN   
3                       NaN                        -0.712324   
4                 11.682157                              NaN   

   Linear Accelerometer_df_Y_m/s^2  Linear Accelerometer_df_Z_m/s^2  \
0                              NaN                              NaN   
1                        -0.171700           

Unnamed: 0,Time (s),Accelerometer_df_X_m/s^2,Accelerometer_df_Y_m/s^2,Accelerometer_df_Z_m/s^2,Linear Accelerometer_df_X_m/s^2,Linear Accelerometer_df_Y_m/s^2,Linear Accelerometer_df_Z_m/s^2,Barometer_df_X_hPa,Location_df_Latitude_°,Location_df_Longitude_°,...,Location_df_Direction_°,Location_df_Horizontal_Accuracy_m,Location_df_Vertical_Accuracy_°,Gyroscope_df_X_rad/s,Gyroscope_df_Y_rad/s,Gyroscope_df_Z_rad/s,Magnetometer_df_X_µT,Magnetometer_df_Y_µT,Magnetometer_df_Z_µT,Proximity_df_Distance_cm
0,-0.535,,,,,,,1008.348923,,,...,,,,,,,,,,
1,0.0,,,,-2.192563,-0.1717,4.255854,,,,...,,,,-0.01097,0.069035,0.648594,-21.498955,-16.158371,-37.33429,
2,0.003,-2.956801,3.204536,12.733271,,,,,,,...,,,,,,,,,,5.0
3,0.01,,,,-0.712324,-0.519467,2.735299,,,,...,,,,0.057596,0.260039,0.49567,-21.380814,-16.132431,-36.897217,
4,0.013,-1.568887,2.797832,11.682157,,,,,,,...,,,,,,,,,,


# Cleaning

## Filling Nulls using interpolaration

In [None]:
biking_df.isnull().sum()

Unnamed: 0,0
Time (s),0
Accelerometer_df_X_m/s^2,91829
Accelerometer_df_Y_m/s^2,91829
Accelerometer_df_Z_m/s^2,91829
Linear Accelerometer_df_X_m/s^2,91829
Linear Accelerometer_df_Y_m/s^2,91829
Linear Accelerometer_df_Z_m/s^2,91829
Barometer_df_X_hPa,181405
Location_df_Latitude_°,181353
Location_df_Longitude_°,181353


In [None]:
walking_df.isnull().sum()

Unnamed: 0,0
Time (s),0
Accelerometer_df_X_m/s^2,91829
Accelerometer_df_Y_m/s^2,91829
Accelerometer_df_Z_m/s^2,91829
Linear Accelerometer_df_X_m/s^2,91829
Linear Accelerometer_df_Y_m/s^2,91829
Linear Accelerometer_df_Z_m/s^2,91829
Barometer_df_X_hPa,181405
Location_df_Latitude_°,181353
Location_df_Longitude_°,181353


In [None]:
biking_df['x'].interpolate(method='linear', inplace=True)

In [None]:
# prompt: biking_df['x'].interpolate(method='linear', inplace=True) i want this for every column of bking df and then the same for walking df

# Interpolate all columns in biking_df
for col in biking_df.columns:
    if biking_df[col].isnull().any():
        biking_df[col].interpolate(method='linear', inplace=True)

# Interpolate all columns in walking_df
for col in walking_df.columns:
    if walking_df[col].isnull().any():
        walking_df[col].interpolate(method='linear', inplace=True)

print("Interpolation complete for biking_df:")
print(biking_df.isnull().sum())

print("\nInterpolation complete for walking_df:")
print(walking_df.isnull().sum())

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  biking_df[col].interpolate(method='linear', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  walking_df[col].interpolate(method='linear', inplace=True)


Interpolation complete for biking_df:
Time (s)                               0
Accelerometer_df_X_m/s^2               2
Accelerometer_df_Y_m/s^2               2
Accelerometer_df_Z_m/s^2               2
Linear Accelerometer_df_X_m/s^2        1
Linear Accelerometer_df_Y_m/s^2        1
Linear Accelerometer_df_Z_m/s^2        1
Barometer_df_X_hPa                     0
Location_df_Latitude_°               116
Location_df_Longitude_°              116
Location_df_Height_m                 116
Location_df_Velocity_m/s             464
Location_df_Direction_°              871
Location_df_Horizontal_Accuracy_m    116
Location_df_Vertical_Accuracy_°      116
Gyroscope_df_X_rad/s                   1
Gyroscope_df_Y_rad/s                   1
Gyroscope_df_Z_rad/s                   1
Magnetometer_df_X_µT                   1
Magnetometer_df_Y_µT                   1
Magnetometer_df_Z_µT                   1
Proximity_df_Distance_cm               2
dtype: int64

Interpolation complete for walking_df:
Time (s