# Feature Selection
## Reducing the number of features used, in order to optimisea machine learning human activity recognition model
#### Natasha Qayyum - 2021

MEx is a multimodel dataset containing data for 7 different physiotherapy exercises performed by 30 subjects, recorded by four sensor modalities. This notebook utilises the data recorded by one of these modalities, the Sensing Tex Pressure Mat (sampling frequency 15Hz, frame size 32 * 16), which is both rich and sparse.

The aim is to intelligently select a subset of the 512 features, to train a machine learning (ML) model whilst demonstrating a beneficial application of Feature Selection in the pre-processing steps. The ML model seeks to predict which physiotherapy exercise a subject is performing on the pressure mat, according to sensor readings. 

Benefits of feature selection include:
- Reduced training times
- Reduced computational requirements
- Remove irrelevant features, for example those pressure points which have never picked up a signal
- Reduce overfitting, seperating the signal from the noise

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from timeit import default_timer as timer
import seaborn as sns

Unable to revert mtime: /Library/Fonts


In [2]:
#choosing headers for desired additional columns
colNames_test = ["Subject", "Exercise", "a_Time", "r_Time", "Total_p"] #a = actual, r = relative, p = pressure

#using for loop to rename the 512 pressure point columns into X and Y co-ordinates to display their position on the mat
coords_test = [] 
for x in range(32):
    for y in range(16):
        nextCol_test = str(x+1) + "_" + str(y+1)
        colNames_test.append(nextCol_test) #appending the renamed coord headers to the additional headers
        coords_test.append(nextCol_test)

#initialising an empty dataframe to visualise the new desired structure of data
pm_df_empty = pd.DataFrame(columns = colNames_test)

pm_df_empty

Unnamed: 0,Subject,Exercise,a_Time,r_Time,Total_p,1_1,1_2,1_3,1_4,1_5,...,32_7,32_8,32_9,32_10,32_11,32_12,32_13,32_14,32_15,32_16


In [19]:
debug = False

#defining the root directory of the pressure mat data
pmRoot = "data/"

#choosing headers for desired additional columns
colNames = ["Subject", "Exercise", "a_Time", "r_Time", "Total_p"]
#using for loop to rename the 512 pressure point columns into X and Y co-ordinates to display their position on the mat
coords = []
for x in range(32):
    for y in range(16):
        nextCol = str(x+1) + "_" + str(y+1)
        colNames.append(nextCol)
        coords.append(nextCol)

#creating our master dataframe to store all the data
pm_df = pd.DataFrame(columns = colNames)

#timing information - start timer
start = timer()

#for subjects 1 to 30
for i in range(1, 31):
    
    #for exercises 1 to 7
    for j in range(1, 8):
        
        #read in a pressure mat data frame
        
        #formatting the numbers to match file system
        if(i<10): folder = "0" + str(i) + "/"
        else: folder = str(i) + "/"
        file = "0" + str(j) + "_pm_1.csv"
        
        #read in the .csv files
        if (debug == True): print(pmRoot + folder + file)
        pm_df_in = pd.read_csv(pmRoot + folder + file)
        
        #calculate the total pressure for each time increment
        pm_temp_totals = pm_df_in.drop(pm_df_in.columns[0], axis = 1).sum(axis = 1)
        
        #reformatting the time string into an actual datetime object
        for k in range(len(pm_df_in)):
            timeString = pm_df_in.iloc[k,0]
            if len(timeString) < 20:
                timeString = timeString + "."
            while len(timeString) < 26:
                timeString = timeString + "0"
            pm_df_in.iat[k,0] = datetime.strptime(timeString, "%Y-%m-%d %H:%M:%S.%f")
        
        #create a measure of absolute time for each data point
        pm_temp_a_Time = pm_df_in.iloc[:,0]
        
        #create a measure of relative time for each data point
        #have to do this in a for loop for some reason?
        pm_temp_r_Time = [None] * len(pm_temp_a_Time)
        for time in range(len(pm_temp_a_Time)):
            pm_temp_r_Time[time] = pm_temp_a_Time[time] - pm_temp_a_Time[0]
        
        #create a dataframe to store our bad boys in
        pm_temp_df = pd.DataFrame(columns = colNames)
        
        #add in our data
        pm_temp_df["a_Time"] = pm_temp_a_Time
        pm_temp_df["r_Time"] = pm_temp_r_Time
        pm_temp_df["Total_p"] = pm_temp_totals
        
        pm_temp_df[coords] = pm_df_in.iloc[:,1:len(pm_df_in.columns)]
        
        #label the data with the subject and exercise
        pm_temp_df["Subject"] = pd.Series([i for x in range(len(pm_temp_df.index))], index=pm_temp_df.index)
        pm_temp_df["Exercise"] = pd.Series([j for x in range(len(pm_temp_df.index))], index=pm_temp_df.index)
        
        #append to our dataframe
        pm_df = pm_df.append(pm_temp_df)
        
#         break
#     break

#timing information - end timer
end = timer()
dt = end - start
print("Importing time elapsed: {:.2f}".format(dt))

pm_df

data/01/01_pm_1.csv
data/01/02_pm_1.csv
data/01/03_pm_1.csv
data/01/04_pm_1.csv
data/01/05_pm_1.csv
data/01/06_pm_1.csv
data/01/07_pm_1.csv
data/02/01_pm_1.csv
data/02/02_pm_1.csv
data/02/03_pm_1.csv
data/02/04_pm_1.csv
data/02/05_pm_1.csv
data/02/06_pm_1.csv
data/02/07_pm_1.csv
data/03/01_pm_1.csv
data/03/02_pm_1.csv
data/03/03_pm_1.csv
data/03/04_pm_1.csv
data/03/05_pm_1.csv
data/03/06_pm_1.csv
data/03/07_pm_1.csv
data/04/01_pm_1.csv
data/04/02_pm_1.csv
data/04/03_pm_1.csv
data/04/04_pm_1.csv
data/04/05_pm_1.csv
data/04/06_pm_1.csv
data/04/07_pm_1.csv
data/05/01_pm_1.csv
data/05/02_pm_1.csv
data/05/03_pm_1.csv
data/05/04_pm_1.csv
data/05/05_pm_1.csv
data/05/06_pm_1.csv
data/05/07_pm_1.csv
data/06/01_pm_1.csv
data/06/02_pm_1.csv
data/06/03_pm_1.csv
data/06/04_pm_1.csv
data/06/05_pm_1.csv
data/06/06_pm_1.csv
data/06/07_pm_1.csv
data/07/01_pm_1.csv
data/07/02_pm_1.csv
data/07/03_pm_1.csv
data/07/04_pm_1.csv
data/07/05_pm_1.csv
data/07/06_pm_1.csv
data/07/07_pm_1.csv
data/08/01_pm_1.csv


Unnamed: 0,Subject,Exercise,a_Time,r_Time,Total_p,1_1,1_2,1_3,1_4,1_5,...,32_7,32_8,32_9,32_10,32_11,32_12,32_13,32_14,32_15,32_16
0,1,1,2018-11-08 11:34:51.535000,0 days 00:00:00,27375.0,20.0,3.0,2.0,0.0,0.0,...,104.0,93.0,58.0,78.0,53.0,192.0,388.0,330.0,123.0,6.0
1,1,1,2018-11-08 11:34:51.602000,0 days 00:00:00.067000,27466.0,20.0,3.0,2.0,0.0,0.0,...,106.0,93.0,64.0,78.0,53.0,195.0,390.0,330.0,119.0,7.0
2,1,1,2018-11-08 11:34:51.669000,0 days 00:00:00.134000,27423.0,20.0,3.0,2.0,0.0,0.0,...,108.0,94.0,66.0,79.0,55.0,196.0,391.0,324.0,106.0,5.0
3,1,1,2018-11-08 11:34:51.737000,0 days 00:00:00.202000,27651.0,20.0,3.0,2.0,0.0,0.0,...,125.0,94.0,64.0,79.0,55.0,194.0,391.0,321.0,114.0,6.0
4,1,1,2018-11-08 11:34:51.804000,0 days 00:00:00.269000,27416.0,20.0,3.0,2.0,0.0,0.0,...,129.0,95.0,64.0,79.0,55.0,194.0,381.0,321.0,114.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
922,30,7,2019-03-26 16:45:51.454000,0 days 00:01:01.976000,578.0,6.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
923,30,7,2019-03-26 16:45:51.522000,0 days 00:01:02.044000,633.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
924,30,7,2019-03-26 16:45:51.589000,0 days 00:01:02.111000,696.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
925,30,7,2019-03-26 16:45:51.656000,0 days 00:01:02.178000,704.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
