In [1]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
import time

In [2]:
path = "/Users/niharawarawita/Desktop/MSc Project/Data/EMG_data_collection/combined_stats_nihara.csv"
data = pd.read_csv(path)
data.head()

Unnamed: 0,participant_id,clothes_id,property_id,property_name,interaction_id,rating,sub_window_num,subwindow_start_time,subwindow_end_time,max_ch1_Hand0,...,std_z_Hand1,std_AVx_Hand1,std_AVy_Hand1,std_AVz_Hand1,std_AAx_Hand1,std_AAy_Hand1,std_AAz_Hand1,std_AJx_Hand1,std_AJy_Hand1,std_AJz_Hand1
0,19,3,16,softness,119,6,1,2022-02-13 11:24:11.302,2022-02-13 11:24:17.236,0.293434,...,0.034045,0.223631,0.400869,0.198252,5.807455,10.633557,4.88632,434.490981,853.585591,445.739808
1,19,3,16,softness,119,6,2,2022-02-13 11:24:17.266,2022-02-13 11:24:22.230,0.122884,...,0.034718,0.102369,0.202003,0.215416,2.62433,4.994368,4.763141,163.059962,293.701684,359.806687
2,19,3,16,softness,119,6,3,2022-02-13 11:24:22.250,2022-02-13 11:24:27.250,0.15943,...,0.045145,0.190866,0.258466,0.341591,4.035735,7.234644,6.105591,307.386108,462.341199,430.277295
3,19,3,15,flexibility,120,5,1,2022-02-13 11:27:08.930,2022-02-13 11:27:15.106,0.815143,...,0.042501,0.30734,0.676318,0.321149,8.205791,18.275548,7.814122,633.186926,1362.201276,664.974739
4,19,3,15,flexibility,120,5,2,2022-02-13 11:27:15.175,2022-02-13 11:27:20.095,0.597988,...,0.038397,0.308847,0.553856,0.392333,6.602712,17.628923,9.375255,463.76634,1186.051852,711.834698


In [3]:
# Initialise the random state
num = random.randint(1, 500)
print(f"The generated random seed is {num}") #451

The generated random seed is 95


## Section A) Physical properties

### Task 1: Predicting the property based on the provided data (excluding enjoyment data)

In [4]:
physical_data = data[data.property_name != 'enjoyment']
physical_data.head()

Unnamed: 0,participant_id,clothes_id,property_id,property_name,interaction_id,rating,sub_window_num,subwindow_start_time,subwindow_end_time,max_ch1_Hand0,...,std_z_Hand1,std_AVx_Hand1,std_AVy_Hand1,std_AVz_Hand1,std_AAx_Hand1,std_AAy_Hand1,std_AAz_Hand1,std_AJx_Hand1,std_AJy_Hand1,std_AJz_Hand1
0,19,3,16,softness,119,6,1,2022-02-13 11:24:11.302,2022-02-13 11:24:17.236,0.293434,...,0.034045,0.223631,0.400869,0.198252,5.807455,10.633557,4.88632,434.490981,853.585591,445.739808
1,19,3,16,softness,119,6,2,2022-02-13 11:24:17.266,2022-02-13 11:24:22.230,0.122884,...,0.034718,0.102369,0.202003,0.215416,2.62433,4.994368,4.763141,163.059962,293.701684,359.806687
2,19,3,16,softness,119,6,3,2022-02-13 11:24:22.250,2022-02-13 11:24:27.250,0.15943,...,0.045145,0.190866,0.258466,0.341591,4.035735,7.234644,6.105591,307.386108,462.341199,430.277295
3,19,3,15,flexibility,120,5,1,2022-02-13 11:27:08.930,2022-02-13 11:27:15.106,0.815143,...,0.042501,0.30734,0.676318,0.321149,8.205791,18.275548,7.814122,633.186926,1362.201276,664.974739
4,19,3,15,flexibility,120,5,2,2022-02-13 11:27:15.175,2022-02-13 11:27:20.095,0.597988,...,0.038397,0.308847,0.553856,0.392333,6.602712,17.628923,9.375255,463.76634,1186.051852,711.834698


#### Step 1: Basic implementation, without CV

In [5]:
def rf_properties(data, random_state=num): #drop=None, 
    print(f'Classification accuracy when predicting properties at random : {round(1/len(data.property_id.unique())*100,2)}% ')

    # Data preparation
    y = data['property_id'].values
    X = data.iloc[:,9:].values

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, shuffle=True, random_state = num) 

    # Instantiate model with 1000 decision trees
    rf_model = RandomForestClassifier(n_estimators = 1000, random_state = num) 

    # Train the model on the training data
    rf_model.fit(X_train, y_train)

    # Using the model, obtain predictions for the test data
    predictions = rf_model.predict(X_test)

    # Calculate the number of correct predictions
    acc = accuracy_score(y_test, predictions)

    # Print out the percentage classification accuracy for the test set
    print(f'Classification accuracy for the test set when predicting properties: {round(acc*100,2)}%')

rf_properties(data=physical_data, random_state=num)  


Classification accuracy when predicting properties at random : 20.0% 
Classification accuracy for the test set when predicting properties: 46.3%


#### Step 2: LOCOCV (Leave One Cloth(Sock) Out CV)

In [8]:
# Note: predicting_feature = 'property_id' or 'rating'

def rf_LOCOCV_properties(data, num_inner_folds=5, predicting_feature = 'property_id', random_state=num):
    f1_lst = []
    acc_lst = []

    for cloth_id in range(1,7):      
        # Split the data into training and testing
        training_data = data[data.clothes_id != cloth_id]
        testing_data = data[data.clothes_id == cloth_id]
        
        # Data preparation
        X_train = training_data.iloc[:,9:].values        
        y_train = training_data[predicting_feature].values
        X_test = testing_data.iloc[:,9:].values        
        y_test = testing_data[predicting_feature].values
        
        # Configure the cross-validation procedure
        cv_inner = KFold(n_splits=num_inner_folds, shuffle=True, random_state=num)
        
        # Define the model
        rf_model = RandomForestClassifier(random_state=num)
        
        # Create a dictionary with the hyperparameters to tune
        features_dict = dict()
        features_dict['n_estimators'] = [100, 500, 1000]
        
        # Define the Grid Search
        search = GridSearchCV(rf_model, features_dict, scoring='accuracy', cv=cv_inner, refit=True) #accuracy
        
        # Execute the search
        result = search.fit(X_train, y_train)
        print(f"Hyper parameters that maximise accuracy: {result.best_params_}")    
        
        # Obtain the best performing model fit on the whole training set
        best_rf_model = result.best_estimator_
        
        # Using the model, obtain predictions for the test data
        predictions = best_rf_model.predict(X_test)
        
        # Evaluate the model
        f1_score_val = f1_score(y_test, predictions, average=None)
        avg_f1_score_val = sum(f1_score_val) / len(f1_score_val)        
        acc = accuracy_score(y_test, predictions)
        print(f"When sock {cloth_id} was left out, the F1 score was {round(avg_f1_score_val,2)} and the classification accuracy was {round(acc*100,2)}%")
        
        # store the result
        f1_lst.append(avg_f1_score_val)        
        acc_lst.append(acc)
    

    avg_f1_score = sum(f1_lst) / len(f1_lst)
    avg_acc = sum(acc_lst) / len(acc_lst)   
    
    print(f"Results for predicting the property when LOCOCV (Leave One Cloth Out Cross Validation) was used: average f1 score = {round(avg_f1_score,2)} and average classification accuracy = {round(avg_acc*100,2)}%")
            
rf_LOCOCV_properties(data=physical_data, num_inner_folds=5, predicting_feature = 'property_id', random_state=num) 


Hyper parameters that maximise accuracy: {'n_estimators': 500}
When sock 1 was left out, the F1 score was 0.35 and the classification accuracy was 35.96%
Hyper parameters that maximise accuracy: {'n_estimators': 100}
When sock 2 was left out, the F1 score was 0.4 and the classification accuracy was 40.0%
Hyper parameters that maximise accuracy: {'n_estimators': 1000}
When sock 3 was left out, the F1 score was 0.42 and the classification accuracy was 43.33%
Hyper parameters that maximise accuracy: {'n_estimators': 1000}
When sock 4 was left out, the F1 score was 0.44 and the classification accuracy was 45.56%
Hyper parameters that maximise accuracy: {'n_estimators': 100}
When sock 5 was left out, the F1 score was 0.38 and the classification accuracy was 37.78%
Hyper parameters that maximise accuracy: {'n_estimators': 500}
When sock 6 was left out, the F1 score was 0.41 and the classification accuracy was 42.22%
Results for predicting the property when LOSOCV (Leave One Sock Out Cross Va

#### Step 3: LOPOCV (Leave One Participant Out CV)

In [14]:
# Note: predicting_feature = 'property_id' or 'rating'

def rf_LOPOCV_properties(data, num_inner_folds=5, predicting_feature = 'property_id', random_state=num):
    f1_lst = []
    acc_lst = []
    for participant in range(19,26):    
        if participant == 20:
            pass
        else:
            # Split the data into training and testing
            training_data = data[data.participant_id != participant]
            testing_data = data[data.participant_id == participant]

            # Data preparation
            X_train = training_data.iloc[:,9:].values        
            y_train = training_data[predicting_feature].values
            X_test = testing_data.iloc[:,9:].values        
            y_test = testing_data[predicting_feature].values

            # Configure the cross-validation procedure
            cv_inner = KFold(n_splits=num_inner_folds, shuffle=True, random_state=num)

            # Define the model
            rf_model = RandomForestClassifier(random_state=num)

            # Create a dictionary with the hyperparameters to tune
            features_dict = dict()
            features_dict['n_estimators'] = [100, 500, 1000]

            # Define the Grid Search
            search = GridSearchCV(rf_model, features_dict, scoring='accuracy', cv=cv_inner, refit=True) #accuracy

            # Execute the search
            result = search.fit(X_train, y_train)
            print(f"Hyper parameters that maximise accuracy: {result.best_params_}")    

            # Obtain the best performing model fit on the whole training set
            best_rf_model = result.best_estimator_

            # Using the model, obtain predictions for the test data
            predictions = best_rf_model.predict(X_test)

            # Evaluate the model
            f1_score_val = f1_score(y_test, predictions, average=None)
            avg_f1_score_val = sum(f1_score_val) / len(f1_score_val)        
            acc = accuracy_score(y_test, predictions)
            print(f"When participant {participant} was left out, the F1 score was {round(avg_f1_score_val,2)} and the classification accuracy was {round(acc*100,2)}%")

            # store the result
            f1_lst.append(avg_f1_score_val)        
            acc_lst.append(acc)


    avg_f1_score = sum(f1_lst) / len(f1_lst)
    avg_acc = sum(acc_lst) / len(acc_lst)   

    print(f"Results for predicting the property when LOPOCV (Leave One Participant Out Cross Validation) was used: average f1 score = {round(avg_f1_score,2)} and average classification accuracy = {round(avg_acc*100,2)}%")

            
rf_predicting_properties_LOPOCV(data=physical_data, num_inner_folds=5, predicting_feature = 'property_id', random_state=num) 


Hyper parameters that maximise accuracy: {'n_estimators': 1000}
When participant 19 was left out, the F1 score was 0.21 and the classification accuracy was 30.0%
Hyper parameters that maximise accuracy: {'n_estimators': 100}
When participant 21 was left out, the F1 score was 0.14 and the classification accuracy was 20.0%
Hyper parameters that maximise accuracy: {'n_estimators': 1000}
When participant 22 was left out, the F1 score was 0.26 and the classification accuracy was 30.0%
Hyper parameters that maximise accuracy: {'n_estimators': 1000}
When participant 23 was left out, the F1 score was 0.37 and the classification accuracy was 42.22%
Hyper parameters that maximise accuracy: {'n_estimators': 500}
When participant 24 was left out, the F1 score was 0.15 and the classification accuracy was 21.35%
Hyper parameters that maximise accuracy: {'n_estimators': 500}
When participant 25 was left out, the F1 score was 0.29 and the classification accuracy was 33.33%
Results for predicting the p

### Task 2: Predicting the rating of the property based on the data

#### Step 1: Basic implementation, without CV

In [12]:
def rf_rating(data, random_state = num):
    lst = ['smoothness', 'thickness', 'warmth', 'flexibility', 'softness']#, 'enjoyment']
    print(f'Classification accuracy when predicting properties at random: {round((1/7)*100,2)}% ')
    for prop in lst:
        # Create dataset
        property_data = data[data.property_name == prop]
        
        # Data preparation
        y_property = property_data['rating'].values
        X_property = property_data.iloc[:,9:].values
        
        # Split the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X_property, y_property, test_size = 0.3, shuffle=True, random_state = num) 

        # Instantiate model with 1000 decision trees
        rf_model = RandomForestClassifier(n_estimators = 1000, random_state = num) 

        # Train the model on the training data
        rf_model.fit(X_train, y_train)
        
        # Using the model, obtain predictions for the test data
        predictions = rf_model.predict(X_test)

        # Calculate the number of correct predictions
        acc = accuracy_score(y_test, predictions)

        # Print out the percentage classification accuracy for the test set
        print(f'Classification accuracy for the test set when predicting the rating for {prop}: {round(acc*100,2)}%')
        print('__________________________________________')

rf_rating(data=physical_data)

     

Classification accuracy when predicting properties at random: 14.29% 
Classification accuracy for the test set when predicting the rating for smoothness: 39.39%
__________________________________________
Classification accuracy for the test set when predicting the rating for thickness: 39.39%
__________________________________________
Classification accuracy for the test set when predicting the rating for warmth: 54.55%
__________________________________________
Classification accuracy for the test set when predicting the rating for flexibility: 57.58%
__________________________________________
Classification accuracy for the test set when predicting the rating for softness: 39.39%
__________________________________________


#### Step 2: LOCOCV (Leave One Cloth(Sock) Out CV)

In [15]:
def rf_LOCOCV_rating(data, num_inner_folds=5, predicting_feature = 'rating', random_state=num):
    lst = ['smoothness', 'thickness', 'warmth', 'flexibility', 'softness']#, 'enjoyment']
    print(f'Classification accuracy when predicting properties at random: {round((1/7)*100,2)}% ')
    for prop in lst:
        # Create dataset
        property_data = data[data.property_name == prop]
        
        print(f"For {prop}:")
        rf_predicting_properties_LOCOCV(data=property_data, num_inner_folds=5, predicting_feature = 'rating', random_state=num)
        print('__________________________________________')
        

rf_LOCOCV_rating(data=physical_data, num_inner_folds=5, predicting_feature = 'rating', random_state=num)

     

Classification accuracy when predicting properties at random: 14.29% 
Classification accuracy for the test set when predicting the rating for smoothness: 39.39%
__________________________________________
Classification accuracy for the test set when predicting the rating for thickness: 39.39%
__________________________________________
Classification accuracy for the test set when predicting the rating for warmth: 54.55%
__________________________________________
Classification accuracy for the test set when predicting the rating for flexibility: 57.58%
__________________________________________
Classification accuracy for the test set when predicting the rating for softness: 39.39%
__________________________________________


#### Step 3: LOPOCV (Leave One Participant Out CV)

In [None]:
def rf_LOPOCV_rating(data, num_inner_folds=5, predicting_feature = 'rating', random_state=num):
    lst = ['smoothness', 'thickness', 'warmth', 'flexibility', 'softness']#, 'enjoyment']
    print(f'Classification accuracy when predicting properties at random: {round((1/7)*100,2)}% ')
    for prop in lst:
        # Create dataset
        property_data = data[data.property_name == prop]
        
        print(f"For {prop}:")
        rf_predicting_properties_LOPOCV(data=property_data, num_inner_folds=5, predicting_feature = 'rating', random_state=num)
        print('__________________________________________')
        
rf_LOPOCV_rating(data=physical_data, num_inner_folds=5, predicting_feature = 'rating', random_state=num)

     

Classification accuracy when predicting properties at random: 14.29% 
For smoothness:
Hyper parameters that maximise accuracy: {'n_estimators': 1000}
When participant 19 was left out, the F1 score was 0.0 and the classification accuracy was 0.0%
Hyper parameters that maximise accuracy: {'n_estimators': 1000}
When participant 21 was left out, the F1 score was 0.16 and the classification accuracy was 22.22%
Hyper parameters that maximise accuracy: {'n_estimators': 100}
When participant 22 was left out, the F1 score was 0.04 and the classification accuracy was 5.56%
Hyper parameters that maximise accuracy: {'n_estimators': 500}
When participant 23 was left out, the F1 score was 0.0 and the classification accuracy was 0.0%
Hyper parameters that maximise accuracy: {'n_estimators': 100}
When participant 24 was left out, the F1 score was 0.06 and the classification accuracy was 11.76%
Hyper parameters that maximise accuracy: {'n_estimators': 100}
When participant 25 was left out, the F1 score

## Section B) Affective properties - Enjoyment

### Task 1: Predicting the property based on the provided data (including enjoyment data)

#### Step 1: Basic implementation, without CV

In [None]:
rf_predicting_properties(data, random_state=num)    

#### Step 2: LOCOCV (Leave One Cloth(Sock) Out CV)

### Task 2: Predicting the rating of the property based on the data

In [None]:
enjoyment_data = data[data.property_name == 'enjoyment']

In [None]:
def rf_predicting_rating(data, random_state = num):
    print(f'Classification accuracy when predicting properties at random: {round((1/7)*100,2)}% ')

    # Data preparation
    y_property = data['rating'].values
    X_property = data.iloc[:,9:].values

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_property, y_property, test_size = 0.3, shuffle=True, random_state = num) 

    # Instantiate model with 1000 decision trees
    rf_model = RandomForestClassifier(n_estimators = 1000, random_state = num) 

    # Train the model on the training data
    rf_model.fit(X_train, y_train)

    # Using the model, obtain predictions for the test data
    predictions = rf_model.predict(X_test)

    # Calculate the number of correct predictions
    acc = accuracy_score(y_test, predictions)   

    # Print out the percentage classification accuracy for the test set
    print(f'Classification accuracy for the test set when predicting the rating for enjoyment: {round(acc*100,2)}%')

rf_predicting_rating(enjoyment_data)
     

In [None]:
gjvhkbjln

#### First, consider only the physical properties

In [None]:
print(data.shape)
data = data[data.property_name != 'enjoyment']
print(data.shape)

In [None]:
# manual nested cross-validation for random forest on a classification dataset
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
# create dataset
X, y = make_classification(n_samples=1000, n_features=20, random_state=1, n_informative=10, n_redundant=10)
# configure the cross-validation procedure
cv_outer = KFold(n_splits=10, shuffle=True, random_state=1)
# enumerate splits
num_inner_folds = 5
outer_results = list()
for train_ix, test_ix in cv_outer.split(X):
    # split data
    X_train, X_test = X[train_ix, :], X[test_ix, :]
    y_train, y_test = y[train_ix], y[test_ix]
    # configure the cross-validation procedure
    cv_inner = KFold(n_splits=num_inner_folds, shuffle=True, random_state=num)
    # define the model
    model = RandomForestClassifier(random_state=num)
    # define search space
    space = dict()
    space['n_estimators'] = [10, 100, 500, 1000]
    # define search
    search = GridSearchCV(model, space, scoring='accuracy', cv=cv_inner, refit=True) #accuracy
    # execute search
    result = search.fit(X_train, y_train)
    # get the best performing model fit on the whole training set
    best_model = result.best_estimator_
    # evaluate model on the hold out dataset
    yhat = best_model.predict(X_test)
    # evaluate the model
    acc = accuracy_score(y_test, yhat)
    # store the result
    outer_results.append(acc)
    # report progress
print('>acc=%.3f, est=%.3f, cfg=%s' % (acc, result.best_score_, result.best_params_))
# summarize the estimated performance of the model
print('Accuracy: %.3f (%.3f)' % (mean(outer_results), std(outer_results)))