## Neural Network models - Leave One Participant Out CV to predict properties using 3 subwindows

In [1]:
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", None)
#pd.set_option("display.max_rows", None)
from pandas.core.common import SettingWithCopyWarning

import warnings
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
warnings.simplefilter(action="ignore", category=UserWarning)

import random
import datetime
import time
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, ConcatDataset
import torch.optim as optim



In [2]:
# Initialise the random state
#num = random.randint(1, 500)
num = 58
torch.manual_seed(num)
np.random.seed(num)
print(f"The generated random seed is {num}") #347

The generated random seed is 58


### Load data

In [3]:
path = "complete_dataset_"+str(15)+"subwindows_"+str(3)+"slices.csv"
df = pd.read_csv(path)
#data.iloc[12959:12965,:]



In [4]:
# Remove enjoyment as we are only considering physical properties
print(df.shape)
physical_df = df[df.property_name!='enjoyment']
physical_df.reset_index(inplace=True, drop=True)
physical_df.shape

(22679, 190)


(18899, 190)

In [5]:
existing_data = physical_df.iloc[:10800,:]
new_data = physical_df.iloc[10800:,:]

### Normalise the data

In [6]:
starting_index = 10

# Create a df of features
existing_features_df = existing_data.iloc[:,starting_index:]
new_features_df = new_data.iloc[:,starting_index:]

# Create a df with the first 10 columns
existing_info = existing_data.iloc[:, :starting_index]
new_info = new_data.iloc[:, :starting_index]

normalised_existing_features = existing_features_df.copy()
normalised_new_features = new_features_df.copy()
print(normalised_existing_features.shape)
print(normalised_new_features.shape)

# create scaler
scaler = MinMaxScaler(feature_range=(-1,1)) # As this is the range of the activation function - tanh

# fit scaler and apply transform
normalised_existing_features[normalised_existing_features.columns] = scaler.fit_transform(existing_features_df[existing_features_df.columns])
normalised_new_features[normalised_new_features.columns] = scaler.fit_transform(new_features_df[new_features_df.columns])

print(normalised_existing_features.shape)
print(normalised_new_features.shape)

normalised_existing_df = pd.concat([existing_info, normalised_existing_features], axis=1)
normalised_new_df = pd.concat([new_info, normalised_new_features], axis=1)




(10800, 180)
(8099, 180)
(10800, 180)
(8099, 180)


### Create X and y data

In [7]:
def create_X_2d(df, features_starting_idx):
    
    X_2d = df.iloc[:,features_starting_idx:].values
    
    X_tensor_2d = torch.Tensor(X_2d)    
    return X_tensor_2d



In [8]:
def create_y_train_for_2d_X(df, predicting_feature = 'property_id', output_as_tensor='Yes'):
    # CreatE an instance of a one-hot-encoder
    encoder = OneHotEncoder(handle_unknown='ignore')

    # Perform one-hot encoding on the specified column 
    encoder_df = pd.DataFrame(encoder.fit_transform(df[[predicting_feature]]).toarray())
    
    # Convert to a numpy array
    y_train = encoder_df.to_numpy()
    
    if output_as_tensor == 'Yes':
        # Convert to a tensor
        y_train = torch.Tensor(y_train)

    return y_train



In [9]:
def create_y_test_for_2d_X(df, predicting_feature = 'property_id'):   
    y_test = df[predicting_feature].values
    #if predicting_feature == 'property_id':
       # y_test = y_test - 3
    if predicting_feature == 'rating_level_num':
        y_test = y_test - 1
    
    y_test_tensor = torch.Tensor(y_test)    
    y_test_tensor = y_test_tensor.type(torch.LongTensor)
    
    return y_test_tensor


In [10]:
def create_X_3d(df, features_starting_idx):
    dim1 = df.new_interaction_id.nunique()
    #print(dim1)
    dim2 = df.slice_num.nunique()
    dim3 = df.iloc[:,features_starting_idx:].shape[1]
        
    X = np.zeros((dim1, dim2, dim3)) 

    itr_id_lst = df.new_interaction_id.unique().tolist()
    #print(itr_id_lst[0], itr_id_lst[-1])

    for itr_id in itr_id_lst: #range(len(itr_id_lst)):
        #itr_id = itr_id_lst[i]
        itr_id_df = df[df.new_interaction_id==itr_id]  
        
        for j in range(itr_id_df.shape[0]):
            vals_arr = itr_id_df.iloc[j,features_starting_idx:].values
            if itr_id-1 == dim1:
                print(itr_id)
            X[itr_id-1,j] = vals_arr
    
    X_tensor = torch.Tensor(X)    
    return X_tensor


In [11]:
def create_y_train_for_3d_X(df, predicting_feature = 'property_id'):
    # Create a dataset with only the required columns
    df2 = df[['new_interaction_id', 'property_id', 'rating_level_num']]

    # Remove duplicates
    df2.drop_duplicates(keep='first', inplace=True)

    # Reset the indexes
    df2.reset_index(drop=True, inplace=True) 
    
    ## Create y train
    # CreatE an instance of a one-hot-encoder
    encoder = OneHotEncoder(handle_unknown='ignore')

    # Perform one-hot encoding on the specified column 
    encoder_df = pd.DataFrame(encoder.fit_transform(df2[[predicting_feature]]).toarray())
    
    # Convert to a numpy array
    y_train = encoder_df.to_numpy()
    
    # Convert to a tensor
    y_train = torch.Tensor(y_train)
  
    return y_train



In [12]:
def create_y_test_for_3d_X(df, predicting_feature = 'property_id'):
    # Create a dataset with only the required columns
    df2 = df[['new_interaction_id', 'property_id', 'rating_level_num']]

    # Remove duplicates
    df2.drop_duplicates(keep='first', inplace=True)

    # Reset the indexes
    df2.reset_index(drop=True, inplace=True) 
    
    y_test = df2[predicting_feature].values
    #if predicting_feature == 'property_id':
       # y_test = y_test - 3
    if predicting_feature == 'rating_level_num':
        y_test = y_test - 1
    
    y_test = torch.Tensor(y_test)    
    y_test = y_test.type(torch.LongTensor)
    
    return y_test



## Model 2 - LSTM model using all 180 features

In [13]:
class LSTM_all_features_properties(nn.Module):
    def __init__(self):
        super().__init__()
        self.rnn = nn.LSTM(90, 40, 1, batch_first=True)
        self.fc1 = nn.Linear(3 * 40 * 2, 20)
        self.fc2 = nn.Linear(20, 10)
        self.fc3 = nn.Linear(10, 5)
          
    def forward(self, x1, x2): #, x2
        x1, (hn, cn) = self.rnn(x1) #, (self.h0, self.c0)
        x1 = F.tanh(x1)
        x2, (hm, cm) = self.rnn(x2) # (self.h0, self.c0)
        x2 = F.tanh(x2)
        x = torch.cat((x1, x2), 2)
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = F.tanh(self.fc1(x))
        x = F.tanh(self.fc2(x))
        x = F.softmax(self.fc3(x), dim=1)
        return x
    

In [14]:
def find_best_model_for_3d_X(train_dataloader, learning_rate, num_epochs, model):

    # Model
    train_model = model

    # Loss and Optimiser
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = optim.SGD(train_model.parameters(), lr=learning_rate, momentum=0.7)

    best_train_loss = np.inf
    best_model = None
    #best_model_epoch_num = np.inf
    train_loss_lst = []
   # val_loss_lst = []
   # avg_loss_lst = []

    for epoch in range(num_epochs):  # loop over the dataset multiple times

        #Set the model in training mode
        train_model.train()

        # Initialise the total training and validation loss
        epoch_train_loss = 0
        epoch_val_loss = 0
        avg_loss = 0

        #running_loss = 0.0
        for i, train_data in enumerate(train_dataloader, 0):
            #print(len(train_data))

            # get the inputs; data is a list of [input1, input2, label]
            train_input1, train_input2, train_labels = train_data #train_input2, 

            #train_labels = train_labels.type(torch.LongTensor)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            train_preds = train_model(train_input1, train_input2)  
            #print(train_labels)#

            train_loss = criterion(train_preds, train_labels)
            train_loss.backward()
            optimizer.step()

            # Update training loss
            epoch_train_loss += train_loss.item()

     
        avg_training_loss = epoch_train_loss / len(train_dataloader) #count_train
        
        train_loss_lst.append(avg_training_loss)
         
        #print(f'epoch {epoch+1}: train loss = {round(avg_training_loss,3)}, val loss = {round(avg_validation_loss,3)}, average loss = {round(avg_loss,3)}')
        if epoch % 10 == 0:
            print(f'epoch {epoch+1}: train loss = {round(avg_training_loss,2)}')
        
        if avg_training_loss < best_train_loss:
            best_train_loss = avg_training_loss
            best_model = train_model.state_dict()

    return best_train_loss, best_model, train_loss_lst 

    #return best_avg_loss, best_model, train_loss_lst, val_loss_lst, avg_loss_lst   #, avg_loss_lst, 


In [15]:
# Original
def LSTM_LOP0CV(lili_0, lili_1, new_0, new_1, model, num_folds=5, predicting_feature='property_id', learning_rate=0.01, num_epochs=10, random_state=num): #, num_inner_folds=5
    # Set fixed random number seed
    torch.manual_seed(num)
        
    total_conf_mat = 0
    micro_f1_lst = []
    acc_lst = []
    if predicting_feature == 'property_id':
        macro_f1_lst = []
    elif predicting_feature == 'rating_level_num':
        weighted_f1_lst = [] 
        
        data0_add = create_y_train_for_2d_X(data_0, predicting_feature = 'property_id', output_as_tensor='No')
        data0_add_pd = pd.DataFrame(data0_add, columns = ['smoothness','thickness','warmth', 'flexibility', 'softness'])
        data_0 = pd.concat([data_0.reset_index(drop=True), data0_add_pd.reset_index(drop=True)], axis=1)
        data1_add = create_y_train_for_2d_X(data_1, predicting_feature = 'property_id', output_as_tensor='No')
        data1_add_pd = pd.DataFrame(data1_add, columns = ['smoothness','thickness','warmth', 'flexibility', 'softness'])
        data_1 = pd.concat([data_1.reset_index(drop=True), data1_add_pd.reset_index(drop=True)], axis=1)
    
    #for participant in sorted(data_0.participant_id.unique()): # # #:lst: #
       # print(f'LEAVING PARTICIPANT {participant} OUT:')
        
        # Split the data into training and testing
        #training_data_0 = data_0[data_0.participant_id != participant] 
        #training_data_1 = data_1[data_1.participant_id != participant] 
        #testing_data_0 = data_0[data_0.participant_id == participant] 
        #testing_data_1 = data_1[data_1.participant_id == participant] 

    # Data preparation
    X_train_0 = create_X_3d(lili_0, 11)
    X_train_1 = create_X_3d(lili_1, 11) 
    X_test_0 = create_X_3d(new_0, 11) 
    X_test_1 = create_X_3d(new_1, 11)           
    y_train = create_y_train_for_3d_X(lili_0, predicting_feature = predicting_feature)
    y_test = create_y_test_for_3d_X(new_0, predicting_feature = predicting_feature)        


        #print(X_train_0.shape)
        #print(X_train_1.shape)
        #print(y_train.shape)
        #print(X_test_0.shape)
        #print(X_test_1.shape)
        #print(y_test.shape)
 
    # Create the datasets and dataloaders
    train_dataset = TensorDataset(X_train_0, X_train_1, y_train) 
    train_dataloader = torch.utils.data.DataLoader(train_dataset, shuffle=True, batch_size=y_train.shape[0]) # num_workers=2,

    test_dataset = TensorDataset(X_test_0, X_test_1, y_test)
    test_dataloader = torch.utils.data.DataLoader(test_dataset, shuffle=True, batch_size=y_test.shape[0]) # num_workers=2,       
            
    train_loss, best_model, train_loss_lst = find_best_model_for_3d_X(train_dataloader, learning_rate, num_epochs, model)

    # save trained model 
    name = 'best_model.pt'
    torch.save(best_model, name)
    print(f'The model has been saved')
      
        
    test_model = model
    test_model.load_state_dict(torch.load(name))

    dataiter = iter(test_dataloader) 
    test_input1, test_input2, test_labels = dataiter.next() 

    test_preds = test_model(test_input1, test_input2) 

    test_preds_np = test_preds.detach().numpy()
    test_predicted_np = np.argmax(test_preds_np, axis = 1)

    test_labels_np = test_labels.numpy()    
    
    if predicting_feature == 'property_id':
        conf_mat = confusion_matrix(test_labels_np, test_predicted_np, labels=[0, 1, 2, 3, 4])
        macro_f1_score = f1_score(test_labels_np, test_predicted_np, average='macro') 
        #macro_f1_lst.append(macro_f1_score) 
    elif predicting_feature == 'rating_level_num':
        conf_mat = confusion_matrix(test_labels_np, test_predicted_np, labels=[0,1,2])
        weighted_f1_score = f1_score(test_labels_np, test_predicted_np, average='weighted') 
        weighted_f1_lst.append(weighted_f1_score)

    #total_conf_mat += conf_mat
    micro_f1_score = f1_score(test_labels_np, test_predicted_np, average='micro')  
    #micro_f1_lst.append(micro_f1_score)
    acc = accuracy_score(test_labels_np, test_predicted_np)
    #acc_lst.append(acc)


    #print(f"Leaving participant {participant} out")
    print("(1) Confusion matrix:\n", conf_mat)
    print(f"(2) micro F1 score = {round(micro_f1_score,2)}") 
    if predicting_feature == 'property_id':
        print(f"(3) Macro F1 score = {round(macro_f1_score,2)}")
    elif predicting_feature == 'rating_level_num':
        print(f"(3) Weighted F1 score = {round(weighted_f1_score,2)}")            
    print(f"(4) Percentage Classification accuracy = {round(acc*100,2)}%")

    print('--------------------------------')
        
   

In [16]:
# Add a column to interact

normalised_existing_df.insert(5, "new_interaction_id", None)
normalised_existing_df['new_interaction_id'] = normalised_existing_df.groupby(['participant_id', 'clothes_id', 'property_id', 'sub_window_num'], sort=False).ngroup() + 1

normalised_new_df.insert(5, "new_interaction_id", None)
normalised_new_df['new_interaction_id'] = normalised_new_df.groupby(['participant_id', 'clothes_id', 'property_id', 'sub_window_num'], sort=False).ngroup() + 1



In [17]:
a_hand1_emg = normalised_existing_df.iloc[:,35:59]

a_hand0_acc = normalised_existing_df.iloc[:,59:86]
a_hand1_acc = normalised_existing_df.iloc[:,86:113]

a_hand0_qua = normalised_existing_df.iloc[:,113:152]
a_hand1_qua = normalised_existing_df.iloc[:,152:]

a_df_info = normalised_existing_df.iloc[:,:11]

# Combine the data to ceate a df for each hand
a_emg_0 = normalised_existing_df.iloc[:,:35]
#a_emg_1 = pd.concat([df_info, hand1_emg], axis=1)

existing_0 = pd.concat([a_emg_0, a_hand0_acc, a_hand0_qua], axis=1)
existing_1 = pd.concat([a_df_info, a_hand1_emg, a_hand1_acc, a_hand1_qua], axis=1)

In [18]:
b_hand1_emg = normalised_new_df.iloc[:,35:59]

b_hand0_acc = normalised_new_df.iloc[:,59:86]
b_hand1_acc = normalised_new_df.iloc[:,86:113]

b_hand0_qua = normalised_new_df.iloc[:,113:152]
b_hand1_qua = normalised_new_df.iloc[:,152:]

b_df_info = normalised_new_df.iloc[:,:11]

# Combine the data to ceate a df for each hand
b_emg_0 = normalised_new_df.iloc[:,:35]
#a_emg_1 = pd.concat([df_info, hand1_emg], axis=1)

new_0 = pd.concat([b_emg_0, b_hand0_acc, b_hand0_qua], axis=1)
new_1 = pd.concat([b_df_info, b_hand1_emg, b_hand1_acc, b_hand1_qua], axis=1)

In [19]:
print(existing_0.shape)
print(existing_1.shape)

print(new_0.shape)
print(new_1.shape)

(10800, 101)
(10800, 101)
(8099, 101)
(8099, 101)


In [20]:

LSTM_LOP0CV(lili_0=existing_0, lili_1=existing_1, new_0=new_0, new_1=new_1, model=LSTM_all_features_properties(), num_folds=5, predicting_feature='property_id', learning_rate=0.5, num_epochs=1500, random_state=num)

epoch 1: train loss = 1.61
epoch 11: train loss = 1.61
epoch 21: train loss = 1.61
epoch 31: train loss = 1.61
epoch 41: train loss = 1.61
epoch 51: train loss = 1.61
epoch 61: train loss = 1.61
epoch 71: train loss = 1.61
epoch 81: train loss = 1.61
epoch 91: train loss = 1.6
epoch 101: train loss = 1.6
epoch 111: train loss = 1.58
epoch 121: train loss = 1.64
epoch 131: train loss = 1.62
epoch 141: train loss = 1.61
epoch 151: train loss = 1.58
epoch 161: train loss = 1.62
epoch 171: train loss = 1.54
epoch 181: train loss = 1.51
epoch 191: train loss = 1.55
epoch 201: train loss = 1.49
epoch 211: train loss = 1.56
epoch 221: train loss = 1.58
epoch 231: train loss = 1.47
epoch 241: train loss = 1.5
epoch 251: train loss = 1.48
epoch 261: train loss = 1.43
epoch 271: train loss = 1.53
epoch 281: train loss = 1.46
epoch 291: train loss = 1.52
epoch 301: train loss = 1.47
epoch 311: train loss = 1.48
epoch 321: train loss = 1.44
epoch 331: train loss = 1.46
epoch 341: train loss = 1.5
