#Finding and Cleaning Datasets


In [1]:
!sudo pip install tensorflow



In [2]:
import pandas as pd
import numpy as np
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2" # suppress info and warning messages
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import time
import tensorflow.keras as keras
from scipy.stats.mstats import winsorize

In [3]:
#connect to google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
#Initialize datasets for each gesture
four_df = pd.DataFrame()
paper_df = pd.DataFrame()
phone_df = pd.DataFrame()
rock_df = pd.DataFrame()
scissors_df = pd.DataFrame()
three_df = pd.DataFrame()

In [5]:
# Define the dataset folder
dataset_folder = '/content/drive/MyDrive/PIE_Final_Project/ML_stuff/emg_data_4_sensor'
# Process each folder
for folder in os.listdir(dataset_folder):
    for filename in os.listdir(os.path.join(dataset_folder, folder)):
        if filename.endswith('.csv'):
            file_path = os.path.join(dataset_folder, folder, filename)
            if folder == 'four':
                four_df = pd.concat([four_df, pd.read_csv(file_path)], ignore_index=True)
            elif folder == 'paper':
                paper_df = pd.concat([paper_df, pd.read_csv(file_path)], ignore_index=True)
            elif folder == 'phone':
                phone_df = pd.concat([phone_df, pd.read_csv(file_path)], ignore_index=True)
            elif folder == 'rock':
                rock_df = pd.concat([rock_df, pd.read_csv(file_path)], ignore_index=True)
            elif folder == 'scissors':
                scissors_df = pd.concat([scissors_df, pd.read_csv(file_path)], ignore_index=True)
            elif folder == 'three':
                three_df = pd.concat([three_df, pd.read_csv(file_path)], ignore_index=True)

# Truncate each dataframe to 20055 rows
four_df = four_df.iloc[:20055]
paper_df = paper_df.iloc[:20055]
phone_df = phone_df.iloc[:20055]
rock_df = rock_df.iloc[:20055]
scissors_df = scissors_df.iloc[:20055]
three_df = three_df.iloc[:20055]
#Add column to each dataset with finger name
four_df['gesture'] = 'four'
paper_df['gesture'] = 'paper'
phone_df['gesture'] = 'phone'
rock_df['gesture'] = 'rock'
scissors_df['gesture'] = 'scissors'
three_df['gesture'] = 'three'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  four_df['gesture'] = 'four'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  paper_df['gesture'] = 'paper'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phone_df['gesture'] = 'phone'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value in

In [6]:
#Make sure dataframes were properly created
print(four_df.head())
print(paper_df.head())
print(phone_df.head())
print(rock_df.head())
print(scissors_df.head())
print(three_df.head())

      timestamp  emg_value_sensor1  emg_value_sensor2  emg_value_sensor3  \
0  1.733775e+09                154                123                  5   
1  1.733775e+09                160                120                  5   
2  1.733775e+09                153                121                  5   
3  1.733775e+09                156                120                  5   
4  1.733775e+09                156                120                  5   

   emg_value_sensor4 gesture  
0                 67    four  
1                 65    four  
2                 65    four  
3                 62    four  
4                 61    four  
      timestamp  emg_value_sensor1  emg_value_sensor2  emg_value_sensor3  \
0  1.733771e+09                118                 26                  4   
1  1.733771e+09                118                 26                  5   
2  1.733771e+09                119                 28                  5   
3  1.733771e+09                119                 31

In [7]:
# Make sure they each have the correct number of rows (20200)
print(four_df.shape)
print(paper_df.shape)
print(phone_df.shape)
print(rock_df.shape)
print(scissors_df.shape)
print(three_df.shape)

(20055, 6)
(20055, 6)
(20055, 6)
(20055, 6)
(20055, 6)
(20055, 6)


In [8]:
# Check types of each column
print(four_df.dtypes)
print(paper_df.dtypes)
print(phone_df.dtypes)
print(rock_df.dtypes)
print(scissors_df.dtypes)
print(three_df.dtypes)

timestamp            float64
emg_value_sensor1      int64
emg_value_sensor2      int64
emg_value_sensor3      int64
emg_value_sensor4      int64
gesture               object
dtype: object
timestamp            float64
emg_value_sensor1      int64
emg_value_sensor2      int64
emg_value_sensor3      int64
emg_value_sensor4      int64
gesture               object
dtype: object
timestamp            float64
emg_value_sensor1      int64
emg_value_sensor2      int64
emg_value_sensor3      int64
emg_value_sensor4      int64
gesture               object
dtype: object
timestamp            float64
emg_value_sensor1      int64
emg_value_sensor2      int64
emg_value_sensor3      int64
emg_value_sensor4      int64
gesture               object
dtype: object
timestamp            float64
emg_value_sensor1      int64
emg_value_sensor2      int64
emg_value_sensor3      int64
emg_value_sensor4      int64
gesture               object
dtype: object
timestamp            float64
emg_value_sensor1      int64
em

Need to one hot encode gesture column

In [9]:
# Make a function to specifically get only numbered columns to make life easier
def cols_num(df):
  return (df.dtypes == np.int64) | (df.dtypes == np.float64)
# Make a function to winsorize
def win(df):
  df[df.columns[cols_num(df)]] = df[df.columns[cols_num(df)]].apply(lambda x: winsorize(x, limits=[0.01, 0.01]))
  return None

In [10]:
# Winsorize to handle possible outliers
win(four_df)
win(paper_df)
win(phone_df)
win(rock_df)
win(scissors_df)
win(three_df)

In [11]:
# Combine datasets into one big dataset
final_df = pd.concat([four_df, paper_df, phone_df, rock_df, scissors_df, three_df], ignore_index=True)

In [12]:
#look at final dataset
final_df

Unnamed: 0,timestamp,emg_value_sensor1,emg_value_sensor2,emg_value_sensor3,emg_value_sensor4,gesture
0,1.733775e+09,154,123,5,67,four
1,1.733775e+09,160,120,5,65,four
2,1.733775e+09,153,121,5,65,four
3,1.733775e+09,156,120,5,62,four
4,1.733775e+09,156,120,5,61,four
...,...,...,...,...,...,...
120325,1.733774e+09,188,130,5,57,three
120326,1.733774e+09,186,119,4,58,three
120327,1.733774e+09,188,115,5,59,three
120328,1.733774e+09,188,114,5,60,three


In [13]:
final_df.shape

(120330, 6)

In [14]:
#perform one hot encoding
df_finger = pd.get_dummies(final_df['gesture'])
final_df.drop(labels = 'gesture', axis = 1, inplace = True)
final_df = final_df.join(df_finger)

In [15]:
# check new df
final_df

Unnamed: 0,timestamp,emg_value_sensor1,emg_value_sensor2,emg_value_sensor3,emg_value_sensor4,four,paper,phone,rock,scissors,three
0,1.733775e+09,154,123,5,67,True,False,False,False,False,False
1,1.733775e+09,160,120,5,65,True,False,False,False,False,False
2,1.733775e+09,153,121,5,65,True,False,False,False,False,False
3,1.733775e+09,156,120,5,62,True,False,False,False,False,False
4,1.733775e+09,156,120,5,61,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...
120325,1.733774e+09,188,130,5,57,False,False,False,False,False,True
120326,1.733774e+09,186,119,4,58,False,False,False,False,False,True
120327,1.733774e+09,188,115,5,59,False,False,False,False,False,True
120328,1.733774e+09,188,114,5,60,False,False,False,False,False,True


In [17]:
# Check new column types
final_df.dtypes

Unnamed: 0,0
emg_value_sensor1,int64
emg_value_sensor2,int64
emg_value_sensor3,int64
emg_value_sensor4,int64
four,bool
paper,bool
phone,bool
rock,bool
scissors,bool
three,bool


In [18]:
# Function to turn pandas column into type float
def col_to_float(df):
  df[df.columns] = df[df.columns].astype(float)
  return None
col_to_float(final_df)

In [19]:
final_df.dtypes

Unnamed: 0,0
emg_value_sensor1,float64
emg_value_sensor2,float64
emg_value_sensor3,float64
emg_value_sensor4,float64
four,float64
paper,float64
phone,float64
rock,float64
scissors,float64
three,float64


In [20]:
final_df

Unnamed: 0,emg_value_sensor1,emg_value_sensor2,emg_value_sensor3,emg_value_sensor4,four,paper,phone,rock,scissors,three
0,154.0,123.0,5.0,67.0,1.0,0.0,0.0,0.0,0.0,0.0
1,160.0,120.0,5.0,65.0,1.0,0.0,0.0,0.0,0.0,0.0
2,153.0,121.0,5.0,65.0,1.0,0.0,0.0,0.0,0.0,0.0
3,156.0,120.0,5.0,62.0,1.0,0.0,0.0,0.0,0.0,0.0
4,156.0,120.0,5.0,61.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
120325,188.0,130.0,5.0,57.0,0.0,0.0,0.0,0.0,0.0,1.0
120326,186.0,119.0,4.0,58.0,0.0,0.0,0.0,0.0,0.0,1.0
120327,188.0,115.0,5.0,59.0,0.0,0.0,0.0,0.0,0.0,1.0
120328,188.0,114.0,5.0,60.0,0.0,0.0,0.0,0.0,0.0,1.0


#Create Training and Validation Sets

In [21]:
#Seperate label and features
gestures = ['four', 'paper', 'phone', 'rock', 'scissors', 'three']
y = final_df[gestures]
x = final_df.drop(columns = gestures)

In [22]:
#Make sure label and features are correct
print(y.columns)
print(x.columns)

Index(['four', 'paper', 'phone', 'rock', 'scissors', 'three'], dtype='object')
Index(['emg_value_sensor1', 'emg_value_sensor2', 'emg_value_sensor3',
       'emg_value_sensor4'],
      dtype='object')


In [23]:
# Make training and test sets - test set is 25% of the total dataset
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1234)

In [24]:
X_train.shape

(90247, 4)

Using a model I made over the summer can change structure if needed

In [31]:
# 1. Create the model object
nn_model = keras.Sequential()

# 2. Create the input layer and add it to the model
input_layer = keras.layers.InputLayer(input_shape=(X_train.shape[1],))
nn_model.add(input_layer)

# 3. Add 10 hidden layers
nn_model.add(keras.layers.Dense(units=512, activation='relu'))  # Hidden Layer 1
nn_model.add(keras.layers.Dense(units=256, activation='relu'))  # Hidden Layer 2
nn_model.add(keras.layers.Dense(units=128, activation='relu'))  # Hidden Layer 3
nn_model.add(keras.layers.Dense(units=128, activation='relu'))  # Hidden Layer 4
nn_model.add(keras.layers.Dense(units=64, activation='relu'))   # Hidden Layer 5
nn_model.add(keras.layers.Dense(units=64, activation='relu'))   # Hidden Layer 6
nn_model.add(keras.layers.Dense(units=32, activation='relu'))   # Hidden Layer 7
nn_model.add(keras.layers.Dense(units=32, activation='relu'))   # Hidden Layer 8
nn_model.add(keras.layers.Dense(units=16, activation='relu'))   # Hidden Layer 9
nn_model.add(keras.layers.Dense(units=16, activation='relu'))   # Hidden Layer 10

# 4. Create and add the output layer
output_layer = keras.layers.Dense(units=6, activation='softmax')
nn_model.add(output_layer)

# Print summary of the neural network structure
nn_model.summary()



In [32]:
#Using a stochastic gradient descent optimizer - learning rate will be changed later for optimization
learning_rate = 0.05
sgd_optimizer = keras.optimizers.SGD(learning_rate = learning_rate)

In [33]:
#categorical cross entropy loss function
loss_fn = keras.losses.CategoricalCrossentropy(from_logits=False)

In [34]:
# Compile Model
nn_model.compile(optimizer=sgd_optimizer, loss=loss_fn, metrics=['accuracy'])

In [35]:
# Callback class I wrote over the summer to output model info while training
class ProgBarLoggerNEpochs(keras.callbacks.Callback):

    def __init__(self, num_epochs: int, every_n: int = 50):
        self.num_epochs = num_epochs
        self.every_n = every_n

    def on_epoch_end(self, epoch, logs=None):
        if (epoch + 1) % self.every_n == 0:
            s = 'Epoch [{}/ {}]'.format(epoch + 1, self.num_epochs)
            logs_s = ['{}: {:.4f}'.format(k.capitalize(), v)
                      for k, v in logs.items()]
            s_list = [s] + logs_s
            print(', '.join(s_list))

In [36]:
t0 = time.time() # start time

num_epochs = 290 # epochs

history = nn_model.fit(X_train, y_train, epochs=num_epochs, verbose=0, callbacks=[ProgBarLoggerNEpochs(num_epochs, every_n=5)], validation_split = 0.2)


t1 = time.time() # stop time

print('Elapsed time: %.2fs' % (t1-t0))

Epoch [5/ 290], Accuracy: 0.7346, Loss: 0.6051, Val_accuracy: 0.7111, Val_loss: 0.6565


KeyboardInterrupt: 

In [None]:
# Plot training and validation loss
plt.plot(range(1, num_epochs + 1), history.history['loss'], label='Training Loss')
plt.plot(range(1, num_epochs + 1), history.history['val_loss'], label='Validation Loss')

plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()


# Plot training and validation accuracy
plt.plot(range(1, num_epochs + 1), history.history['accuracy'], label='Training Accuracy')
plt.plot(range(1, num_epochs + 1), history.history['val_accuracy'], label='Validation Accuracy')

plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
# Use model on test data
loss, accuracy = nn_model.evaluate(X_test, y_test)

print('Loss: {0} Accuracy: {1}'.format(loss, accuracy))

In [None]:
# Make predictions on the test set
probability_predictions = nn_model.predict(X_test)
class_label_predictions = []

# Get the class with the highest probability for each prediction
class_label_predictions = np.argmax(probability_predictions, axis=1)

In [None]:
class_label_predictions

In [None]:
# Compare predicted labels to real labels
# Assuming y_test is a pandas DataFrame
actual_labels = np.argmax(y_test.values, axis=1)
# Example: Check if the first 10 predictions are correct
for i in range(10):
    print(f"Sample {i}: Predicted - {class_label_predictions[i]}, Actual - {actual_labels[i]}")
from sklearn.metrics import accuracy_score, classification_report

# Calculate accuracy
accuracy = accuracy_score(actual_labels, class_label_predictions)
print(f"Accuracy: {accuracy}")

# Generate classification report with precision, recall, F1-score
report = classification_report(actual_labels, class_label_predictions, target_names=fingers)  # 'fingers' from your code
print(report)