In [62]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "dim_reduction"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    #print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.utils import io

with io.capture_output() as captured:
    print('no prints in here')

import time
import pandas as pd

In [63]:
Alone = 0
Spontan = 1
Sync = 2

First lets load the right hand solo,
and we would actually from the get go merge it to the solo hands

In [64]:
HandRight = pd.read_csv("Unity Data\HandRight.csv")
#HandRight.head(10)

In [65]:
drop_indecies = [0,1,2] # notice we are droping hands count because this is cheating!

In [66]:
HandRight = HandRight.drop(HandRight.columns[drop_indecies], axis=1)
HandRight = HandRight.iloc[500:4000]

In [67]:
HandRight.head()
#HandRight.info()

Unnamed: 0,# hands,Position X,Position Y,Position Z,Velocity X,Velocity Y,Velocity Z,Pitch,Roll,Yaw,Wrist Pos X,Wrist Pos Y,Wrist Pos Z,Elbow pos X,Elbow Pos Y,Elbow Pos Z,Grab Strenth,Grab Angle,Pinch Strength
500,1,1.854658,163.3199,12.51509,67.3389,-6.28195,101.4422,-0.155174,0.078463,-0.764891,59.46534,173.0233,60.61472,254.6348,127.2725,240.6239,0.0,0.401652,0.0
501,1,2.990307,163.0106,14.22045,63.82683,-27.71448,91.91269,-0.166827,0.079468,-0.760001,60.39677,173.5084,62.45371,257.0284,134.3396,242.4251,0.0,0.405177,0.0
502,1,4.197042,162.9949,15.69018,69.52037,12.69234,67.46247,-0.172697,0.079392,-0.758363,61.55299,174.0561,63.92721,259.6933,142.5166,243.7431,0.0,0.378955,0.0
503,1,5.585192,163.327,17.14896,81.45722,20.71213,85.95974,-0.162573,0.073489,-0.754197,62.87757,173.9819,65.59348,262.3326,143.5807,244.1491,0.0,0.341598,0.0
504,1,6.809369,163.075,18.31746,64.54992,-20.25474,60.26554,-0.162684,0.0714,-0.747182,63.8033,173.8077,67.11089,263.0048,144.4967,246.1332,0.0,0.299372,0.0


In [68]:
k = pd.concat([HandRight.iloc[::4,:].reset_index(drop=True), HandRight.iloc[1::4,:].reset_index(drop=True)], axis=1)
k.shape


(875, 38)

In [69]:
#HandRight.iloc[1:5*3+1:3,:]

In [70]:
def TransformData(df, type):
    df = df.drop(df.columns[drop_indecies], axis=1)
    if(type == Alone):
        df = df.iloc[500:4000]
        maxX = df.shape[0]
        combine = np.hstack([HandRight[:maxX].values, df.values]).reshape(-1, df.shape[1])
        df = pd.DataFrame(combine, columns=df.columns)
    else:
        df = df.iloc[1000:9000]
        maxX = int(df.shape[0] / 2) * 2
        df = df.iloc[:maxX]
    df[df.columns[0]] -= 1
    df["state"] = type
    return df

In [71]:
import glob

class DataLoader:
    def __init__(self, path):
        li = []
        all_files = glob.glob(path + "/*")
        v = 0
        for folder in all_files:
            print("\nloading in" ,folder, ':')
            files = glob.glob(folder + "/*.csv")
            for filename in files:
                df = pd.read_csv(filename, index_col=None, header=0)
                type = None
                if("Alone" in filename):
                    type = Alone
                elif("Sync" in filename):
                    type = Sync
                elif("Spontan" in filename):
                    type = Spontan
                df = TransformData(df, type)
                v += df.shape[0]
                li.append(df)
                print('loaded ', filename, type)
        self.dataRaw = li
        self.dataMerged = pd.concat(li, axis=0, sort=False)
        print(self.dataMerged.shape, v)

In [72]:
trainSet = DataLoader('Unity Data\Training')
testSet = DataLoader('Unity Data\Validation')


loading in Unity Data\Training\Evyatar Cohen :
loaded  Unity Data\Training\Evyatar Cohen\Evyatar636771052727603804Spontan.csv 1
loaded  Unity Data\Training\Evyatar Cohen\Evyatar636771053639929594Sync.csv 2
loaded  Unity Data\Training\Evyatar Cohen\Evyatar636771054555711409Alone.csv 0

loading in Unity Data\Training\Nofar Social_Nuero :
loaded  Unity Data\Training\Nofar Social_Nuero\Nofar636759795182793299Spontan.csv 1
loaded  Unity Data\Training\Nofar Social_Nuero\Nofar636759796290435160Alone.csv 0
loaded  Unity Data\Training\Nofar Social_Nuero\Nofar636759797397919664Sync.csv 2

loading in Unity Data\Training\Oriya Social_Nuero :
loaded  Unity Data\Training\Oriya Social_Nuero\Oriya636759804404113837Spontan.csv 1
loaded  Unity Data\Training\Oriya Social_Nuero\Oriya636759805268396661Alone.csv 0
loaded  Unity Data\Training\Oriya Social_Nuero\Oriya636759806131350399Sync.csv 2

loading in Unity Data\Training\Orya Kalmanovitz :
loaded  Unity Data\Training\Orya Kalmanovitz\OryaB6367710827366

In [73]:
data = pd.DataFrame
data = trainSet.dataMerged

dataRaw is all the daraframes that we have already setup for us, now what we need is to generate from all of this,

our actuall training data

In [74]:
k = trainSet.dataRaw[4]
w = k.iloc[:,[1,2,3]]
k = k.drop(k.columns[[1,2,3]], axis=1)
k.head()
k = pd.concat([w.reset_index(drop=True), k.reset_index(drop=True)], axis=1)
k

Unnamed: 0,# hands,Velocity X,Velocity Y,Velocity Z,Pitch,Roll,Yaw,Wrist Pos X,Wrist Pos Y,Wrist Pos Z,Elbow pos X,Elbow Pos Y,Elbow Pos Z,Grab Strenth,Grab Angle,Pinch Strength,state
0,0.0,67.3389,-6.28195,101.4422,-0.155174,0.078463,-0.764891,59.46534,173.0233,60.61472,254.6348,127.2725,240.6239,0.0,0.401652,0.0,0
1,0.0,49.71883,-341.2516,-14.95178,1.199416,0.119953,1.519325,-92.19111,218.0329,12.57041,-309.3688,85.11283,-24.33749,0.0,0.603028,0.0,0
2,0.0,63.82683,-27.71448,91.91269,-0.166827,0.079468,-0.760001,60.39677,173.5084,62.45371,257.0284,134.3396,242.4251,0.0,0.405177,0.0,0
3,0.0,36.02574,-309.4118,0.574942,1.089347,0.092826,1.515014,-91.69262,214.4963,13.19102,-314.42,89.30956,-17.0896,0.0,0.313147,0.0,0
4,0.0,69.52037,12.69234,67.46247,-0.172697,0.079392,-0.758363,61.55299,174.0561,63.92721,259.6933,142.5166,243.7431,0.0,0.378955,0.0,0


Unnamed: 0,Position X,Position Y,Position Z,# hands,Velocity X,Velocity Y,Velocity Z,Pitch,Roll,Yaw,Wrist Pos X,Wrist Pos Y,Wrist Pos Z,Elbow pos X,Elbow Pos Y,Elbow Pos Z,Grab Strenth,Grab Angle,Pinch Strength,state
0,1.854658,163.3199,12.51509,0.0,67.33890,-6.28195,101.442200,-0.155174,0.078463,-0.764891,59.46534,173.0233,60.61472,254.6348,127.27250,240.62390,0.000000,0.401652,0.000000,0
1,-20.808590,226.0860,17.08860,0.0,49.71883,-341.25160,-14.951780,1.199416,0.119953,1.519325,-92.19111,218.0329,12.57041,-309.3688,85.11283,-24.33749,0.000000,0.603028,0.000000,0
2,2.990307,163.0106,14.22045,0.0,63.82683,-27.71448,91.912690,-0.166827,0.079468,-0.760001,60.39677,173.5084,62.45371,257.0284,134.33960,242.42510,0.000000,0.405177,0.000000,0
3,-20.061710,220.5413,17.05443,0.0,36.02574,-309.41180,0.574942,1.089347,0.092826,1.515014,-91.69262,214.4963,13.19102,-314.4200,89.30956,-17.08960,0.000000,0.313147,0.000000,0
4,4.197042,162.9949,15.69018,0.0,69.52037,12.69234,67.462470,-0.172697,0.079392,-0.758363,61.55299,174.0561,63.92721,259.6933,142.51660,243.74310,0.000000,0.378955,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6995,-73.616340,242.2852,69.19761,0.0,-188.33350,-50.49825,29.529870,1.857212,0.791704,1.893267,-119.80380,195.0393,45.57323,-299.4263,41.95221,-32.44111,0.163370,1.244664,0.201331,0
6996,3.230397,169.1729,-4.81981,0.0,224.79620,-538.24620,-463.401900,0.351476,0.213730,-0.117169,22.59152,148.0959,66.18939,158.3324,41.64271,276.20630,0.000000,0.514317,0.000000,0
6997,-75.451450,242.0424,69.58482,0.0,-215.89570,-28.57200,45.553780,1.852507,0.801023,1.897196,-120.83340,194.0461,45.91526,-299.5178,40.43322,-33.21206,0.236677,1.400911,0.336688,0
6998,6.434268,161.4118,-12.09352,0.0,194.23230,-373.04310,-408.729000,0.309624,0.220371,-0.108358,25.10394,143.9342,60.14394,156.3949,60.15020,282.87230,0.000000,0.383218,0.000000,0


Lets generate our np_array so first of all lets create a new class,

class would do the following it would get a settings : colloums it should drop, 

(jumps in time, and number of frames per row, andoverall jumps on the dataset),

In [75]:
# k = training.dataRaw[4].iloc[1:2*2+1:2,:].to_numpy()
# w = training.dataRaw[4].iloc[:2*2:2,:].to_numpy()
# k
# w

# np.concatenate((k, w), axis=0)

#training.dataRaw[4]['state'][0]

w1 =  trainSet.dataRaw[4].iloc[::5,[7]][:2]
w2 = trainSet.dataRaw[4].iloc[1::5,[7]][:2]

w1.columns
w1
w2
w3 = pd.concat([w1.reset_index(drop=True), w2.reset_index(drop=True)], axis = 1)
w3.groupby(lambda x:x, axis=1).mean()

#training.dataRaw[4].drop('state', axis=1)

Index([' Pitch'], dtype='object')

Unnamed: 0,Pitch
0,-0.155174
5,0.913572


Unnamed: 0,Pitch
1,1.199416
6,-0.162573


Unnamed: 0,Pitch
0,0.522121
1,0.375499


In [76]:
class DataToNP:
    def __init__(self, label_index, jumps, combine = 4, skips = 1, drop_indecies = [], merge_indecies = []):
        self.drop_indecies = drop_indecies
        self.label_index = label_index
        self.jumps = jumps
        self.combine = combine
        self.merge_indecies = merge_indecies
        if(skips < 0):
            skips = skips % self.jumps
        self.skips = skips

    def transform(self, data, skips = -1):
        if(skips < 0):
            skips = self.skips
        state = data[self.label_index].iloc[0]
        if(len(self.drop_indecies) > 0):
            current = data.drop(data.columns[self.drop_indecies], axis=1)
            current = current.drop(self.label_index, axis=1)
        else:
            current = data.drop(self.label_index, axis=1)
        

        mi = data.iloc[:,self.merge_indecies]
        mi = pd.concat([mi.iloc[::2,:].reset_index(drop=True), mi.iloc[1::2,:].reset_index(drop=True)], axis=1)
        current = current.drop(data.columns[self.merge_indecies], axis=1)
        current = pd.concat([current.iloc[::2,:].reset_index(drop=True), current.iloc[1::2,:].reset_index(drop=True)], axis=1)
        li = []
        if(skips < 1):
            skips = 1
        for i in range(0, self.jumps, skips):
            df = current.iloc[i::self.jumps,:]
            comb = []
            mis = []
            for j in range(self.combine):
                sample = df.iloc[j::self.combine]
                mi_sample = mi.iloc[i::self.jumps,:].iloc[j::self.combine]
                mis.append(mi_sample.reset_index(drop=True))
                comb.append(sample.reset_index(drop=True))
            
            vmi = pd.concat(mis, axis=1).groupby(lambda x:x, axis=1).mean()
            #comb.append(mi.iloc[i::self.jumps,:][::self.combine].reset_index(drop=True))
            comb.append(vmi)
            df = pd.concat(comb, axis=1)
            #df['state'] = type
            li.append(df.dropna())
        df = pd.concat(li, axis=0, sort=False) 
        df['state'] = state     
        return df
    
    def transform_arr(self, df_arr, skips = -1):
        arr = []
        print("combining", len(df_arr), "dataframes")
        for df in df_arr:
            arr.append(self.transform(df, skips))
            #X = np.concatenate((X, t[0]), axis=0)
            #y = np.concatenate((y, t[1]), axis=0)
        return arr
        
            

Small test seeing that the class actually works!

it does so cool,

we want to combine every 2 rows ( right and left hand), then we want,

to sample with jumps ( here its 2 ), and we will combine every 2 jumps.

also if we got a line that has missing properties a.k.a we combined a line with a non existent one,

we will drop that line

In [77]:
test_data = {'Name':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
        'Age':[20, 21, 19, 18, 20, 21, 19, 18, 17, 16],
        'label':[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
test_data2 = {'Name':[11, 22, 33, 44, 55, 66, 77, 88, 99, 1010],
        'Age':[20, 21, 19, 18, 20, 21, 19, 18, 17, 16],
        'label':[2, 2, 2, 2, 2, 2, 2, 2, 2, 2]}
 
test_df = pd.DataFrame(test_data)
test_df2 = pd.DataFrame(test_data2)
li = [test_df, test_df2]

# Create DataFrame

test_df
combinerTest = DataToNP('label', 2, combine=2)
combinerTest.transform(test_df)

combinerTest.transform_arr(li)

Unnamed: 0,Name,Age,label
0,1,20,1
1,2,21,1
2,3,19,1
3,4,18,1
4,5,20,1
5,6,21,1
6,7,19,1
7,8,18,1
8,9,17,1
9,10,16,1


Unnamed: 0,Name,Age,Name.1,Age.1,Name.2,Age.2,Name.3,Age.3,state
0,1,20,2,21,5.0,20.0,6.0,21.0,1
0,3,19,4,18,7.0,19.0,8.0,18.0,1


combining 2 dataframes


[   Name  Age  Name  Age  Name   Age  Name   Age  state
 0     1   20     2   21   5.0  20.0   6.0  21.0      1
 0     3   19     4   18   7.0  19.0   8.0  18.0      1,
    Name  Age  Name  Age  Name   Age  Name   Age  state
 0    11   20    22   21  55.0  20.0  66.0  21.0      2
 0    33   19    44   18  77.0  19.0  88.0  18.0      2]

Lets see what is the corroletion to the state variable

In [78]:
corr_matrix = trainSet.dataMerged.corr()
corr_matrix["state"]

 # hands           0.830492
 Position X        0.077259
 Position Y        0.089554
 Position Z        0.047028
 Velocity X       -0.007780
 Velocity Y        0.008675
 Velocity Z        0.003729
 Pitch             0.257066
 Roll             -0.023557
 Yaw              -0.162749
 Wrist Pos X       0.110207
 Wrist Pos Y      -0.058052
 Wrist Pos Z      -0.137829
 Elbow pos X       0.108218
 Elbow Pos Y      -0.252712
 Elbow Pos Z      -0.399522
 Grab Strenth     -0.080044
 Grab Angle       -0.219273
 Pinch Strength   -0.137151
state              1.000000
Name: state, dtype: float64

In [79]:
jumps = 15
combine = 5
skip = 1
label = 'state'
drop_indecies = range(1,trainSet.dataMerged.shape[1]-4)

trainSet.dataMerged.shape[1]
MyCombiner = DataToNP(label_index = label, skips=skip, jumps=jumps, combine=combine, 
    drop_indecies=drop_indecies, merge_indecies = [0,16,17,18])

# test if we get the same dementions its working

# MyCombiner.transform(training.dataRaw[4]).shape
# (training.dataRaw[4].shape[1] - 1) * combine * 2
# rows_afterJoining = (training.dataRaw[4].shape[0]/2) 
# import math
# int(int(int(rows_afterJoining/jumps )/combine) * math.ceil(jumps / skip))

w = MyCombiner.transform(training.dataRaw[3])
w

#training.dataRaw[3].columns

20

Unnamed: 0,# hands,Grab Angle,Grab Strenth,Pinch Strength,state
0,0.8,0.805184,0.072362,0.134520,1
1,0.8,0.411345,0.042449,0.003385,1
2,1.0,0.945570,0.100084,0.184384,1
3,1.0,1.120967,0.209261,0.312322,1
4,1.0,1.598069,0.362430,0.355919,1
...,...,...,...,...,...
49,1.0,0.770665,0.068845,0.028501,1
50,1.0,1.011708,0.100000,0.221990,1
51,1.0,1.008322,0.050274,0.307427,1
52,1.0,1.451354,0.308101,0.337846,1


Loots good so the combiner works as expected!

there should be no problems with multiple dataFrames as it just doing the same function and combining the end result

In [80]:
train_df = MyCombiner.transform_arr(trainSet.dataRaw)
test_df = MyCombiner.transform_arr(testSet.dataRaw)
train_merged = pd.concat(train_df, axis=0, sort=False) 
test_merged = pd.concat(test_df, axis=0, sort=False) 
#X_train_df[0]
#MyCombiner.transform(training.dataRaw[0])


combining 27 dataframes
combining 9 dataframes


In [81]:
train_merged.shape
test_merged.shape

m = train_merged.shape[1]

(19527, 5)

(5686, 5)

In [82]:
corr_matrix = train_merged.corr()
corr_matrix['state']

corr_matrix = test_merged.corr()
corr_matrix['state']

 # hands           0.843713
 Grab Angle       -0.378694
 Grab Strenth     -0.168940
 Pinch Strength   -0.266196
state              1.000000
Name: state, dtype: float64

 # hands           0.839211
 Grab Angle       -0.479526
 Grab Strenth     -0.301672
 Pinch Strength   -0.382238
state              1.000000
Name: state, dtype: float64

ineed we can see that both the training and test data look a like!

In [101]:
def dataframeToXY(df):
    X = df.iloc[:,range(0,m - 1)].to_numpy()
    y = df['state'].to_numpy()
    return (X, y)

In [103]:
X_train_full, y_train_unshuffled = dataframeToXY(train_merged)

print(X_train_full[::500][:5], y_train_unshuffled[::500][:5])
train_merged.iloc[::500][:5]

[[1.00000000e+00 1.11863117e+00 1.85557819e-01 1.16387910e-02]
 [1.00000000e+00 6.79856350e-01 1.34175800e-03 3.31147600e-02]
 [1.00000000e+00 1.78942120e-01 0.00000000e+00 0.00000000e+00]
 [1.00000000e+00 5.14093991e-02 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 2.10833270e+00 2.82609220e-01 3.91836210e-01]] [1 1 2 2 0]


Unnamed: 0,# hands,Grab Angle,Grab Strenth,Pinch Strength,state
0,1.0,1.118631,0.185558,0.011639,1
14,1.0,0.679856,0.001342,0.033115,1
28,1.0,0.178942,0.0,0.0,2
42,1.0,0.051409,0.0,0.0,2
4,0.0,2.108333,0.282609,0.391836,0


Lets create a validation set!

In [104]:
def split(X, y, size = 0.8):
    m = int(len(X) * size)
    return (X[:m], y[:m], X[m:], y[m:])

In [96]:
np.random.seed(42)

shuffle_indecies = np.random.permutation(len(X_train_full))
train_size = int(len(X_train_full) * 0.8)

X_train = X_train_full[shuffle_indecies][:train_size]
X_validate = X_train_full[shuffle_indecies][train_size:]

y_train = y_train_unshuffled[shuffle_indecies][:train_size]
y_validate = y_train_unshuffled[shuffle_indecies][train_size:]

In [None]:
np.random.seed(42)

shuffle_indecies = np.random.permutation(len())

In [99]:
X_train[:5]
y_train[:5]
train_merged.iloc[shuffle_indecies][:5]

array([[1.        , 0.87018249, 0.1       , 0.21372421],
       [1.        , 0.68609325, 0.1       , 0.14866803],
       [0.        , 2.27305884, 0.5850421 , 0.51026192],
       [0.5       , 1.03578963, 0.14153682, 0.03012584],
       [1.        , 0.0761939 , 0.        , 0.        ]])

array([1, 1, 0, 2, 1], dtype=int64)

Unnamed: 0,# hands,Grab Angle,Grab Strenth,Pinch Strength,state
18,1.0,0.870182,0.1,0.213724,1
5,1.0,0.686093,0.1,0.148668,1
34,0.0,2.273059,0.585042,0.510262,0
26,0.5,1.03579,0.141537,0.030126,2
4,1.0,0.076194,0.0,0.0,1


Nice we've got a training set, validation set, and test set

now we can test some models

In [100]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

lin_reg.score(X_train, y_train)
lin_reg.score(X_validate, y_validate)

lin_reg.score(X_test)

LinearRegression()

0.7584489032245849

0.7555033787575105