In [61]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "dim_reduction"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    #print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.utils import io

with io.capture_output() as captured:
    print('no prints in here')

import time
import pandas as pd

In [62]:
Alone = 0
Spontan = 1
Sync = 2

First lets load the right hand solo,
and we would actually from the get go merge it to the solo hands

In [63]:
HandRight = pd.read_csv("Unity Data\HandRight.csv")
#HandRight.head(10)

In [64]:
drop_indecies = [0,1,2] # notice we are droping hands count because this is cheating!

In [65]:
HandRight = HandRight.drop(HandRight.columns[drop_indecies], axis=1)
HandRight = HandRight.iloc[500:4000]

In [66]:
HandRight.head()
#HandRight.info()

Unnamed: 0,# hands,Position X,Position Y,Position Z,Velocity X,Velocity Y,Velocity Z,Pitch,Roll,Yaw,Wrist Pos X,Wrist Pos Y,Wrist Pos Z,Elbow pos X,Elbow Pos Y,Elbow Pos Z,Grab Strenth,Grab Angle,Pinch Strength
500,1,1.854658,163.3199,12.51509,67.3389,-6.28195,101.4422,-0.155174,0.078463,-0.764891,59.46534,173.0233,60.61472,254.6348,127.2725,240.6239,0.0,0.401652,0.0
501,1,2.990307,163.0106,14.22045,63.82683,-27.71448,91.91269,-0.166827,0.079468,-0.760001,60.39677,173.5084,62.45371,257.0284,134.3396,242.4251,0.0,0.405177,0.0
502,1,4.197042,162.9949,15.69018,69.52037,12.69234,67.46247,-0.172697,0.079392,-0.758363,61.55299,174.0561,63.92721,259.6933,142.5166,243.7431,0.0,0.378955,0.0
503,1,5.585192,163.327,17.14896,81.45722,20.71213,85.95974,-0.162573,0.073489,-0.754197,62.87757,173.9819,65.59348,262.3326,143.5807,244.1491,0.0,0.341598,0.0
504,1,6.809369,163.075,18.31746,64.54992,-20.25474,60.26554,-0.162684,0.0714,-0.747182,63.8033,173.8077,67.11089,263.0048,144.4967,246.1332,0.0,0.299372,0.0


In [67]:
k = pd.concat([HandRight.iloc[::4,:].reset_index(drop=True), HandRight.iloc[1::4,:].reset_index(drop=True)], axis=1)
k.shape


(875, 38)

In [68]:
#HandRight.iloc[1:5*3+1:3,:]

In [69]:
def TransformData(df, type):
    df = df.drop(df.columns[drop_indecies], axis=1)
    if(type == Alone):
        df = df.iloc[500:4000]
        maxX = df.shape[0]
        combine = np.hstack([HandRight[:maxX].values, df.values]).reshape(-1, df.shape[1])
        df = pd.DataFrame(combine, columns=df.columns)
    else:
        df = df.iloc[1000:9000]
        maxX = int(df.shape[0] / 2) * 2
        df = df.iloc[:maxX]
    df[df.columns[0]] -= 1
    df.columns = df.columns.str.replace('#','')
    df.columns = df.columns.str.lstrip()
    df.columns = df.columns.str.rstrip()
    df.columns = df.columns.str.replace(' ', '_')
    df["state"] = type
    return df

In [70]:
import glob

class DataLoader:
    def __init__(self, path):
        li = []
        all_files = glob.glob(path + "/*")
        v = 0
        for folder in all_files:
            print("\nloading in" ,folder, ':')
            files = glob.glob(folder + "/*.csv")
            for filename in files:
                df = pd.read_csv(filename, index_col=None, header=0)
                type = None
                if("Alone" in filename):
                    type = Alone
                elif("Sync" in filename):
                    type = Sync
                elif("Spontan" in filename):
                    type = Spontan
                df = TransformData(df, type)
                v += df.shape[0]
                li.append(df)
                print('loaded ', filename, type)
        self.dataRaw = li
        self.dataMerged = pd.concat(li, axis=0, sort=False)
        print(self.dataMerged.shape, v)

In [71]:
trainSet = DataLoader('Unity Data\Training')
testSet = DataLoader('Unity Data\Validation')


loading in Unity Data\Training\Evyatar Cohen :
loaded  Unity Data\Training\Evyatar Cohen\Evyatar636771052727603804Spontan.csv 1
loaded  Unity Data\Training\Evyatar Cohen\Evyatar636771053639929594Sync.csv 2
loaded  Unity Data\Training\Evyatar Cohen\Evyatar636771054555711409Alone.csv 0

loading in Unity Data\Training\Nofar Social_Nuero :
loaded  Unity Data\Training\Nofar Social_Nuero\Nofar636759795182793299Spontan.csv 1
loaded  Unity Data\Training\Nofar Social_Nuero\Nofar636759796290435160Alone.csv 0
loadedUnity Data\Training\Nofar Social_Nuero\Nofar636759797397919664Sync.csv 2

loading in Unity Data\Training\Oriya Social_Nuero :
loaded  Unity Data\Training\Oriya Social_Nuero\Oriya636759804404113837Spontan.csv 1
loaded  Unity Data\Training\Oriya Social_Nuero\Oriya636759805268396661Alone.csv 0
loaded  Unity Data\Training\Oriya Social_Nuero\Oriya636759806131350399Sync.csv 2

loading in Unity Data\Training\Orya Kalmanovitz :
loaded  Unity Data\Training\Orya Kalmanovitz\OryaB636771082736601

In [72]:
data = pd.DataFrame
data = trainSet.dataMerged

dataRaw is all the daraframes that we have already setup for us, now what we need is to generate from all of this,

our actuall training data

In [73]:
k = trainSet.dataRaw[4]
w = k.iloc[:,[1,2,3]]
k = k.drop(k.columns[[1,2,3]], axis=1)
k.head()
k = pd.concat([w.reset_index(drop=True), k.reset_index(drop=True)], axis=1)
k

Unnamed: 0,hands,Velocity_X,Velocity_Y,Velocity_Z,Pitch,Roll,Yaw,Wrist_Pos_X,Wrist_Pos_Y,Wrist_Pos_Z,Elbow_pos_X,Elbow_Pos_Y,Elbow_Pos_Z,Grab_Strenth,Grab_Angle,Pinch_Strength,state
0,0.0,67.3389,-6.28195,101.4422,-0.155174,0.078463,-0.764891,59.46534,173.0233,60.61472,254.6348,127.2725,240.6239,0.0,0.401652,0.0,0
1,0.0,49.71883,-341.2516,-14.95178,1.199416,0.119953,1.519325,-92.19111,218.0329,12.57041,-309.3688,85.11283,-24.33749,0.0,0.603028,0.0,0
2,0.0,63.82683,-27.71448,91.91269,-0.166827,0.079468,-0.760001,60.39677,173.5084,62.45371,257.0284,134.3396,242.4251,0.0,0.405177,0.0,0
3,0.0,36.02574,-309.4118,0.574942,1.089347,0.092826,1.515014,-91.69262,214.4963,13.19102,-314.42,89.30956,-17.0896,0.0,0.313147,0.0,0
4,0.0,69.52037,12.69234,67.46247,-0.172697,0.079392,-0.758363,61.55299,174.0561,63.92721,259.6933,142.5166,243.7431,0.0,0.378955,0.0,0


Unnamed: 0,Position_X,Position_Y,Position_Z,hands,Velocity_X,Velocity_Y,Velocity_Z,Pitch,Roll,Yaw,Wrist_Pos_X,Wrist_Pos_Y,Wrist_Pos_Z,Elbow_pos_X,Elbow_Pos_Y,Elbow_Pos_Z,Grab_Strenth,Grab_Angle,Pinch_Strength,state
0,1.854658,163.3199,12.51509,0.0,67.33890,-6.28195,101.442200,-0.155174,0.078463,-0.764891,59.46534,173.0233,60.61472,254.6348,127.27250,240.62390,0.000000,0.401652,0.000000,0
1,-20.808590,226.0860,17.08860,0.0,49.71883,-341.25160,-14.951780,1.199416,0.119953,1.519325,-92.19111,218.0329,12.57041,-309.3688,85.11283,-24.33749,0.000000,0.603028,0.000000,0
2,2.990307,163.0106,14.22045,0.0,63.82683,-27.71448,91.912690,-0.166827,0.079468,-0.760001,60.39677,173.5084,62.45371,257.0284,134.33960,242.42510,0.000000,0.405177,0.000000,0
3,-20.061710,220.5413,17.05443,0.0,36.02574,-309.41180,0.574942,1.089347,0.092826,1.515014,-91.69262,214.4963,13.19102,-314.4200,89.30956,-17.08960,0.000000,0.313147,0.000000,0
4,4.197042,162.9949,15.69018,0.0,69.52037,12.69234,67.462470,-0.172697,0.079392,-0.758363,61.55299,174.0561,63.92721,259.6933,142.51660,243.74310,0.000000,0.378955,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6995,-73.616340,242.2852,69.19761,0.0,-188.33350,-50.49825,29.529870,1.857212,0.791704,1.893267,-119.80380,195.0393,45.57323,-299.4263,41.95221,-32.44111,0.163370,1.244664,0.201331,0
6996,3.230397,169.1729,-4.81981,0.0,224.79620,-538.24620,-463.401900,0.351476,0.213730,-0.117169,22.59152,148.0959,66.18939,158.3324,41.64271,276.20630,0.000000,0.514317,0.000000,0
6997,-75.451450,242.0424,69.58482,0.0,-215.89570,-28.57200,45.553780,1.852507,0.801023,1.897196,-120.83340,194.0461,45.91526,-299.5178,40.43322,-33.21206,0.236677,1.400911,0.336688,0
6998,6.434268,161.4118,-12.09352,0.0,194.23230,-373.04310,-408.729000,0.309624,0.220371,-0.108358,25.10394,143.9342,60.14394,156.3949,60.15020,282.87230,0.000000,0.383218,0.000000,0


Lets generate our np_array so first of all lets create a new class,

class would do the following it would get a settings : colloums it should drop, 

(jumps in time, and number of frames per row, andoverall jumps on the dataset),

In [74]:
# k = training.dataRaw[4].iloc[1:2*2+1:2,:].to_numpy()
# w = training.dataRaw[4].iloc[:2*2:2,:].to_numpy()
# k
# w

# np.concatenate((k, w), axis=0)

#training.dataRaw[4]['state'][0]

w1 =  trainSet.dataRaw[4].iloc[::5,[7]][:2]
w2 = trainSet.dataRaw[4].iloc[1::5,[7]][:2]

w1.columns
w1
w2
w3 = pd.concat([w1.reset_index(drop=True), w2.reset_index(drop=True)], axis = 1)
w3.groupby(lambda x:x, axis=1).mean()

#training.dataRaw[4].drop('state', axis=1)

Index(['Pitch'], dtype='object')

Unnamed: 0,Pitch
0,-0.155174
5,0.913572


Unnamed: 0,Pitch
1,1.199416
6,-0.162573


Unnamed: 0,Pitch
0,0.522121
1,0.375499


In [75]:
class DataToNP:
    def __init__(self, label_index, jumps, combine = 4, skips = 1, drop_indecies = [], merge_indecies = [], merge_seperate = []):
        self.drop_indecies = drop_indecies
        self.label_index = label_index
        self.jumps = jumps
        self.combine = combine
        self.merge_indecies = merge_indecies
        self.merge_seperate = merge_seperate
        if(skips < 0):
            skips = skips % self.jumps
        self.skips = skips

    def transform(self, data, skips = -1):
        if(skips < 0):
            skips = self.skips
        state = data[self.label_index].iloc[0]
        if(len(self.drop_indecies) > 0):
            current = data.drop(data.columns[self.drop_indecies], axis=1)
            current = current.drop(self.label_index, axis=1)
        else:
            current = data.drop(self.label_index, axis=1)
        

        mi = data.iloc[:, self.merge_indecies]
        msi = data.iloc[:, self.merge_seperate]
        mi = pd.concat([mi.iloc[::2,:].reset_index(drop=True), mi.iloc[1::2,:].reset_index(drop=True)], axis=1)

        msi1 = msi.iloc[::2,:].reset_index(drop=True)
        msi2 = msi.iloc[1::2,:].reset_index(drop=True)

        msi1.columns += "_r"
        msi2.columns += "_l"

        _drop = list(self.merge_indecies) + list(self.merge_seperate)
        current = current.drop(data.columns[_drop], axis=1)

        current_r = current.iloc[::2,:].reset_index(drop=True)
        current_l = current.iloc[1::2,:].reset_index(drop=True)


        current_r.columns = current_r.columns + "_r"
        current_l.columns = current_l.columns + "_l"
        current = pd.concat([current_r, current_l], axis=1)
        li = []
        if(skips < 1):
            skips = 1
        for i in range(0, self.jumps, skips):
            df = current.iloc[i::self.jumps,:]
            comb = []
            mis = []

            msis1 = []
            msis2 = []
            for j in range(self.combine):
                sample = df.iloc[j::self.combine]
                comb.append(sample.reset_index(drop=True))

                mi_sample = mi.iloc[i::self.jumps,:].iloc[j::self.combine]
                mis.append(mi_sample.reset_index(drop=True))

                msi_sample1 = msi1.iloc[i::self.jumps,:].iloc[j::self.combine]
                msi_sample2 = msi2.iloc[i::self.jumps,:].iloc[j::self.combine]
                       
                msis1.append(msi_sample1.reset_index(drop=True))
                msis2.append(msi_sample2.reset_index(drop=True))
            
            vmi = pd.concat(mis, axis=1).groupby(lambda x:x, axis=1).mean()
            vmsi1 = pd.concat(msis1, axis=1).groupby(lambda x:x, axis=1).mean()
            vmsi2 = pd.concat(msis2, axis=1).groupby(lambda x:x, axis=1).mean()

            vmsi = pd.concat([vmsi1.reset_index(drop=True), vmsi2.reset_index(drop=True)], axis=1)
            #comb.append(mi.iloc[i::self.jumps,:][::self.combine].reset_index(drop=True))
            comb.append(vmi)
            comb.append(vmsi)
            df = pd.concat(comb, axis=1)
            #df['state'] = type
            li.append(df.dropna())
        df = pd.concat(li, axis=0, sort=False) 
        df['state'] = state     
        return df
    
    def transform_arr(self, df_arr, skips = -1):
        arr = []
        print("combining", len(df_arr), "dataframes")
        for df in df_arr:
            arr.append(self.transform(df, skips))
            #X = np.concatenate((X, t[0]), axis=0)
            #y = np.concatenate((y, t[1]), axis=0)
        return arr
        
            

Small test seeing that the class actually works!

it does so cool,

we want to combine every 2 rows ( right and left hand), then we want,

to sample with jumps ( here its 2 ), and we will combine every 2 jumps.

also if we got a line that has missing properties a.k.a we combined a line with a non existent one,

we will drop that line

In [76]:
test_data = {'Name':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
        'Age':[20, 21, 19, 18, 20, 21, 19, 18, 17, 16],
        'label':[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
test_data2 = {'Name':[11, 22, 33, 44, 55, 66, 77, 88, 99, 1010],
        'Age':[20, 21, 19, 18, 20, 21, 19, 18, 17, 16],
        'label':[2, 2, 2, 2, 2, 2, 2, 2, 2, 2]}
 
test_df = pd.DataFrame(test_data)
test_df2 = pd.DataFrame(test_data2)
li = [test_df, test_df2]

# Create DataFrame

test_df
combinerTest = DataToNP('label', 2, combine=2)
combinerTest.transform(test_df)

combinerTest.transform_arr(li)

Unnamed: 0,Name,Age,label
0,1,20,1
1,2,21,1
2,3,19,1
3,4,18,1
4,5,20,1
5,6,21,1
6,7,19,1
7,8,18,1
8,9,17,1
9,10,16,1


Unnamed: 0,Name_r,Age_r,Name_l,Age_l,Name_r.1,Age_r.1,Name_l.1,Age_l.1,state
0,1,20,2,21,5.0,20.0,6.0,21.0,1
0,3,19,4,18,7.0,19.0,8.0,18.0,1


combining 2 dataframes


[   Name_r  Age_r  Name_l  Age_l  Name_r  Age_r  Name_l  Age_l  state
 0       1     20       2     21     5.0   20.0     6.0   21.0      1
 0       3     19       4     18     7.0   19.0     8.0   18.0      1,
    Name_r  Age_r  Name_l  Age_l  Name_r  Age_r  Name_l  Age_l  state
 0      11     20      22     21    55.0   20.0    66.0   21.0      2
 0      33     19      44     18    77.0   19.0    88.0   18.0      2]

Lets see what is the corroletion to the state variable

In [77]:
def intersection(lst1, lst2): 
    lst3 = [value for value in lst1 if value in lst2] 
    return lst3

def between(corr, min, max):
    first = (abs(corr["state"]) > min)
    second = (abs(corr["state"]) < max)
    first = [i for i, x in enumerate(first) if x]
    second = [i for i, x in enumerate(second) if x]
    return intersection(first, second)

In [78]:
corr_matrix = trainSet.dataMerged.corr()
corr_matrix["state"].sort_values()

import math
delete_arr_f = (abs(corr_matrix["state"]) <= 0.05)
delete_arr_f = [i for i, x in enumerate(delete_arr_f) if x]
delete_arr_f.remove(5)

merge_seperate = between(corr_matrix, 0.05, 0.2)

#merge_seperate.remove(0)

trainSet.dataMerged.columns

merge = list(range(0, trainSet.dataMerged.shape[1] - 1))
save = []

#trainSet.dataMerged.columns[merge_seperate]
#trainSet.dataMerged.columns[drop_i]

merge = np.setdiff1d(merge, merge_seperate)
merge = np.setdiff1d(merge, save)
merge = np.setdiff1d(merge, delete_arr_f)

print("\nremove : \n",  trainSet.dataMerged.columns[delete_arr_f])
print("\n\nmerged : \n",  trainSet.dataMerged.columns[merge])
print("\n\nmerged with RL : \n",  trainSet.dataMerged.columns[merge_seperate])
#print(merge, merge_seperate, delete_arr_f)

Elbow_Pos_Z      -0.399522
Elbow_Pos_Y      -0.252712
Grab_Angle       -0.219273
Yaw              -0.162749
Wrist_Pos_Z      -0.137829
Pinch_Strength   -0.137151
Grab_Strenth     -0.080044
Wrist_Pos_Y      -0.058052
Roll             -0.023557
Velocity_X       -0.007780
Velocity_Z        0.003729
Velocity_Y        0.008675
Position_Z        0.047028
Position_X        0.077259
Position_Y        0.089554
Elbow_pos_X       0.108218
Wrist_Pos_X       0.110207
Pitch             0.257066
hands             0.830492
state             1.000000
Name: state, dtype: float64

Index(['hands', 'Position_X', 'Position_Y', 'Position_Z', 'Velocity_X',
       'Velocity_Y', 'Velocity_Z', 'Pitch', 'Roll', 'Yaw', 'Wrist_Pos_X',
       'Wrist_Pos_Y', 'Wrist_Pos_Z', 'Elbow_pos_X', 'Elbow_Pos_Y',
       'Elbow_Pos_Z', 'Grab_Strenth', 'Grab_Angle', 'Pinch_Strength', 'state'],
      dtype='object')


remove : 
 Index(['Position_Z', 'Velocity_X', 'Velocity_Z', 'Roll'], dtype='object')


merged : 
 Index(['hands', 'Velocity_Y', 'Pitch', 'Elbow_Pos_Y', 'Elbow_Pos_Z',
       'Grab_Angle'],
      dtype='object')


merged with RL : 
 Index(['Position_X', 'Position_Y', 'Yaw', 'Wrist_Pos_X', 'Wrist_Pos_Y',
       'Wrist_Pos_Z', 'Elbow_pos_X', 'Grab_Strenth', 'Pinch_Strength'],
      dtype='object')


Lets remove positions, velocities, and roll and wrist pos Y, and grabStrength,

they give to little effect and would probably hurt.

In [79]:
jumps = 20
combine = 5
skip = 1
label = 'state'

features =  trainSet.dataMerged.shape[1]
# MyCombiner = DataToNP(label_index = label, skips=skip, jumps=jumps, combine=combine, 
#     drop_indecies=[], merge_indecies = [0,16,17,18])

MyCombiner = DataToNP(label_index = label, skips=skip, jumps=jumps, combine=combine, 
    drop_indecies = delete_arr_f, merge_indecies = merge, merge_seperate = merge_seperate)

# test if we get the same dementions its working

# MyCombiner.transform(training.dataRaw[4]).shape
# (training.dataRaw[4].shape[1] - 1) * combine * 2
# rows_afterJoining = (training.dataRaw[4].shape[0]/2) 
# import math
# int(int(int(rows_afterJoining/jumps )/combine) * math.ceil(jumps / skip))

w = MyCombiner.transform(trainSet.dataRaw[3])
w

#training.dataRaw[3].columns

Unnamed: 0,Elbow_Pos_Y,Elbow_Pos_Z,Grab_Angle,Pitch,Velocity_Y,hands,Elbow_pos_X_r,Grab_Strenth_r,Pinch_Strength_r,Position_X_r,...,Elbow_pos_X_l,Grab_Strenth_l,Pinch_Strength_l,Position_X_l,Position_Y_l,Wrist_Pos_X_l,Wrist_Pos_Y_l,Wrist_Pos_Z_l,Yaw_l,state
0,39.823218,8.821516,0.558606,1.951806,-9.772053,0.6,47.492740,0.000000,0.051396,20.320220,...,-349.700680,0.000000,0.252371,-164.143060,254.764200,-209.794440,222.678120,24.159870,2.246202,1
1,42.504997,70.235662,0.620478,1.401055,-24.130205,1.0,-363.819520,0.000000,0.177196,-148.724740,...,360.315440,0.035093,0.000000,171.782640,247.259560,206.349480,193.630740,82.827944,-0.637905,1
2,47.573770,63.465303,1.211767,1.654547,13.360834,1.0,-385.832580,0.552808,0.682555,-145.205740,...,379.277820,0.000000,0.000000,168.240520,245.700880,211.307960,194.363520,82.502964,-0.861536,1
3,51.279769,71.299116,1.354158,0.926535,20.784507,1.0,-380.944940,0.200000,0.586907,-150.382880,...,387.781200,0.195183,0.000000,176.754660,250.525160,213.734560,200.235760,96.436426,-0.574027,1
4,48.321454,85.919137,0.707550,0.772073,-15.139557,1.0,-380.750100,0.093348,0.276380,-163.980380,...,338.547940,0.000000,0.000000,133.113600,218.211660,172.939080,180.457340,93.681730,-0.560945,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35,55.999232,96.102148,0.963165,0.823711,-67.256190,1.0,-359.389480,0.400000,0.260173,-80.093830,...,391.543060,0.000000,0.000000,138.083800,195.102840,185.969340,149.850020,116.670180,-1.209123,1
36,19.082920,76.799393,1.216995,1.454497,63.846141,1.0,-370.458800,0.385013,0.481766,-83.081300,...,362.558720,0.167032,0.000000,149.230240,217.958280,192.978400,160.187960,94.994286,-1.456968,1
37,-13.134776,45.806365,1.032783,1.251151,16.944638,1.0,-359.675760,0.200000,0.373226,-92.883898,...,335.888720,0.038420,0.000000,146.824220,213.155840,191.488220,156.176920,79.009750,-1.469253,1
38,-2.426512,57.593913,1.131793,1.404055,48.849163,1.0,-350.172140,0.310415,0.451635,-95.276534,...,343.700480,0.000000,0.000000,160.676680,230.442380,201.862680,171.227240,80.999386,-1.396328,1


Loots good so the combiner works as expected!

there should be no problems with multiple dataFrames as it just doing the same function and combining the end result

In [80]:
train_df = MyCombiner.transform_arr(trainSet.dataRaw)
test_df = MyCombiner.transform_arr(testSet.dataRaw)
train_merged = pd.concat(train_df, axis=0, sort=False) 
test_merged = pd.concat(test_df, axis=0, sort=False) 
#X_train_df[0]
#MyCombiner.transform(training.dataRaw[0])


combining 27 dataframes
combining 9 dataframes


In [81]:
train_merged.shape
test_merged.shape

m = train_merged.shape[1]

(19367, 25)

(5680, 25)

In [82]:
corr_matrix = train_merged.corr()
corr_matrix['state'].sort_values()

corr_matrix2 = test_merged.corr()
corr_matrix2['state'].sort_values()

corr_matrix2['state'] - corr_matrix['state']

Elbow_Pos_Z        -0.737063
Wrist_Pos_Z_r      -0.454723
Elbow_Pos_Y        -0.422112
Grab_Angle         -0.392061
Pinch_Strength_l   -0.267390
Yaw_l              -0.240990
Grab_Strenth_l     -0.192406
Wrist_Pos_Y_l      -0.150854
Yaw_r              -0.133663
Pinch_Strength_r   -0.102741
Grab_Strenth_r     -0.058546
Elbow_pos_X_r      -0.032033
Wrist_Pos_Y_r      -0.004474
Position_X_r        0.035015
Wrist_Pos_X_r       0.035316
Velocity_Y          0.038036
Position_Y_l        0.065449
Wrist_Pos_Z_l       0.109200
Position_X_l        0.138378
Wrist_Pos_X_l       0.209902
Position_Y_r        0.219273
Elbow_pos_X_l       0.267262
Pitch               0.473682
hands               0.845177
state               1.000000
Name: state, dtype: float64

Elbow_Pos_Z        -0.666839
Grab_Angle         -0.495884
Elbow_Pos_Y        -0.460887
Pinch_Strength_l   -0.434554
Wrist_Pos_Z_r      -0.432349
Yaw_l              -0.419605
Grab_Strenth_l     -0.388303
Wrist_Pos_Y_l      -0.364634
Position_Y_l       -0.198770
Pinch_Strength_r   -0.117654
Yaw_r              -0.110416
Elbow_pos_X_r      -0.075414
Wrist_Pos_Z_l      -0.057360
Grab_Strenth_r     -0.056759
Wrist_Pos_Y_r      -0.045704
Wrist_Pos_X_r       0.025776
Position_X_l        0.030806
Position_X_r        0.037284
Velocity_Y          0.058195
Wrist_Pos_X_l       0.148226
Position_Y_r        0.197893
Elbow_pos_X_l       0.262397
Pitch               0.510126
hands               0.841148
state               1.000000
Name: state, dtype: float64

Elbow_Pos_Y        -0.038775
Elbow_Pos_Z         0.070224
Grab_Angle         -0.103822
Pitch               0.036445
Velocity_Y          0.020159
hands              -0.004028
Elbow_pos_X_r      -0.043381
Grab_Strenth_r      0.001786
Pinch_Strength_r   -0.014913
Position_X_r        0.002270
Position_Y_r       -0.021381
Wrist_Pos_X_r      -0.009539
Wrist_Pos_Y_r      -0.041230
Wrist_Pos_Z_r       0.022374
Yaw_r               0.023247
Elbow_pos_X_l      -0.004865
Grab_Strenth_l     -0.195897
Pinch_Strength_l   -0.167164
Position_X_l       -0.107572
Position_Y_l       -0.264219
Wrist_Pos_X_l      -0.061676
Wrist_Pos_Y_l      -0.213780
Wrist_Pos_Z_l      -0.166560
Yaw_l              -0.178615
state               0.000000
Name: state, dtype: float64

With this setup we can see that most of the features we got are very much relevant!

sadly we can see that there is quite a difference between the test, and validation set.

In [83]:
def dataframeToXY(df):
    X = df.iloc[:,range(0,m - 1)].to_numpy()
    y = df['state'].to_numpy()
    return (X, y)

In [84]:
X_train_full, y_train_unshuffled = dataframeToXY(train_merged)

#print(X_train_full[::500][:5], y_train_unshuffled[::500][:5])
#train_merged.iloc[::500][:5]

Lets create a validation set!

In [85]:
def split(X, y, size = 0.8):
    m = int(len(X) * size)
    return (X[:m], y[:m], X[m:], y[m:])

In [86]:
np.random.seed(42)

shuffle_indecies = np.random.permutation(len(X_train_full))

X_train_full_s = X_train_full[shuffle_indecies]
y_train_full = y_train_unshuffled[shuffle_indecies]

X_train, y_train, X_validate, y_validate = split(X_train_full[shuffle_indecies], y_train_unshuffled[shuffle_indecies], 0.8)

X_test, y_test = dataframeToXY(test_merged)

shuffle_indecies_t = np.random.permutation(len(X_test))
X_test = X_test[shuffle_indecies_t]
y_test = y_test[shuffle_indecies_t]

In [87]:
print(len(X_train_full_s), len(X_train), len(X_validate), len(X_test))

19367 15493 3874 5680


In [88]:
#X_train[:5]
#y_train[:5]
#train_merged.iloc[shuffle_indecies][:5]

Nice we've got a training set, validation set, and test set

now we can test some models

In [89]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

log_reg = Pipeline([
    ('std_scaler', StandardScaler()),
    ('pca', PCA(0.999)),
    ('lin', LogisticRegression())
])

from sklearn.model_selection import cross_val_score

svm_scores = cross_val_score(log_reg, X_train, y_train, cv=5)
svm_scores.mean()

0.925644083109756

LogisticRegression() having a score of 92% on cross validation

In [90]:
log_reg.fit(X_train_full_s, y_train_full)

#log_reg.score(X_train, y_train)
log_reg.score(X_validate, y_validate)
log_reg.score(X_test, y_test)

Pipeline(steps=[('std_scaler', StandardScaler()),
                ('pca', PCA(n_components=0.999)),
                ('lin', LogisticRegression())])

0.9220443985544656

0.8985915492957747

almost 90% accuracity on the test set that's impresive!

Lets test the SVC model now

In [91]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

svm_model = Pipeline([
    ('std_scaler', StandardScaler()),
    ('pca', PCA(0.999)),
    ('svm', SVC(probability=True))
])

In [92]:
from sklearn.model_selection import cross_val_score

svm_scores = cross_val_score(svm_model, X_train, y_train, cv=5)
svm_scores.mean()

0.9941909247886249

we got a real good score with the svm model on the cross_validtion 98%!

with suggest that the SVM model doing very well on the training set, and does not significantly overfitting,

as it performs well on different folds

In [93]:
svm_model.fit(X_train_full_s, y_train_full)

svm_model.score(X_train, y_train)
svm_model.score(X_validate, y_validate)

svm_model.score(X_test, y_test)

Pipeline(steps=[('std_scaler', StandardScaler()),
                ('pca', PCA(n_components=0.999)),
                ('svm', SVC(probability=True))])

0.9967727360743561

0.9976768198244709

0.9373239436619718

So we get a score of 93.5%+ on the test set, that very good,

but we have about 6% drop from the training set, with suggest the testing data is somehow different

In [94]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

tree = Pipeline([
    #('PCA', PCA(0.99)),
    ('std_scaler', StandardScaler()),
    ('tree', DecisionTreeClassifier(max_depth=4)),
])

svm_scores = cross_val_score(tree, X_train, y_train, cv=5)
svm_scores.mean()

0.8763967676530321

In [95]:
tree.fit(X_train, y_train)

tree.score(X_train, y_train)
tree.score(X_validate, y_validate)

tree.score(X_test, y_test)

Pipeline(steps=[('std_scaler', StandardScaler()),
                ('tree', DecisionTreeClassifier(max_depth=4))])

0.8792357839024075

0.8673205988642231

0.7813380281690141

Same stuff happens here the score for the training data is significantly better then,

the score on the testing, again that support the idea that the testing data somehow different.

now i will test many different models, and in the end i will try to use ensemble to boost our testing score.

In [96]:
from sklearn.linear_model import SGDClassifier


sgd_clf = Pipeline([
    #('PCA', PCA(0.99)),
    ('std_scaler', StandardScaler()),
    ('sgd_clf', SGDClassifier())
])

sgd_clf.fit(X_train, y_train)

sgd_clf.score(X_validate, y_validate)
sgd_clf.score(X_test, y_test)

Pipeline(steps=[('std_scaler', StandardScaler()), ('sgd_clf', SGDClassifier())])

0.9277232834279814

0.8852112676056338

In [97]:
from sklearn.ensemble import RandomForestClassifier


rnd_clf = Pipeline([
    ('std_scaler', StandardScaler()),
    ('pca', PCA(0.99)),
    ("forest" , RandomForestClassifier(max_depth=5, n_estimators=150, max_leaf_nodes=55, n_jobs=-1, oob_score=True))
])
cross_scores = cross_val_score(rnd_clf, X_train, y_train, cv=5)
cross_scores.mean()

Pipeline(steps=[('std_scaler', StandardScaler()),
                ('pca', PCA(n_components=0.99)),
                ('forest',
                 RandomForestClassifier(max_depth=5, max_leaf_nodes=55,
                                        n_estimators=150, n_jobs=-1,
                                        oob_score=True))])

0.9054577464788732

0.9340998397825491

In [None]:
rnd_clf.fit(X_train_full_s, y_train_full)
rnd_clf.score(X_test, y_test)

In [98]:
from sklearn.ensemble import GradientBoostingClassifier

gbrt = Pipeline([
    ('std_scaler', StandardScaler()),
    ('pca', PCA(0.99)),
    ("gbrt" ,  GradientBoostingClassifier(max_depth=4, n_estimators=20, learning_rate=0.15, subsample=0.7))
])
cross_scores = cross_val_score(gbrt, X_train, y_train, cv=5)
cross_scores.mean()

Pipeline(steps=[('std_scaler', StandardScaler()),
                ('pca', PCA(n_components=0.99)),
                ('gbrt',
                 GradientBoostingClassifier(learning_rate=0.15, max_depth=4,
                                            n_estimators=20, subsample=0.7))])

0.9063380281690141

0.9584408879710893

In [None]:
gbrt.fit(X_train_full_s, y_train_full)
gbrt.score(X_test, y_test)
gbrt.score(X_validate, y_validate)

In [100]:
from sklearn.ensemble import AdaBoostClassifier

ada_clf = Pipeline([
    #('std_scaler', StandardScaler()),
    #('pca', PCA(0.99)),
    ("gbrt" ,  AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), n_estimators=10, learning_rate=0.11))
])
ada_clf.fit(X_train, y_train)
ada_clf.score(X_validate, y_validate)
ada_clf.score(X_test, y_test)

Pipeline(steps=[('gbrt',
                 AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=3),
                                    learning_rate=0.11, n_estimators=10))])

0.8644366197183099

0.8789364997418688

In [103]:
from sklearn.naive_bayes import GaussianNB
model_gausiyan = Pipeline([
    #('std_scaler', StandardScaler()),
    ('std_scaler', StandardScaler()),
    ('pca', PCA(0.999)),
    ("gnb", GaussianNB()),
])

model_gausiyan.fit(X_train, y_train)
model_gausiyan.score(X_validate, y_validate)
model_gausiyan.score(X_test, y_test)

Pipeline(steps=[('std_scaler', StandardScaler()),
                ('pca', PCA(n_components=0.999)), ('gnb', GaussianNB())])

0.8980382034073309

0.8697183098591549

In [104]:
from sklearn.ensemble import AdaBoostClassifier

ada_g2 = AdaBoostClassifier(
    VotingClassifier(estimators=[("gnb", GaussianNB()), ('log_reg', LogisticRegression()), ('tree', DecisionTreeClassifier(max_depth=3))], voting='soft'), n_estimators= 15,
    algorithm="SAMME.R", learning_rate=0.2)
model_super_complex = Pipeline([
    ('std_scaler', StandardScaler()),
    ('pca', PCA(0.999)),
    ('adag2', ada_g2)
    #("gnb", GaussianNB()),
])

model_super_complex.fit(X_train, y_train)
model_super_complex.score(X_train, y_train)
model_super_complex.score(X_validate, y_validate)

Pipeline(steps=[('std_scaler', StandardScaler()),
                ('pca', PCA(n_components=0.999)),
                ('adag2',
                 AdaBoostClassifier(base_estimator=VotingClassifier(estimators=[('gnb',
                                                                                 GaussianNB()),
                                                                                ('log_reg',
                                                                                 LogisticRegression()),
                                                                                ('tree',
                                                                                 DecisionTreeClassifier(max_depth=3))],
                                                                    voting='soft'),
                                    learning_rate=0.2, n_estimators=15))])

0.9388756212483057

0.9370160041300981

In [105]:
model_super_complex.score(X_test, y_test)

0.882218309859155

In [106]:
from sklearn.ensemble import AdaBoostClassifier
ada_g = AdaBoostClassifier(
    VotingClassifier(estimators=[("gnb", GaussianNB()), ('log_reg', LogisticRegression()), ('tree', DecisionTreeClassifier(max_depth=3))], voting='soft'), n_estimators= 20,
    algorithm="SAMME.R", learning_rate=0.2)
ada_g.fit(X_train, y_train)
ada_g.score(X_train, y_train)
ada_g.score(X_validate, y_validate)
ada_g.score(X_test, y_test)

AdaBoostClassifier(base_estimator=VotingClassifier(estimators=[('gnb',
                                                                GaussianNB()),
                                                               ('log_reg',
                                                                LogisticRegression()),
                                                               ('tree',
                                                                DecisionTreeClassifier(max_depth=3))],
                                                   voting='soft'),
                   learning_rate=0.2, n_estimators=20)

0.8907248434776996

0.8861641713990708

0.8698943661971831

In [107]:
from sklearn.ensemble import VotingClassifier

voting_clf = VotingClassifier(
        estimators=[('log_reg', log_reg), ('svm', svm_model),  ("forest", rnd_clf), ("ada", ada_clf), ('adaj', ada_g)], voting = "soft")

voting_clf.fit(X_train_full_s, y_train_full)

voting_clf.score(X_train, y_train)
voting_clf.score(X_validate, y_validate)

voting_clf.score(X_test, y_test)

VotingClassifier(estimators=[('log_reg',
                              Pipeline(steps=[('std_scaler', StandardScaler()),
                                              ('pca', PCA(n_components=0.999)),
                                              ('lin', LogisticRegression())])),
                             ('svm',
                              Pipeline(steps=[('std_scaler', StandardScaler()),
                                              ('pca', PCA(n_components=0.999)),
                                              ('svm', SVC(probability=True))])),
                             ('forest',
                              Pipeline(steps=[('std_scaler', StandardScaler()),
                                              ('pca', PCA(n_compone...
                              Pipeline(steps=[('gbrt',
                                               AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=3),
                                                                  learning_ra

0.9833473181436778

0.9819308208569953

0.9376760563380282

In [108]:


from sklearn.ensemble import VotingClassifier

voting_clf = VotingClassifier(
        estimators=[('log_reg', log_reg), ('svm', svm_model), ("adaboost", ada_clf), ("gaus", model_gausiyan), 
        ('adaj', ada_g), ("forest", rnd_clf), ("gbrt", gbrt)], voting = "soft")

voting_clf.fit(X_train_full_s, y_train_full)

voting_clf.score(X_train, y_train)
voting_clf.score(X_validate, y_validate)

voting_clf.score(X_test, y_test)

VotingClassifier(estimators=[('log_reg',
                              Pipeline(steps=[('std_scaler', StandardScaler()),
                                              ('pca', PCA(n_components=0.999)),
                                              ('lin', LogisticRegression())])),
                             ('svm',
                              Pipeline(steps=[('std_scaler', StandardScaler()),
                                              ('pca', PCA(n_components=0.999)),
                                              ('svm', SVC(probability=True))])),
                             ('adaboost',
                              Pipeline(steps=[('gbrt',
                                               AdaBoostClassifier(base_estimator=Decision...
                              Pipeline(steps=[('std_scaler', StandardScaler()),
                                              ('pca', PCA(n_components=0.99)),
                                              ('forest',
                                   

0.9767636997353644

0.9731543624161074

0.9373239436619718

In [109]:
models = [('log_reg', log_reg), ('svm', svm_model), ('svm2', svm_model), ('adaj2', ada_g2), ('adaj', ada_g) ]

In [110]:
voting_clf = VotingClassifier(
        estimators = models, voting = "soft")

voting_clf.fit(X_train_full_s, y_train_full)

voting_clf.score(X_train, y_train)
voting_clf.score(X_validate, y_validate)

voting_clf.score(X_test, y_test)

VotingClassifier(estimators=[('log_reg',
                              Pipeline(steps=[('std_scaler', StandardScaler()),
                                              ('pca', PCA(n_components=0.999)),
                                              ('lin', LogisticRegression())])),
                             ('svm',
                              Pipeline(steps=[('std_scaler', StandardScaler()),
                                              ('pca', PCA(n_components=0.999)),
                                              ('svm', SVC(probability=True))])),
                             ('svm2',
                              Pipeline(steps=[('std_scaler', StandardScaler()),
                                              ('pca', PCA(n_component...
                                                                                              LogisticRegression()),
                                                                                             ('tree',
                               

0.9958691021751759

0.996386164171399

0.9394366197183098

In [111]:
voting_clf = VotingClassifier(
        estimators = [('log_reg', log_reg), ('svm', svm_model), ("adaboost", ada_clf), ("sgd", sgd_clf) ], voting = "hard")

voting_clf.fit(X_train_full_s, y_train_full)

voting_clf.score(X_train, y_train)
voting_clf.score(X_validate, y_validate)

voting_clf.score(X_test, y_test)

VotingClassifier(estimators=[('log_reg',
                              Pipeline(steps=[('std_scaler', StandardScaler()),
                                              ('pca', PCA(n_components=0.999)),
                                              ('lin', LogisticRegression())])),
                             ('svm',
                              Pipeline(steps=[('std_scaler', StandardScaler()),
                                              ('pca', PCA(n_components=0.999)),
                                              ('svm', SVC(probability=True))])),
                             ('adaboost',
                              Pipeline(steps=[('gbrt',
                                               AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=3),
                                                                  learning_rate=0.11,
                                                                  n_estimators=10))])),
                             ('sgd',
               

0.9472665074549796

0.9465668559628291

0.8966549295774648

In [112]:
def combinations(_arr):
    m = len(_arr)
    arr = np.array(_arr)
    combs = []
    for i in range(0,2**m):
        pr = [bool(int(x)) for x in bin(i)[2:]]
        for j in range(m - len(pr)):
            pr.insert(0, False)
        combination = arr[pr]
        combs.append(combination)
    return combs

In [113]:
combinations([1,2,3])

[array([], dtype=int32),
 array([3]),
 array([2]),
 array([2, 3]),
 array([1]),
 array([1, 3]),
 array([1, 2]),
 array([1, 2, 3])]

instead of trying every combination possible with voting classifier, we would just generate all the possible combinations

In [60]:
models_pr = [[('log_reg', log_reg)], [('svm', svm_model)], [('log_reg', log_reg), ('svm', svm_model)]]

add_models = [("gbrt_add", gbrt), ("gaus_add", model_gausiyan), ('svm_add', svm_model), ("forest_add", rnd_clf), 
    ('adaboost_mix', ada_g), ('adaboost_mix_pip', model_super_complex), ("adaboost", ada_clf)]

best_estimators = []
best_score = 0

iteration  = 0
for model in models_pr:
    for combination in combinations(add_models):
        iteration += 1
        estimators = list(model) + list(combination)
        test_model = VotingClassifier(estimators = estimators, voting = "soft", n_jobs = -1)
        with io.capture_output() as captured:
            test_model.fit(X_train_full_s, y_train_full)
        score_test = test_model.score(X_test, y_test)
        print("iteration : ", iteration, " score : ", score_test)
        if(score_test > best_score):
            best_score = score_test
            best_estimators = estimators
print("--------------------------------")
best_score
best_estimators

NameError: name 'log_reg' is not defined

models_pr = [[('log_reg', log_reg)], [('svm', svm_model)], [('log_reg', log_reg), ('svm', svm_model)]]

add_models = [("gbrt_add", gbrt), ("gaus_add", model_gausiyan), ('log_reg_add', log_reg), ('svm_add', svm_model), ('tree', tree),
     ("forest_add", rnd_clf), ('adaboost_mix', ada_g), ('adaboost_mix_pip', model_super_complex), ("adaboost", ada_clf)]

best_estimators2 = []
best_score2 = 0

iteration  = 0
for model in models_pr:
    for combination in combinations(add_models):
        iteration += 1
        estimators = list(model) + list(combination)
        test_model = VotingClassifier(estimators = estimators, voting = "soft", n_jobs = -1)
        with io.capture_output() as captured:
            test_model.fit(X_train, y_train)
        score_test = test_model.score(X_validate, y_validate)
        print("iteration : ", iteration, " score : ", score_test)
        if(score_test > best_score2):
            best_score2 = score_test
            best_estimators2 = estimators
print("--------------------------------")
best_score2
best_estimators2

In [58]:
best_estimators

NameError: name 'best_estimators' is not defined