In [1]:
import numpy as np
import pandas as pd
import pickle
import seaborn as sns
import random

from sklearn.preprocessing import MinMaxScaler


In [125]:
# INTERNAL FUNCS
def splitdata(data, label, ntrainbatch=10):
    nbatch=ntrainbatch-1
    trainframes=[]
    testframes =[]
    for i in range(45):
        if i%15-1<nbatch:
            trainframes.append(pd.concat([pd.DataFrame(data[i]), pd.DataFrame(label[i])], axis=1))
        if i%15>nbatch:
            testframes.append(pd.concat([pd.DataFrame(data[i]), pd.DataFrame(label[i])], axis=1))
    train = pd.concat(trainframes)
    test  = pd.concat(testframes)
    return train, test

def gatherdata(X, y):
    Xyframes=[]
    for i in range(45):
        Xyframes.append(pd.concat([pd.DataFrame(X[i]), pd.DataFrame(y[i])], axis=1))
    XyDF = pd.concat(Xyframes)
    return XyDF

def allsets(X,y,slice_size=13, trackdict=False):
    slices = []
    dicc={}
    for i in range(45):
        conc = pd.concat([pd.DataFrame(X[i]), pd.DataFrame(y[i])], axis=1)
        length   = len(conc)
        sobrantes, setscomp = length%slice_size, length//slice_size
        
        for e in range(setscomp):
            slic = conc.iloc[slice_size*e:min_size*e+slice_size]
            slices.append(slic)
            dicc[f"clip {i}"]=f"slices:{setscomp}" #dicc {clip,slice}
    df = pd.concat(slices)
    if trackdict:
        return df, dicc
    if not trackdict:
        return df

In [141]:
# GLOBAL FUNCS

#Train-Test from Full DF function 
def fulldfsplit(nsubjects=16):
    '''Files must be labelled as {subject#}_123.npz' and should be inside a Data folder within the Project'''
    data16  = {}
    label16 = {}
    Xytrain16_list = []
    Xytest16_list  = []
    for i in range(1,nsubjects+1): 
        # Load all 16 files data into a Dict named 'i_123.npz' using a for loop
        data16[i]  = pickle.loads(np.load(f'../data/{i}_123.npz')['data'])
        label16[i] = pickle.loads(np.load(f'../data/{i}_123.npz')['label'])
            
    for i in range(1,nsubjects+1):
        #apply all data to the splitdata func to create lists of DFs 
        train, test = splitdata(data16[i], label16[i], 10)
        Xytrain16_list.append(train)
        Xytest16_list.append(test)

    #create a unified DF from every list with pd.concat(trainframes)
    Xytrain16_DF = pd.concat(Xytrain16_list)
    Xytest16_DF  = pd.concat(Xytest16_list)
    
    return Xytrain16_DF, Xytest16_DF

#Full DF no split
def fulldf(nsubjects=16):
    '''Files must be labelled as {subject#}_123.npz' and should be inside a Data folder within the Project.'''
    data16  = {}
    label16 = {}
    for i in range(1,nsubjects+1): 
        # Load all 16 files data into a Dict named 'i_123.npz' using a for loop
        data16[i]  = pickle.loads(np.load(f'../data/{i}_123.npz')['data'])
        label16[i] = pickle.loads(np.load(f'../data/{i}_123.npz')['label'])
    Xy16_list = []
    for i in range(1,nsubjects+1): 
        #apply all data to the gather data func to create lists of DFs 
        Xy = gatherdata(data16[i], label16[i])
        Xy16_list.append(Xy)
    XyDF = pd.concat(Xy16_list)
    XyDF.columns = [*XyDF.columns[:-1], 'target']
    return XyDF

#Full DF, no split, slicing each clip to multiples of 13
def fulldfslices(nsubjects=16, slice_size =13, trackdict=False):
    '''Files must be labelled as {subject#}_123.npz' and should be inside a Data folder within the Project.
    slice_size is the desired row length of each slice
    '''
    data16  = {}
    label16 = {}
    #trackdic = trackdict
    for i in range(1,nsubjects+1): 
        # Load all 16 files data into a Dict named 'i_123.npz' using a for loop
        data16[i]  = pickle.loads(np.load(f'../data/{i}_123.npz')['data'])
        label16[i] = pickle.loads(np.load(f'../data/{i}_123.npz')['label'])
    Xy16_list = []
    dicc16  = {}
    for i in range(1,nsubjects+1): 
        #apply all data to the gather data func to create lists of DFs 
        if trackdict:
            Xy,dicc = allsets(data16[i], label16[i], slice_size, trackdict=True)
            Xy16_list.append(Xy)
            dicc16[f"subject {i}"]=dicc  #list with dicc {clip,slice}
        if not trackdict:
            Xy = allsets(data16[i], label16[i], slice_size)
            Xy16_list.append(Xy)

    XyDF = pd.concat(Xy16_list)
    XyDF.columns = [*XyDF.columns[:-1], 'target']
    return XyDF

In [6]:
#Dayus formula
def get_X_y(df, 
            X_length=13, # 
            y_length=13, 
            number_of_sequences=51, 
            number_of_targets=1, 
            val=False, 
            val_cutoff=0.8):


#     limit dataframes to length for train/test splits
    df_X = df.copy().drop(columns=df.columns[-1], axis=1)
    df_y = df.iloc[:, -1].copy()
    
#     convert and scale X dataframe to PCA to solve dimensionality problem
    scaler = MinMaxScaler()
    df_X_scaled = pd.DataFrame(scaler.fit_transform(df_X), columns=df_X.columns, index=df_X.index)
    

#     create unique list to sample random datapoints from
    if val:
        sample_list = list(range(int(len(df_y)*val_cutoff), int(len(df_y)-y_length))) #y_length pans the end
    if not val:
        sample_list = list(range(int(X_length), int(len(df_y)-y_length))) #X_length pans start
    random.shuffle(sample_list)
    
#     empty lists to append data to, will create 3D dataframe here
    X, y = [], []
    
    
#     define a simple data slicing and selection function. This function will create a slice of data from a specified random starting position. The random position must be generated externally.
    
    def get_Xi_yi(df_X, 
              df_y,
              random_start, #list of random values
              X_length, #X_length pans start
              y_length #y_length pans the end
                 ): 
        '''Define a simple data slicing and selection function. 
        This function will create a slice of data from a specified random starting position. 
        The random position must be generated externally.'''
    
#     must define a random_start:int for function to run
        Xi = df_X.iloc[random_start-X_length:random_start]
        yi = df_y.iloc[random_start:random_start+y_length]

        return Xi, yi

    
#     for loop to select ith values from data
    for i in range(number_of_sequences):
        Xi, yi = get_Xi_yi(df_X_scaled, df_y, sample_list.pop(), X_length, y_length)
        X.append(Xi.values.tolist())
        y.append(yi.values.tolist())
        
    return np.array(X), np.array(y)

In [8]:
#tests on D func
X,y= get_X_y(df, 
            X_length=13, # 
            y_length=13, 
            number_of_sequences=51, 
            number_of_targets=4, 
            val=True, 
            val_cutoff=0.8)
X.shape, y.shape #((51, 13, 310), (51, 13))

y_length = 3
X_length = 3
val_cutoff = .999
sample_list = list(range(int(len(df_y)*val_cutoff), int(len(df_y)-y_length))) #y_length pans the end
sample_list

In [95]:
# Randomstart excercise

#random.shuffle(list(range(length-min_size)))
#randslicestart = False
#start=random.choice(list(range(length-min_size))
#if randslicestart:
#slic = conc.iloc[randstart:randstart+13]
#if not randslicestart:
#slic = conc.iloc[0:13]      

In [96]:
df = fulldfslices()

In [163]:
xlen= len(data[2]) #len(df)
slice_size = 13
fullslices = xlen//slice_size
slicind = list(e for e in range(xlen) if e%slice_size==0)
randslic = random.choice(slicind)
slicind, randslic, fullslices

([0, 13, 26, 39, 52], 13, 4)

In [167]:
slices = [] 
for i in randslic:
    #slices.append(data[2].iloc[i:i+slice_size])
    
    

[0, 1, 2, 3]

### Padding

In [169]:
#Max1
import numpy as np
#New function to collect all Data across all 16 subjects without split
#Full DF no split
def fulldfmax(nsubjects=16):
    data16  = [pickle.loads(np.load(f'../data/{i}_123.npz')['data']) for i in range(1,nsubjects+1)]
    return data16
pad_list = []
ddd = fulldfmax()


In [184]:
#Max2
for i in range(16):
    X = list(ddd[i].values())
    from tensorflow.keras.preprocessing.sequence import pad_sequences
    X_pad = pad_sequences(X, dtype='float32', value=-42069) # int32 by default
    pad_list.append(X_pad)
#12:35
pad_list
X = np.concatenate(pad_list)  #X.shape (720, 74, 310)

(720, 74, 310)

In [253]:
def fulldfpad(nsubjects=16, Xymerge=True):
    '''Files must be labelled as {subject#}_123.npz' and should be inside a Data folder within the Project.
    Returns a list with 720,18,311 np.arrays
    Xymerge
    '''
    data16  = {}
    label16 = {}
    Xymergeg=Xymerge
    for i in range(1,nsubjects+1): 
        # Load all 16 files data into a Dict named 'i_123.npz' using a for loop
        data16[i]  = pickle.loads(np.load(f'../data/{i}_123.npz')['data'])
        label16[i] = pickle.loads(np.load(f'../data/{i}_123.npz')['label'])    
    
    def gatherdatapad(X, y, Xymerge):
        Xyframes=[]
        for i in range(45):
            if Xymerge:
                merge = pd.concat([pd.DataFrame(X[i]), pd.DataFrame(y[i])], axis=1)
                Xyframes.append(np.array(merge))
            if not Xymerge:
                Xyframes.append(pd.DataFrame(X[i]))
        #XyDF = pd.concat(Xyframes) #DFintegrated
        return Xyframes #list of pd.DF
    
    Xy16_list = []
    for i in range(1,nsubjects+1): 
        #apply all data to the gather data func to create lists of DFs 
        Xy = gatherdatapad(data16[i], label16[i],Xymerge=Xymergeg)
        Xy16_list += Xy
    #XyDF = pd.concat(Xy16_list)
    #XyDF.columns = [*XyDF.columns[:-1], 'target']
    return Xy16_list

In [255]:
Xtest = fulldfpad(Xymerge=False)
Xtest[0].shape

(18, 310)

In [248]:
Xtest[0].shape

(18, 311)

In [232]:
#16x45 = 720 y's (1 per video across all participants)
#for i in range(1,nsubjects+1):
#    label16.append(pickle.loads(np.load(f'../data/{i}_123.npz')['label']))
def y_unique(nsubjects=16):
    yunique = []
    
    for i in range(1,nsubjects+1):
        y=pickle.loads(np.load(f'../data/{i}_123.npz')['label'])
        for e in range(45):
            yunique.append(int(np.unique(y[e])))
            
    return np.array(yunique).astype(np.float32)

In [233]:
y=y_unique()
y.shape

(720,)

In [193]:
X = fulldfpad() #shape = (720,18,311)

[          0         1         2         3         4          5         6    \
 0   11.082522  8.915990  7.894088  8.393629  8.576055  10.450283  8.682803   
 1   11.081816  8.915104  7.893646  8.393852  8.575908  10.449770  8.681773   
 2   11.081139  8.914245  7.893116  8.394334  8.575671  10.449391  8.680753   
 3   11.080857  8.913749  7.893061  8.394893  8.575294  10.449425  8.680200   
 4   11.081297  8.913816  7.893386  8.395509  8.575009  10.449813  8.680219   
 5   11.082385  8.914133  7.893591  8.395986  8.574627  10.450640  8.680702   
 6   11.083825  8.914366  7.893565  8.396130  8.573922  10.451684  8.681255   
 7   11.085158  8.914192  7.892964  8.395734  8.572777  10.452605  8.681552   
 8   11.085906  8.913450  7.891825  8.394749  8.571212  10.452949  8.681341   
 9   11.086201  8.911913  7.890041  8.393327  8.569228  10.452640  8.680249   
 10  11.086416  8.910022  7.888055  8.391777  8.567214  10.451958  8.678671   
 11  11.086846  8.908668  7.886208  8.390266  8.5653

In [207]:
X_pad = pad_sequences(X, dtype='float32', value=-42069) # int32 by default
pad_list.append(X_pad)
#12:35
#pad_list

In [206]:
X_pad[1][0] #list with 17x45= 765,74,310

array([-42069., -42069., -42069., -42069., -42069., -42069., -42069.,
       -42069., -42069., -42069., -42069., -42069., -42069., -42069.,
       -42069., -42069., -42069., -42069., -42069., -42069., -42069.,
       -42069., -42069., -42069., -42069., -42069., -42069., -42069.,
       -42069., -42069., -42069., -42069., -42069., -42069., -42069.,
       -42069., -42069., -42069., -42069., -42069., -42069., -42069.,
       -42069., -42069., -42069., -42069., -42069., -42069., -42069.,
       -42069., -42069., -42069., -42069., -42069., -42069., -42069.,
       -42069., -42069., -42069., -42069., -42069., -42069., -42069.,
       -42069., -42069., -42069., -42069., -42069., -42069., -42069.,
       -42069., -42069., -42069., -42069., -42069., -42069., -42069.,
       -42069., -42069., -42069., -42069., -42069., -42069., -42069.,
       -42069., -42069., -42069., -42069., -42069., -42069., -42069.,
       -42069., -42069., -42069., -42069., -42069., -42069., -42069.,
       -42069., -420