In [1]:
import numpy as np
import pandas as pd
import pickle
import seaborn as sns
import random

from sklearn.preprocessing import MinMaxScaler

In [125]:
# INTERNAL FUNCS
def splitdata(data, label, ntrainbatch=10):
    nbatch=ntrainbatch-1
    trainframes=[]
    testframes =[]
    for i in range(45):
        if i%15-1<nbatch:
            trainframes.append(pd.concat([pd.DataFrame(data[i]), pd.DataFrame(label[i])], axis=1))
        if i%15>nbatch:
            testframes.append(pd.concat([pd.DataFrame(data[i]), pd.DataFrame(label[i])], axis=1))
    train = pd.concat(trainframes)
    test  = pd.concat(testframes)
    return train, test

def gatherdata(X, y):
    Xyframes=[]
    for i in range(45):
        Xyframes.append(pd.concat([pd.DataFrame(X[i]), pd.DataFrame(y[i])], axis=1))
    XyDF = pd.concat(Xyframes)
    return XyDF

def allsets(X,y,slice_size=13, trackdict=False):
    slices = []
    dicc={}
    for i in range(45):
        conc = pd.concat([pd.DataFrame(X[i]), pd.DataFrame(y[i])], axis=1)
        length   = len(conc)
        sobrantes, setscomp = length%slice_size, length//slice_size
        
        for e in range(setscomp):
            slic = conc.iloc[slice_size*e:min_size*e+slice_size]
            slices.append(slic)
            dicc[f"clip {i}"]=f"slices:{setscomp}" #dicc {clip,slice}
    df = pd.concat(slices)
    if trackdict:
        return df, dicc
    if not trackdict:
        return df

In [141]:
# GLOBAL FUNCS

#Train-Test from Full DF function 
def fulldfsplit(nsubjects=16):
    '''Files must be labelled as {subject#}_123.npz' and should be inside a Data folder within the Project'''
    data16  = {}
    label16 = {}
    Xytrain16_list = []
    Xytest16_list  = []
    for i in range(1,nsubjects+1): 
        # Load all 16 files data into a Dict named 'i_123.npz' using a for loop
        data16[i]  = pickle.loads(np.load(f'../data/{i}_123.npz')['data'])
        label16[i] = pickle.loads(np.load(f'../data/{i}_123.npz')['label'])
            
    for i in range(1,nsubjects+1):
        #apply all data to the splitdata func to create lists of DFs 
        train, test = splitdata(data16[i], label16[i], 10)
        Xytrain16_list.append(train)
        Xytest16_list.append(test)

    #create a unified DF from every list with pd.concat(trainframes)
    Xytrain16_DF = pd.concat(Xytrain16_list)
    Xytest16_DF  = pd.concat(Xytest16_list)
    
    return Xytrain16_DF, Xytest16_DF

#Full DF no split
def fulldf(nsubjects=16):
    '''Files must be labelled as {subject#}_123.npz' and should be inside a Data folder within the Project.'''
    data16  = {}
    label16 = {}
    for i in range(1,nsubjects+1): 
        # Load all 16 files data into a Dict named 'i_123.npz' using a for loop
        data16[i]  = pickle.loads(np.load(f'../data/{i}_123.npz')['data'])
        label16[i] = pickle.loads(np.load(f'../data/{i}_123.npz')['label'])
    Xy16_list = []
    for i in range(1,nsubjects+1): 
        #apply all data to the gather data func to create lists of DFs 
        Xy = gatherdata(data16[i], label16[i])
        Xy16_list.append(Xy)
    XyDF = pd.concat(Xy16_list)
    XyDF.columns = [*XyDF.columns[:-1], 'target']
    return XyDF

#Full DF, no split, slicing each clip to multiples of 13
def fulldfslices(nsubjects=16, slice_size =13, trackdict=False):
    '''Files must be labelled as {subject#}_123.npz' and should be inside a Data folder within the Project.
    slice_size is the desired row length of each slice
    '''
    data16  = {}
    label16 = {}
    #trackdic = trackdict
    for i in range(1,nsubjects+1): 
        # Load all 16 files data into a Dict named 'i_123.npz' using a for loop
        data16[i]  = pickle.loads(np.load(f'../data/{i}_123.npz')['data'])
        label16[i] = pickle.loads(np.load(f'../data/{i}_123.npz')['label'])
    Xy16_list = []
    dicc16  = {}
    for i in range(1,nsubjects+1): 
        #apply all data to the gather data func to create lists of DFs 
        if trackdict:
            Xy,dicc = allsets(data16[i], label16[i], slice_size, trackdict=True)
            Xy16_list.append(Xy)
            dicc16[f"subject {i}"]=dicc  #list with dicc {clip,slice}
        if not trackdict:
            Xy = allsets(data16[i], label16[i], slice_size)
            Xy16_list.append(Xy)

    XyDF = pd.concat(Xy16_list)
    XyDF.columns = [*XyDF.columns[:-1], 'target']
    return XyDF

In [6]:
#Dayus formula
def get_X_y(df, 
            X_length=13, # 
            y_length=13, 
            number_of_sequences=51, 
            number_of_targets=1, 
            val=False, 
            val_cutoff=0.8):


#     limit dataframes to length for train/test splits
    df_X = df.copy().drop(columns=df.columns[-1], axis=1)
    df_y = df.iloc[:, -1].copy()
    
#     convert and scale X dataframe to PCA to solve dimensionality problem
    scaler = MinMaxScaler()
    df_X_scaled = pd.DataFrame(scaler.fit_transform(df_X), columns=df_X.columns, index=df_X.index)
    

#     create unique list to sample random datapoints from
    if val:
        sample_list = list(range(int(len(df_y)*val_cutoff), int(len(df_y)-y_length))) #y_length pans the end
    if not val:
        sample_list = list(range(int(X_length), int(len(df_y)-y_length))) #X_length pans start
    random.shuffle(sample_list)
    
#     empty lists to append data to, will create 3D dataframe here
    X, y = [], []
    
    
#     define a simple data slicing and selection function. This function will create a slice of data from a specified random starting position. The random position must be generated externally.
    
    def get_Xi_yi(df_X, 
              df_y,
              random_start, #list of random values
              X_length, #X_length pans start
              y_length #y_length pans the end
                 ): 
        '''Define a simple data slicing and selection function. 
        This function will create a slice of data from a specified random starting position. 
        The random position must be generated externally.'''
    
#     must define a random_start:int for function to run
        Xi = df_X.iloc[random_start-X_length:random_start]
        yi = df_y.iloc[random_start:random_start+y_length]

        return Xi, yi

    
#     for loop to select ith values from data
    for i in range(number_of_sequences):
        Xi, yi = get_Xi_yi(df_X_scaled, df_y, sample_list.pop(), X_length, y_length)
        X.append(Xi.values.tolist())
        y.append(yi.values.tolist())
        
    return np.array(X), np.array(y)

In [8]:
#tests on D func
X,y= get_X_y(df, 
            X_length=13, # 
            y_length=13, 
            number_of_sequences=51, 
            number_of_targets=4, 
            val=True, 
            val_cutoff=0.8)
X.shape, y.shape #((51, 13, 310), (51, 13))

y_length = 3
X_length = 3
val_cutoff = .999
sample_list = list(range(int(len(df_y)*val_cutoff), int(len(df_y)-y_length))) #y_length pans the end
sample_list

In [95]:
# Randomstart excercise

#random.shuffle(list(range(length-min_size)))
#randslicestart = False
#start=random.choice(list(range(length-min_size))
#if randslicestart:
#slic = conc.iloc[randstart:randstart+13]
#if not randslicestart:
#slic = conc.iloc[0:13]      

In [96]:
#df = fulldfslices()

([0, 13, 26, 39, 52], 13, 4)

In [167]:
slices = [] 
for i in randslic:
    #slices.append(data[2].iloc[i:i+slice_size])
    
    

[0, 1, 2, 3]

### Padding

In [291]:
def fulldfpad(nsubjects=16, Xymerge=True):
    '''Files must be labelled as {subject#}_123.npz' and should be inside a Data folder within the Project.
    Returns a list prepared to pad with 720,~18,311 np.arrays
    Xymerge=True includes 'y' on the DF
    '''
    data16  = {}
    label16 = {}
    for i in range(1,nsubjects+1): 
        # Load all 16 files data into a Dict named 'i_123.npz' using a for loop
        data16[i]  = pickle.loads(np.load(f'../data/{i}_123.npz')['data'])
        label16[i] = pickle.loads(np.load(f'../data/{i}_123.npz')['label'])    
    
    def gatherdatapad(X, y, Xymerge):
        Xyframes=[]
        for i in range(45):
            if Xymerge:
                merge = pd.concat([pd.DataFrame(X[i]), pd.DataFrame(y[i])], axis=1)
                Xyframes.append(np.array(merge))
            if not Xymerge:
                Xyframes.append(pd.DataFrame(X[i]))
        #XyDF = pd.concat(Xyframes) #DFintegrated
        return Xyframes #list of np.arrays
    
    Xy16_list = []
    for i in range(1,nsubjects+1): 
        #apply all data to the gather data func to create lists of DFs 
        Xy = gatherdatapad(data16[i], label16[i],Xymerge=Xymergeg)
        Xy16_list += Xy
    #XyDF = pd.concat(Xy16_list)
    #XyDF.columns = [*XyDF.columns[:-1], 'target']
    return Xy16_list

In [65]:
#y np.array (720,), 1 value per array
def y_unique(nsubjects=16):
    '''y for RNN. after X is padded, this y is used to fit'''
    yunique = []
    
    for i in range(1,nsubjects+1):
        y=pickle.loads(np.load(f'../data/{i}_123.npz')['label'])
        for e in range(45):
            yunique.append(int(np.unique(y[e])))
            
    return np.array(yunique).astype(np.float32)
y = y_unique()

In [66]:
#Max1
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
#New function to collect all Data across all 16 subjects without split
#Full DF no split
def fulldfmax(nsubjects=16):
    '''returns a np.array shape (720, 74, 310) '''
    data16  = [pickle.loads(np.load(f'../data/{i}_123.npz')['data']) for i in range(1,nsubjects+1)]

    pad_list=[]
    for i in range(nsubjects):
        X = list(data16[i].values())
        #padding
        X_pad = pad_sequences(X, dtype='float32', value=-42069) # int32 by default
        pad_list.append(X_pad)

    return np.concatenate(pad_list)
X = fulldfmax()
#    return pad_list
#Xlist=fulldfmax()

In [34]:
ddd.shape, yyy.shape, ddd.shape[0]

((720, 74, 310), (720,), 720)

In [64]:
type(ddd), type(yyy)

(numpy.ndarray, numpy.ndarray)

In [None]:
def tvtsplit(X,y,train_size=.7, val_size=.2):
    '''X.shape, y.shape = ((720, 74, 310), (720,)
    train & val should be percentage values between 0-1'''
    #if X.shape[0]== y.shape[0]:
    n = X.shape[0]
    ntrain, nval, ntest = int(train_size*n), int(val_size*n), int((1-train_size-val_size)*n)
    sample_list=[e for e in range(n)] #n-arrays size list
    random.shuffle(sample_list)
    
    random_train = sample_list[:ntrain]
    random_val   = sample_list[ntrain:ntrain+nval]
    random_test  = sample_list[ntrain+nval:]
    
    for e in range(total_frames):
        
    

In [108]:
#gpt func
def RNN_split_data(X, y, train_size, val_size, random_state=42):
    '''Takes fulldfmax() as X, y_unique() as y. '''
    test_size = 1-train_size-val_size
    assert train_size + val_size + test_size == 1.0, "Sizes must add up to 1.0"
    #assert abs(train_size + val_size + test_size - 1.0) < 1e-9, "Sizes must add up to 1.0"

    # Set the random seed for reproducibility
    np.random.seed(random_state)

    # Calculate total size and generate a permutation
    total_size = X.shape[0] #720
    permutation = np.random.permutation(total_size) #random sequence of 720 values as array (720,)

    # Shuffle X and y
    X = X[permutation]
    y = y[permutation]

    # Calculate the indices for the splits
    train_end = int(total_size * train_size)
    val_end = train_end + int(total_size * val_size)

    # Split the X array
    X_train = X[:train_end]
    X_val = X[train_end:val_end]
    X_test = X[val_end:]

    # Split the y array
    y_train = y[:train_end]
    y_val = y[train_end:val_end]
    y_test = y[val_end:]

    return X_train, X_val, X_test, y_train, y_val, y_test

In [109]:
X_train, X_val, X_test, y_train, y_val, y_test = RNN_split_data(X, y, 0.7, 0.2)

In [110]:
X_train.shape, X_val.shape, X_test.shape, y_train.shape, y_val.shape, y_test.shape

((503, 74, 310), (144, 74, 310), (73, 74, 310), (503,), (144,), (73,))

In [102]:
503+144+73

720

In [73]:
random_state=42
np.random.seed(random_state)
total_size = X.shape[0]
permutation = np.random.permutation(total_size)

In [106]:
#permutation.shape
train_size, val_size, test_size = 0.7, 0.2, 0.1
print(train_size+ val_size+ test_size)
train_size, val_size = 0.7, 0.2
test_size = 1-train_size- val_size
print(train_size+ val_size+ test_size)

0.9999999999999999
1.0


In [62]:
n=20
ll=[e for e in range(n)]
random.shuffle(ll)
ptrain, pval = .7,.2
ptest = 1-ptrain-pval
train, val, test = int(ptrain*n), int(pval*n), int(ptest*n)

ttrain=ll[0:train]
tval=ll[train:train+val]
ttest=ll[train+val:]
print(ll)
print(train, val, test)
print(ttrain, tval, ttest)
print(len(ttrain), len(tval), len(ttest))

[5, 12, 0, 2, 18, 10, 7, 11, 16, 8, 1, 19, 15, 13, 9, 3, 14, 17, 4, 6]
14 4 2
[5, 12, 0, 2, 18, 10, 7, 11, 16, 8, 1, 19, 15, 13] [9, 3, 14, 17] [4, 6]
14 4 2


In [35]:
xlen= len(data[2]) #len(df)
slice_size = 13
fullslices = xlen//slice_size
slicind = list(e for e in range(xlen) if e%slice_size==0)
randslic = random.choice(slicind)
slicind, randslic, fullslices


[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

### Train Test

In [270]:
def splitdata(X, y, ntrainbatch=10):
    nbatch=ntrainbatch-1
    trainframes=[]
    testframes =[]
    for i in range(45):
        if i%15-1<nbatch:
            trainframes.append(pd.concat([pd.DataFrame(data[i]), pd.DataFrame(label[i])], axis=1))
        if i%15>nbatch:
            testframes.append(pd.concat([pd.DataFrame(data[i]), pd.DataFrame(label[i])], axis=1))
    train = pd.concat(trainframes)
    test  = pd.concat(testframes)
    return train, test


def fulldfsplit(nsubjects=16):
    '''Train-Test from Full DF function '''
    data16  = {}
    label16 = {}
    Xytrain16_list = []
    Xytest16_list  = []
    for i in range(1,nsubjects+1): 
        # Load all 16 files data into a Dict named 'i_123.npz' using a for loop
        data16[i]  = pickle.loads(np.load(f'../data/{i}_123.npz')['data'])
        label16[i] = pickle.loads(np.load(f'../data/{i}_123.npz')['label'])
            
    for i in range(1,nsubjects+1):
        #apply all data to the splitdata func to create lists of DFs 
        train, test = splitdata(data16[i], label16[i], 10)
        Xytrain16_list.append(train)
        Xytest16_list.append(test)

    #create a unified DF from every list with pd.concat(trainframes)
    Xytrain16_DF = pd.concat(Xytrain16_list)
    Xytest16_DF  = pd.concat(Xytest16_list)
    
    return Xytrain16_DF, Xytest16_DF

In [8]:
###slice to even values
def allsets(X,y,slice_size=13, trackdict=False,Xymerge=False):
    '''adds all possible slices from slice_size  
    Xymerge=True returns df with X and y'''
    slices = []
    dicc={}
    for i in range(len(X)):
        #conc = pd.concat([pd.DataFrame(X[i]), pd.DataFrame(y[i])], axis=1)
        #length   = len(conc)
        if Xymerge:
            merge = pd.concat([pd.DataFrame(X[i]), pd.DataFrame(y[i])], axis=1)
            #Xyframes.append(np.array(merge))
        if not Xymerge:
            merge = pd.DataFrame(X[i])
        length   = len(merge)
        sobrantes, setscomp = length%slice_size, length//slice_size
        for e in range(setscomp):
            slic = merge.iloc[slice_size*e:slice_size*e+slice_size]
            slices.append(np.array(slic))
            #dicc[f"clip {i}"]=f"slices:{setscomp}" #dicc {clip,slice}
    #df = pd.concat(slices)
    if trackdict:
        return slices, dicc
    if not trackdict:
        return slices #list with 45 sliced arrays
#.----------
    def gatherdatapad(X, y, Xymerge):
        Xyframes=[]
        for i in range(45):
            if Xymerge:
                merge = pd.concat([pd.DataFrame(X[i]), pd.DataFrame(y[i])], axis=1)
                Xyframes.append(np.array(merge))
            if not Xymerge:
                Xyframes.append(pd.DataFrame(X[i]))
        #XyDF = pd.concat(Xyframes) #DFintegrated
        return Xyframes #list of np.arrays
#.----------
    Xy16_list = []
    for i in range(1,nsubjects+1): 
        #apply all data to the gather data func to create lists of DFs 
        Xy = gatherdatapad(data16[i], label16[i],Xymerge=Xymergeg)
        Xy16_list += Xy
    #XyDF = pd.concat(Xy16_list)
    #XyDF.columns = [*XyDF.columns[:-1], 'target']
    return Xy16_list
#.----------
#13-slices
def fulldfslices(nsubjects=16, slice_size =13, trackdict=False, Xymerge=False):
    '''Files must be labelled as {subject#}_123.npz' and should be inside a Data folder within the Project.
    slice_size is the desired row length of each slice
    trackdict returns a dict with the slices per clip. To retrieve: df, dicc = fulldfslices(trackdict=True)  
    Xymerge=True returns the df with both X+y'''
    Xymerg=Xymerge
    data16  = {}
    label16 = {}
    #trackdic = trackdict
    for i in range(1,nsubjects+1): 
        # Load all 16 files into a Dict using a for loop
        data16[i]  = pickle.loads(np.load(f'../data/{i}_123.npz')['data'])
        label16[i] = pickle.loads(np.load(f'../data/{i}_123.npz')['label'])
            
    Xy16_list = []
    dicc16  = {}
    for i in range(1,nsubjects+1): 
        #apply all data to the gather data func to create lists of DFs 
        if trackdict:
            Xy,dicc = allsets(data16[i], label16[i], slice_size, trackdict=True,Xymerge=Xymerg )
            Xy16_list+=Xy
            dicc16[f"subject {i}"]=dicc  #list with dicc {clip,slice}
        if not trackdict:
            Xy = allsets(data16[i], label16[i], slice_size, Xymerge=Xymerg)
            Xy16_list+=Xy

    #XyDF = pd.concat(Xy16_list)
    #XyDF.columns = [*XyDF.columns[:-1], 'target']
    return Xy16_list #list of all slices

def fulldfsplit(nsubjects=16):
    '''Train-Test from Full DF function '''
    data16  = {}
    label16 = {}
    Xytrain16_list = []
    Xytest16_list  = []
    for i in range(1,nsubjects+1): 
        # Consolidate all 16 files into a list using a for loop
        data16[i]  = pickle.loads(np.load(f'../data/{i}_123.npz')['data'])
        label16[i] = pickle.loads(np.load(f'../data/{i}_123.npz')['label'])
            
    for i in range(1,nsubjects+1):
        #apply all data to the splitdata func to create lists of DFs 
        train, test = splitdata(data16[i], label16[i], 10)
        Xytrain16_list.append(train)
        Xytest16_list.append(test)

    #create a unified DF from every list with pd.concat(trainframes)
    Xytrain16_DF = pd.concat(Xytrain16_list)
    Xytest16_DF  = pd.concat(Xytest16_list)
    
    return Xytrain16_DF, Xytest16_DF

In [11]:
Xylist = fulldfslices(Xymerge=False)

In [12]:
len(Xylist), Xylist[0].shape #(1904, (13, 311))

(1904, (13, 310))

In [116]:
def y_unique(nsubjects=16):
    '''y for RNN. after X is padded, this y is used to fit'''
    yunique = []
    for i in range(1,nsubjects+1):
        y=pickle.loads(np.load(f'../data/{i}_123.npz')['label'])
        for e in range(45):
            yunique.append(int(np.unique(y[e])))
    return np.array(yunique).astype(np.float32).reshape(-1, 1)

In [77]:
def access16(y=True):
    '''converts Person file into a dict with 16 keys.
    if y: returns data16 and label16.
    if not y: returns only data16'''
    data16  = {}
    label16 = {}
    for i in range(1,nsubjects+1):
        # Load all 16 files data into a Dict named ‘i_123.npz’ using a for loop
        data16[i]  = pickle.loads(np.load(f'../data/{i}_123.npz')['data'])
        if y:
            label16[i] = pickle.loads(np.load(f'../data/{i}_123.npz')['label'])  
    if y:
        return data16, label16
    if not y:
        return data16
    
def rnn_df(nsubjects=16):
    '''returns a list with 16 arrays'''
    data16  = [pickle.loads(np.load(f'../data/{i}_123.npz')['data']) for i in range(1,nsubjects+1)]
    return data16

def concate(X,y):
    ''' joins X-frame and y-frame. returns DF '''
    Xyframe = pd.concat([pd.DataFrame(X), pd.DataFrame(y)], axis=1)
    return Xyframe
    
def gatherdata(X, y):
    ''' X,y = inputs Dicts with 45 keys e.g. data[1], label[1]
    returns a pd.DF with all 45 clips concatenated (1823, 311))'''
    Xyframes=[]
    for i in range(45):
        Xyframes.append(concate(X[i],y[i]))
    XyDF = pd.concat(Xyframes)
    return XyDF
def fulldf(nsubjects=16):
    '''returns a pd.DF with X and y. y labelled as 'target' '''
    #get files into dicts
    data16, label16 = access16()
    #apply all data to the gather data func to create lists of DFs
    Xy16_list = [gatherdata(data16[e], label16[e]) for e in range(1,nsubjects+1)]
    #concat clips one on top of the other one
    XyDF = pd.concat(Xy16_list)
    XyDF.columns = [*XyDF.columns[:-1], 'target']
    return XyDF   #pd.DF with X and y. y labelled as 'target'