In [1]:
import numpy as np
import pandas as pd
import pickle
import seaborn as sns
import random

from sklearn.preprocessing import MinMaxScaler


In [90]:
# INTERNAL FUNCS
def splitdata(data, label, ntrainbatch=10):
    nbatch=ntrainbatch-1
    trainframes=[]
    testframes =[]
    for i in range(45):
        if i%15-1<nbatch:
            trainframes.append(pd.concat([pd.DataFrame(data[i]), pd.DataFrame(label[i])], axis=1))
        if i%15>nbatch:
            testframes.append(pd.concat([pd.DataFrame(data[i]), pd.DataFrame(label[i])], axis=1))
    train = pd.concat(trainframes)
    test  = pd.concat(testframes)
    return train, test

def gatherdata(X, y):
    Xyframes=[]
    for i in range(45):
        Xyframes.append(pd.concat([pd.DataFrame(X[i]), pd.DataFrame(y[i])], axis=1))
    XyDF = pd.concat(Xyframes)
    return XyDF

def allsets(X,y,slice_size=13):
    slices = []
    dicc={}
    for i in range(45):
        conc = pd.concat([pd.DataFrame(X[i]), pd.DataFrame(y[i])], axis=1)
        length   = len(conc)
        sobrantes, setscomp = length%slice_size, length//slice_size

        for e in range(setscomp):
            slic = zero.iloc[slice_size*e:min_size*e+slice_size]
            slices.append(slic)
            dicc[f"clip {i}"]=f"slices:{setscomp}" #dicc {clip,slice}
    df = pd.concat(slices)
    return df, dicc

In [92]:
# GLOBAL FUNCS

#Train-Test from Full DF function 
def fulldfsplit(nsubjects=16):
    '''Files must be labelled as {subject#}_123.npz' and should be inside a Data folder within the Project'''
    data16  = {}
    label16 = {}
    Xytrain16_list = []
    Xytest16_list  = []
    for i in range(1,nsubjects+1): 
        # Load all 16 files data into a Dict named 'i_123.npz' using a for loop
        data16[i]  = pickle.loads(np.load(f'../data/{i}_123.npz')['data'])
        label16[i] = pickle.loads(np.load(f'../data/{i}_123.npz')['label'])
            
    for i in range(1,nsubjects+1):
        #apply all data to the splitdata func to create lists of DFs 
        train, test = splitdata(data16[i], label16[i], 10)
        Xytrain16_list.append(train)
        Xytest16_list.append(test)

    #create a unified DF from every list with pd.concat(trainframes)
    Xytrain16_DF = pd.concat(Xytrain16_list)
    Xytest16_DF  = pd.concat(Xytest16_list)
    
    return Xytrain16_DF, Xytest16_DF

#Full DF no split
def fulldf(nsubjects=16):
    '''Files must be labelled as {subject#}_123.npz' and should be inside a Data folder within the Project.'''
    data16  = {}
    label16 = {}
    for i in range(1,nsubjects+1): 
        # Load all 16 files data into a Dict named 'i_123.npz' using a for loop
        data16[i]  = pickle.loads(np.load(f'../data/{i}_123.npz')['data'])
        label16[i] = pickle.loads(np.load(f'../data/{i}_123.npz')['label'])
    Xy16_list = []
    for i in range(1,nsubjects+1): 
        #apply all data to the gather data func to create lists of DFs 
        Xy = gatherdata(data16[i], label16[i])
        Xy16_list.append(Xy)
    XyDF = pd.concat(Xy16_list)
    XyDF.columns = [*XyDF.columns[:-1], 'target']
    return XyDF

#Full DF, no split, slicing each clip to multiples of 13
def fulldfslices(nsubjects=16, slice_size =13, trackdict=False):
    '''Files must be labelled as {subject#}_123.npz' and should be inside a Data folder within the Project.
    slice_size is the desired row length of each slice
    '''
    data16  = {}
    label16 = {}
    for i in range(1,nsubjects+1): 
        # Load all 16 files data into a Dict named 'i_123.npz' using a for loop
        data16[i]  = pickle.loads(np.load(f'../data/{i}_123.npz')['data'])
        label16[i] = pickle.loads(np.load(f'../data/{i}_123.npz')['label'])
    Xy16_list = []
    dicc16  = {}
    for i in range(1,nsubjects+1): 
        #apply all data to the gather data func to create lists of DFs 
        Xy, dicc = allsets(data16[i], label16[i], slice_size)
        Xy16_list.append(Xy)
        dicc16[f"subject {i}"]=dicc  #list with dicc {clip,slice}
    XyDF = pd.concat(Xy16_list)
    XyDF.columns = [*XyDF.columns[:-1], 'target']
    if trackdict:
        return XyDF, dicc16
    if not trackdict:
        return XyDF

In [6]:
#Dayus formula
def get_X_y(df, 
            X_length=13, # 
            y_length=13, 
            number_of_sequences=51, 
            number_of_targets=1, 
            val=False, 
            val_cutoff=0.8):


#     limit dataframes to length for train/test splits
    df_X = df.copy().drop(columns=df.columns[-1], axis=1)
    df_y = df.iloc[:, -1].copy()
    
#     convert and scale X dataframe to PCA to solve dimensionality problem
    scaler = MinMaxScaler()
    df_X_scaled = pd.DataFrame(scaler.fit_transform(df_X), columns=df_X.columns, index=df_X.index)
    

#     create unique list to sample random datapoints from
    if val:
        sample_list = list(range(int(len(df_y)*val_cutoff), int(len(df_y)-y_length))) #y_length pans the end
    if not val:
        sample_list = list(range(int(X_length), int(len(df_y)-y_length))) #X_length pans start
    random.shuffle(sample_list)
    
#     empty lists to append data to, will create 3D dataframe here
    X, y = [], []
    
    
#     define a simple data slicing and selection function. This function will create a slice of data from a specified random starting position. The random position must be generated externally.
    
    def get_Xi_yi(df_X, 
              df_y,
              random_start, #list of random values
              X_length, #X_length pans start
              y_length #y_length pans the end
                 ): 
        '''Define a simple data slicing and selection function. 
        This function will create a slice of data from a specified random starting position. 
        The random position must be generated externally.'''
    
#     must define a random_start:int for function to run
        Xi = df_X.iloc[random_start-X_length:random_start]
        yi = df_y.iloc[random_start:random_start+y_length]

        return Xi, yi

    
#     for loop to select ith values from data
    for i in range(number_of_sequences):
        Xi, yi = get_Xi_yi(df_X_scaled, df_y, sample_list.pop(), X_length, y_length)
        X.append(Xi.values.tolist())
        y.append(yi.values.tolist())
        
    return np.array(X), np.array(y)

In [8]:
#tests on D func
X,y= get_X_y(df, 
            X_length=13, # 
            y_length=13, 
            number_of_sequences=51, 
            number_of_targets=4, 
            val=True, 
            val_cutoff=0.8)
X.shape, y.shape #((51, 13, 310), (51, 13))

y_length = 3
X_length = 3
val_cutoff = .999
sample_list = list(range(int(len(df_y)*val_cutoff), int(len(df_y)-y_length))) #y_length pans the end
sample_list

In [12]:
# Randomstart excercise

#random.shuffle(list(range(length-min_size)))
#randslicestart = False
#start=random.choice(list(range(length-min_size))
#if randslicestart:
#slic = conc.iloc[randstart:randstart+13]
#if not randslicestart:
#slic = conc.iloc[0:13]      

[0, 1, 2, 3, 4]
1


[1, 4, 0, 3, 2]

In [94]:
df = fulldfslices()
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,300,301,302,303,304,305,306,307,308,target
0,10.351274,8.478359,7.657808,9.115215,9.617855,9.965765,8.179382,7.175567,7.617582,7.930847,...,8.604859,7.827406,6.886811,5.196589,3.738288,8.809076,7.937103,7.085711,5.365358,3.865277
1,10.349975,8.477551,7.657058,9.115191,9.618256,9.964904,8.178533,7.174970,7.617338,7.931454,...,8.603685,7.827231,6.886709,5.195572,3.737141,8.808018,7.937104,7.085549,5.364444,3.864036
2,10.347788,8.476277,7.655869,9.115107,9.618442,9.963447,8.177262,7.173795,7.616820,7.931954,...,8.602335,7.827093,6.886249,5.193952,3.735543,8.806674,7.937112,7.085094,5.363152,3.862595
3,10.345007,8.474367,7.654376,9.115119,9.618277,9.961652,8.175384,7.172226,7.616229,7.932134,...,8.601078,7.827251,6.885465,5.192035,3.733665,8.805615,7.937364,7.084372,5.361651,3.861099
4,10.341874,8.471907,7.652900,9.115251,9.617869,9.959593,8.172932,7.170506,7.615714,7.932137,...,8.599649,7.827607,6.884306,5.190031,3.731868,8.804490,7.937713,7.083435,5.360154,3.859576
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34,10.454476,8.569670,7.742804,9.134632,9.598853,10.025414,8.246529,7.220644,7.643463,7.912133,...,8.690939,7.817322,6.859976,5.202574,3.770688,8.871830,7.911786,7.075238,5.378821,3.910919
35,10.458923,8.575193,7.745847,9.135550,9.597961,10.029033,8.251487,7.222618,7.644589,7.910926,...,8.692500,7.817568,6.860357,5.204365,3.774398,8.874079,7.911761,7.076301,5.381045,3.914878
36,10.463055,8.580253,7.748671,9.136538,9.596827,10.032337,8.255966,7.224398,7.645612,7.909481,...,8.694603,7.818440,6.860554,5.206705,3.778204,8.877085,7.912314,7.077314,5.383770,3.918960
37,10.467096,8.584745,7.751183,9.137651,9.595519,10.035440,8.259823,7.225847,7.646520,7.907844,...,8.696740,7.819728,6.860508,5.209431,3.782042,8.880173,7.913196,7.078140,5.386946,3.923182


In [81]:
dict_of_arrays=data.copy()
n, m = 13, 310  # replace with your desired shape

for key, array in dict_of_arrays.items():
    dict_of_arrays[key] = np.resize(array, (n, m))

stacked_array = np.stack(list(dict_of_arrays.values()))
print(stacked_array.shape)

(45, 13, 310)
