<a href="https://colab.research.google.com/github/mmarcato/dog_posture/blob/ICML/posture_algorithm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Posture Algorithm
This notebook was created on 13th of January 2021. The code was migrated from Python Modules which were stored on GitHub.

## Modules

**Run this!** It imports external modules used in the code.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Setting up Caching
import joblib
from shutil import rmtree
location = './cachedir'
memory = joblib.Memory(location=location, verbose=10)

# Setting up logger
#import logging
#logger = log(__name__)
#logger.info('Modules imported')


# Importing Data & Extracting Features

**Skip this step** if you don't want to create new dataframes.


## Importing and Creating Raw Posture Dataset

**Skip this step!** The Subjects folder with Raw TimeStamps and Actigraph files are not available on Google Drive.


Creates *df_raw* - Raw Posture Labelled Dataset - by combining the data from TimeStamps and Actigraph datasets through the following functions:

- *timestamp*: imports timestamps file with recordings of Episodes and Positions during the behaviour test (df_info, df_pos, df_ep)
- *actigraph*: imports and concatenates raw dataset from three Actigraph sensors (Back, Chest and Neck) to create a single dataset (df_imu)
- *label*: creates label for each of row in the actigraph raw file based on the timestamps file recordings and saves labeled dataset (df_raw)


In [None]:
import os
import pandas as pd
import glob

def timestamps(subjects, dcs, base_dir): 
    '''
    import timestamps files and organised data into and return:

    df_ep: df indexed by'Timestamps' containing'Episode', 'Ep-VT' and 'Duration'
    df_pos: df indexed by 'Timestamps' containing 'Position', 'Pos-VT', 'Duration', 'Type'
    df_stats: df containing 'Subject', 'DC', 'Date', 'Start time' 
    '''
    print('\nImporting Timestamp files - Episode and Position Data') 
    stats = []
    df_ep, df_pos = {},{}
    for subj in subjects:
        df_ep[subj], df_pos[subj] = {},{}
        for dc in dcs:
            df_ep[subj][dc], df_pos[subj][dc] = None, None
            f_name = '%s\\%s\\%s_Timestamps.csv' % (base_dir, subj, dc[-1])  
            if os.path.exists(f_name):
                # Read the information about the behaviour test 
                df_info = pd.read_csv(f_name, index_col = 0, nrows = 4, usecols = [0,1])
                date = df_info[subj]['Date']
                time = df_info[subj]['Start time']
                dt = pd.to_datetime(date + time, format = '%d/%m/%Y%H:%M:%S' )            
                stats.append([subj, dc, date, time])
                # Read the episode Virtual Time (VT) 
                df_ep[subj][dc] = pd.read_csv(f_name, skiprows = 6, usecols = ['Episode', 'Ep-VT']).dropna()
                # Create new column for the episode Real Time (RT)
                df_ep[subj][dc].index = dt + pd.to_timedelta(df_ep[subj][dc]['Ep-VT'])         
                # Create new column for the episode Duration
                df_ep[subj][dc]['Duration'] = df_ep[subj][dc].index.to_series().diff().shift(-1)
                df_ep[subj][dc]['Episode'] = df_ep[subj][dc]['Episode'].str.lower()
                
                # Read the position Virtual Time (VT) 
                df_pos[subj][dc] = pd.read_csv(f_name, skiprows = 6, usecols = ['Position', 'Pos-VT']).dropna()
                # Create new column for the position Real Time (RT) and sets it as the index
                df_pos[subj][dc].index = dt + pd.to_timedelta(df_pos[subj][dc]['Pos-VT'])         
                # Create new column for the position Duration
                df_pos[subj][dc]['Duration'] = df_pos[subj][dc].index.to_series().diff().shift(-1) 
                df_pos[subj][dc]['Position'] = df_pos[subj][dc]['Position'].str.lower()
                
                pos_type = {'walking': 'dynamic', 'w-sniffing floor': 'dynamic',
                            'standing':'static', 'sitting':'static', 
                            'lying down': 'static', 'jumping up': 'dynamic',
                            'jumping down':'dynamic', 'body shake':'dynamic',
                            's-sniffing floor': 'static', 'Pull on leash': 'dynamic',
                            'moving': 'dynamic'}

                df_pos[subj][dc]['Type'] = df_pos[subj][dc]['Position'].map(pos_type)

    df_info = pd.DataFrame(stats, columns = ['Subject', 'DC', 'Date', 'Start time'])
    logger.info('\t Imported Timestamps for \n{}'.format(df_info))

    return(df_info, df_pos, df_ep)

def actigraph(df_info, base_dir):
    df_imu = {}
    bps = ['Back', 'Chest', 'Neck']
    print('\nImporting Actigraph files - IMU Data')
    logger.info('\t Started Importing Actigraph data')
    for subj in df_info['Subject'].unique():
        df_imu[subj] = {}       
        # Iterating through data collections
        for dc in df_info[df_info.Subject == subj]['DC']:
            df_list= []
            df_imu[subj][dc] = None
            # If this the path to data exists
            if os.path.isdir('%s\\%s\\%s_Actigraph' % (base_dir, subj, dc[-1])):
                # Looping through all bps
                for bp in bps:   
                        # Find file path for each bp
                        f_name =  glob.glob('%s\\%s\\%s_Actigraph\\*_%s.csv' % (base_dir, subj, dc[-1], bp))
                    
                        df_list.append(pd.read_csv(f_name[0], index_col = ['Timestamp'], parse_dates = [0], \
                                date_parser = lambda x: pd.to_datetime(x, format = '%Y-%m-%d %H:%M:%S.%f'))\
                                .drop(['Temperature'], axis = 1))
                # Concatenating dataframes for different body parts in one single dataframe
                # Results in one dataframe per dog per data collection
                df_imu[subj][dc] = pd.concat(df_list, axis = 1, keys = bps, \
                names = ['Body Parts', 'Sensor Axis'])
                # Change column names to be bodypart.sen.axis (Back.Acc.X)
                df_imu[subj][dc].columns = [f'{i}.{j[:3]}.{j[-1]}' for i,j in df_imu[subj][dc].columns]
    logger.info('\t Finished Importing Actigraph data')
    return(df_imu)


def label(df_info, df_pos, df_imu, df_dir):
    '''
        Combines data from df_imu and df_info to create a 
            df containing raw df_imu data plus Dog, DC, Type, Position 
                based on the markings df_pos

        df_info: df containing 'Subject', 'DC', 'Data' and 'Start Time'
        df_pos: df containing timestamps data 'Position', 'Pos-VT' and 'Duration'
        df_imu: df containing Actigraph data (back, chest, neck)*(3-axis)*(acc, gyr, mag)
        df_dir: directory to save new dataframe

    '''
    logger.info('\t Started creating labeled raw data')
    df_list = []  
    for subj in df_info['Subject'].unique():        
        # Iterating through data collections
        for dc in df_info[df_info.Subject == subj]['DC']:     
            print('\t',subj, dc)
            for (s_time, f_time) in zip(df_pos[subj][dc].index.to_series(), \
                                df_pos[subj][dc].index.to_series().shift(-1)):
                #print(s_time, f_time)    
                df_imu[subj][dc]['Dog'] = subj
                df_imu[subj][dc]['DC'] = dc
                df_imu[subj][dc].loc[s_time:f_time,'Type'] = df_pos[subj][dc].loc[s_time, 'Type']
                df_imu[subj][dc].loc[s_time:f_time,'Position'] = df_pos[subj][dc].loc[s_time, 'Position']

               
            df_list.append(df_imu[subj][dc])
    df = pd.concat(df_list)
    # Deleting rows with nan 
    df.dropna(axis = 0, inplace = True)
    # Deleting rows with 'Moving'
    df = df[df['Position'] != 'moving']
    df.to_csv('%s\\%s.csv' % (df_dir, 'df_raw'))
    logger.info('\t Finished creating labeled raw data')
    return(df)


In [None]:
# ------------------------------------------------------------------------- #
#                        Raw Data Importing parameters                      #    
# ------------------------------------------------------------------------- #
base_dir = 'C:\\Users\\marinara.marcato\\Project\\Data\\Subjects' # path to subjects folder
subjects = os.listdir(base_dir)[1:]
dcs = ['DC1', 'DC2']

# creating info, positions, episode dataframes
df_info, df_pos, df_ep = timestamps(subjects, dcs, base_dir)
# creating imu dataset considering the timestamps created
df_imu = actigraph(df_info, base_dir)
# creating raw dataset and saving it 
df_raw = label(df_info, df_pos, df_imu, df_dir)

FileNotFoundError: ignored

## Importing and Creating Processed Dataset

**Skip this step!** If you don't want to create a new Posture Processed Dataset 

Importing the Posture Datasets (df_raw or df*)

- *posture*: importing raw (df_raw) or other processed dataset (dfs*), as they have the same structure (first row is the column name) 

In [None]:
def posture(df_dir, df_name = 'df_raw'):
  ''' imports data from any posture file, be it raw or processed
  '''
  return(pd.read_csv( '%s\\%s.csv' % (df_dir, df_name), index_col = ['Timestamp'], parse_dates = [0], \
                                date_parser = lambda x: pd.to_datetime(x, format = '%Y-%m-%d %H:%M:%S.%f')))
  
# importing created raw dataset - shortcut for all the processes above    
df_raw = posture(df_dir, 'df_raw')


Creating Processed Datasets

Processing df_raw using the following functions:
- *transitions*: detects transitions in time and position in **df_raw**
- *features*: calculates simple features


In [None]:
def transitions(df): 
    '''
        Process df_raw to create:
            1. Transition in Position for consective positions performed by same dog
            2. Transition in Time in case moving in between two positions or different dog
            3. Transition column combining both step 1. and 2.
    '''
    # Finding transitions in posture
    df['Trans-Pos'] = df['Position'].shift()  != df['Position']
    # Finding transitions in time that are bigger than the 100Hz -> 10,000 microseconds
    df['Trans-Time'] = (df.index.to_series().diff() != datetime.timedelta(microseconds = 10000)) + (df.index.to_series().diff().shift(-1) != datetime.timedelta(microseconds = 10000))
    # Combining the time and position transitions
    df['Transition'] = df['Trans-Pos'] + df['Trans-Time']
    # Changing last row into a transition, Transition column has s_time and f_time of the BT
    df.iloc[-1]['Transition'] = True
    
    return(df)


def features(df_raw, df_dir, df_name, w_size, w_offset, t_time):
    '''     
    Extracts 'min', 'max', 'mean','std', 'median', 'sum', 'skew', 'kurt' from a window interval in df_raw 
    based on timestamps for positions in df_raw
    Saves transformed df to a file in df_dir with df_name:
    df_raw: dataframe with all raw IMU measurements and info 'Dog', 'DC' & 'Position', 'Type'
    df_imu: dataframe containing Actigraph data (back, chest, neck)*(3-axis)*(acc, gyr, mag)
    params: pr_feat contains columns for
    df_name = dataset name
    w_size = size of the window df_feat.ix[df_feat.index.get_loc(s_time)+1, 'Trans-Time']
    w_offset = offset from start time for the value to be taken
    t_time = transition time between positions
    return:
    df containing features calculated and label 'Position' and 'Type'

    '''
    print('Processing simple features \n df_name {}, w_size {}, w_offset {}, t_time{}'.format(df_name, w_size, w_offset, t_time))

    # Finding transitions in posture
    df_raw = transitions(df_raw)

    df_l2 = []
    # Iterating over the periods between transitions
    for (s_time, f_time) in zip( df_raw.loc[df_raw['Transition'] == True].index[:-1] + t_time , \
                                df_raw.loc[df_raw['Transition'].shift(-1) == True].index - t_time):
        
        #print(s_time,  f_time)
        # if there is not a transition in time 
        if(~df_raw.ix[df_raw.index.get_loc(s_time-t_time)+1, 'Trans-Time']):
        
            #print('Calculating Features for {}\n'.format(df_raw.loc[s_time, ['Dog','DC', 'Position']].values))    
            df_l1 = []   
            
            df_l1.append((df_raw.ix[s_time:f_time, :-7].rolling(w_size, center = True).min()).resample(w_offset).first())
            df_l1.append((df_raw.ix[s_time:f_time, :-7].rolling(w_size, center = True).max()).resample(w_offset).first())
            df_l1.append((df_raw.ix[s_time:f_time, :-7].rolling(w_size, center = True).mean()).resample(w_offset).first())
            df_l1.append((df_raw.ix[s_time:f_time, :-7].rolling(w_size, center = True).std()).resample(w_offset).first())
            df_l1.append((df_raw.ix[s_time:f_time, :-7].rolling(w_size, center = True).median()).resample(w_offset).first())
            df_l1.append((df_raw.ix[s_time:f_time, :-7].rolling(w_size, center = True).sum()).resample(w_offset).first())
            #df_l1.append((df_raw.ix[s_time:f_time, :-7].rolling(w_size, center = True).corr()).resample(w_offset).first())
            #df_l1.append((df_raw.ix[s_time:f_time, :-7].rolling(w_size, center = True).cov()).resample(w_offset).first())
            df_l1.append((df_raw.ix[s_time:f_time, :-7].rolling(w_size, center = True).skew()).resample(w_offset).first())
            df_l1.append((df_raw.ix[s_time:f_time, :-7].rolling(w_size, center = True).kurt()).resample(w_offset).first())


            df_l2.append( pd.concat(df_l1, axis = 1, keys = ['min', 'max', 'mean','std', 'median', 'sum', 'skew', 'kurt',],\
            names = ['Statistics','BodyPart.SensorAxis'])\
            .assign(Dog = df_raw.loc[s_time,'Dog'], DC = df_raw.loc[s_time,'DC'], Type = df_raw.loc[s_time,'Type'], Position = df_raw.loc[s_time, 'Position']))  
           
        #else:
            #print('Do not calculate features\n' )

    df = pd.concat(df_l2)
    # Renaming the columns to contain stats.bodypart.sensor.axis, e.g. mean.Back.Acc.X, keeping last 4 columns (info and label) the same
    df.columns = df.columns[:-4].map('{0[0]}.{0[1]}'.format).append(df.columns[-4:].droplevel(1))
    print('Shape before dropping NAs', df.shape)
    df = df.dropna()
    print('Shape after dropping NAs', df.shape)

    print('Save df to csv')
    df.to_csv('%s\\%s.csv' % (df_dir, df_name))
    df_logger = log(df_name, log_file = '%s\\%s.log' % (df_dir, df_name))
    df_logger.info('\n\t Dataset created with simple_feature parameters: \n\ndf_name: {}, w_size: {}, w_offset: {}us, t_time: {}us'.format(df_name, w_size, w_offset, t_time))
    df_logger.info('\n\t Number of Examples in raw dataframe \n{} \n\n{}\n'.format(df['Position'].value_counts(), df['Type'].value_counts()))
    df_logger.info('\n\t Including data from  \n{}\n\n'.format( df.groupby(['Dog', 'DC']).size() ))
    logger.info('{}: Dataset created. See log for parameter details'.format(df_name))

    return (df)


In [None]:
# ------------------------------------------------------------------------- #
#                            Feature Engineering                            #    
# ------------------------------------------------------------------------- #

'''
defining hyperparameters: window size, window offset and transition time 
df_name:    dataset file name 
t_time:     transition time - between positions used for creating a position window 
w_size:     window size - for feature calculation, considering that raw data are recorded at 100Hz
w_offset:   window offset - for resampling, taken from start_time + t_time + w_size/2 * as feature are calculated from centre of window
'''
df_dir = 'C:\\Users\\marinara.marcato\\Project\\Scripts\\dog_posture\\dfs'
df_name = 'df_32'
w_size = 25
w_offset = timedelta(seconds = .10)
t_time = timedelta(seconds = .25)

# creating dataset with defined hyperparameter 
print(df_name, w_size, w_offset, t_time)
df_feat = process.features(df_raw, df_dir, df_name, w_size, w_offset, t_time)


# Handling Dataset

**Run this!** Import, Visualise and Prepare Dataset for Machine Learning steps

## Importing and Visualising Dataset

**Run this!** If you already have the Processed Datasets (df*) available. Importing df* and Visualising Distribution of Classes per Subjects

### Functions

- *posture*: imports processed dataset (dfs*)
- *distribution*: plots the distribution of posture(label) given different dogs in df 

In [None]:
def posture(df_dir, df_name = 'df_raw'):
  ''' imports data from any posture file, be it raw or processed
  '''
  return(pd.read_csv( '%s/%s.csv' % (df_dir, df_name), index_col = ['Timestamp'], parse_dates = [0], \
                                date_parser = lambda x: pd.to_datetime(x, format = '%Y-%m-%d %H:%M:%S.%f')))

def distribution (df, df_desc):
    print(df_desc)
    # checking the number dogs included
    print('\nNumber of Dogs: {}'.format(df['Dog'].unique().size))
    # checking the number DCs included
    print('Number of DCs: {}\n'.format(df.groupby(['Dog' ,'DC']).size().count()))
    df_dogs = df['Dog'].value_counts().reset_index(name= 'count')
    df_dogs['percentage'] = df_dogs['count']*100 /df_dogs['count'].sum()
    print(df_dogs)
    # calculating the number of examples per category
    df_sum = df['Position'].value_counts().reset_index(name= 'count')
    # calculating the percentage of examples per category
    df_sum['percentage'] = df_sum['count']*100 /df_sum['count'].sum()
    print(df_sum)

    print('\nDistribution of Positions per dog')
    plt.figure(figsize=(10,5))
    chart = sns.countplot(x="Position", hue="Dog", data = df, order = df['Position'].value_counts().index)
    chart.set_title('Distribution of Positions per dog')
    chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment='right')
    #setup multiple columns in legend  
    chart.legend(ncol = 4, bbox_to_anchor = (1,1))
    return(df.groupby(['Position', 'Dog']).size().reset_index(name='count'))

### Main

In [None]:
# ------------------------------------------------------------------------- #
#                             Machine Learning                              #    
# ------------------------------------------------------------------------- #
''' setting parameters

''' 
df_dir = '/content/drive/My Drive/Posture Algorithm/dfs'
df_name = 'df_32'

# importing previously created datasets
df_feat = posture(df_dir, df_name)  

# visualising feature distribution  
#df_dist = distribution(df_feat, 'Original Dataset')


## Preparing Dataset
**Run this!** The split function was designed to make sure every dog's data is constrained to one of the sets only. 


### Modules

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin


from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names, dtype=None):
        self.attribute_names = attribute_names
        self.dtype = dtype
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_selected = X[self.attribute_names]
        if self.dtype:
            return X_selected.astype(self.dtype).values
        return X_selected.values


### Functions

The dataframe was first sorted by number of different postures recorded for each dog/subject, giving priority to df_test followed by df_validation sets to contain data from the dogs who performed the most diverse posture set.
-
- *split*: spliting the raw df into two df given a proportion
- *stats*: displays size (examples and dogs) and proportion (examples) of the split dataframes


In [None]:
def split (df, prop):
  '''
      split the dataset into two sets
      selects different dogs for each set
      dogs with most diverse position set are placed in the second set 
  '''
  
  df_counts = df.groupby(['Dog','Position']).size().reset_index(name = 'Counts')
  df_summary = df_counts.groupby('Dog').sum()
  df_summary['Positions'] = df_counts.groupby('Dog').size()
  df_summary.sort_values(['Positions', 'Counts'], ascending = False, inplace = True)
  df_summary['Cum_Percentage'] = df_summary['Counts'].cumsum()/df_summary['Counts'].sum()
  idx = np.argmin(abs(df_summary['Cum_Percentage'] - prop))
  dogs_chunk = df_summary[0:idx+1].index.to_list()

  df1 = df[~df.Dog.isin(dogs_chunk)]
  df2 = df[df.Dog.isin(dogs_chunk)]

  return(df1, df2)

def stats(dfs):
  # Calculating the size of each dataframe
  sizes = list(map(len, dfs))
  print(sizes)
  # Calculating the percentage value of each dataframe
  print([size/sum(sizes) for size in sizes])
  # Calculating the number of dogs in each dataframe
  print([df['Dog'].unique().size for df in dfs])


### Main
The resulting dataframes have either a 

**test-dev** split:
- *df_dev*: contains data from 80% of the dogs
- *df_test*: contains data from 20% of the dogs

**test-val-train** split:
- *df_train*: contains data from 60% of the dogs
- *df_val*: contains data from 20% of the dogs
- *df_test*: contains data from 20% of the dogs



In [None]:
# creating dev and test sets
df_dev, df_test = split(df_feat, 0.2)
stats([df_dev, df_test])

# creating train and val sets
df_train, df_val = split(df_dev, 0.25)
stats([df_train, df_val, df_test])

# visualising feature distribution for dev and test sets
#process.distribution(df_dev, 'Development Dataset')
#process.distribution(df_test, 'Test Dataset')

# visualising feature distribution for dev and test sets
#process.distribution(df_train, 'Train Dataset')
#process.distribution(df_val, 'Validation Dataset')
#process.distribution(df_test, 'Test Dataset')

[115992, 30380]
[0.7924466428005357, 0.20755335719946438]
[32, 6]
[85503, 30489, 30380]
[0.5841486076572022, 0.20829803514333342, 0.20755335719946438]
[26, 6, 6]


In [None]:
# Select dataset for the topmost level of
df = df_train
# Select feature names
feat = df.columns[:-4]
# Removing all Magnetometer features 
features = [x for x in feat if "Mag" not in x]

# select features
X = df.loc[:, feat]
# setting label
label = 'Position'
# select label
y = df.loc[:, label].values
# setting a cv strategy that accounts for dogs
cv0 = GroupKFold(n_splits = 10).split(X, y, groups = df.loc[:,'Dog'])
cv1 = LeaveOneGroupOut().split(X, y, groups = df.loc[:,'Dog'])

df[label].unique().size


9

9

# Grid Search

In [None]:
class gs_results:
    # Storing Grid Search results
    def __init__(self, gs):
        self.cv_results_ = gs.cv_results_
        self.best_estimator_ = gs.best_estimator_
        self.best_params_ = gs.best_params_
        self.best_score_ = gs.best_score_

def gs_output(gs):
    '''
        Printing key metricts from the best estimator selected by GS algorithm
    '''
    best_idx_ = np.argmax(gs.cv_results_['mean_test_score'])
    print("Best Estimator \nTest mean: {:.6f}\t std: {:.6f}\nTrain mean: {:.6f} \t std:  {:.6f}\nparameters: {}".format( \
        np.max(gs.cv_results_['mean_test_score']), gs.cv_results_['std_test_score'][best_idx_],\
        gs.cv_results_['mean_train_score'][best_idx_],  gs.cv_results_['std_train_score'][best_idx_],\
        gs.best_params_))

def gs_dump(gs, gs_name, gs_dir, memory, location):    
# Saving Grid Search Results to pickle file 
    joblib.dump(gs, '{}/{}.pkl'.format(gs_dir, gs_name), compress = 1 )
    #memory.clear()
    #rmtree(location)

## Random Forest

In [None]:
#################### RF
gs_pipe = Pipeline([
    ('selector', DataFrameSelector(features,'float64')),
    ('scaler', StandardScaler()),
    ('reduce_dim', PCA()), 
    ('estimator', RandomForestClassifier())       
], memory = memory) 

gs_params = {
    'reduce_dim__n_components' : [0.85, 0.90, 0.95],
    'estimator__max_depth' : [7, 10, 15],
    #'estimator__max_features' : [75, 82, 90, 100],
    'estimator__n_estimators' : [7, 10, 15, 20]
}
#n_cpus = multiprocessing.cpu_count()

gs_rf = GridSearchCV(gs_pipe, \
    cv = cv0, \
    scoring = 'f1_weighted', \
    param_grid = gs_params, \
    return_train_score = True, n_jobs = 1)
    
gs_rf.fit(X,y, groups = df_train.loc[:,'Dog'])
gs_output(gs_rf)
%time

# Saving Grid Search Results to pickle file 
gs_dir = '/content/drive/My Drive/Posture Algorithm/models'
gs_name = 'GS-RF-df_32-5'

joblib.dump(gs_results(gs_rf), '{}/{}.pkl'.format(gs_dir, gs_name), compress = 1 )
#gs_dump(gs=gs_rf, gs_name = gs_name, gs_dir = gs_dir, memory=memory, location)

In [None]:
#################### RF
gs_pipe = Pipeline([
    ('selector', DataFrameSelector(features,'float64')),
    ('scaler', StandardScaler()),
    ('reduce_dim', SelectKBest(f_classif)), 
    ('estimator', RandomForestClassifier())       
], memory = memory) 

gs_params = {
    'reduce_dim__k' : [70, 80, 90],
    'estimator__max_depth' : [7, 10, 15],
    #'estimator__max_features' : [75, 82, 90, 100],
    'estimator__n_estimators' : [2, 5, 7, 10]
    #'reduce_dim__n_components' : [80, 100, 120], 
}

gs_rf = GridSearchCV(gs_pipe, \
    cv = cv0, \
    scoring = 'f1_weighted', \
    param_grid = gs_params, \
    return_train_score = True, n_jobs = 1)
    
gs_rf.fit(X,y, groups = df_train.loc[:,'Dog'])
gs_output(gs_rf)
%time

# Saving Grid Search Results to pickle file 
gs_dir = '/content/drive/My Drive/Posture Algorithm/models'
gs_name = 'GS-RF-df_32-7'

joblib.dump(gs_results(gs_rf), '{}/{}.pkl'.format(gs_dir, gs_name), compress = 1 )
#gs_dump(gs=gs_rf, gs_name = gs_name, gs_dir = gs_dir, memory=memory, location)

[Memory]0.7s, 0.0min    : Loading _fit_transform_one from ./cachedir/joblib/sklearn/pipeline/_fit_transform_one/aecd4f96e7e7da40d078b510a8202f4e
___________________________________fit_transform_one cache loaded - 0.3s, 0.0min
[Memory]1.4s, 0.0min    : Loading _fit_transform_one from ./cachedir/joblib/sklearn/pipeline/_fit_transform_one/63ac992a5d1485b03c0f9831180f1a7b
___________________________________fit_transform_one cache loaded - 0.3s, 0.0min
________________________________________________________________________________
[Memory] Calling sklearn.pipeline._fit_transform_one...
_fit_transform_one(SelectKBest(k=70, score_func=<function f_classif at 0x7f5ff62c2378>), array([[ 0.578348, ...,  0.502025],
       ...,
       [-0.179725, ...,  0.054325]]), 
array(['standing', ..., 'jumping down'], dtype=object), None, message_clsname='Pipeline', message=None)
________________________________________________fit_transform_one - 0.9s, 0.0min
[Memory]0.7s, 0.0min    : Loading _fit_transform_o

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])


________________________________________________fit_transform_one - 0.8s, 0.0min
[Memory]0.7s, 0.0min    : Loading _fit_transform_one from ./cachedir/joblib/sklearn/pipeline/_fit_transform_one/c0bea16bfeb216b5c34a66f4d941be53
___________________________________fit_transform_one cache loaded - 0.1s, 0.0min
[Memory]1.3s, 0.0min    : Loading _fit_transform_one from ./cachedir/joblib/sklearn/pipeline/_fit_transform_one/9b80715ed522483c4729688360a92b3e
___________________________________fit_transform_one cache loaded - 0.1s, 0.0min
________________________________________________________________________________
[Memory] Calling sklearn.pipeline._fit_transform_one...
_fit_transform_one(SelectKBest(k=70, score_func=<function f_classif at 0x7f5ff62c2378>), array([[ 0.62341 , ...,  0.494886],
       ...,
       [-0.158451, ...,  0.048488]]), 
array(['standing', ..., 'jumping down'], dtype=object), None, message_clsname='Pipeline', message=None)
________________________________________________fi

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])


________________________________________________fit_transform_one - 1.0s, 0.0min
[Memory]0.8s, 0.0min    : Loading _fit_transform_one from ./cachedir/joblib/sklearn/pipeline/_fit_transform_one/42f1a0a38315ff66037b658ce0164cd5
___________________________________fit_transform_one cache loaded - 0.1s, 0.0min
[Memory]1.2s, 0.0min    : Loading _fit_transform_one from ./cachedir/joblib/sklearn/pipeline/_fit_transform_one/47231c21f9f70680feb5e08d3fdd4e76
___________________________________fit_transform_one cache loaded - 0.1s, 0.0min
________________________________________________________________________________
[Memory] Calling sklearn.pipeline._fit_transform_one...
_fit_transform_one(SelectKBest(k=80, score_func=<function f_classif at 0x7f5ff62c2378>), array([[ 0.589807, ...,  0.494872],
       ...,
       [-0.20462 , ...,  0.049165]]), 
array(['standing', ..., 'jumping down'], dtype=object), None, message_clsname='Pipeline', message=None)
________________________________________________fi

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])


________________________________________________fit_transform_one - 0.8s, 0.0min
[Memory]0.7s, 0.0min    : Loading _fit_transform_one from ./cachedir/joblib/sklearn/pipeline/_fit_transform_one/c0bea16bfeb216b5c34a66f4d941be53
___________________________________fit_transform_one cache loaded - 0.1s, 0.0min
[Memory]1.2s, 0.0min    : Loading _fit_transform_one from ./cachedir/joblib/sklearn/pipeline/_fit_transform_one/9b80715ed522483c4729688360a92b3e
___________________________________fit_transform_one cache loaded - 0.0s, 0.0min
________________________________________________________________________________
[Memory] Calling sklearn.pipeline._fit_transform_one...
_fit_transform_one(SelectKBest(k=80, score_func=<function f_classif at 0x7f5ff62c2378>), array([[ 0.62341 , ...,  0.494886],
       ...,
       [-0.158451, ...,  0.048488]]), 
array(['standing', ..., 'jumping down'], dtype=object), None, message_clsname='Pipeline', message=None)
________________________________________________fi

['/content/drive/My Drive/Posture Algorithm/models/GS-RF-df_32-7.pkl']

In [None]:
# Loading Grid Search Results from pickle file
gs_dir = '/content/drive/My Drive/Posture Algorithm/models'
gs_name = 'GS-RF-df_32-7'
gs = joblib.load('{}/{}.pkl'.format(gs_dir, gs_name))
gs_output(gs)

Best Estimator 
Test mean: 0.787763	 std: 0.055293
Train mean: 0.940938 	 std:  0.002723
parameters: {'estimator__max_depth': 15, 'estimator__n_estimators': 5, 'reduce_dim__k': 80}


In [None]:
# converting cv_results_ into dataframe
pd.concat([pd.DataFrame(clf.cv_results_["params"]),pd.DataFrame(clf.cv_results_["mean_test_score"], columns=["Accuracy"])],axis=1)

## Gradient Boosting

In [None]:
#################### GB
gs_pipe = Pipeline([
    ('selector', DataFrameSelector(features,'float64')),
    ('estimator', GradientBoostingClassifier())
], memory = memory)

gs_params = {
    'estimator__max_depth' : [10],
    'estimator__max_features' : [20],
    'estimator__n_estimators': [3, 5, 10]
}

## KNN


In [None]:
################## KNN
gs_pipe = Pipeline([
    ('selector', learn.DataFrameSelector(features,'float64')),
    ('scaler', StandardScaler()),
    ('estimator', KNeighborsClassifier(n_jobs=-1))
], memory = memory)

gs_params = {
    'estimator__n_neighbors' : [2,5,10,20,40],
    'estimator__weights': ['uniform', 'distance']
}

# Functions in Development

In [None]:
def balance (df, label):
    '''
        Balances df based on label
        Naive Undersampling, does not take into account the dogs  
    '''
    print('\nBalancing df for label', label , '\n')
    df_list = []
    min_sample = np.min(df[label].value_counts())
    for pos in df[label].unique():
        df_list.append(df[df[label] == pos].sample(min_sample))
    df_balanced = pd.concat(df_list)
    return df_balanced