# Multiclass MLP Classifier Implementation

A bit of copypasta from Jasmine's code:

In [2]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt

plt.rc("font", size=14) 

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

#from mpl_toolkits.basemap import Basemap


import os
import datetime
import pytz
import re

import peakutils
import statsmodels.api as sm

import requests


  from pandas.core import datetools


In [4]:
ride_ids = ['15692']


#ride_ids = ['14827']
# 14743 - Motion Control July 10th
# 14750 - Magnetometer Control July 11th
# 14814 - Pool Displacement Control July 17th
# 14815 - Compass Orientation (Lying on Charger Side) July 19th
# 14816 - Orientation w Higher Sampling (Lying on Charger Side) July 20th
# 14827 - Pool Displacement Control w Higher Sampling (Jul 23)
# 14888 - First Buoy Calibration Experiment (July 30)
# 15218 - Jasmine's Second Ride Sesh filmed with GoPro (Aug 29) //no footage
# 15629 - Jasmine's First Ride Sesh filmed with VIRB (Oct. 24) //first labelled footage!
# 15669 - Jasmine's Second Ride Sesh filmed with VIRB (Nov. 7) //second labelled footage!
# 15692 - Jasmine's 3rd Ride Sesh filmed with VIRB (Nov. 9) //third labelled footage!
# 15686 - Jasmine's 4th Ride Sesh filmed with VIRB (Nov. 11) //fourth labelled footage!


In [10]:
#%% Fin ID scraper
# Input fin ID, get all ride IDs
# base URL to which we'll append given fin IDs
fin_url_base = 'http://surf.smartfin.org/fin/'

# Look for the following text in the HTML contents in fcn below
str_id_ride = 'rideId = \'' # backslash allows us to look for single quote
str_id_date = 'var date = \'' # backslash allows us to look for single quote

#%% Ride ID scraper
# Input ride ID, get ocean and motion CSVs
# Base URL to which we'll append given ride IDs
ride_url_base = 'https://surf.smartfin.org/ride/'

# Look for the following text in the HTML contents in fcn below
str_id_csv = 'img id="temperatureChart" class="chart" src="' 

def get_csv_from_ride_id(rid):
    # Build URL for each individual ride
    ride_url = ride_url_base+str(rid)
    print(ride_url)
    
    # Get contents of ride_url
    html_contents = requests.get(ride_url).text
    
    # Find CSV identifier 
    loc_csv_id = html_contents.find(str_id_csv)
    
    # Different based on whether user logged in with FB or Google
    offset_googleOAuth = [46, 114]
    offset_facebkOAuth = [46, 112]
    if html_contents[loc_csv_id+59] == 'f': # Facebook login
        off0 = offset_facebkOAuth[0]
        off1 = offset_facebkOAuth[1]
    else: # Google login
        off0 = offset_googleOAuth[0]
        off1 = offset_googleOAuth[1]
        
    csv_id_longstr = html_contents[loc_csv_id+off0:loc_csv_id+off1]
    
#    print(csv_id_longstr)
    
    # Stitch together full URL for CSV
    if ("media" in csv_id_longstr) & ("Calibration" not in html_contents): # other junk URLs can exist and break everything
        
        ocean_csv_url = 'https://surf.smartfin.org/'+csv_id_longstr+'Ocean.CSV'
        motion_csv_url = 'https://surf.smartfin.org/'+csv_id_longstr+'Motion.CSV'
        
        print(ocean_csv_url)
        # Go to ocean_csv_url and grab contents (theoretically, a CSV)
        ocean_df_small = pd.read_csv(ocean_csv_url, parse_dates = [0])
        elapsed_timedelta = (ocean_df_small['UTC']-ocean_df_small['UTC'][0])
        ocean_df_small['elapsed'] = elapsed_timedelta/np.timedelta64(1, 's')
        
        motion_df_small = pd.read_csv(motion_csv_url, parse_dates = [0])
        
        # Reindex on timestamp if there are at least a few rows
        if len(ocean_df_small) > 1:
            ocean_df_small.set_index('UTC', drop = True, append = False, inplace = True)
            motion_df_small.set_index('UTC', drop = True, append = False, inplace = True)
            
            #print(ocean_df_small)
            #print(motion_df_small)
            
            #May need to change this sampling interval:
            sample_interval = '33ms'
            
            
            ocean_df_small_resample = ocean_df_small.resample(sample_interval).mean()
            motion_df_small_resample = motion_df_small.resample(sample_interval).mean()
            
            # No need to save many extra rows with no fix
            motion_df_small = motion_df_small[~np.isnan(motion_df_small.Latitude)]
            
            return ocean_df_small_resample, motion_df_small_resample

    else:
        ocean_df_small_resample = pd.DataFrame() # empty DF just so something is returned
        motion_df_small_resample = pd.DataFrame() 
        return ocean_df_small_resample, motion_df_small_resample
    
appended_ocean_list = [] # list of DataFrames from original CSVs
appended_motion_list = []
appended_multiIndex = [] # fin_id & ride_id used to identify each DataFrame

## Nested loops (for each fin ID, find all ride IDs, then build a DataFrame from all ride CSVs)
## (Here, ride IDS are either ocean or motion dataframes)
count_good_fins = 0
    
# Loop over ride_ids and find CSVs
for rid in ride_ids:
    try:
        new_ocean_df, new_motion_df = get_csv_from_ride_id(rid) # get given ride's CSV from its ride ID using function above
        #print(len(new_ocean_df))
        #print(len(new_motion_df))
        if not new_ocean_df.empty: # Calibration rides, for example
            # Append only if DF isn't empty. There may be a better way to control empty DFs which are created above
            appended_multiIndex.append(str(rid)) # build list to be multiIndex of future DataFrame
            appended_ocean_list.append(new_ocean_df)
            appended_motion_list.append(new_motion_df)
            print("Ride data has been uploaded.")
            #print("Ride: ", rid, "data has been uploaded.")
            count_good_fins += 1
        
    except: 
        print("Ride threw an exception!")
        #print("Ride ", rid, "threw an exception!")    

#%% Build the "Master" DataFrame

# appended_ocean_df.summary()
df_keys = tuple(appended_multiIndex) # keys gotta be a tuple, a list which data in it cannot be changed
ocean_df = pd.concat(appended_ocean_list, keys = df_keys, names=['ride_id'])
motion_df = pd.concat(appended_motion_list, keys = df_keys, names = ['ride_id'])


##Here, maybe just use info from the motion_df and don't worry about ocean_df data for now.
##If you do want ocean_df data, look at how Phil was getting it from "July 10th and 11th Calibration" jupyter notebook file.

#We can also check to see if the surfboard was recording "in-water-freq" or 
#"out-of-water-freq" based on how many NaN values we see. 
motion_df.head(10)

https://surf.smartfin.org/ride/15692
https://surf.smartfin.org/media/201811/google_105349665704999793400_0006667E229D_181109191556_Ocean.CSV
Ride data has been uploaded.


Unnamed: 0_level_0,Unnamed: 1_level_0,Time,IMU A1,IMU A2,IMU A3,IMU G1,IMU G2,IMU G3,IMU M1,IMU M2,IMU M3,Latitude,Longitude
ride_id,UTC,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
15692,2018-11-09 19:16:03.789,1414743000.0,493.0,48.0,110.0,75.0,-124.0,-86.0,-309.0,209.0,39.0,3285871.0,-11725690.0
15692,2018-11-09 19:16:03.822,,,,,,,,,,,,
15692,2018-11-09 19:16:03.855,,,,,,,,,,,,
15692,2018-11-09 19:16:03.888,,,,,,,,,,,,
15692,2018-11-09 19:16:03.921,,,,,,,,,,,,
15692,2018-11-09 19:16:03.954,,,,,,,,,,,,
15692,2018-11-09 19:16:03.987,,,,,,,,,,,,
15692,2018-11-09 19:16:04.020,,,,,,,,,,,,
15692,2018-11-09 19:16:04.053,1414743000.0,513.0,89.0,62.0,34.0,-36.0,-92.0,-320.0,194.0,38.0,,
15692,2018-11-09 19:16:04.086,,,,,,,,,,,,


In [9]:
#Drop the latitude and longitude values since most of them are Nan:
motion_df_dropped = motion_df.drop(columns=['Latitude', 'Longitude'])


#Drop the NAN values from the motion data:
motion_df_dropped = motion_df_dropped.dropna(axis=0, how='any')
motion_df_dropped.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Time,IMU A1,IMU A2,IMU A3,IMU G1,IMU G2,IMU G3,IMU M1,IMU M2,IMU M3
ride_id,UTC,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
15692,2018-11-09 19:16:03.789,1414743000.0,493.0,48.0,110.0,75.0,-124.0,-86.0,-309.0,209.0,39.0
15692,2018-11-09 19:16:04.053,1414743000.0,513.0,89.0,62.0,34.0,-36.0,-92.0,-320.0,194.0,38.0
15692,2018-11-09 19:16:04.284,1414743000.0,494.0,92.0,80.0,69.0,-63.0,-42.0,-329.0,189.0,49.0
15692,2018-11-09 19:16:04.548,1414744000.0,421.0,205.0,-104.0,192.0,-92.0,-37.0,-330.0,180.0,64.0
15692,2018-11-09 19:16:04.812,1414744000.0,534.0,306.0,-32.0,-421.0,-233.0,-229.0,-325.0,161.0,97.0
15692,2018-11-09 19:16:05.043,1414744000.0,455.0,149.0,-102.0,-355.0,-376.0,-397.0,-337.0,117.0,151.0
15692,2018-11-09 19:16:05.307,1414744000.0,474.0,342.0,-219.0,-234.0,-527.0,-465.0,-311.0,25.0,217.0
15692,2018-11-09 19:16:05.571,1414745000.0,363.0,323.0,-131.0,60.0,-662.0,-305.0,-238.0,-8.0,272.0
15692,2018-11-09 19:16:05.802,1414745000.0,-21.0,510.0,-447.0,78.0,-643.0,-153.0,-159.0,-21.0,321.0
15692,2018-11-09 19:16:06.066,1414745000.0,35.0,283.0,-132.0,-114.0,-430.0,132.0,-86.0,-38.0,326.0


In [11]:
#Create an elapsed_timedelta field:

#timedelta_values = (motion_df_dropped['Time']-motion_df_dropped['Time'][0])
#motion_df_dropped.insert(loc=1, column='TimeDelta', value=timedelta_values, drop=True)
motion_df_dropped['TimeDelta'] = (motion_df_dropped['Time']-motion_df_dropped['Time'][0])
#print(elapsed_timedelta)
#motion_df_dropped.head()
motion_df_dropped.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Time,IMU A1,IMU A2,IMU A3,IMU G1,IMU G2,IMU G3,IMU M1,IMU M2,IMU M3,TimeDelta
ride_id,UTC,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
15692,2018-11-09 19:16:03.789,1414743000.0,493.0,48.0,110.0,75.0,-124.0,-86.0,-309.0,209.0,39.0,0.0
15692,2018-11-09 19:16:04.053,1414743000.0,513.0,89.0,62.0,34.0,-36.0,-92.0,-320.0,194.0,38.0,252.5
15692,2018-11-09 19:16:04.284,1414743000.0,494.0,92.0,80.0,69.0,-63.0,-42.0,-329.0,189.0,49.0,501.5
15692,2018-11-09 19:16:04.548,1414744000.0,421.0,205.0,-104.0,192.0,-92.0,-37.0,-330.0,180.0,64.0,753.5
15692,2018-11-09 19:16:04.812,1414744000.0,534.0,306.0,-32.0,-421.0,-233.0,-229.0,-325.0,161.0,97.0,1003.5
15692,2018-11-09 19:16:05.043,1414744000.0,455.0,149.0,-102.0,-355.0,-376.0,-397.0,-337.0,117.0,151.0,1253.5
15692,2018-11-09 19:16:05.307,1414744000.0,474.0,342.0,-219.0,-234.0,-527.0,-465.0,-311.0,25.0,217.0,1504.5
15692,2018-11-09 19:16:05.571,1414745000.0,363.0,323.0,-131.0,60.0,-662.0,-305.0,-238.0,-8.0,272.0,1755.5
15692,2018-11-09 19:16:05.802,1414745000.0,-21.0,510.0,-447.0,78.0,-643.0,-153.0,-159.0,-21.0,321.0,2006.5
15692,2018-11-09 19:16:06.066,1414745000.0,35.0,283.0,-132.0,-114.0,-430.0,132.0,-86.0,-38.0,326.0,2258.5


In [13]:
#Footage sync code written by Alina:

import time

#simple method: only walking, paddling, floating, surfing
#complex method: columns created based on footage file labels
def label_data( footage_file = 'Footage.txt', labelling_method = 'simple', sync_threshold = 20000 ):
    
    #First, perform sync
    sync_buf = 0
    with open(footage_file) as file:
        for line in file:
            labelled_time = line.split(None, 2) 
            try:
                cur_time = time.strptime(labelled_time[0], '%M:%S')
            except:
                continue
            labelled_time[1] = labelled_time[1].rstrip()
            if labelled_time[1].lower() == 'sync': #Assumption that first word in sync line is "sync"
                sync_time = cur_time.tm_min * 60 * 1000 + cur_time.tm_sec * 1000
                index = 0
                start = 0
                end = 0
                #Syncing occurs when IMU A2 data is negative for a longer period than the provided threshold
                #Default is 20 seconds
                for data in motion_df_dropped['IMU A2']:
                    if data < 0 and start == 0:
                        start = motion_df_dropped['TimeDelta'][index]
                    elif data > 0 and start != 0:
                        end = motion_df_dropped['TimeDelta'][index]
                        if end - start > sync_threshold:
                            sync_buf = start - sync_time
                            break
                        start = 0
                    index += 1

    accepted_labels = set()
    if labelling_method == 'simple':
        accepted_labels = {'WALKING', 'PADDLING', 'FLOATING', 'SURFING', 'SYNC (FLIP BOARD UPSIDE DOWN TO SYNC DATA/FOOTAGE)'}

        #Create new DataFrame containing label info
        label_frame = pd.DataFrame(0, index = motion_df_dropped.index, columns = accepted_labels)
        for label in accepted_labels:
            label_frame[label] = [0] * len(motion_df_dropped['Time'])
    
    #Convention of labelled footage text: "MINUTE:SECOND LABEL"
    elapsed_time = 0
    cur_label = ''
    buffer = 0
    with open(footage_file) as file:
        for line in file:
            
            if labelling_method == 'simple':
                labelled_time = line.split(None, 2) #simple categorizes on a one-word basis
            else:
                labelled_time = line.split(None, 1) #complex requires the entire label
                
            #If the first word is not a properly formatted time, the line cannot be read
            try:
                cur_time = time.strptime(labelled_time[0], '%M:%S')
                cur_timeMS = cur_time.tm_min * 60 * 1000 + cur_time.tm_sec * 1000 + sync_buf
            except:
                continue
            labelled_time[1] = labelled_time[1].rstrip() #Remove potential newline
                
            #Check for end of video and modify buffer accordingly
            if labelled_time[1].lower() == 'end of video': #Assumption that label end video with "end of video"
                buffer += cur_timeMS
                
            #Modify accepted labels list if reading a new label and in complex mode
            elif labelling_method == 'complex' and (labelled_time[1].upper() not in accepted_labels):
                accepted_labels.add(labelled_time[1].upper())
                if not cur_label:
                    label_frame = pd.DataFrame(0, index = motion_df_dropped.index, columns = accepted_labels)
                label_frame[labelled_time[1].upper()] = [0] * len(motion_df_dropped['Time'])
                
            if labelled_time[1].upper() in accepted_labels:
                while (elapsed_time < len(motion_df_dropped['Time']) and
                      (np.isnan(motion_df_dropped['TimeDelta'][elapsed_time]) or
                       motion_df_dropped['TimeDelta'][elapsed_time] < cur_timeMS + buffer)):
                    if cur_label != '':
                        label_frame[cur_label][elapsed_time] = 1
                    elapsed_time += 1
                if labelled_time[1].upper() != 'end of video':
                    cur_label = labelled_time[1].upper()

    labelled = pd.concat([motion_df_dropped, label_frame], axis = 1)

    return labelled

pd.options.display.max_rows = 5000
pd.options.display.max_columns = 5000

motion_df_simple = label_data('Footage3.txt')
motion_df_simple.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Time,IMU A1,IMU A2,IMU A3,IMU G1,IMU G2,IMU G3,IMU M1,IMU M2,IMU M3,TimeDelta,WALKING,FLOATING,SYNC (FLIP BOARD UPSIDE DOWN TO SYNC DATA/FOOTAGE),SURFING,PADDLING
ride_id,UTC,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
15692,2018-11-09 19:16:03.789,1414743000.0,493.0,48.0,110.0,75.0,-124.0,-86.0,-309.0,209.0,39.0,0.0,0,0,0,0,0
15692,2018-11-09 19:16:04.053,1414743000.0,513.0,89.0,62.0,34.0,-36.0,-92.0,-320.0,194.0,38.0,252.5,0,0,0,0,0
15692,2018-11-09 19:16:04.284,1414743000.0,494.0,92.0,80.0,69.0,-63.0,-42.0,-329.0,189.0,49.0,501.5,0,0,0,0,0
15692,2018-11-09 19:16:04.548,1414744000.0,421.0,205.0,-104.0,192.0,-92.0,-37.0,-330.0,180.0,64.0,753.5,0,0,0,0,0
15692,2018-11-09 19:16:04.812,1414744000.0,534.0,306.0,-32.0,-421.0,-233.0,-229.0,-325.0,161.0,97.0,1003.5,0,0,0,0,0
15692,2018-11-09 19:16:05.043,1414744000.0,455.0,149.0,-102.0,-355.0,-376.0,-397.0,-337.0,117.0,151.0,1253.5,0,0,0,0,0
15692,2018-11-09 19:16:05.307,1414744000.0,474.0,342.0,-219.0,-234.0,-527.0,-465.0,-311.0,25.0,217.0,1504.5,0,0,0,0,0
15692,2018-11-09 19:16:05.571,1414745000.0,363.0,323.0,-131.0,60.0,-662.0,-305.0,-238.0,-8.0,272.0,1755.5,0,0,0,0,0
15692,2018-11-09 19:16:05.802,1414745000.0,-21.0,510.0,-447.0,78.0,-643.0,-153.0,-159.0,-21.0,321.0,2006.5,0,0,0,0,0
15692,2018-11-09 19:16:06.066,1414745000.0,35.0,283.0,-132.0,-114.0,-430.0,132.0,-86.0,-38.0,326.0,2258.5,0,0,0,0,0


In [14]:
motion_df_complex = label_data('Footage3.txt', 'complex')
motion_df_complex.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Time,IMU A1,IMU A2,IMU A3,IMU G1,IMU G2,IMU G3,IMU M1,IMU M2,IMU M3,TimeDelta,PUSHING BOARD INTO WATER,SYNC (FLIP BOARD UPSIDE DOWN TO SYNC DATA/FOOTAGE),FLIP BOARD RIGHT SIDE UP,WALKING,PUSH-OFF,PADDLING INTO WAVES,WALKING IN WATER,SIT-UP,FLOATING,TURNING TO SURFER'S LEFT,LAY-DOWN,PADDLING FOR A WAVE,POP-UP,SURFING,STEP-OFF,TURNING TO SURFER'S RIGHT,SIT-BACK,OFF-BOARD,PADDLING,WIPE-OUT,PULL-BACK LEASH,PADDLING FOR POSITION,NEW,"DONE, OUT OF WATER",WALKING OUT OF WATER
ride_id,UTC,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1
15692,2018-11-09 19:16:03.789,1414743000.0,493.0,48.0,110.0,75.0,-124.0,-86.0,-309.0,209.0,39.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
15692,2018-11-09 19:16:04.053,1414743000.0,513.0,89.0,62.0,34.0,-36.0,-92.0,-320.0,194.0,38.0,252.5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
15692,2018-11-09 19:16:04.284,1414743000.0,494.0,92.0,80.0,69.0,-63.0,-42.0,-329.0,189.0,49.0,501.5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
15692,2018-11-09 19:16:04.548,1414744000.0,421.0,205.0,-104.0,192.0,-92.0,-37.0,-330.0,180.0,64.0,753.5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
15692,2018-11-09 19:16:04.812,1414744000.0,534.0,306.0,-32.0,-421.0,-233.0,-229.0,-325.0,161.0,97.0,1003.5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
15692,2018-11-09 19:16:05.043,1414744000.0,455.0,149.0,-102.0,-355.0,-376.0,-397.0,-337.0,117.0,151.0,1253.5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
15692,2018-11-09 19:16:05.307,1414744000.0,474.0,342.0,-219.0,-234.0,-527.0,-465.0,-311.0,25.0,217.0,1504.5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
15692,2018-11-09 19:16:05.571,1414745000.0,363.0,323.0,-131.0,60.0,-662.0,-305.0,-238.0,-8.0,272.0,1755.5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
15692,2018-11-09 19:16:05.802,1414745000.0,-21.0,510.0,-447.0,78.0,-643.0,-153.0,-159.0,-21.0,321.0,2006.5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
15692,2018-11-09 19:16:06.066,1414745000.0,35.0,283.0,-132.0,-114.0,-430.0,132.0,-86.0,-38.0,326.0,2258.5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [16]:
#correct IMU data

#make a deep copy of motion_df_labelled
df_converted = motion_df_complex.copy(deep = 'true')

#for rows in df_corrected
for row in range(0, df_converted.shape[0]):
    
    #convert acceleromters (new: m/s^2)
    df_converted.iloc[row, df_converted.columns.get_loc('IMU A1')] *= 0.019141
    df_converted.iloc[row, df_converted.columns.get_loc('IMU A2')] *= 0.019141
    df_converted.iloc[row, df_converted.columns.get_loc('IMU A3')] *= 0.019141
 
    #convert gyroscopes (new: deg/s)
    df_converted.iloc[row, df_converted.columns.get_loc('IMU G1')] /= 8.2
    df_converted.iloc[row, df_converted.columns.get_loc('IMU G2')] /= 8.2
    df_converted.iloc[row, df_converted.columns.get_loc('IMU G3')] /= 8.2

motion_df_complex.head(10)    

Unnamed: 0_level_0,Unnamed: 1_level_0,Time,IMU A1,IMU A2,IMU A3,IMU G1,IMU G2,IMU G3,IMU M1,IMU M2,IMU M3,TimeDelta,PUSHING BOARD INTO WATER,SYNC (FLIP BOARD UPSIDE DOWN TO SYNC DATA/FOOTAGE),FLIP BOARD RIGHT SIDE UP,WALKING,PUSH-OFF,PADDLING INTO WAVES,WALKING IN WATER,SIT-UP,FLOATING,TURNING TO SURFER'S LEFT,LAY-DOWN,PADDLING FOR A WAVE,POP-UP,SURFING,STEP-OFF,TURNING TO SURFER'S RIGHT,SIT-BACK,OFF-BOARD,PADDLING,WIPE-OUT,PULL-BACK LEASH,PADDLING FOR POSITION,NEW,"DONE, OUT OF WATER",WALKING OUT OF WATER
ride_id,UTC,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1
15692,2018-11-09 19:16:03.789,1414743000.0,493.0,48.0,110.0,75.0,-124.0,-86.0,-309.0,209.0,39.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
15692,2018-11-09 19:16:04.053,1414743000.0,513.0,89.0,62.0,34.0,-36.0,-92.0,-320.0,194.0,38.0,252.5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
15692,2018-11-09 19:16:04.284,1414743000.0,494.0,92.0,80.0,69.0,-63.0,-42.0,-329.0,189.0,49.0,501.5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
15692,2018-11-09 19:16:04.548,1414744000.0,421.0,205.0,-104.0,192.0,-92.0,-37.0,-330.0,180.0,64.0,753.5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
15692,2018-11-09 19:16:04.812,1414744000.0,534.0,306.0,-32.0,-421.0,-233.0,-229.0,-325.0,161.0,97.0,1003.5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
15692,2018-11-09 19:16:05.043,1414744000.0,455.0,149.0,-102.0,-355.0,-376.0,-397.0,-337.0,117.0,151.0,1253.5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
15692,2018-11-09 19:16:05.307,1414744000.0,474.0,342.0,-219.0,-234.0,-527.0,-465.0,-311.0,25.0,217.0,1504.5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
15692,2018-11-09 19:16:05.571,1414745000.0,363.0,323.0,-131.0,60.0,-662.0,-305.0,-238.0,-8.0,272.0,1755.5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
15692,2018-11-09 19:16:05.802,1414745000.0,-21.0,510.0,-447.0,78.0,-643.0,-153.0,-159.0,-21.0,321.0,2006.5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
15692,2018-11-09 19:16:06.066,1414745000.0,35.0,283.0,-132.0,-114.0,-430.0,132.0,-86.0,-38.0,326.0,2258.5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [20]:
df_converted.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Time,IMU A1,IMU A2,IMU A3,IMU G1,IMU G2,IMU G3,IMU M1,IMU M2,IMU M3,TimeDelta,PUSHING BOARD INTO WATER,SYNC (FLIP BOARD UPSIDE DOWN TO SYNC DATA/FOOTAGE),FLIP BOARD RIGHT SIDE UP,WALKING,PUSH-OFF,PADDLING INTO WAVES,WALKING IN WATER,SIT-UP,FLOATING,TURNING TO SURFER'S LEFT,LAY-DOWN,PADDLING FOR A WAVE,POP-UP,SURFING,STEP-OFF,TURNING TO SURFER'S RIGHT,SIT-BACK,OFF-BOARD,PADDLING,WIPE-OUT,PULL-BACK LEASH,PADDLING FOR POSITION,NEW,"DONE, OUT OF WATER",WALKING OUT OF WATER
ride_id,UTC,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1
15692,2018-11-09 19:16:03.789,1414743000.0,9.436513,0.918768,2.10551,9.146341,-15.121951,-10.487805,-309.0,209.0,39.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
15692,2018-11-09 19:16:04.053,1414743000.0,9.819333,1.703549,1.186742,4.146341,-4.390244,-11.219512,-320.0,194.0,38.0,252.5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
15692,2018-11-09 19:16:04.284,1414743000.0,9.455654,1.760972,1.53128,8.414634,-7.682927,-5.121951,-329.0,189.0,49.0,501.5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
15692,2018-11-09 19:16:04.548,1414744000.0,8.058361,3.923905,-1.990664,23.414634,-11.219512,-4.512195,-330.0,180.0,64.0,753.5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
15692,2018-11-09 19:16:04.812,1414744000.0,10.221294,5.857146,-0.612512,-51.341463,-28.414634,-27.926829,-325.0,161.0,97.0,1003.5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
15692,2018-11-09 19:16:05.043,1414744000.0,8.709155,2.852009,-1.952382,-43.292683,-45.853659,-48.414634,-337.0,117.0,151.0,1253.5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
15692,2018-11-09 19:16:05.307,1414744000.0,9.072834,6.546222,-4.191879,-28.536585,-64.268293,-56.707317,-311.0,25.0,217.0,1504.5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
15692,2018-11-09 19:16:05.571,1414745000.0,6.948183,6.182543,-2.507471,7.317073,-80.731707,-37.195122,-238.0,-8.0,272.0,1755.5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
15692,2018-11-09 19:16:05.802,1414745000.0,-0.401961,9.76191,-8.556027,9.512195,-78.414634,-18.658537,-159.0,-21.0,321.0,2006.5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
15692,2018-11-09 19:16:06.066,1414745000.0,0.669935,5.416903,-2.526612,-13.902439,-52.439024,16.097561,-86.0,-38.0,326.0,2258.5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [38]:
print("Creating multiclass dataframe...")

#Can also add "UTC Time" and "Time Delta" columns if needed later. 
multiclass_df = df_converted[['IMU A1', 'IMU A2', 'IMU A3', 'IMU G1', 'IMU G2', 'IMU G3', 'IMU M1', 'IMU M2', 'IMU M3']].copy()
parse_df = df_converted.copy()
       
#Parse the columns of the original dataframe to extract labels into a single column for the new multiclass df. 
def parse_func(x):
    l=[]
    val = 1
    for col in parse_df:
        if x[col] == val:
            l.append(col)

    return pd.Series(l)

parse_df = parse_df.apply(parse_func, axis=1)

#Drop the 2nd column of parse_df (which has all NaN values)
cols = [1]
parse_df.drop(parse_df.columns[cols],axis=1,inplace=True)
parse_df.columns=['Complex Surfer Motion Label']
#print(parse_df.head(100))


#Concatenate the parse column with the multiclass dataframe. 
multiclass_df = pd.concat([multiclass_df, parse_df], axis=1)
#print(multiclass_df.head(100))

#Drop the NaN rows from the beginning/end of the df.
print("Shape before dropping NA:", multiclass_df.shape)
multiclass_df = multiclass_df.dropna()
print("Shape after dropping NA:", multiclass_df.shape)

print("\n Done creating complex multiclass df.")


Creating multiclass dataframe...
Shape before dropping NA: (21645, 10)
Shape after dropping NA: (9388, 10)

 Done creating complex multiclass df.


In [46]:
#Create simple multiclass df: (Surfing, Floating, Paddling, Misc.)

simple_multiclass_df = multiclass_df.copy()

simple_multiclass_df.loc[simple_multiclass_df['Complex Surfer Motion Label'].str.contains('SURFING|PADDLING|FLOATING')==False, 'Complex Surfer Motion Label'] = 'MISC'
simple_multiclass_df.loc[simple_multiclass_df['Complex Surfer Motion Label'].str.contains('PADDLING'), 'Complex Surfer Motion Label'] = 'PADDLING'
#simple_multiclass_df.loc[simple_multiclass_df['Complex Surfer Motion Label'].str.contains('TURNING'), 'Complex Surfer Motion Label'] = 'FLOATING'


column_names = list(simple_multiclass_df)
column_names = column_names[:-1]
column_names.append("Simple Surfer Motion Label")

simple_multiclass_df.columns=column_names

#Check to see that labels exist: 
#simple_multiclass_df[simple_multiclass_df['Simple Surfer Motion Label'].str.match('FLOATING|PADDLING|SURFING')]
simple_multiclass_df

Unnamed: 0_level_0,Unnamed: 1_level_0,IMU A1,IMU A2,IMU A3,IMU G1,IMU G2,IMU G3,IMU M1,IMU M2,IMU M3,Simple Surfer Motion Label
ride_id,UTC,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
15692,2018-11-09 19:16:27.945,-1.684408,8.900565,3.292252,-2.682927,12.560976,2.560976,80.0,-132.0,56.0,MISC
15692,2018-11-09 19:16:28.209,-1.741831,9.091975,3.521944,2.195122,14.390244,4.512195,70.0,-136.0,68.0,MISC
15692,2018-11-09 19:16:28.440,-1.512139,9.187680,3.253970,0.365854,19.268293,6.219512,77.0,-127.0,73.0,MISC
15692,2018-11-09 19:16:28.704,-1.588703,9.225962,3.177406,6.219512,22.317073,7.682927,73.0,-141.0,83.0,MISC
15692,2018-11-09 19:16:28.968,-1.856677,9.417372,2.775445,-1.341463,27.682927,4.268293,68.0,-142.0,112.0,MISC
15692,2018-11-09 19:16:29.199,-1.091037,9.034552,3.024278,2.682927,23.780488,7.804878,57.0,-141.0,125.0,MISC
15692,2018-11-09 19:16:29.463,-0.861345,9.053693,2.966855,6.097561,26.829268,9.878049,45.0,-149.0,129.0,MISC
15692,2018-11-09 19:16:29.694,-1.186742,9.895897,1.875818,4.390244,29.390244,9.634146,36.0,-148.0,152.0,MISC
15692,2018-11-09 19:16:29.958,-1.684408,9.589641,1.569562,2.195122,28.536585,5.731707,15.0,-153.0,167.0,MISC
15692,2018-11-09 19:16:30.189,-1.780113,9.340808,1.837536,1.829268,27.804878,5.975610,2.0,-164.0,176.0,MISC


# Implementing the Multi-Class MLP Classifier

Hooray, we have the data in proper form. Now lets get our MLP going.

In [128]:
simple_multi_dataset = simple_multiclass_df.copy()

In [129]:
X = simple_multi_dataset.iloc[:, :-1].values  #selects everything until the last column listed

y = simple_multi_dataset.iloc[:, -1].values   #selects the last column 
print("X.shape:", X.shape)
print("y.shape:", y.shape)

print("y:", y)

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 0)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X.shape: (9388, 9)
y.shape: (9388,)
y: ['MISC' 'MISC' 'MISC' ... 'MISC' 'MISC' 'MISC']
X_train shape: (8449, 9)
y_train shape: (8449,)
X_test shape: (939, 9)
y_test shape: (939,)


SMOTE does not necessarily work for multiple target classes, but we can apply multiple passes of SMOTE by looking at only two target classes at a time. I will revisit this later.

In [1]:
X_train_res, y_train_res = X_train, y_train


#We still need to scale our data since, at least according to sklearn, MLP's do not handle unscaled data well.
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train_res)
X_train_res = scaler.transform(X_train_res)
X_test = scaler.transform(X_test)

NameError: name 'X_train' is not defined

In [131]:
from sklearn.neural_network import MLPClassifier
# A lower regularization coeffecient (alpha) improves accuracy across the board by around 1-5%
# The dataset appears to be heavily imbalanced, so SMOTE may need to be done or we need to choose a better dataset
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(15,15), random_state=1, verbose=True)
clf.fit(X_train_res, y_train_res)

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(15, 15), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=True,
       warm_start=False)

In [132]:
# Predicting the Test set results
y_pred = clf.predict(X_test)

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_true=y_test, y_pred=y_pred, labels=["FLOATING", "PADDLING", "SURFING", "MISC"])
print("When printing the confusion matrix, the first row tells us the number of correct predictions while the second row tells us the number of incorrect predictions.")
print(cm)

When printing the confusion matrix, the first row tells us the number of correct predictions while the second row tells us the number of incorrect predictions.
[[362  34   0  42]
 [ 64 108   0  45]
 [ 10   3   0   3]
 [ 80  50   0 138]]


In [133]:
from sklearn.model_selection import cross_val_score
score = cross_val_score(estimator = clf, X = X_train_res, y = y_train_res, cv = 10)
print(score)

[0.64935065 0.6257379  0.62174941 0.64184397 0.61018957 0.62085308
 0.59834123 0.62085308 0.61137441 0.6405694 ]


In [95]:
unique, counts = np.unique(y_pred, return_counts=True)
dict(zip(unique, counts))

{'MISC': 936, 'PADDLING': 3}

In [107]:
unique, counts = np.unique(y_train, return_counts=True)
dict(zip(unique, counts))

{'FLOATING': 3874, 'MISC': 2432, 'PADDLING': 1979, 'SURFING': 164}

In [111]:
unique, counts = np.unique(y_test, return_counts=True)
dict(zip(unique, counts))

{'FLOATING': 438, 'MISC': 268, 'PADDLING': 217, 'SURFING': 16}

In [116]:
print("Average score from confusion matrix:")
print(sum(score)/len(score))

0.6240862697855855

In [118]:
from sklearn import metrics
print("Multiclass MLP Train Accuracy :: ", metrics.accuracy_score(y_train, clf.predict(X_train)))
print("Multiclass MLP Test Accuracy :: ", metrics.accuracy_score(y_test, clf.predict(X_test)))

Multiclass MLP Train Accuracy ::  0.28512249970410697
Multiclass MLP Test Accuracy ::  0.24494142705005326
