## Creating a logistic regression model in Python!

In [1]:
# Reference: https://github.com/susanli2016/Machine-Learning-with-Python/blob/master/Logistic%20Regression%20balanced.ipynb

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt 
plt.rc("font", size=14) 

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

from mpl_toolkits.basemap import Basemap

import os
import datetime
import pytz
import re

import peakutils
import statsmodels.api as sm

import requests

#Read data from a local csv file:

##Will change this to scrape files from the Smartfin.org website later.
data = pd.read_csv('Motion_13735.CSV', header=0)   
data = data.dropna()

#Print out the column headings:
print(data.shape)
print(list(data.columns))

(0, 13)
['UTC', 'Time', 'IMU A1', 'IMU A2', 'IMU A3', 'IMU G1', 'IMU G2', 'IMU G3', 'IMU M1', 'IMU M2', 'IMU M3', 'Latitude', 'Longitude']


In [2]:
ride_ids = ['15629']


#ride_ids = ['14827']
# 14743 - Motion Control July 10th
# 14750 - Magnetometer Control July 11th
# 14814 - Pool Displacement Control July 17th
# 14815 - Compass Orientation (Lying on Charger Side) July 19th
# 14816 - Orientation w Higher Sampling (Lying on Charger Side) July 20th
# 14827 - Pool Displacement Control w Higher Sampling (Jul 23)
# 14888 - First Buoy Calibration Experiment (July 30)
# 15218 - Jasmine's Second Ride Sesh filmed with GoPro (Aug 29) //no footage
# 15629 - Jasmine's First Ride Sesh filmed with VIRB (Oct. 24) //first labelled footage!
# 15669 - Jasmine's Second Ride Sesh filmed with VIRB (Nov. 7) //second labelled footage!


In [3]:
#%% Fin ID scraper
# Input fin ID, get all ride IDs
# base URL to which we'll append given fin IDs
fin_url_base = 'http://surf.smartfin.org/fin/'

# Look for the following text in the HTML contents in fcn below
str_id_ride = 'rideId = \'' # backslash allows us to look for single quote
str_id_date = 'var date = \'' # backslash allows us to look for single quote

#%% Ride ID scraper
# Input ride ID, get ocean and motion CSVs
# Base URL to which we'll append given ride IDs
ride_url_base = 'https://surf.smartfin.org/ride/'

# Look for the following text in the HTML contents in fcn below
str_id_csv = 'img id="temperatureChart" class="chart" src="' 

def get_csv_from_ride_id(rid):
    # Build URL for each individual ride
    ride_url = ride_url_base+str(rid)
    print(ride_url)
    
    # Get contents of ride_url
    html_contents = requests.get(ride_url).text
    
    # Find CSV identifier 
    loc_csv_id = html_contents.find(str_id_csv)
    
    # Different based on whether user logged in with FB or Google
    offset_googleOAuth = [46, 114]
    offset_facebkOAuth = [46, 112]
    if html_contents[loc_csv_id+59] == 'f': # Facebook login
        off0 = offset_facebkOAuth[0]
        off1 = offset_facebkOAuth[1]
    else: # Google login
        off0 = offset_googleOAuth[0]
        off1 = offset_googleOAuth[1]
        
    csv_id_longstr = html_contents[loc_csv_id+off0:loc_csv_id+off1]
    
#    print(csv_id_longstr)
    
    # Stitch together full URL for CSV
    if ("media" in csv_id_longstr) & ("Calibration" not in html_contents): # other junk URLs can exist and break everything
        
        ocean_csv_url = 'https://surf.smartfin.org/'+csv_id_longstr+'Ocean.CSV'
        motion_csv_url = 'https://surf.smartfin.org/'+csv_id_longstr+'Motion.CSV'
        
        print(ocean_csv_url)
        # Go to ocean_csv_url and grab contents (theoretically, a CSV)
        ocean_df_small = pd.read_csv(ocean_csv_url, parse_dates = [0])
        elapsed_timedelta = (ocean_df_small['UTC']-ocean_df_small['UTC'][0])
        ocean_df_small['elapsed'] = elapsed_timedelta/np.timedelta64(1, 's')
        
        motion_df_small = pd.read_csv(motion_csv_url, parse_dates = [0])
        
        # Reindex on timestamp if there are at least a few rows
        if len(ocean_df_small) > 1:
            ocean_df_small.set_index('UTC', drop = True, append = False, inplace = True)
            motion_df_small.set_index('UTC', drop = True, append = False, inplace = True)
            
            #print(ocean_df_small)
            #print(motion_df_small)
            
            #May need to change this sampling interval:
            sample_interval = '33ms'
            
            
            ocean_df_small_resample = ocean_df_small.resample(sample_interval).mean()
            motion_df_small_resample = motion_df_small.resample(sample_interval).mean()
            
            # No need to save many extra rows with no fix
            motion_df_small = motion_df_small[~np.isnan(motion_df_small.Latitude)]
            
            return ocean_df_small_resample, motion_df_small_resample

    else:
        ocean_df_small_resample = pd.DataFrame() # empty DF just so something is returned
        motion_df_small_resample = pd.DataFrame() 
        return ocean_df_small_resample, motion_df_small_resample
    
appended_ocean_list = [] # list of DataFrames from original CSVs
appended_motion_list = []
appended_multiIndex = [] # fin_id & ride_id used to identify each DataFrame

## Nested loops (for each fin ID, find all ride IDs, then build a DataFrame from all ride CSVs)
## (Here, ride IDS are either ocean or motion dataframes)
count_good_fins = 0
    
# Loop over ride_ids and find CSVs
for rid in ride_ids:
    try:
        new_ocean_df, new_motion_df = get_csv_from_ride_id(rid) # get given ride's CSV from its ride ID using function above
        #print(len(new_ocean_df))
        #print(len(new_motion_df))
        if not new_ocean_df.empty: # Calibration rides, for example
            # Append only if DF isn't empty. There may be a better way to control empty DFs which are created above
            appended_multiIndex.append(str(rid)) # build list to be multiIndex of future DataFrame
            appended_ocean_list.append(new_ocean_df)
            appended_motion_list.append(new_motion_df)
            print("Ride data has been uploaded.")
            #print("Ride: ", rid, "data has been uploaded.")
            count_good_fins += 1
        
    except: 
        print("Ride threw an exception!")
        #print("Ride ", rid, "threw an exception!")    

#%% Build the "Master" DataFrame

# appended_ocean_df.summary()
df_keys = tuple(appended_multiIndex) # keys gotta be a tuple, a list which data in it cannot be changed
ocean_df = pd.concat(appended_ocean_list, keys = df_keys, names=['ride_id'])
motion_df = pd.concat(appended_motion_list, keys = df_keys, names = ['ride_id'])


##Here, maybe just use info from the motion_df and don't worry about ocean_df data for now.
##If you do want ocean_df data, look at how Phil was getting it from "July 10th and 11th Calibration" jupyter notebook file.

#We can also check to see if the surfboard was recording "in-water-freq" or 
#"out-of-water-freq" based on how many NaN values we see. 
print(motion_df)

https://surf.smartfin.org/ride/15629
https://surf.smartfin.org/media/201810/google_105349665704999793400_0006667E229D_181031013846_Ocean.CSV
Ride data has been uploaded.
                                       Time  IMU A1  IMU A2  IMU A3  IMU G1  \
ride_id UTC                                                                   
15629   2018-10-31 19:33:01.839  11992061.0    -6.0   540.0    47.0   -24.0   
        2018-10-31 19:33:01.872         NaN     NaN     NaN     NaN     NaN   
        2018-10-31 19:33:01.905         NaN     NaN     NaN     NaN     NaN   
        2018-10-31 19:33:01.938         NaN     NaN     NaN     NaN     NaN   
        2018-10-31 19:33:01.971         NaN     NaN     NaN     NaN     NaN   
        2018-10-31 19:33:02.004         NaN     NaN     NaN     NaN     NaN   
        2018-10-31 19:33:02.037         NaN     NaN     NaN     NaN     NaN   
        2018-10-31 19:33:02.070         NaN     NaN     NaN     NaN     NaN   
        2018-10-31 19:33:02.103  1199231

In [16]:
#Drop the latitude and longitude values since most of them are Nan:
motion_df_dropped = motion_df.drop(columns=['Latitude', 'Longitude'])


#Drop the NAN values from the motion data:
motion_df_dropped = motion_df_dropped.dropna(axis=0, how='any')
print(motion_df_dropped)

                                       Time  IMU A1  IMU A2  IMU A3  IMU G1  \
ride_id UTC                                                                   
15629   2018-10-31 19:33:01.839  11992061.0    -6.0   540.0    47.0   -24.0   
        2018-10-31 19:33:02.103  11992311.0     4.0   514.0    93.0    10.0   
        2018-10-31 19:33:02.334  11992561.0    14.0   532.0   126.0    40.0   
        2018-10-31 19:33:02.598  11992813.0    13.0   528.0   133.0   -72.0   
        2018-10-31 19:33:02.829  11993063.0    23.0   481.0   121.0   -30.0   
        2018-10-31 19:33:03.093  11993313.0    24.0   452.0   105.0    74.0   
        2018-10-31 19:33:03.357  11993564.0    10.0   467.0    39.0    33.0   
        2018-10-31 19:33:03.588  11993815.0    -5.0   490.0    60.0  -151.0   
        2018-10-31 19:33:03.852  11994065.0    16.0   475.0    94.0  -117.0   
        2018-10-31 19:33:04.116  11994316.0   -17.0   461.0    38.0   -34.0   
        2018-10-31 19:33:04.347  11994567.0   -20.0 

In [5]:
#Sync the video footage with the UTC time from the Smartfin. 
#Code to convert UTC Time to PCT time (UTC Time is 7 hrs. ahead of PCT Time):

#A reference: https://stackoverflow.com/questions/22800079/converting-time-zone-pandas-dataframe


#sLength = len(motion_df_dropped['UTC'])
#motion_df_dropped['PCT'] = p.Series(np.random.randn(sLength), index=motion_df_dropped.index)


#motion_df_dropped['PCT'] = (motion_df_dropped.UTC)

from datetime import datetime, timedelta
from pytz import timezone
import pytz

#To get the first column i.e. 'Time': 
#print(motion_df_dropped.iloc[:,0])


pacific = pytz.timezone('US/Pacific')
#motion_df_dropped.index = motion_df_dropped.index.tz_localize(pytz.utc).tz_convert(pacific)
#motion_df_dropped.index = motion_df_dropped.index.tz_localize(pytz.utc).tz_convert(pacific)

df = pd.DataFrame(motion_df_dropped)
df.index = df.index.tz_localize('UTC')
df.index = df.index.tz_convert(pacific)


AttributeError: 'MultiIndex' object has no attribute 'tz_localize'

In [17]:
#Create an elapsed_timedelta field:

#timedelta_values = (motion_df_dropped['Time']-motion_df_dropped['Time'][0])
#motion_df_dropped.insert(loc=1, column='TimeDelta', value=timedelta_values, drop=True)
motion_df_dropped['TimeDelta'] = (motion_df_dropped['Time']-motion_df_dropped['Time'][0])
#print(elapsed_timedelta)
#motion_df_dropped.head()
motion_df_dropped.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Time,IMU A1,IMU A2,IMU A3,IMU G1,IMU G2,IMU G3,IMU M1,IMU M2,IMU M3,TimeDelta
ride_id,UTC,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
15629,2018-10-31 19:33:01.839,11992061.0,-6.0,540.0,47.0,-24.0,25.0,-38.0,28.0,-136.0,18.0,0.0
15629,2018-10-31 19:33:02.103,11992311.0,4.0,514.0,93.0,10.0,58.0,-32.0,40.0,-142.0,14.0,250.0
15629,2018-10-31 19:33:02.334,11992561.0,14.0,532.0,126.0,40.0,96.0,8.0,51.0,-139.0,39.0,500.0
15629,2018-10-31 19:33:02.598,11992813.0,13.0,528.0,133.0,-72.0,105.0,47.0,46.0,-146.0,26.0,752.0
15629,2018-10-31 19:33:02.829,11993063.0,23.0,481.0,121.0,-30.0,39.0,83.0,41.0,-141.0,21.0,1002.0
15629,2018-10-31 19:33:03.093,11993313.0,24.0,452.0,105.0,74.0,-26.0,64.0,32.0,-142.0,26.0,1252.0
15629,2018-10-31 19:33:03.357,11993564.0,10.0,467.0,39.0,33.0,-92.0,0.0,20.0,-152.0,40.0,1503.0
15629,2018-10-31 19:33:03.588,11993815.0,-5.0,490.0,60.0,-151.0,-112.0,-58.0,28.0,-148.0,18.0,1754.0
15629,2018-10-31 19:33:03.852,11994065.0,16.0,475.0,94.0,-117.0,52.0,-62.0,46.0,-136.0,-2.0,2004.0
15629,2018-10-31 19:33:04.116,11994316.0,-17.0,461.0,38.0,-34.0,122.0,-9.0,39.0,-121.0,-15.0,2255.0


In [18]:
#Create a "surf" label, label each individual line of data from reviewing surf videotape: 

#Creates a surf array, with all values = 0:
sLength = len(motion_df_dropped['Time'])
surf_array = [0] * sLength
motion_df_dropped['surfing'] = pd.Series(surf_array, index=motion_df_dropped.index)



#Creates a surf array, where we can now choose values based on the TimeDelta,
#this will be useful for labelling data from our videotaped sessions:
surf_array = []
for td in motion_df_dropped['TimeDelta']:
    if td > 99.0 and td < 265.0:        #TimeDelta units are milliseconds
        surf_array.append(1)
    else:
        surf_array.append(0)
        
motion_df_dropped['surfing'] = pd.Series(surf_array, index=motion_df_dropped.index)
        
    

motion_df_dropped.head(20)



Unnamed: 0_level_0,Unnamed: 1_level_0,Time,IMU A1,IMU A2,IMU A3,IMU G1,IMU G2,IMU G3,IMU M1,IMU M2,IMU M3,TimeDelta,surfing
ride_id,UTC,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
15629,2018-10-31 19:33:01.839,11992061.0,-6.0,540.0,47.0,-24.0,25.0,-38.0,28.0,-136.0,18.0,0.0,0
15629,2018-10-31 19:33:02.103,11992311.0,4.0,514.0,93.0,10.0,58.0,-32.0,40.0,-142.0,14.0,250.0,1
15629,2018-10-31 19:33:02.334,11992561.0,14.0,532.0,126.0,40.0,96.0,8.0,51.0,-139.0,39.0,500.0,0
15629,2018-10-31 19:33:02.598,11992813.0,13.0,528.0,133.0,-72.0,105.0,47.0,46.0,-146.0,26.0,752.0,0
15629,2018-10-31 19:33:02.829,11993063.0,23.0,481.0,121.0,-30.0,39.0,83.0,41.0,-141.0,21.0,1002.0,0
15629,2018-10-31 19:33:03.093,11993313.0,24.0,452.0,105.0,74.0,-26.0,64.0,32.0,-142.0,26.0,1252.0,0
15629,2018-10-31 19:33:03.357,11993564.0,10.0,467.0,39.0,33.0,-92.0,0.0,20.0,-152.0,40.0,1503.0,0
15629,2018-10-31 19:33:03.588,11993815.0,-5.0,490.0,60.0,-151.0,-112.0,-58.0,28.0,-148.0,18.0,1754.0,0
15629,2018-10-31 19:33:03.852,11994065.0,16.0,475.0,94.0,-117.0,52.0,-62.0,46.0,-136.0,-2.0,2004.0,0
15629,2018-10-31 19:33:04.116,11994316.0,-17.0,461.0,38.0,-34.0,122.0,-9.0,39.0,-121.0,-15.0,2255.0,0


In [None]:
#If we ever need to calculate averages from data rows/columns here's a link to do that:
#https://stackoverflow.com/questions/31698861/add-column-to-the-end-of-pandas-dataframe-containing-average-of-previous-data

