In [1]:
import pandas as pd
import numpy as np

import glob
import csv

In [2]:
# Set path to files
path = 'Data/tc4tl_train/data/train/'

# Read the .tsv key file
df_train_key = pd.read_csv('Data/tc4tl_train/docs/tc4tl_train_key.tsv', sep='\t', index_col="fileid")

In [3]:
def get_dict(path):
    # This bit will allow us to cycle through all files in the folder
    all_files = glob.glob(path + "/*.csv")

    """
    EDIT - Adapted from the first part of Owen's code
    Changed list to dictionary so that file name would be stored also
        - might be easier to then link with the key files?
        - "li" changed to "dic"
        
    Output should be a dictionary where each key value pair is as follows
        - key = file name
        - value = dataframe with bluetooth, accelerometer data etc
    """

    # Create an empty list
    dic = {}
    # Read in each file from the folder and add them to the list
    # WARNING This may not read in the CSVs in the order they were in in the folder
    for filename in all_files:
        df = pd.read_csv(filename, header=None, skiprows=7, sep='\n', skip_blank_lines=False, quoting=csv.QUOTE_NONE)
        df = df[0].str.split(',', expand=True)
        dic[filename[len(path):]] = df
    
        
    return dic

dic = get_dict(path)

In [4]:
#-- TAKES A WHILE TO RUN
"""
For best run go back to min-max rather than quartiles and remove gyro_2
"""

"""
This function will create a dataframe from the key tsv file provided by NIST

Then it will add a column with the average bluetooth, accelerometer etc. for each file

It also gets dummy variables for the coarse_grain variable
"""

from scipy.stats import kurtosis, skew

def add_predictor_aggregated(dic, key_df):
    key_df = key_df.copy()
    """
    dic - output of the get_dict() function
    key_df - the dataframe obtained from reading the .tsv file provided by nist
    """
    # Get rough summary statistics for each file to create a baseline model and append to key dataframe
    key_df["bluetooth_avg"] = np.nan
    key_df["bluetooth_lwr"] = np.nan
    key_df["bluetooth_upr"] = np.nan
    key_df["bluetooth_std"] = np.nan
    key_df["bluetooth_skew"] = np.nan
    key_df["accelerometer_1_avg"] = np.nan
    key_df["accelerometer_1_lwr"] = np.nan
    key_df["accelerometer_1_upr"] = np.nan
    key_df["accelerometer_1_std"] = np.nan
    key_df["accelerometer_1_skew"] = np.nan
    key_df["accelerometer_2_avg"] = np.nan
    key_df["accelerometer_2_lwr"] = np.nan
    key_df["accelerometer_2_upr"] = np.nan
    key_df["accelerometer_2_std"] = np.nan
    key_df["accelerometer_2_skew"] = np.nan
    key_df["gyro_1_avg"] = np.nan
    key_df["gyro_1_lwr"] = np.nan
    key_df["gyro_1_upr"] = np.nan
    key_df["gyro_1_std"] = np.nan
    key_df["gyro_2_avg"] = np.nan
    key_df["gyro_2_lwr"] = np.nan
    key_df["gyro_2_upr"] = np.nan
    key_df["gyro_2_std"] = np.nan
    key_df["gyro_3_avg"] = np.nan
    key_df["gyro_3_lwr"] = np.nan
    key_df["gyro_3_upr"] = np.nan
    key_df["gyro_3_std"] = np.nan
    #key_df["attitude_avg"] = np.nan
    #key_df["gravity_avg"] = np.nan

    for file_id, value in dic.items():
        bt_temp = value[value.iloc[:,1]=="Bluetooth"].iloc[:,2].astype(float)
        bt_mean = np.mean(bt_temp)
        bt_lwr = np.quantile(bt_temp, 0.1)
        bt_upr = np.quantile(bt_temp, 0.9)
        bt_std = np.std(bt_temp)
        bt_skew = skew(bt_temp)
        
        acc_1_temp = value[value.iloc[:,1]=="Accelerometer"].iloc[:,2].astype(float)
        accelerometer_mean = np.mean(acc_1_temp)
        accelerometer_lwr = np.quantile(acc_1_temp, 0.1)
        accelerometer_upr = np.quantile(acc_1_temp, 0.9)
        accelerometer_std = np.std(acc_1_temp)
        accelerometer_skew = skew(acc_1_temp)
        
        acc_2_temp = value[value.iloc[:,1]=="Accelerometer"].iloc[:,3].astype(float)
        accelerometer_2_mean = np.mean(acc_2_temp)
        accelerometer_2_lwr = np.quantile(acc_2_temp, 0.1)
        accelerometer_2_upr = np.quantile(acc_2_temp, 0.9)
        accelerometer_2_std = np.std(acc_2_temp)
        accelerometer_2_skew = skew(acc_2_temp)
        
        gyro_1_temp = value[value.iloc[:,1]=="Gyroscope"].iloc[:,2].astype(float)
        gyro_1_mean = np.mean(gyro_1_temp)
        gyro_1_lwr = np.quantile(gyro_1_temp, 0.1)
        gyro_1_upr = np.quantile(gyro_1_temp, 0.9)
        gyro_1_std = np.std(gyro_1_temp)
        
        gyro_2_temp = value[value.iloc[:,1]=="Gyroscope"].iloc[:,3].astype(float)
        gyro_2_mean = np.mean(gyro_2_temp)
        gyro_2_lwr = np.quantile(gyro_2_temp, 0.1)
        gyro_2_upr = np.quantile(gyro_2_temp, 0.9)
        gyro_2_std = np.std(gyro_2_temp)
        
        gyro_3_temp = value[value.iloc[:,1]=="Gyroscope"].iloc[:,4].astype(float)
        gyro_3_mean = np.mean(gyro_3_temp)
        gyro_3_lwr = np.quantile(gyro_3_temp, 0.1)
        gyro_3_upr = np.quantile(gyro_3_temp, 0.9)
        gyro_3_std = np.std(gyro_3_temp)
        
        #attitude_mean = np.mean(value[value.iloc[:,1]=="Attitude"].iloc[:,2].astype(float))
        #gravity_mean = np.mean(value[value.iloc[:,1]=="Gravity"].iloc[:,2].astype(float))

        key_df.loc[file_id, "bluetooth_avg"] = bt_mean
        key_df.loc[file_id, "bluetooth_lwr"] = bt_lwr
        key_df.loc[file_id, "bluetooth_upr"] = bt_upr
        key_df.loc[file_id, "bluetooth_std"] = bt_std
        key_df.loc[file_id, "bluetooth_skew"] = bt_skew
        
        key_df.loc[file_id, "accelerometer_1_avg"] = accelerometer_mean
        key_df.loc[file_id, "accelerometer_1_lwr"] = accelerometer_lwr
        key_df.loc[file_id, "accelerometer_1_upr"] = accelerometer_upr
        key_df.loc[file_id, "accelerometer_1_std"] = accelerometer_std
        key_df.loc[file_id, "accelerometer_1_skew"] = accelerometer_skew
        
        key_df.loc[file_id, "accelerometer_2_avg"] = accelerometer_2_mean
        key_df.loc[file_id, "accelerometer_2_lwr"] = accelerometer_2_lwr
        key_df.loc[file_id, "accelerometer_2_upr"] = accelerometer_2_upr
        key_df.loc[file_id, "accelerometer_2_std"] = accelerometer_2_std
        key_df.loc[file_id, "accelerometer_2_skew"] = accelerometer_2_skew
        
        key_df.loc[file_id, "gyro_1_avg"] = gyro_1_mean
        key_df.loc[file_id, "gyro_1_lwr"] = gyro_1_lwr
        key_df.loc[file_id, "gyro_1_upr"] = gyro_1_upr
        key_df.loc[file_id, "gyro_1_std"] = gyro_1_std
        
        key_df.loc[file_id, "gyro_2_avg"] = gyro_2_mean
        key_df.loc[file_id, "gyro_2_lwr"] = gyro_2_lwr
        key_df.loc[file_id, "gyro_2_upr"] = gyro_2_upr
        key_df.loc[file_id, "gyro_2_std"] = gyro_2_std
        
        key_df.loc[file_id, "gyro_3_avg"] = gyro_3_mean
        key_df.loc[file_id, "gyro_3_lwr"] = gyro_3_lwr
        key_df.loc[file_id, "gyro_3_upr"] = gyro_3_upr
        key_df.loc[file_id, "gyro_3_std"] = gyro_3_std
        
        #key_df.loc[file_id, "attitude_avg"] = attitude_mean
        #key_df.loc[file_id, "gravity_avg"] = gravity_mean
        
    #key_df = pd.get_dummies(key_df, columns=['coarse_grain'])
        
    return key_df

key_df = add_predictor_aggregated(dic, df_train_key)

In [5]:
key_df.columns

Index(['phone_carriage_state', 'distance_in_meters', 'step_size_in_sec',
       'coarse_grain', 'bluetooth_avg', 'bluetooth_lwr', 'bluetooth_upr',
       'bluetooth_std', 'bluetooth_skew', 'accelerometer_1_avg',
       'accelerometer_1_lwr', 'accelerometer_1_upr', 'accelerometer_1_std',
       'accelerometer_1_skew', 'accelerometer_2_avg', 'accelerometer_2_lwr',
       'accelerometer_2_upr', 'accelerometer_2_std', 'accelerometer_2_skew',
       'gyro_1_avg', 'gyro_1_lwr', 'gyro_1_upr', 'gyro_1_std', 'gyro_2_avg',
       'gyro_2_lwr', 'gyro_2_upr', 'gyro_2_std', 'gyro_3_avg', 'gyro_3_lwr',
       'gyro_3_upr', 'gyro_3_std'],
      dtype='object')

In [23]:
key_df.head()

Unnamed: 0_level_0,phone_carriage_state,distance_in_meters,step_size_in_sec,bluetooth_avg,bluetooth_lwr,bluetooth_upr,bluetooth_std,accelerometer_avg,accelerometer_lwr,accelerometer_upr,accelerometer_std,gyroscope_avg,gyroscope_avg_2,attitude_avg,gravity_avg,gyroscope_avg_3,coarse_grain_N,coarse_grain_Y
fileid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
aaadbuvp_tc4tl20.csv,pocket_hand,1.8,50,-62.133482,-74.0,-52.0,7.57381,0.024843,-0.040276,0.05127,0.05127,0.007361,-0.001176,0.726321,0.025127,0.009136,0,1
aaamkcii_tc4tl20.csv,pocket_pocket,3.0,80,-55.255556,-60.0,-49.0,5.254757,0.187249,-0.218625,0.444652,0.444652,-0.01703,-0.019209,-0.410232,0.186804,0.015339,1,0
aabqtowt_tc4tl20.csv,hand_hand,1.2,60,-62.289089,-75.0,-52.0,8.674381,0.050939,0.024933,0.077966,0.077966,0.007156,-0.005191,0.95443,0.050841,-0.006214,1,0
aadkjwss_tc4tl20.csv,pocket_pocket,1.8,20,-67.938259,-78.0,-61.0,6.982572,-0.067156,-0.157225,0.111467,0.111467,0.028015,0.053077,1.337362,-0.069499,0.044304,1,0
aafzrgzt_tc4tl20.csv,hand_hand,1.8,60,-55.760797,-63.0,-48.0,5.289863,0.015989,-0.047058,0.087917,0.087917,-0.007063,-0.00621,0.292535,0.016077,-0.008596,0,1


In [7]:
#key_df.to_csv('aggregated.csv')

## Build model

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [16]:
#-- Take just variables we want
#-- No Skew
predictor_columns = ['bluetooth_avg', 'bluetooth_lwr', 'bluetooth_upr',
                     'bluetooth_std', 'accelerometer_1_avg', 'accelerometer_1_lwr',
                     'accelerometer_1_upr', 'accelerometer_1_std', 'accelerometer_2_avg',
                     'accelerometer_2_lwr', 'accelerometer_2_upr', 'accelerometer_2_std',
                     'gyro_1_avg', 'gyro_1_lwr', 'gyro_1_upr', 'gyro_1_std', 'gyro_2_avg',
                     'gyro_2_lwr', 'gyro_2_upr', 'gyro_2_std', 'gyro_3_avg', 'gyro_3_lwr',
                     'gyro_3_upr', 'gyro_3_std']

#-- Skew
"""predictor_columns = ['bluetooth_avg', 'bluetooth_lwr', 'bluetooth_upr',
       'bluetooth_std', 'bluetooth_skew', 'accelerometer_1_avg',
       'accelerometer_1_lwr', 'accelerometer_1_upr', 'accelerometer_1_std',
       'accelerometer_1_skew', 'accelerometer_2_avg', 'accelerometer_2_lwr',
       'accelerometer_2_upr', 'accelerometer_2_std', 'accelerometer_2_skew',
       'gyro_1_avg', 'gyro_1_lwr', 'gyro_1_upr', 'gyro_1_std', 'gyro_2_avg',
       'gyro_2_lwr', 'gyro_2_upr', 'gyro_2_std', 'gyro_3_avg', 'gyro_3_lwr',
       'gyro_3_upr', 'gyro_3_std']"""


X = key_df[predictor_columns]
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

y = key_df["distance_in_meters"].apply(str)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=8)

In [17]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

"""
Fit all of the imported models and store predictions
"""

model1 = RandomForestClassifier()
model1.fit(X_train, y_train)
predictions1 = model1.predict(X_test)

model2 = KNeighborsClassifier()
model2.fit(X_train, y_train)
predictions2 = model2.predict(X_test)

model3 = AdaBoostClassifier()
model3.fit(X_train, y_train)
predictions3 = model3.predict(X_test)

model4 = GradientBoostingClassifier()
model4.fit(X_train, y_train)
predictions4 = model4.predict(X_test)

model5 = MLPClassifier()
model5.fit(X_train, y_train)
predictions5 = model5.predict(X_test)




In [18]:
#-- Print the MSE for each model
print("MSE for Random Forest : ", np.sum((predictions1.astype(float)-y_test.astype(float))**2))
print("MSE for K Neighbors : ", np.sum((predictions2.astype(float)-y_test.astype(float))**2))
print("MSE for Ada Boost : ", np.sum((predictions3.astype(float)-y_test.astype(float))**2))
print("MSE for Gradient Boosting : ", np.sum((predictions4.astype(float)-y_test.astype(float))**2))
print("MSE for Multi Layer Perceptron : ", np.sum((predictions5.astype(float)-y_test.astype(float))**2))

MSE for Random Forest :  528.9300000000001
MSE for K Neighbors :  1497.2400000000002
MSE for Ada Boost :  4862.160000000001
MSE for Gradient Boosting :  2392.7400000000007
MSE for Multi Layer Perceptron :  2486.07


In [11]:
predictions1.astype(float)

array([3. , 4.5, 4.5, ..., 1.8, 1.2, 3. ])

In [12]:
# True values for rough comparison
y_test

fileid
yqjxjjcj_tc4tl20.csv    3.0
oetvnfyd_tc4tl20.csv    4.5
owilsukc_tc4tl20.csv    4.5
dhzcxlwg_tc4tl20.csv    4.5
butaetck_tc4tl20.csv    1.8
                       ... 
ahpvtsni_tc4tl20.csv    1.8
crbdzqme_tc4tl20.csv    1.2
bbkbjbed_tc4tl20.csv    1.8
afyiwipu_tc4tl20.csv    1.2
vibvnhbe_tc4tl20.csv    3.0
Name: distance_in_meters, Length: 5133, dtype: object

# Apply to competition Dev data

In [14]:
# Set path to files
dev_path = 'Data/tc4tl_dev_test/data/dev/'

# Read the .tsv key file
df_dev_key = pd.read_csv('Data/tc4tl_dev_test/docs/tc4tl_dev_key.tsv', sep='\t', index_col="fileid")

dev_dict = get_dict(dev_path)

dev_key_df = add_predictor_aggregated(dev_dict, df_dev_key)

def predict_from_key_df(key_df, model, scaler):
    X = key_df[predictor_columns]
    X = scaler.transform(X)
    
    return model.predict(X)

#-- !!!!!!! CHANGE model4 below to whatever model you want to get the output file for!
dev_predictions = predict_from_key_df(dev_key_df, model1, scaler)

df_dev_trials = pd.read_csv('Data/tc4tl_dev_test/docs/tc4tl_dev_trials.tsv', sep='\t', index_col="fileid")

df_dev_trials['distance'] = dev_predictions.astype(float)

df_dev_trials.to_csv('dev_system_output_RF.tsv', sep='\t')

In [39]:
df_dev_trials.head()

Unnamed: 0_level_0,distance
fileid,Unnamed: 1_level_1
abgikaek_tc4tl20.csv,1.2
acehqsss_tc4tl20.csv,3.0
adcmsfnp_tc4tl20.csv,1.8
adljjzjj_tc4tl20.csv,1.2
adzvqmmg_tc4tl20.csv,4.5


# Apply to test

In [None]:
# Set path to files
dev_path = 'Data/tc4tl_dev_test/data/test/'

# Read the .tsv key file
df_dev_key = pd.read_csv('Data/tc4tl_dev_test/docs/tc4tl_test_metadata.tsv', sep='\t', index_col="fileid")

dev_dict = get_dict(dev_path)

dev_key_df = add_predictor_aggregated(dev_dict, df_dev_key)

def predict_from_key_df(key_df, model, scaler):
    X = key_df[predictor_columns]
    X = scaler.transform(X)
    
    return model.predict(X)

#-- !!!!!!! CHANGE model4 below to whatever model you want to get the output file for!
dev_predictions = predict_from_key_df(dev_key_df, model1, scaler)

df_dev_trials = pd.read_csv('Data/tc4tl_dev_test/docs/tc4tl_test_trials.tsv', sep='\t', index_col="fileid")

df_dev_trials['distance'] = dev_predictions.astype(float)

df_dev_trials.to_csv('test_system_output_RF_no_skew.tsv', sep='\t')

In [20]:
# Set path to files
dev_path = 'Data/tc4tl_dev_test/data/test/'

# Read the .tsv key file
df_dev_key = pd.read_csv('Data/tc4tl_dev_test/docs/tc4tl_test_metadata.tsv', sep='\t', index_col="fileid")

dev_dict = get_dict(dev_path)

dev_key_df = add_predictor_aggregated(dev_dict, df_dev_key)

def predict_from_key_df(key_df, model, scaler):
    X = key_df[predictor_columns]
    X = scaler.transform(X)
    
    return model.predict(X)

#-- !!!!!!! CHANGE model4 below to whatever model you want to get the output file for!
dev_predictions = predict_from_key_df(dev_key_df, model2, scaler)

df_dev_trials = pd.read_csv('Data/tc4tl_dev_test/docs/tc4tl_test_trials.tsv', sep='\t', index_col="fileid")

df_dev_trials['distance'] = dev_predictions.astype(float)

df_dev_trials.to_csv('test_system_output_KNN_no_skew.tsv', sep='\t')

In [21]:
# Set path to files
dev_path = 'Data/tc4tl_dev_test/data/test/'

# Read the .tsv key file
df_dev_key = pd.read_csv('Data/tc4tl_dev_test/docs/tc4tl_test_metadata.tsv', sep='\t', index_col="fileid")

dev_dict = get_dict(dev_path)

dev_key_df = add_predictor_aggregated(dev_dict, df_dev_key)

def predict_from_key_df(key_df, model, scaler):
    X = key_df[predictor_columns]
    X = scaler.transform(X)
    
    return model.predict(X)

#-- !!!!!!! CHANGE model4 below to whatever model you want to get the output file for!
dev_predictions = predict_from_key_df(dev_key_df, model3, scaler)

df_dev_trials = pd.read_csv('Data/tc4tl_dev_test/docs/tc4tl_test_trials.tsv', sep='\t', index_col="fileid")

df_dev_trials['distance'] = dev_predictions.astype(float)

df_dev_trials.to_csv('test_system_output_Ada_no_skew.tsv', sep='\t')

In [22]:
# Set path to files
dev_path = 'Data/tc4tl_dev_test/data/test/'

# Read the .tsv key file
df_dev_key = pd.read_csv('Data/tc4tl_dev_test/docs/tc4tl_test_metadata.tsv', sep='\t', index_col="fileid")

dev_dict = get_dict(dev_path)

dev_key_df = add_predictor_aggregated(dev_dict, df_dev_key)

def predict_from_key_df(key_df, model, scaler):
    X = key_df[predictor_columns]
    X = scaler.transform(X)
    
    return model.predict(X)

#-- !!!!!!! CHANGE model4 below to whatever model you want to get the output file for!
dev_predictions = predict_from_key_df(dev_key_df, model4, scaler)

df_dev_trials = pd.read_csv('Data/tc4tl_dev_test/docs/tc4tl_test_trials.tsv', sep='\t', index_col="fileid")

df_dev_trials['distance'] = dev_predictions.astype(float)

df_dev_trials.to_csv('test_system_output_XGB_no_skew.tsv', sep='\t')

In [23]:
# Set path to files
dev_path = 'Data/tc4tl_dev_test/data/test/'

# Read the .tsv key file
df_dev_key = pd.read_csv('Data/tc4tl_dev_test/docs/tc4tl_test_metadata.tsv', sep='\t', index_col="fileid")

dev_dict = get_dict(dev_path)

dev_key_df = add_predictor_aggregated(dev_dict, df_dev_key)

def predict_from_key_df(key_df, model, scaler):
    X = key_df[predictor_columns]
    X = scaler.transform(X)
    
    return model.predict(X)

#-- !!!!!!! CHANGE model4 below to whatever model you want to get the output file for!
dev_predictions = predict_from_key_df(dev_key_df, model5, scaler)

df_dev_trials = pd.read_csv('Data/tc4tl_dev_test/docs/tc4tl_test_trials.tsv', sep='\t', index_col="fileid")

df_dev_trials['distance'] = dev_predictions.astype(float)

df_dev_trials.to_csv('test_system_output_MLP_no_skew.tsv', sep='\t')