In [1]:
import pandas as pd
import numpy as np

import glob
import csv

In [2]:
# Set path to files
path = 'Data/tc4tl_train/data/train/'

# Read the .tsv key file
df_train_key = pd.read_csv('Data/tc4tl_train/docs/tc4tl_train_key.tsv', sep='\t', index_col="fileid")

In [3]:
def get_dict(path):
    # This bit will allow us to cycle through all files in the folder
    all_files = glob.glob(path + "/*.csv")

    """
    EDIT - Adapted from the first part of Owen's code
    Changed list to dictionary so that file name would be stored also
        - might be easier to then link with the key files?
        - "li" changed to "dic"
        
    Output should be a dictionary where each key value pair is as follows
        - key = file name
        - value = dataframe with bluetooth, accelerometer data etc
    """

    # Create an empty list
    dic = {}
    # Read in each file from the folder and add them to the list
    # WARNING This may not read in the CSVs in the order they were in in the folder
    for filename in all_files:
        df = pd.read_csv(filename, header=None, skiprows=7, sep='\n', skip_blank_lines=False, quoting=csv.QUOTE_NONE)
        df = df[0].str.split(',', expand=True)
        dic[filename[len(path):]] = df
    
        
    return dic

dic = get_dict(path)

In [4]:
# Printout to show an example of the dictionary
key = next(iter(dic))
value = dic[key]

print(key)
print("="*20)
print(value.head())

jpprpdob_tc4tl20.csv
       0               1                     2                      3  \
0  0.000   Accelerometer    0.0350799560546875    -0.9711151123046875   
1  0.002       Gyroscope  0.022057099267840385  -0.007673632353544235   
2  0.004        Attitude      1.34190080377603    0.14275428933014317   
3  0.005         Gravity   0.03228135406970978    -0.9739176034927368   
4  0.006  Magnetic-field    21.337322235107422    -28.389331817626953   

                     4     5     6     7  
0  -0.2176361083984375  None  None  None  
1  0.00640676636248827  None  None  None  
2  -0.7818029121247423  None  None  None  
3  -0.2245941013097763  None  None  None  
4   -22.16741943359375  high  None  None  


In [5]:
"""
This function will create a dataframe from the key tsv file provided by NIST

Then it will add a column with the average bluetooth, accelerometer etc. for each file

It also gets dummy variables for the coarse_grain variable
"""
def add_predictor_aggregated(dic, key_df):
    key_df = key_df.copy()
    """
    dic - output of the get_dict() function
    key_df - the dataframe obtained from reading the .tsv file provided by nist
    """
    # Get rough summary statistics for each file to create a baseline model and append to key dataframe
    key_df["bluetooth_avg"] = np.nan
    key_df["accelerometer_avg"] = np.nan
    key_df["gyroscope_avg"] = np.nan
    key_df["attitude_avg"] = np.nan
    key_df["gravity_avg"] = np.nan

    for file_id, value in dic.items():
        bt_mean = np.mean(value[value.iloc[:,1]=="Bluetooth"].iloc[:,2].astype(float))
        accelerometer_mean = np.mean(value[value.iloc[:,1]=="Accelerometer"].iloc[:,2].astype(float))
        gyroscope_mean = np.mean(value[value.iloc[:,1]=="Gyroscope"].iloc[:,2].astype(float))
        attitude_mean = np.mean(value[value.iloc[:,1]=="Attitude"].iloc[:,2].astype(float))
        gravity_mean = np.mean(value[value.iloc[:,1]=="Gravity"].iloc[:,2].astype(float))

        key_df.loc[file_id, "bluetooth_avg"] = bt_mean
        key_df.loc[file_id, "accelerometer_avg"] = accelerometer_mean
        key_df.loc[file_id, "gyroscope_avg"] = gyroscope_mean
        key_df.loc[file_id, "attitude_avg"] = attitude_mean
        key_df.loc[file_id, "gravity_avg"] = gravity_mean
        
    key_df = pd.get_dummies(key_df, columns=['coarse_grain'])
        
    return key_df

key_df = add_predictor_aggregated(dic, df_train_key)

In [6]:
key_df.head()

Unnamed: 0_level_0,phone_carriage_state,distance_in_meters,step_size_in_sec,bluetooth_avg,accelerometer_avg,gyroscope_avg,attitude_avg,gravity_avg,coarse_grain_N,coarse_grain_Y
fileid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
aaadbuvp_tc4tl20.csv,pocket_hand,1.8,50,-62.133482,0.024843,0.007361,0.726321,0.025127,0,1
aaamkcii_tc4tl20.csv,pocket_pocket,3.0,80,-55.255556,0.187249,-0.01703,-0.410232,0.186804,1,0
aabqtowt_tc4tl20.csv,hand_hand,1.2,60,-62.289089,0.050939,0.007156,0.95443,0.050841,1,0
aadkjwss_tc4tl20.csv,pocket_pocket,1.8,20,-67.938259,-0.067156,0.028015,1.337362,-0.069499,1,0
aafzrgzt_tc4tl20.csv,hand_hand,1.8,60,-55.760797,0.015989,-0.007063,0.292535,0.016077,0,1


## Build model

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestRegressor
#from sklearn.neighbors import KNeighborsRegressor
#from sklearn.neural_network import MLPRegressor

from sklearn.metrics import plot_roc_curve

In [8]:
key_df.head()

Unnamed: 0_level_0,phone_carriage_state,distance_in_meters,step_size_in_sec,bluetooth_avg,accelerometer_avg,gyroscope_avg,attitude_avg,gravity_avg,coarse_grain_N,coarse_grain_Y
fileid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
aaadbuvp_tc4tl20.csv,pocket_hand,1.8,50,-62.133482,0.024843,0.007361,0.726321,0.025127,0,1
aaamkcii_tc4tl20.csv,pocket_pocket,3.0,80,-55.255556,0.187249,-0.01703,-0.410232,0.186804,1,0
aabqtowt_tc4tl20.csv,hand_hand,1.2,60,-62.289089,0.050939,0.007156,0.95443,0.050841,1,0
aadkjwss_tc4tl20.csv,pocket_pocket,1.8,20,-67.938259,-0.067156,0.028015,1.337362,-0.069499,1,0
aafzrgzt_tc4tl20.csv,hand_hand,1.8,60,-55.760797,0.015989,-0.007063,0.292535,0.016077,0,1


In [9]:
# Choose predictor columns for baseline model
# Results posted in slack were with and without the coarse_grain_N variable
predictor_columns = ["bluetooth_avg", "accelerometer_avg", "gyroscope_avg", 
                     "attitude_avg", "gravity_avg"]
#predictor_columns = ["bluetooth_avg", "accelerometer_avg", "gyroscope_avg", 
#                     "attitude_avg", "gravity_avg", "coarse_grain_N"]

X = key_df[predictor_columns]
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

y = key_df["distance_in_meters"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=8)

In [10]:
"""
SKLearn Models sample code for sklearn models

Models:
    o KNeighborsClassifier()
    o RandomForestClassifier()
    o MLPClassifier()
    o SVC()
"""
model = RandomForestRegressor()
model.fit(X_train, y_train)
predictions = model.predict(X_test)

In [11]:
predictions

array([3.513, 3.651, 4.131, ..., 1.842, 1.344, 3.483])

In [12]:
# True values for rough comparison
y_test

fileid
yqjxjjcj_tc4tl20.csv    3.0
oetvnfyd_tc4tl20.csv    4.5
owilsukc_tc4tl20.csv    4.5
dhzcxlwg_tc4tl20.csv    4.5
butaetck_tc4tl20.csv    1.8
                       ... 
ahpvtsni_tc4tl20.csv    1.8
crbdzqme_tc4tl20.csv    1.2
bbkbjbed_tc4tl20.csv    1.8
afyiwipu_tc4tl20.csv    1.2
vibvnhbe_tc4tl20.csv    3.0
Name: distance_in_meters, Length: 5133, dtype: float64

# Apply to competition test data

In [13]:
# Set path to files
dev_path = 'Data/tc4tl_dev_test/data/dev/'

# Read the .tsv key file
df_dev_key = pd.read_csv('Data/tc4tl_dev_test/docs/tc4tl_dev_key.tsv', sep='\t', index_col="fileid")

In [14]:
dev_dict = get_dict(dev_path)

In [15]:
dev_key_df = add_predictor_aggregated(dev_dict, df_dev_key)

In [16]:
# Printout to show an example of the dictionary
key = next(iter(dev_dict))
value = dev_dict[key]

print(key)
print("="*20)
print(value.head())

ndffkyyx_tc4tl20.csv
       0          1    2     3     4     5     6     7
0  0.000  Bluetooth  -54  None  None  None  None  None
1  0.001  Bluetooth  -54  None  None  None  None  None
2  0.034  Bluetooth  -59  None  None  None  None  None
3  0.035  Bluetooth  -59  None  None  None  None  None
4  0.110  Bluetooth  -58  None  None  None  None  None


In [17]:
dev_key_df.head()

Unnamed: 0_level_0,phone_carriage_state,distance_in_meters,step_size_in_sec,bluetooth_avg,accelerometer_avg,gyroscope_avg,attitude_avg,gravity_avg,coarse_grain_N,coarse_grain_Y
fileid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
abgikaek_tc4tl20.csv,unknown_unknown,1.2,60.0,-54.102249,-0.015496,0.024632,0.36858,-0.021319,1,0
acehqsss_tc4tl20.csv,pocket_pocket,4.5,80.0,-66.210706,0.174597,-0.001085,-1.334979,0.200371,0,1
adcmsfnp_tc4tl20.csv,unknown_unknown,3.0,70.0,-57.761044,0.007031,0.03637,0.473086,0.008276,1,0
adljjzjj_tc4tl20.csv,unknown_unknown,1.2,150.0,-62.629252,-0.642454,-0.004362,-0.861068,-0.643719,1,0
adzvqmmg_tc4tl20.csv,pocket_hand,1.8,30.0,-62.978268,0.01254,-0.014845,0.348743,0.002637,0,1


In [18]:
def predict_from_key_df(key_df, model, scaler):
    X = key_df[predictor_columns]
    X = scaler.transform(X)
    
    return model.predict(X)

dev_predictions = predict_from_key_df(dev_key_df, model, scaler)

In [19]:
df_dev_trials = pd.read_csv('Data/tc4tl_dev_test/docs/tc4tl_dev_trials.tsv', sep='\t', index_col="fileid")

In [20]:
df_dev_trials['distance'] = dev_predictions

In [21]:
df_dev_trials.head()

Unnamed: 0_level_0,distance
fileid,Unnamed: 1_level_1
abgikaek_tc4tl20.csv,1.665
acehqsss_tc4tl20.csv,4.191
adcmsfnp_tc4tl20.csv,2.661
adljjzjj_tc4tl20.csv,1.551
adzvqmmg_tc4tl20.csv,3.978


In [22]:
df_dev_trials.to_csv('rian_dev_system_output', sep='\t')