<a href="https://colab.research.google.com/github/njanwani/miniproject1/blob/main/preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
from tqdm import tqdm
import csv, json
import scipy.stats

import urllib.request
urllib.request.urlretrieve('https://raw.githubusercontent.com/njanwani/miniproject1/main/train.json', 'train.json')
urllib.request.urlretrieve('https://raw.githubusercontent.com/njanwani/miniproject1/main/test.json', 'test.json')

('test.json', <http.client.HTTPMessage at 0x7f778f6bf970>)

# Helper functions

In [2]:
def mean_step_speed(coords):
    """Mean step speed of the entire track.
    
    The average per-step speed. Basically the average of distances between points adjacent in time.
    
    Returns
    -------
    float
        The average step speed.
    """

    speeds = []

    for i in range(1, coords.shape[0]):
        # Previous coordinate location
        prev = coords[i-1, 1:]
        # Current coordinate location
        curr = coords[i, 1:]
        
        # Speed in pixels per frame
        curr_speed = np.linalg.norm(curr - prev)
        
        # Accumulate per-step speeds into a list
        speeds.append(curr_speed)
    
    # Return the average of the speeds
    if len(speeds) > 0: 
      return np.mean(speeds)
    else:
      return 0


def stddev_step_speed(coords):
    """Standard deviation of the step speed of the entire track.
    
    The standard deviation of the per-step speed.
    
    Returns
    -------
    float
        The stddev of the step speed.
    """

    speeds = []

    for i in range(1, coords.shape[0]):
        # Previous coordinate location
        prev = coords[i-1, 1:]
        # Current coordinate location
        curr = coords[i, 1:]
        
        # Speed in pixels per frame
        curr_speed = np.linalg.norm(curr - prev)
        
        # Accumulate per-step speeds into a list
        speeds.append(curr_speed)
    
    # Return the standard deviation of the speeds
    if len(speeds) > 0: 
      return np.std(speeds)
    else:
      return 0


def track_length(coords):
    """Standard deviation of the step speed of the entire track.
    
    The standard deviation of the per-step speed.
    
    Returns
    -------
    float
        The length of the entire track.
    """

    lengths = []

    for i in range(1, coords.shape[0]):
        # Previous coordinate location
        prev = coords[i-1,1:]
        # Current coordinate location
        curr = coords[i,1:]
        
        # Speed in pixels per frame
        step_length = np.linalg.norm(curr - prev)
        
        # Accumulate per-step speeds into a list
        lengths.append(step_length)
    
    # Return the sum of the lengths
    return np.sum(lengths)


def e2e_distance(coords):
    """End-to-end distance of the track.
    
    The distance from the start and the end of the given track.
    
    Returns
    -------
    float
        The end-to-end distance of the entire track.
    """
    
    # Start and end of the track
    start = coords[0, 1:]
    end = coords[-1, 1:]
    
    # Return the distance
    return np.linalg.norm(end-start)


def duration(coords):
    """Duration of the track.
    
    The time duration of the track.
    
    Returns
    -------
    int
        The end-to-end duration of the entire track.
    """
    
    # Start and end times of the track
    start_t = coords[0, 0]
    end_t = coords[-1, 0]
    
    # Return the difference
    return end_t - start_t

def min_step(coords):
  speeds = []

  for i in range(1, coords.shape[0]):
      # Previous coordinate location
      prev = coords[i-1, 1:]
      # Current coordinate location
      curr = coords[i, 1:]
      
      # Speed in pixels per frame
      curr_speed = np.linalg.norm(curr - prev)
      
      # Accumulate per-step speeds into a list
      speeds.append(curr_speed)
  
  # Return the average of the speeds
  try:
    return np.min(speeds)
  except:
    return 0


def max_step(coords):
  speeds = []

  for i in range(1, coords.shape[0]):
      # Previous coordinate location
      prev = coords[i-1, 1:]
      # Current coordinate location
      curr = coords[i, 1:]
      
      # Speed in pixels per frame
      curr_speed = np.linalg.norm(curr - prev)
      
      # Accumulate per-step speeds into a list
      speeds.append(curr_speed)
  
  # Return the average of the speeds
  try:
    return np.max(speeds)
  except:
    return 0

def predictability(coords):
  if len(coords) >= 4:
    return scipy.stats.linregress(coords[:,0], coords[:,1])[2] # r^2 value
  else:
    return 0

# Pre-processing

In [11]:
FEATURE_LIST = [mean_step_speed, stddev_step_speed, track_length, e2e_distance, duration, min_step, max_step, predictability]
FEATURE_STRINGS = ['mean_step_speed', 'stddev_step_speed', 'track_length', 'e2e_distance', 'duration', 'min_step', 'max_step', 'predictability']

In [4]:
def process(json_file, OUTPUT_FILENAME):
  data = []
  with open(json_file, 'r') as f:
    track_data = json.load(f)

  # Generate the feature csv
  header = ['uid', 'label']
  for featfunc in FEATURE_LIST:
      header.append(featfunc.__name__)

  features = []

  track_uids = track_data.keys()
  for uid in tqdm(track_uids):
      curr_row = {
          'uid': uid,
          'label': track_data[uid]['label']
      }
      
      for featfunc in FEATURE_LIST:
          curr_row[featfunc.__name__] = featfunc(np.array(track_data[uid]['txy']))
      
      features.append(curr_row)

  with open(OUTPUT_FILENAME, 'w') as f:
      writer = csv.DictWriter(f, fieldnames = header)
      writer.writeheader()
      for r in features:
          writer.writerow(r)

  print("Written to:", OUTPUT_FILENAME)

In [5]:
def get_raw(json_file, file_out):
  print('Reading json...')
  data = []
  with open(json_file, 'r') as f:
    track_data = json.load(f)

  # Generate the feature csv
  header = ['uid', 'label', 'track']

  features = []

  track_uids = track_data.keys()
  for uid in tqdm(track_uids):
      curr_row = {
          'uid': uid,
          'label': track_data[uid]['label'],
          'track' : None
      }
      
      curr_row['track'] = np.array(track_data[uid]['txy'])
      
      features.append(curr_row)

  if file_out != None:
    print(f'Writing to {file_out}...')
    with open(file_out, 'w') as f:
        writer = csv.DictWriter(f, fieldnames = header)
        writer.writeheader()
        for r in tqdm(features):
            writer.writerow(r)

    print("Written to:", file_out)
  return features

In [6]:
# process('test.json', 'test_features.csv')

In [7]:
# data = get_raw('train.json', None)

In [8]:
# import pandas as pd
# df = pd.DataFrame(data)

In [9]:
# import matplotlib.pyplot as plt
# print(df['label'][:50] == 1)
# fig = plt.figure(figsize=(10,100))
# alive, dead = 1, 1
# nrows = len(df['label'][:50] == 1)
# for i in range(1, 51):
#   if df['label'][i] == 0:
#     plt.subplot(nrows, 2, dead * 2 - 1)
#     plt.scatter(df['track'][i][:,1],df['track'][i][:,2], s = 1)
#     dead += 1
#   else:
#     plt.subplot(nrows, 2, alive * 2)
#     plt.scatter(df['track'][i][:,1],df['track'][i][:,2], s = 1)
#     alive += 1

In [10]:
# import numpy as np
# import matplotlib.pyplot as plt
# import matplotlib.animation as animation

# fig, ax = plt.subplots()

# track = df['track'][21]

# sp, = ax.plot(track[:,1][0],track[:,2][0])


# def animate(i):
#     ax.relim()
#     ax.autoscale_view()
#     sp.set_data(track[:,1][:i],track[:,2][:i])
#     return sp,

# ani = animation.FuncAnimation(fig, animate, repeat=True,
#                                     frames=len(track[:,1]) - 1, interval=1)

# # To save the animation using Pillow as a gif
# writer = animation.PillowWriter(fps=60,
#                                 metadata=dict(artist='Me'),
#                                 bitrate=1800)
# ani.save('scatter.gif', writer=writer)
# plt.close()
# from IPython.display import Image
# Image(open('scatter.gif','rb').read())