In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# read the dataset and set the index
columns = ['timestamp', 'tag_id', 'x_pos', 'y_pos', 'heading', 'direction', 'energy', 'speed', 'total_distance']
data = pd.read_csv('../data/input/raw/tromso_stromsgodset_first.csv', names=columns)

# convert timestamp to second
data['timestamp'] = pd.to_datetime(data['timestamp'])
data['timestamp'] = data['timestamp'] - data['timestamp'][0]
data['timestamp'] = [time.total_seconds() for time in data['timestamp']]

# make the list of player id
player_ids = data['tag_id'].unique()

# remove the entries of the goalkeeper and substitute players
x_means = [] # a list of x-position mean for each player
y_means = [] # a list of y-position mean for each player
for player_id in player_ids:
    x_means.append(data['x_pos'][data['tag_id']==player_id].mean())
    y_means.append(data['y_pos'][data['tag_id']==player_id].mean())

# remove the id of goalkeeper
player_ids = np.delete(player_ids, np.argsort(x_means)[0]) # a goalkeeper has the lowest x-position mean
# remove the ids of a substitute player
player_ids = np.delete(player_ids, np.argsort(y_means)[-3:]) # substitute players have the highest y-position mean
data = data[data['tag_id'].isin(player_ids)]

In [3]:
timestamp_quarter = [time for time in data['timestamp'].unique() if time % 0.25 == 0]
x_quarter = []
y_quarter = []

timestamp_new = []
x_new = []
y_new = []

for time in timestamp_quarter:
    x_quarter = []
    y_quarter = []
    for tag_id in data['tag_id'].unique():
        x_quarter.append(data['x_pos'][(data['tag_id']==tag_id) & (time<=data['timestamp']) & (data['timestamp']<time+0.25)].mean())
        y_quarter.append(data['y_pos'][(data['tag_id']==tag_id) & (time<=data['timestamp']) & (data['timestamp']<time+0.25)].mean())
    if (np.nan not in x_quarter) and (np.nan not in y_quarter):
        timestamp_new.append(time)
        x_new.append(x_quarter)
        y_new.append(y_quarter)

In [10]:
new_data = pd.DataFrame({'timestamp' : sorted(timestamp_new*10), 
              'tag_id' : [i for i in data['tag_id'].unique()] * len(timestamp_new),
              'x_pos' : [x_pos for sublist in x_new for x_pos in sublist],
              'y_pos' : [y_pos for sublist in y_new for y_pos in sublist]})

In [11]:
new_data.head()

Unnamed: 0,timestamp,tag_id,x_pos,y_pos
0,0.0,2,26.645492,29.491699
1,0.0,5,35.539222,30.331989
2,0.0,7,41.557287,38.651601
3,0.0,8,28.566362,39.527918
4,0.0,9,32.2538,12.7243


In [12]:
new_data.to_csv('../data/input/processed/prepped_tromso_stromsgodset_first.csv', index=False)