## Importing libraries

In [1]:
import os
import glob
import pandas as pd
import numpy as np

### Reading data files

In [2]:
data = []
data_labels = []
l = None
label_index = -1
for f1,f2,f3 in os.walk("Geolife Trajectories 1.3/Data/"):
    for p in f3:
        if (p.startswith("label")):
            l = np.genfromtxt(os.path.join(f1,p),delimiter='\t',skip_header=1,dtype=np.object)
            if (l.shape == (3,)):
                l = l.reshape( (1,3) )
            data_labels.append(l)
            label_index += 1
        if(p.endswith(".plt") and l is not None):
                data.append( ( label_index,np.genfromtxt(os.path.join(f1,p),delimiter=',',skip_header=6,dtype=np.object)) )
    if (len(f2) == 0):
        l = None

### Columns: "Latitude","Longitude","Zeros","Altitude","DateInDays","DateString","TimeString"
#### Selecting only trajectories with more than 30 entries

In [3]:
bus, walk, car, taxi, bike = [], [], [], [], []
for label_index, entries in data:
    for label in data_labels[label_index]:
        start_time = label[0]
        end_time = label[1]
        transportation = label[2]
        trajectory = []
        for entry in entries:
            entry_time = entry[5].replace(b'-',b'/') + b' ' + entry[6]
            if (start_time <= entry_time <= end_time):
                    trajectory.append(entry)
        if (transportation == b'bus'):
            bus.append(np.array(trajectory))
        if (transportation == b'walk'):
            walk.append(np.array(trajectory))
        if (transportation == b'car'):
            car.append(np.array(trajectory))
        if (transportation == b'taxi'):
            taxi.append(np.array(trajectory))
        if (transportation == b'bike'):
            bike.append(np.array(trajectory))

In [4]:
print (len(bus), len(car), len(taxi), len(walk), len(bike))

1588698 1232085 916817 3979463 915317


In [5]:
l_w = [d for d in walk if d.shape[0] > 30]
len(l_w)

3535

In [6]:
l_b = [d for d in bus if d.shape[0] > 30]
len(l_b)

1808

In [7]:
l_c = [d for d in car if d.shape[0] > 30]
len(l_c)

788

In [8]:
l_t = [d for d in taxi if d.shape[0] > 30]
len(l_t)

468

In [9]:
l_bk = [d for d in bike if d.shape[0] > 30]
len(l_bk)

1527

### Creating trajectory windows composed by 30 entries each

In [47]:
index = len(l_b)
l_b_train = l_b[:int(index * 0.8)]
l_b_test = l_b[int(index * 0.8):]

In [48]:
sequences_train = []
for arq in l_b_train:
    for i in range(0,int(arq.shape[0]/30)):
        sequences_train.append(arq[i*30:(i*30)+30])
        
sequences_test = []
for arq in l_b_test:
    for i in range(0,int(arq.shape[0]/30)):
        sequences_test.append(arq[i*30:(i*30)+30])

### Checking results

In [50]:
print(sequences_train[1][0])
print(l_b[0][30])

[b'39.9365666' b'116.3036766' b'0' b'0' b'39535.3686574074' b'2008-03-28'
 b'08:50:52']
[b'39.9365666' b'116.3036766' b'0' b'0' b'39535.3686574074' b'2008-03-28'
 b'08:50:52']


### Verifying if the take funcion is capturing only useful columns

In [52]:
np.take(sequences_train[0][0],[0,1,3,4])

array([b'39.9620983', b'116.3015949', b'0', b'39535.3642361111'],
      dtype=object)

### Discarding unnecessary columns

In [53]:
useful_train = []
for s in sequences_train:
    traj = []
    for t in s:
        traj.append(np.take(t,[0,1,3,4]))
    useful_train.append(np.array(traj))
    
useful_test = []
for s in sequences_test:
    traj = []
    for t in s:
        traj.append(np.take(t,[0,1,3,4]))
    useful_test.append(np.array(traj))

### Model-ready dataset

In [54]:
len(useful_train)

30101

In [55]:
len(useful_test)

11798

In [56]:
useful_train = np.array(useful_train,dtype=np.float32)
useful_test = np.array(useful_test,dtype=np.float32)

In [57]:
useful_train[0][0]
data_array = []
for r1 in useful_train:
    for r2 in r1:
        data_array.append(r2)
data_array_train = np.array(data_array)

useful_test[0][0]
data_array = []
for r1 in useful_test:
    for r2 in r1:
        data_array.append(r2)
data_array_test = np.array(data_array)

### Applying MinMax Scaler on features to improve model performance

In [58]:
from sklearn.preprocessing import MinMaxScaler

In [59]:
mm = MinMaxScaler()
scaled_data_train = mm.fit_transform(data_array_train)
scaled_data_test = mm.transform(data_array_test)

### Storing preprocessed data into a file

In [60]:
scaled_data_train

array([[0.8998635 , 0.8851495 , 0.30708322, 0.20280075],
       [0.89916515, 0.8851869 , 0.30708322, 0.20280266],
       [0.89916277, 0.88518786, 0.30708322, 0.20280266],
       ...,
       [0.9090991 , 0.88569593, 0.30708322, 0.49342537],
       [0.90909886, 0.8856952 , 0.30708322, 0.49342537],
       [0.9090996 , 0.8856952 , 0.30708322, 0.49342537]], dtype=float32)

In [61]:
np.savetxt("preprocessed_Data_train.csv",scaled_data_train,delimiter=',')

In [62]:
np.savetxt("preprocessed_Data_test.csv",scaled_data_test,delimiter=',')

In [67]:
sequences_ = []
for arq in l_w:
    for i in range(0,int(arq.shape[0]/30)):
        sequences_.append(arq[i*30:(i*30)+30])
        
useful = []
for s in sequences_:
    traj = []
    for t in s:
        traj.append(np.take(t,[0,1,3,4]))
    useful.append(np.array(traj))
    
useful = np.array(useful,dtype=np.float32)

useful[0][0]
data_array = []
for r1 in useful:
    for r2 in r1:
        data_array.append(r2)
data_array_ = np.array(data_array)
scaled_data_ = mm.transform(data_array_)
np.savetxt("preprocessed_Data_walk.csv",scaled_data_,delimiter=',')

In [68]:
sequences_ = []
for arq in l_c:
    for i in range(0,int(arq.shape[0]/30)):
        sequences_.append(arq[i*30:(i*30)+30])
        
useful = []
for s in sequences_:
    traj = []
    for t in s:
        traj.append(np.take(t,[0,1,3,4]))
    useful.append(np.array(traj))
    
useful = np.array(useful,dtype=np.float32)

useful[0][0]
data_array = []
for r1 in useful:
    for r2 in r1:
        data_array.append(r2)
data_array_ = np.array(data_array)
scaled_data_ = mm.transform(data_array_)
np.savetxt("preprocessed_Data_car.csv",scaled_data_,delimiter=',')

In [69]:
sequences_ = []
for arq in l_bk:
    for i in range(0,int(arq.shape[0]/30)):
        sequences_.append(arq[i*30:(i*30)+30])
        
useful = []
for s in sequences_:
    traj = []
    for t in s:
        traj.append(np.take(t,[0,1,3,4]))
    useful.append(np.array(traj))
    
useful = np.array(useful,dtype=np.float32)

useful[0][0]
data_array = []
for r1 in useful:
    for r2 in r1:
        data_array.append(r2)
data_array_ = np.array(data_array)
scaled_data_ = mm.transform(data_array_)
np.savetxt("preprocessed_Data_bike.csv",scaled_data_,delimiter=',')

In [70]:
sequences_ = []
for arq in l_t:
    for i in range(0,int(arq.shape[0]/30)):
        sequences_.append(arq[i*30:(i*30)+30])
        
useful = []
for s in sequences_:
    traj = []
    for t in s:
        traj.append(np.take(t,[0,1,3,4]))
    useful.append(np.array(traj))
    
useful = np.array(useful,dtype=np.float32)

useful[0][0]
data_array = []
for r1 in useful:
    for r2 in r1:
        data_array.append(r2)
data_array_ = np.array(data_array)
scaled_data_ = mm.transform(data_array_)
np.savetxt("preprocessed_Data_taxi.csv",scaled_data_,delimiter=',')