## Importing libraries

In [1]:
import os
import glob
import pandas as pd
import numpy as np

### Reading data files

In [2]:
data = []
for f1,f2,f3 in os.walk("../Downloads/Geolife Trajectories 1.3/Geolife Trajectories 1.3/Data/"):
    for p in f3:
        if(p.endswith(".plt")):
            data.append(np.genfromtxt(os.path.join(f1,p),delimiter=',',skip_header=6,dtype=np.object))

### Columns: "Latitude","Longitude","Zeros","Altitude","DateInDays","DateString","TimeString"
#### Selecting only trajectories with more than 30 entries

In [3]:
l = [d for d in data if d.shape[0] > 30]
len(l)

4908

### Creating trajectory windows composed by 30 entries each

In [4]:
sequences = []
for arq in l:
    for i in range(0,int(arq.shape[0]/30)):
        sequences.append(arq[i*30:(i*30)+30])

### Checking results

In [5]:
print(sequences[1][0])
print(l[0][30])

[b'39.984621' b'116.313941' b'0' b'121' b'39744.1219328704' b'2008-10-23'
 b'02:55:35']
[b'39.984621' b'116.313941' b'0' b'121' b'39744.1219328704' b'2008-10-23'
 b'02:55:35']


### Verifying if the take funcion is capturing only useful columns

In [6]:
np.take(sequences[0][0],[0,1,3,4])

array([b'39.984702', b'116.318417', b'492', b'39744.1201851852'],
      dtype=object)

### Discarding unnecessary columns

In [7]:
useful = []
for s in sequences:
    traj = []
    for t in s:
        traj.append(np.take(t,[0,1,3,4]))
    useful.append(np.array(traj))

### Model-ready dataset

In [8]:
len(useful)

285211

In [9]:
useful = np.array(useful,dtype=np.float32)

In [10]:
useful[0][0]
data_array = []
for r1 in useful:
    for r2 in r1:
        data_array.append(r2)
data_array = np.array(data_array)

### Applying MinMax Scaler on features to improve model performance

In [11]:
from sklearn.preprocessing import MinMaxScaler

In [12]:
mm = MinMaxScaler()
scaled_data = mm.fit_transform(data_array)

### Storing preprocessed data into a file

In [13]:
scaled_data

array([[0.0570538 , 0.75224483, 0.23634978, 0.31873703],
       [0.05705375, 0.7522453 , 0.23634978, 0.31873703],
       [0.05705376, 0.75224483, 0.23634978, 0.31873703],
       ...,
       [0.05704658, 0.75206816, 0.23501602, 0.42152977],
       [0.05704621, 0.7520684 , 0.23503958, 0.42152977],
       [0.05704588, 0.75206816, 0.23506312, 0.42152977]], dtype=float32)

In [14]:
np.savetxt("preprocessed_Data.csv",scaled_data,delimiter=',')