In [1]:
import numpy as np
import pandas as pd
import glob
import os.path
import datetime
import os

def read_plt(plt_file):
    points = pd.read_csv(plt_file, skiprows=6, header=None)
    
    # Combine date and time columns and convert to datetime
    points['datetime'] = pd.to_datetime(
        points[5] + ' ' + points[6], 
        format='%Y-%m-%d %H:%M:%S',
        errors='coerce'
    )
    
    # for clarity rename columns
    points.rename(inplace=True, columns={0: 'lat', 1: 'lon', 3: 'alt'})
    
    # remove unused columns and keep only needed ones
    points = points[['lat', 'lon', 'alt', 'datetime']]
    
    # rename datetime column to 'time' for consistency
    points.rename(inplace=True, columns={'datetime': 'time'})
    
    return points

mode_names = ['walk', 'bike', 'bus', 'car', 'subway', 'train', 'airplane', 'boat', 'run', 'motorcycle', 'taxi']
mode_ids = {s: i + 1 for i, s in enumerate(mode_names)}

def read_labels(labels_file):
    labels = pd.read_csv(labels_file, skiprows=1, header=None, sep=r'\s+')
    
    # Combine date and time columns for start and end times
    labels['start_time'] = pd.to_datetime(
        labels[0] + ' ' + labels[1], 
        format='%Y/%m/%d %H:%M:%S',
        errors='coerce'
    )
    labels['end_time'] = pd.to_datetime(
        labels[2] + ' ' + labels[3], 
        format='%Y/%m/%d %H:%M:%S', 
        errors='coerce'
    )
    
    # for clarity rename columns
    labels = labels[['start_time', 'end_time', 4]]
    labels.columns = ['start_time', 'end_time', 'label']
    
    # replace 'label' column with integer encoding
    labels['label'] = [mode_ids.get(i, 0) for i in labels['label']]
    
    return labels

def apply_labels(points, labels):
    # Create a mask to find which points fall within labeled intervals
    mask = np.zeros(len(points), dtype=bool)
    point_labels = np.zeros(len(points), dtype=int)
    
    for _, label_row in labels.iterrows():
        label_mask = (points['time'] >= label_row['start_time']) & (points['time'] <= label_row['end_time'])
        point_labels[label_mask] = label_row['label']
        mask = mask | label_mask
    
    points['label'] = point_labels

def read_user(user_folder):
    labels = None

    plt_files = glob.glob(os.path.join(user_folder, 'Trajectory', '*.plt'))
    
    if not plt_files:
        return pd.DataFrame(columns=['lat', 'lon', 'alt', 'time', 'label'])
    
    df = pd.concat([read_plt(f) for f in plt_files], ignore_index=True)
    
    # Sort by time for proper labeling
    df.sort_values('time', inplace=True)
    df.reset_index(drop=True, inplace=True)

    labels_file = os.path.join(user_folder, 'labels.txt')
    if os.path.exists(labels_file):
        labels = read_labels(labels_file)
        apply_labels(df, labels)
    else:
        df['label'] = 0

    return df

def read_all_users(folder):
    if not os.path.exists(folder):
        raise FileNotFoundError(f"Folder {folder} does not exist")
    
    subfolders = [sf for sf in os.listdir(folder) 
                  if os.path.isdir(os.path.join(folder, sf)) and sf.isdigit()]
    
    dfs = []
    for i, sf in enumerate(subfolders):
        print(f'[{i + 1}/{len(subfolders)}] processing user {sf}')
        try:
            df = read_user(os.path.join(folder, sf))
            df['user'] = int(sf)
            dfs.append(df)
        except Exception as e:
            print(f"Error processing user {sf}: {e}")
            continue
    
    if dfs:
        return pd.concat(dfs, ignore_index=True)
    else:
        return pd.DataFrame()

In [2]:
df = read_all_users('Data/Geolife')

[1/11] processing user 000
[2/11] processing user 001
[3/11] processing user 002
[4/11] processing user 003
[5/11] processing user 004
[6/11] processing user 005
[7/11] processing user 006
[8/11] processing user 007
[9/11] processing user 008
[10/11] processing user 009
[11/11] processing user 010


In [3]:
df.to_pickle('geolife.pkl')  # saves df to 'geolife.pkl'

In [4]:
 df = pd.read_pickle('geolife.pkl')  # reads 'geolife.pkl' into df

In [5]:
df

Unnamed: 0,lat,lon,alt,time,label,user
0,39.984702,116.318417,492,2008-10-23 02:53:04,0,0
1,39.984683,116.318450,492,2008-10-23 02:53:10,0,0
2,39.984686,116.318417,492,2008-10-23 02:53:15,0,0
3,39.984688,116.318385,492,2008-10-23 02:53:20,0,0
4,39.984655,116.318263,492,2008-10-23 02:53:25,0,0
...,...,...,...,...,...,...
2781507,39.136261,117.218261,-59,2009-03-21 05:34:49,0,10
2781508,39.136256,117.218276,-59,2009-03-21 05:34:50,0,10
2781509,39.136256,117.218291,-59,2009-03-21 05:34:51,0,10
2781510,39.136256,117.218303,-59,2009-03-21 05:34:52,0,10
