In [1]:
import pandas as pd
import json
from datetime import datetime
import re

# Configurations

In [2]:
timeline_files = [
    'LeHuy.json',
    'DinhKhanh.json',
    'TuanAnh.json'
]

# Output file name for the processed data
OUTPUT_FILENAME = "danang_movement_google_timeline.parquet"

# Data Extraction and Processing Logic

In [3]:
def parse_lat_lon(point_str):
    """Parses 'lat°, lon°' string into two floats."""
    try:
        # Remove the degree symbol and split
        parts = point_str.replace('°', '').split(', ')
        lat = float(parts[0])
        lon = float(parts[1])
        return lat, lon
    except (ValueError, IndexError, AttributeError) as e:
        # Handle cases where point_str is not a string (e.g., None)
        # print(f"Warning: Could not parse point string: '{point_str}'. Error: {e}")
        return None, None

def process_timeline_file(filepath, device_id):
    """Processes a single Google Timeline JSON file and returns a list of records."""
    print(f"--- Processing file: {filepath} ---")
    processed_records = []

    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            data = json.load(f)
    except FileNotFoundError:
        print(f"ERROR: File not found at {filepath}")
        return []
    except json.JSONDecodeError:
        print(f"ERROR: Could not decode JSON from {filepath}. File might be corrupted.")
        return []

    if 'semanticSegments' not in data:
        print(f"Warning: 'semanticSegments' key not found in {filepath}. Skipping.")
        return []
        
    for segment in data['semanticSegments']:
        # -- NEW: Extract data from 'visit' segments --
        if 'visit' in segment:
            visit_segment = segment.get('visit', {})
            location_data = visit_segment.get('topCandidate', {}).get('placeLocation', {})
            
            # Google sometimes provides latLng directly, sometimes in a 'point' sub-object
            lat_lng_str = location_data.get('latLng') or location_data.get('point')

            if lat_lng_str:
                lat, lon = parse_lat_lon(lat_lng_str)
                if lat is not None and 'startTime' in segment:
                    record = {
                        "DeviceID": device_id,
                        "TimestampUTC": segment['startTime'],
                        "Latitude": lat,
                        "Longitude": lon,
                        "Description": f"google_timeline_visit_start"
                    }
                    processed_records.append(record)
                    
                    # Also add the end time of the visit to capture the duration
                    if 'endTime' in segment:
                         record_end = {
                            "DeviceID": device_id,
                            "TimestampUTC": segment['endTime'],
                            "Latitude": lat,
                            "Longitude": lon,
                            "Description": f"google_timeline_visit_end"
                        }
                         processed_records.append(record_end)


        # Extract data from 'activity' segments which represent travel
        if 'activity' in segment:
            activity_segment = segment.get('activity', {})
            
            # Start Point
            start_location = activity_segment.get('start', {})
            start_lat_lng_str = start_location.get('latLng')
            if start_lat_lng_str:
                start_lat, start_lon = parse_lat_lon(start_lat_lng_str)
                if start_lat is not None and 'startTime' in segment:
                    record = {
                        "DeviceID": device_id,
                        "TimestampUTC": segment['startTime'],
                        "Latitude": start_lat,
                        "Longitude": start_lon,
                        "Description": f"google_timeline_activity_start"
                    }
                    processed_records.append(record)
            
            # End Point
            end_location = activity_segment.get('end', {})
            end_lat_lng_str = end_location.get('latLng')
            if end_lat_lng_str:
                end_lat, end_lon = parse_lat_lon(end_lat_lng_str)
                if end_lat is not None and 'endTime' in segment:
                    record = {
                        "DeviceID": device_id,
                        "TimestampUTC": segment['endTime'],
                        "Latitude": end_lat,
                        "Longitude": end_lon,
                        "Description": f"google_timeline_activity_end"
                    }
                    processed_records.append(record)

        # Extract data from 'timelinePath' which contains detailed route points
        if 'timelinePath' in segment:
            for point_data in segment['timelinePath']:
                lat_lng_str = point_data.get('point')
                if lat_lng_str:
                    lat, lon = parse_lat_lon(lat_lng_str)
                    if lat is not None:
                        record = {
                            "DeviceID": device_id,
                            "TimestampUTC": point_data.get('time'),
                            "Latitude": lat,
                            "Longitude": lon,
                            "Description": "google_timeline_path"
                        }
                        processed_records.append(record)

    print(f"Found and processed {len(processed_records)} location points.")
    return processed_records

# Main execution

In [4]:
all_google_data = []
# We'll assign a unique ID for each file to distinguish the users
for i, file_path in enumerate(timeline_files):
    device_id = f"google_timeline_user_{i+1}"
    all_google_data.extend(process_timeline_file(file_path, device_id))

if not all_google_data:
    print("\nNo data was processed. Exiting.")
else:
    # Create DataFrame
    google_df = pd.DataFrame(all_google_data)

--- Processing file: LeHuy.json ---
Found and processed 3 location points.
--- Processing file: DinhKhanh.json ---
Found and processed 7870 location points.
--- Processing file: TuanAnh.json ---
Found and processed 6175 location points.


# Data cleaning and standardization

In [5]:
    # Convert TimestampUTC to datetime objects
    google_df['TimestampUTC'] = pd.to_datetime(google_df['TimestampUTC'], errors='coerce', utc=True)

    # Drop any rows where timestamp conversion failed
    google_df.dropna(subset=['TimestampUTC'], inplace=True)
    
    # Sort by device and time
    google_df.sort_values(by=['DeviceID', 'TimestampUTC'], inplace=True)
    
    # Add placeholder columns to match the original movement data structure
    google_df['LocationID'] = -1
    google_df['Confidence'] = 100.0 # Google data is generally high confidence
    google_df['StatusCode'] = 0
    google_df['DBDatePublishedUTC'] = None
    google_df['EncryptedPayloadDB'] = None
    
    # Reorder columns to match the target schema
    final_columns = [
        "LocationID", "DeviceID", "TimestampUTC", "Latitude", "Longitude",
        "Confidence", "Description", "StatusCode", "DBDatePublishedUTC",
        "EncryptedPayloadDB"
    ]
    google_df = google_df[final_columns]

    print("Data standardized. Final DataFrame info:")
    google_df.info()
    print("\nSample of processed Google Timeline data:")
    print(google_df.head())

Data standardized. Final DataFrame info:
<class 'pandas.core.frame.DataFrame'>
Index: 14048 entries, 1 to 14001
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype              
---  ------              --------------  -----              
 0   LocationID          14048 non-null  int64              
 1   DeviceID            14048 non-null  object             
 2   TimestampUTC        14048 non-null  datetime64[ns, UTC]
 3   Latitude            14048 non-null  float64            
 4   Longitude           14048 non-null  float64            
 5   Confidence          14048 non-null  float64            
 6   Description         14048 non-null  object             
 7   StatusCode          14048 non-null  int64              
 8   DBDatePublishedUTC  0 non-null      object             
 9   EncryptedPayloadDB  0 non-null      object             
dtypes: datetime64[ns, UTC](1), float64(3), int64(2), object(4)
memory usage: 1.2+ MB

Sample of processed Google Timeline 

# Save Processed Data

In [6]:
print(f"\n--- Step 5: Saving to {OUTPUT_FILENAME} ---")
try:
    google_df.to_parquet(OUTPUT_FILENAME, index=False)
    print("Successfully saved processed Google Timeline data.")
except Exception as e:
    print(f"Error saving data to Parquet: {e}")


--- Step 5: Saving to danang_movement_google_timeline.parquet ---
Successfully saved processed Google Timeline data.
