In [65]:
#imports
import json
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
import matplotlib.cm as cm
import matplotlib.colors as mcolors
import seaborn as sns  
import requests
import time
from math import radians, sin, cos, sqrt, atan2

import folium
from folium.plugins import MarkerCluster


In [76]:
# Data Loading and Preprocessing
def load_timeline_data(filename):
    """
    Load the JSON timeline data from Google Takeout and returns a DataFrame.
    Each record is parsed to extract:
      - startTime and endTime (converted to datetime objects)
      - duration (in minutes)
      - latitude and longitude (parsed from the coordinate string)
      - record_type: either 'visit' or 'activity'
    """
    with open(filename, "r") as f:
        data = json.load(f)
    
    records = []
    for entry in data:
        # Check if record is 'activity' or a 'visit'
        if "activity" in entry:
            # For activity records, extract the "start" coordinate.
            if "start" in entry["activity"]:
                coord_str = entry["activity"]["start"]
            else:
                continue
            record_type = "activity"
        elif "visit" in entry:
            # For visit records,use the coordinate from "topCandidate" -> "placeLocation"
            if "topCandidate" in entry["visit"]:
                coord_str = entry["visit"]["topCandidate"]["placeLocation"]
            else:
                continue
            record_type = "visit"
        else:
            continue
        
        # The coordinate string is in the format "geo:lat,lon". Remove the "geo:" prefix.
        try:
            lat_str, lon_str = coord_str.replace("geo:", "").split(",")
            lat = float(lat_str)
            lon = float(lon_str)
        except Exception as e:
            continue
        
        # Convert startTime and endTime from string to datetime objects.
        try:
            start_time = datetime.fromisoformat(entry["startTime"])
            end_time = datetime.fromisoformat(entry["endTime"])
        except Exception as e:
            continue
        
        # Calculate duration in minutes.
        duration = (end_time - start_time).total_seconds() / 60.0
        
        records.append({
            "startTime": start_time,
            "endTime": end_time,
            "duration_min": duration,
            "latitude": lat,
            "longitude": lon,
            "record_type": record_type
        })
        
    df = pd.DataFrame(records)
    return df


#Determine Earliest and Latest Dates in the Data
def print_date_range(df):
    """
    Finds and prints the earliest start date and the latest end date.
    """
    earliest = df["startTime"].min()
    latest = df["endTime"].max()
    print("Earliest date in data:", earliest)
    print("Latest date in data:", latest)
    return earliest, latest


# Clustering Significant Locations using DBSCAN
def cluster_locations(df, eps_meters=100, min_samples=3):
    """
    Uses DBSCAN to cluster GPS coordinates of significant locations.
    """
    # Create array of coordinates.
    coords = df[['latitude', 'longitude']].to_numpy()
    coords_rad = np.radians(coords)
    
    # Earth's radius in meters.
    earth_radius = 6371000.0  
    eps = eps_meters / earth_radius
    
    # Run DBSCAN clustering using the haversine metric.
    db = DBSCAN(eps=eps, min_samples=min_samples, algorithm='ball_tree', metric='haversine').fit(coords_rad)
    df['cluster'] = db.labels_
    return df, db, eps_meters, min_samples 

# decided to manually implement colors so that noise could be gray...
def get_cluster_colors(df):
    """
    Assigns a fixed set of distinct colors to cluster labels manually.
    """
    manual_colors = {
        0: "red",  
        1: "blue",  
        2: "green",  
        3: "purple", 
        4: "orange", 
        5: "brown",  
        6: "pink", 
        7: "black",  
        8: "teal",  
        9: "yellow",  
        -1: "gray"    
    }

    # Add additional fallback colors if more clusters appear than expected
    unique_clusters = df['cluster'].unique()
    for cluster in unique_clusters:
        if cluster not in manual_colors:
            manual_colors[cluster] = "black"  # fallback: black

    return manual_colors


# Visualization of Clusters
def plot_clusters(df, eps_meters=None, min_samples=None, cluster_colors=None):
    plt.figure(figsize=(10, 6))
    for cluster in df['cluster'].unique():
        cluster_data = df[df['cluster'] == cluster]
        color = cluster_colors.get(cluster, 'gray')
        label = "Noise" if cluster == -1 else f"Cluster {cluster}"
        plt.scatter(cluster_data['longitude'], cluster_data['latitude'],    
            c=color, alpha=0.6, label=label)

    plt.xlabel("Longitude")
    plt.ylabel("Latitude")
    plt.title("DBSCAN Clusters of Significant Locations")
    plt.legend()
    if eps_meters is not None and min_samples is not None:
        plt.text(0.01, 0.01, f"eps = {eps_meters} meters\nmin_samples = {min_samples}",
                 transform=plt.gca().transAxes, fontsize=10,
                 bbox=dict(facecolor='white', edgecolor='black', boxstyle='round,pad=0.5'))
    plt.grid(True)
    plt.show()
  


# Label Clusters Using Google Places API
def label_clusters(df, api_key, radius=200, sleep_time=1.0):
    """
    Assign a label to each cluster based on the Google Places API.
    - Only queries once per non-noise cluster (at centroid)
    - Returns the labeled DataFrame, cluster_id → label dict, and place metadata
    """
    cluster_labels = {}
    place_results = {}

    print("Querying Google Places API for cluster labels...")

    for cluster_id in sorted(df['cluster'].unique()):
        if cluster_id == -1:
            cluster_labels[cluster_id] = "Noise"
            continue

        top_places = [] 

        cluster_df = df[df['cluster'] == cluster_id]
        centroid_lat = cluster_df['latitude'].mean()
        centroid_lon = cluster_df['longitude'].mean()

        url = "https://maps.googleapis.com/maps/api/place/nearbysearch/json"
        params = {
            "location": f"{centroid_lat},{centroid_lon}",
            "radius": radius,
            "key": api_key
        }

        try:
            response = requests.get(url, params=params)
            data = response.json()

            label = "Unknown"
            if data["status"] == "OK" and data["results"]:
                print(f"\nCluster {cluster_id}: Top 5 Google Places Results")
                for i, place in enumerate(data["results"][:5]):
                    name = place.get("name", "Unnamed")
                    types = place.get("types", ["Unknown"])
                    rating = place.get("rating", None)
                    user_ratings_total = place.get("user_ratings_total", None)

                    print(f"  {i+1}. {name} → {types}")
                    top_places.append({
                        "cluster_id": cluster_id,
                        "rank": i + 1,
                        "place_name": name,
                        "types": types,
                        "rating": rating,
                        "user_ratings_total": user_ratings_total
                    })
                    for t in types:
                        if t not in ["locality", "political", "point_of_interest", "establishment", "premise"]:
                            if label == "Unknown":
                                label = t
                            break
                            
                if label == "Unknown":
                    print("No specific type found - defaulting to unknown!")
            else:
                print(f"Cluster {cluster_id}: No results (status: {data.get('status', 'N/A')})")
            
            cluster_labels[cluster_id] = label
            place_results[cluster_id] = top_places  # save top 5 results since first was mostly like city name and not informative enough

        except Exception as e:
            label = "Error"
            print(f"Cluster {cluster_id}: API error - {e}")
            place_results[cluster_id] = {"status": "EXCEPTION", "error": str(e)}

        cluster_labels[cluster_id] = label
        time.sleep(sleep_time)

    # Assign labels back to DataFrame
    df['cluster_label'] = df['cluster'].map(cluster_labels)
    return df, cluster_labels, place_results


def plot_labeled_clusters(df):
    plt.figure(figsize=(10, 6))
    for label in df['cluster_label'].unique():
        label_df = df[df['cluster_label'] == label]
        plt.scatter(label_df['longitude'], label_df['latitude'], label=label, alpha=0.6)

    plt.title("Clusters with Inferred Location Types")
    plt.xlabel("Longitude")
    plt.ylabel("Latitude")
    plt.legend()
    plt.grid(True)
    plt.show()

# Zoom in to regions to see more clusters there
def plot_zoomed_regions(df, regions_dict, cluster_colors):
    for region_name, bounds in regions_dict.items():
        zoom_df = df[
            (df['latitude'] >= bounds['lat_min']) & (df['latitude'] <= bounds['lat_max']) &
            (df['longitude'] >= bounds['lon_min']) & (df['longitude'] <= bounds['lon_max'])
        ]
        
        if zoom_df.empty:
            print(f"No data in {region_name}")
            continue

        plt.figure(figsize=(8, 6))
        
        # Now loop over cluster labels
        for cluster_label in zoom_df['cluster'].unique():
            cluster_data = zoom_df[zoom_df['cluster'] == cluster_label]
            color = cluster_colors.get(cluster_label, 'gray')

            label = "Noise" if cluster_label == -1 else f"Cluster {cluster_label}"
            plt.scatter(cluster_data['longitude'], cluster_data['latitude'],    
                c=color, alpha=0.6, label=label)
        
        plt.title(f"Zoomed View: {region_name}")
        plt.xlabel("Longitude")
        plt.ylabel("Latitude")
        plt.legend()
        plt.grid(True)
        plt.show()


def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371000  # Earth radius in meters
    dlat = radians(lat2 - lat1)
    dlon = radians(lon2 - lon1)
    a = sin(dlat/2)**2 + cos(radians(lat1)) * cos(radians(lat2)) * sin(dlon/2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    return R * c

In [None]:
# Specify the file name
filename = "location_history_newer.json"

# Load and preprocess the data.
df = load_timeline_data(filename)
print("Sample loaded data:")
print(df.head(), "\n")
print("Total number of records loaded:", len(df))
print("Total number of data points (before filtering):", df.shape[0])


# Print earliest and latest dates in the dataset.
earliest, latest = print_date_range(df)


# Filter to keep only significant records.
# Here, we assume that a record is 'significant' if the duration is over 5 minutes OR it is a visit!
significant_df = df[(df['duration_min'] > 5) | (df['record_type'] == 'visit')]
print("Significant records for clustering (sample):")
print(significant_df.head(), "\n")
print("Number of significant records (after filtering):", significant_df.shape[0])


# do clustering on the significant data.
# clustered_df, db_model = cluster_locations(significant_df, eps_meters=10000, min_samples=10)
clustered_df, db, eps_val, min_samples_val = cluster_locations(df, eps_meters=500, min_samples=10)
print("Clustered data (sample):")
print(clustered_df.head(), "\n")
cluster_colors = get_cluster_colors(clustered_df)


# Print size of each cluster
cluster_sizes = clustered_df['cluster'].value_counts().sort_index()
print("Cluster sizes (including noise as -1):")
print(cluster_sizes, "\n")


# Visualize the clusters.
# plot_clusters(clustered_df)
# plot_clusters(clustered_df, eps_meters=eps_val, min_samples=min_samples_val)
plot_clusters(clustered_df, eps_meters=eps_val, min_samples=min_samples_val, cluster_colors=cluster_colors)



# Label each cluster using the Google Places API.
# Replace "YOUR_KEY" with your actual API key.
api_key = "YOUR-KEY" # removed...
labeled_df, cluster_labels, place_results = label_clusters(clustered_df, api_key)
print("Assigned cluster labels:")
print(cluster_labels, "\n")

# Flatten and save place metadata
rows = []
for cluster_id, places in place_results.items():
    for p in places:
        rows.append({
            "cluster_id": p["cluster_id"],
            "rank": p["rank"],
            "place_name": p["place_name"],
            "place_types": ", ".join(p["types"]),
            "rating": p["rating"],
            "user_ratings_total": p["user_ratings_total"]
        })

# Save to CSV
places_df = pd.DataFrame(rows)
places_df.to_csv("cluster_top5_places.csv", index=False)
print("Saved cluster_top5_places.csv")

label_df = pd.DataFrame([
    {"cluster_id": cid, "label": label}
    for cid, label in cluster_labels.items()
])
label_df.to_csv("cluster_labels.csv", index=False)
print("Saved cluster_labels.csv")

# Manually added regions to zoom into based on my data - redacted for safety
zoom_regions = {
"Region 1 (North)": {"lat_min": 5, "lat_max": 5, "lon_min": 5, "lon_max": 5},
    "Region 2 (West)": {
    "lat_min": 5, "lat_max": 5,
    "lon_min": 5, "lon_max": 5
},
"Region 3 (Central)": {
    "lat_min": 5, "lat_max": 5,
    "lon_min": 5, "lon_max": 5
}
}

# plot_zoomed_regions(clustered_df, zoom_regions)
plot_zoomed_regions(clustered_df, zoom_regions, cluster_colors)

# if __name__ == "__main__":
#     main()

In [None]:
# Plot my clusters on folium, real-world map for visualization!
def plot_clusters_on_map(df, cluster_colors):
    # Use the center of your data as the map center
    center_lat = df['latitude'].mean()
    center_lon = df['longitude'].mean()

    m = folium.Map(location=[center_lat, center_lon], zoom_start=11)

    marker_cluster = MarkerCluster().add_to(m)

    for _, row in df.iterrows():
        cluster_id = row['cluster']
        label = f"Cluster {cluster_id}" if cluster_id != -1 else "Noise"
        color = cluster_colors.get(cluster_id, 'gray')
        
        folium.CircleMarker(
            location=(row['latitude'], row['longitude']),
            radius=4,
            color=color,
            fill=True,
            fill_color=color,
            fill_opacity=0.6,
            popup=f"{label}"
        ).add_to(marker_cluster)

    return m

filtered_df = labeled_df[labeled_df['cluster'] != -1]
map_view = plot_clusters_on_map(filtered_df, cluster_colors)
map_view.save("clustered_map.html")

map_view

