In [None]:
import pandas as pd

In [None]:
# 0       2010-10-19T23:55:27Z    30.2359091167   -97.7951395833  22847

# Load the Gowalla dataset
df = pd.read_csv(
    "../gowalla/loc-gowalla_totalCheckins.txt", 
    header=None, 
    names=["user", "timestamp", "latitude", "longitude", "location_id"], 
    sep=None, 
    engine="python"
)

# Convert timestamp to datetime
df["timestamp"] = pd.to_datetime(df["timestamp"])

# Extract the date from the timestamp
df["date"] = df["timestamp"].dt.date

In [None]:
# Pivot the dataframe to create a user-day matrix (True/False for check-ins)
user_day_matrix = df.groupby(["user", "date"]).size().unstack(fill_value=0).astype(bool)

In [None]:
def find_valid_streaks(user_series, m):
    """Finds all valid non-overlapping m-day streaks in a binary presence series."""
    streaks = []
    current_streak = []

    for day, present in user_series.items():
        if present:
            current_streak.append(day)
            if len(current_streak) == m:  # Capture exactly m days
                streaks.append(current_streak[:])  # Store a copy of the streak
                current_streak = []  # Reset to avoid overlap
        else:
            current_streak = []  # Reset if a gap occurs

    return streaks

In [None]:
# Apply the streak finding function to each user's data
m = 3
valid_streaks = user_day_matrix.apply(lambda row: find_valid_streaks(row, m), axis=1)

valid_users = valid_streaks[valid_streaks.apply(lambda streaks: len(streaks) > 0)].index

In [None]:
sum(valid_streaks[valid_users].apply(len))

In [None]:
# Step 1: Create a lookup table for streaks
streak_lookup = []
traj_id = 0
for user, streaks in valid_streaks.items():
    for streak in streaks:
        streak_lookup.append({"user": user, "traj_id": traj_id, "streak_dates": streak})
        traj_id += 1

streak_df = pd.DataFrame(streak_lookup)

# Step 2: Merge the streak lookup with the original dataframe for efficient filtering
df["date"] = pd.to_datetime(df["timestamp"]).dt.date  # Ensure date format
df["user"] = df["user"].astype(str)  # Ensure user column is string type for merging

# Explode streak dates to a new dataframe for easy matching
exploded_streaks = streak_df.explode("streak_dates")
exploded_streaks["user"] = exploded_streaks["user"].astype(str)  # Ensure user column is string type for merging

# Merge the original dataframe with exploded streaks to find matching dates
merged_df = pd.merge(df, exploded_streaks, left_on=["user", "date"], right_on=["user", "streak_dates"], how="inner")

# Step 3: Drop duplicates (if any)
merged_df = merged_df.drop_duplicates(subset=["user", "date"])
merged_df = merged_df.drop(columns=["streak_dates"])

# This results in the final merged trajectories
merged_df.to_csv(f"../gowalla/merged_trajectories_length_{m}.csv", index=False)

In [None]:
import folium as fm

# take a single trajectory and plot it on a map
def plot_trajectory(trajectory, map):
    for _, row in trajectory.iterrows():
        fm.Marker([row["latitude"], row["longitude"]]).add_to(map)
    # Add a line connecting the points
    points = trajectory[["latitude", "longitude"]].values
    fm.PolyLine(points, color="blue", weight=5, opacity=0.7).add_to(map)
    return map

traj = merged_df[(merged_df["user"] == "4") & (merged_df["traj_id"] == 0)].sort_values("timestamp")
print(traj)
map = fm.Map(location=[traj["latitude"].mean(), traj["longitude"].mean()], zoom_start=10)
plot_trajectory(traj, map)
map