# Read Dataset

In [1]:
import geopandas as gpd
import numpy as np
import polars as pl

# 读取 CSV 文件，过滤掉 MISSING_DATA 为 True 的行，按 TIMESTAMP 排序，选取 POLYLINE 列，取前 1000 个值
original_trajs = (
    pl.read_csv("./resource/dataset/Porto/porto_sample.csv")
    .filter(pl.col("MISSING_DATA") == False)
    .sort("TIMESTAMP")["POLYLINE"]
    .limit(1000)
    .map_elements(lambda x: np.array(eval(x)), return_dtype=pl.List(pl.Array(pl.Float64, 2)))
)
original_trajs.head(1)

POLYLINE
"list[array[f64, 2]]"
"[[-8.610291, 41.140746], [-8.6103, 41.140755], … [-8.60589, 41.145345]]"


In [3]:
import polars as pl
import numpy as np
from fedtraj.utils.trajclus import traclus_partition
from tqdm import tqdm


def cut_trajectorys_into_segments(original_trajs):
    new_data = []
    # 遍历 POLYLINE 列中的每个轨迹
    for traj in tqdm(original_trajs):
        # 调用 traj_clus 函数得到切分点布尔数组
        _, split_points = traclus_partition(traj)

        # 找到所有切分点的索引
        split_indices = np.where(split_points)[0]

        # 处理没有切分点的情况
        if len(split_indices) == 0:
            new_trajlen = len(traj)
            new_polyline = traj
            if new_trajlen > 0:  # 仅添加长度不为 0 的轨迹
                new_data.append([new_trajlen, new_polyline])
        else:
            # 切分轨迹
            for i in range(len(split_indices) - 1):
                start = split_indices[i]
                end = split_indices[i + 1]
                new_trajlen = end - start + 1
                new_polyline = traj[start: end + 1]
                if new_trajlen > 0:  # 仅添加长度不为 0 的轨迹
                    new_data.append([new_trajlen, new_polyline])

    # 转换为适合创建 Polars DataFrame 的格式
    trajlen_list = [item[0] for item in new_data]
    polyline_list = [item[1] for item in new_data]

    # 创建新的 Polars DataFrame
    new_df = pl.DataFrame({"POLYLINE": polyline_list})["POLYLINE"]
    return new_df


cut_trajs = cut_trajectorys_into_segments(original_trajs)
len(cut_trajs)

100%|██████████| 1000/1000 [00:02<00:00, 496.99it/s]


15686

In [None]:
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
import traj_dist.distance as tdist
from tqdm import tqdm

def compute_lcss_distance_matrix(trajectories):
    """
    计算轨迹之间的 LCSS 距离矩阵
    :param trajectories: 包含 N 个 (L, 2) 数组的序列，每个数组表示一个二维轨迹
    :return: 距离矩阵，形状为 (N, N)
    """
    N = len(trajectories)
    # 初始化距离矩阵
    dist_matrix = np.zeros((N, N))
    # 计算总的循环次数
    total_iterations = N * (N - 1) // 2
    # 创建 tqdm 进度条
    progress_bar = tqdm(total=total_iterations, desc="计算 LCSS 距离矩阵")
    count = 0
    # 遍历所有轨迹对
    for i in range(N):
        for j in range(i + 1, N):
            # 计算轨迹 i 和轨迹 j 之间的 LCSS 距离
            dist_matrix[i, j] = tdist.lcss(np.array(trajectories[i]), np.array(trajectories[j]))
            # 距离矩阵是对称的
            dist_matrix[j, i] = dist_matrix[i, j]
            # 更新进度条
            count += 1
            progress_bar.update(1)
    # 关闭进度条
    progress_bar.close()
    return dist_matrix

def cluster_trajectories(trajectories, eps=0.5, min_samples=5):
    """
    基于 LCSS 距离对轨迹进行 DBSCAN 聚类，并计算凝聚系数
    :param trajectories: 包含 N 个 (L, 2) 数组的序列，每个数组表示一个二维轨迹
    :param eps: DBSCAN 的邻域半径
    :param min_samples: DBSCAN 的最小样本数
    :return: 聚类标签，凝聚系数
    """
    # 计算距离矩阵
    dist_matrix = compute_lcss_distance_matrix(trajectories)
    # 创建 DBSCAN 聚类器
    dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric='precomputed')
    # 进行聚类
    labels = dbscan.fit_predict(dist_matrix)
    try:
        # 计算凝聚系数
        silhouette_coef = silhouette_score(dist_matrix, labels, metric='precomputed')
    except ValueError:
        # 如果只有一个聚类或所有样本都是噪声点，则凝聚系数为 0
        silhouette_coef = 0
    return labels, silhouette_coef


# 进行聚类并计算凝聚系数
labels, silhouette_coef = cluster_trajectories(cut_trajs)
print("聚类标签:", labels)
print("凝聚系数:", silhouette_coef)

计算 LCSS 距离矩阵:   0%|          | 0/123017455 [00:00<?, ?it/s]

计算 LCSS 距离矩阵:  10%|▉         | 11898028/123017455 [06:40<1:20:25, 23025.78it/s]