Skip to content

Commit

Permalink
FIX: changed DBSCAN minimum to 1 (#397)
Browse files Browse the repository at this point in the history
  • Loading branch information
henrymartin1 committed May 18, 2022
1 parent f039904 commit 629c3de
Show file tree
Hide file tree
Showing 4 changed files with 15 additions and 15 deletions.
2 changes: 1 addition & 1 deletion tests/analysis/test_tracking_quality.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ def test_tracking_quality_wrong_datamodel(self):
sp_file = os.path.join("tests", "data", "geolife", "geolife_staypoints.csv")
sp = ti.read_staypoints_csv(sp_file, tz="utc", index_col="id")
_, locs = sp.as_staypoints.generate_locations(
method="dbscan", epsilon=10, num_samples=0, distance_metric="haversine", agg_level="dataset"
method="dbscan", epsilon=10, num_samples=1, distance_metric="haversine", agg_level="dataset"
)
with pytest.raises(KeyError):
ti.analysis.tracking_quality.temporal_tracking_quality(locs)
Expand Down
2 changes: 1 addition & 1 deletion tests/model/test_locations.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def testdata_locs():
sp_file = os.path.join("tests", "data", "geolife", "geolife_staypoints.csv")
sp = ti.read_staypoints_csv(sp_file, tz="utc", index_col="id")
sp, locs = sp.as_staypoints.generate_locations(
method="dbscan", epsilon=10, num_samples=0, distance_metric="haversine", agg_level="dataset"
method="dbscan", epsilon=10, num_samples=1, distance_metric="haversine", agg_level="dataset"
)
return locs

Expand Down
2 changes: 1 addition & 1 deletion tests/preprocessing/test_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def locs_from_geolife():

# cluster staypoints to locations
_, locs = sp.as_staypoints.generate_locations(
method="dbscan", epsilon=10, num_samples=0, distance_metric="haversine", agg_level="dataset"
method="dbscan", epsilon=10, num_samples=1, distance_metric="haversine", agg_level="dataset"
)

# the projection needs to be defined: WGS84
Expand Down
24 changes: 12 additions & 12 deletions tests/preprocessing/test_staypoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ def test_dbscan_hav_euc(self):

# haversine calculation
_, loc_har = sp.as_staypoints.generate_locations(
method="dbscan", epsilon=100, num_samples=0, distance_metric="haversine", agg_level="dataset"
method="dbscan", epsilon=100, num_samples=1, distance_metric="haversine", agg_level="dataset"
)
# WGS_1984
sp.crs = "epsg:4326"
Expand All @@ -170,7 +170,7 @@ def test_dbscan_hav_euc(self):

# euclidean calculation
_, loc_eu = sp.as_staypoints.generate_locations(
method="dbscan", epsilon=100, num_samples=0, distance_metric="euclidean", agg_level="dataset"
method="dbscan", epsilon=100, num_samples=1, distance_metric="euclidean", agg_level="dataset"
)

assert len(loc_har) == len(loc_eu)
Expand All @@ -182,12 +182,12 @@ def test_dbscan_haversine(self):

# haversine calculation using sklearn.metrics.pairwise_distances
sp, locs = sp.as_staypoints.generate_locations(
method="dbscan", epsilon=10, num_samples=0, distance_metric="haversine", agg_level="dataset"
method="dbscan", epsilon=10, num_samples=1, distance_metric="haversine", agg_level="dataset"
)

# calculate pairwise haversine matrix and fed to dbscan
sp_distance_matrix = calculate_distance_matrix(sp, dist_metric="haversine")
db = DBSCAN(eps=10, min_samples=0, metric="precomputed")
db = DBSCAN(eps=10, min_samples=1, metric="precomputed")
labels = db.fit_predict(sp_distance_matrix)

assert len(set(locs.index)) == len(set(labels))
Expand All @@ -197,7 +197,7 @@ def test_dbscan_loc(self):
sp_file = os.path.join("tests", "data", "geolife", "geolife_staypoints.csv")
sp = ti.read_staypoints_csv(sp_file, tz="utc", index_col="id")
sp, locs = sp.as_staypoints.generate_locations(
method="dbscan", epsilon=10, num_samples=0, distance_metric="haversine", agg_level="dataset"
method="dbscan", epsilon=10, num_samples=1, distance_metric="haversine", agg_level="dataset"
)

# create locations as grouped staypoints, another way to create locations
Expand Down Expand Up @@ -235,10 +235,10 @@ def test_dbscan_user_dataset(self):
# duplicate for a certain number
sp = pd.concat([sp] * 6, ignore_index=True)
_, locs_ds = sp.as_staypoints.generate_locations(
method="dbscan", epsilon=10, num_samples=0, distance_metric="haversine", agg_level="dataset"
method="dbscan", epsilon=10, num_samples=1, distance_metric="haversine", agg_level="dataset"
)
_, locs_us = sp.as_staypoints.generate_locations(
method="dbscan", epsilon=10, num_samples=0, distance_metric="haversine", agg_level="user"
method="dbscan", epsilon=10, num_samples=1, distance_metric="haversine", agg_level="user"
)
loc_dataset_num = len(locs_ds.index.unique())
loc_user_num = len(locs_us.index.unique())
Expand All @@ -253,10 +253,10 @@ def test_dbscan_min(self):
method="sliding", gap_threshold=1e6, dist_threshold=0, time_threshold=0
)
_, locs_user = sp.as_staypoints.generate_locations(
method="dbscan", epsilon=1e-18, num_samples=0, agg_level="user"
method="dbscan", epsilon=1e-18, num_samples=1, agg_level="user"
)
_, locs_data = sp.as_staypoints.generate_locations(
method="dbscan", epsilon=1e-18, num_samples=0, agg_level="dataset"
method="dbscan", epsilon=1e-18, num_samples=1, agg_level="dataset"
)
# With small hyperparameters, clustering should not reduce the number
assert len(locs_user) == len(sp)
Expand Down Expand Up @@ -321,15 +321,15 @@ def test_dtype_consistent(self):
sp = ti.read_staypoints_csv(sp_file, tz="utc", index_col="id")
#
sp, locs = sp.as_staypoints.generate_locations(
method="dbscan", epsilon=10, num_samples=0, distance_metric="haversine", agg_level="dataset"
method="dbscan", epsilon=10, num_samples=1, distance_metric="haversine", agg_level="dataset"
)
assert sp["user_id"].dtype == locs["user_id"].dtype
assert sp["location_id"].dtype == "Int64"
assert locs.index.dtype == "int64"
# change the user_id to string
sp["user_id"] = sp["user_id"].apply(lambda x: str(x))
sp, locs = sp.as_staypoints.generate_locations(
method="dbscan", epsilon=10, num_samples=0, distance_metric="haversine", agg_level="dataset"
method="dbscan", epsilon=10, num_samples=1, distance_metric="haversine", agg_level="dataset"
)
assert sp["user_id"].dtype == locs["user_id"].dtype
assert sp["location_id"].dtype == "Int64"
Expand All @@ -345,7 +345,7 @@ def test_index_start(self):
for distance_metric in distance_metric_ls:
for agg_level in agg_level_ls:
_, locations = sp.as_staypoints.generate_locations(
method="dbscan", epsilon=10, num_samples=0, distance_metric=distance_metric, agg_level=agg_level
method="dbscan", epsilon=10, num_samples=1, distance_metric=distance_metric, agg_level=agg_level
)
assert (locations.index == np.arange(len(locations))).any()

Expand Down

0 comments on commit 629c3de

Please sign in to comment.