diff --git a/lux/utils/utils.py b/lux/utils/utils.py index 3ae4503d..283ff425 100644 --- a/lux/utils/utils.py +++ b/lux/utils/utils.py @@ -76,7 +76,7 @@ def check_if_id_like(df, attribute): # Strong signals # so that aggregated reset_index fields don't get misclassified high_cardinality = df.cardinality[attribute] > 500 - attribute_contain_id = re.search(r"id", str(attribute)) is not None + attribute_contain_id = re.search(r"id|ID|iD|Id", str(attribute)) is not None almost_all_vals_unique = df.cardinality[attribute] >= 0.98 * len(df) is_string = pd.api.types.is_string_dtype(df[attribute]) if is_string: @@ -92,8 +92,15 @@ def check_if_id_like(df, attribute): and str_length_uniformity ) else: - # TODO: Could probably add some type of entropy measure (since the binned id fields are usually very even) - return high_cardinality and (attribute_contain_id or almost_all_vals_unique) + if len(df) >= 2: + series = df[attribute] + diff = series.diff() + evenly_spaced = all(diff.iloc[1:] == diff.iloc[1]) + else: + evenly_spaced = True + if attribute_contain_id: + almost_all_vals_unique = df.cardinality[attribute] >= 0.75 * len(df) + return high_cardinality and (almost_all_vals_unique or evenly_spaced) def like_nan(val): diff --git a/tests/test_config.py b/tests/test_config.py index 7f3b49ac..20c4bb8a 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -287,4 +287,4 @@ def test_sort(global_var): # vis_code = df.recommendation["Correlation"][0].to_Altair() # print (vis_code) -# assert 'chart = chart.configure_mark(color="green")' in vis_code, "Exported chart does not have additional plot style setting." +# assert 'chart = chart.configure_mark(color="green")' in vis_code, "Exported chart does not have additional plot style setting." \ No newline at end of file diff --git a/tests/test_type.py b/tests/test_type.py index 5a22c517..a0dc634b 100644 --- a/tests/test_type.py +++ b/tests/test_type.py @@ -14,6 +14,7 @@ from .context import lux import pytest +import random import pandas as pd import warnings @@ -238,3 +239,130 @@ def test_set_wrong_data_type(): ) df.set_data_type({"Year": "quantitative"}) assert df.data_type["Year"] == "quantitative" + + +def test_id_with_label(): + df = pd.read_csv( + "https://github.com/lux-org/lux-datasets/blob/master/data/state_timeseries.csv?raw=true" + ) + df.maintain_metadata() + assert df.data_type == {"Date": "temporal", "State": "nominal", "Value": "quantitative"} + + +def test_ID_random(): + """Tests whether a ID column not satisfying other properties of an ID gets recognized.""" + values = [ + {"ID": random.randint(0, 1000), "A": 6.0, "B": 1.0, "C": 1.0, "D": 3.0, "E": 2.0, "F": 5.0} + for x in range(1000) + ] + df = pd.DataFrame(values) + df.maintain_metadata() + assert df.data_type == { + "ID": "quantitative", + "A": "nominal", + "B": "nominal", + "C": "nominal", + "D": "nominal", + "E": "nominal", + "F": "nominal", + } + + +def test_ID(): + """Tests different ways of writing id""" + values = [{"ID": x, "A": 6.0, "B": 1.0, "C": 1.0, "D": 3.0, "E": 2.0, "F": 5.0} for x in range(1000)] + df = pd.DataFrame(values) + df.maintain_metadata() + assert df.data_type == { + "ID": "id", + "A": "nominal", + "B": "nominal", + "C": "nominal", + "D": "nominal", + "E": "nominal", + "F": "nominal", + } + + +def test_id_aug_test(): + """Tests in a different dataset + Reference: https://www.kaggle.com/arashnic/hr-analytics-job-change-of-data-scientists + """ + df = pd.read_csv("https://github.com/lux-org/lux-datasets/blob/master/data/aug_test.csv?raw=true") + df.maintain_metadata() + assert df.data_type == { + "enrollee_id": "id", + "city": "nominal", + "city_development_index": "quantitative", + "gender": "nominal", + "relevent_experience": "nominal", + "enrolled_university": "nominal", + "education_level": "nominal", + "major_discipline": "nominal", + "experience": "nominal", + "company_size": "nominal", + "company_type": "nominal", + "last_new_job": "nominal", + "training_hours": "quantitative", + } + + +def test_id_music_data(): + """Tests in a different dataset if a column not named as an ID is recognized as an identification. + Reference: https://www.kaggle.com/yamaerenay/spotify-dataset-19212020-160k-tracks + """ + df = pd.read_csv("https://github.com/lux-org/lux-datasets/blob/master/data/spotify.csv?raw=true") + df["unique_num"] = df["id"] + df.drop(columns=["id"]) + df.maintain_metadata() + assert df.data_type == { + "valence": "quantitative", + "year": "temporal", + "acousticness": "quantitative", + "artists": "nominal", + "danceability": "quantitative", + "duration_ms": "quantitative", + "energy": "quantitative", + "explicit": "nominal", + "unique_num": "id", + "instrumentalness": "quantitative", + "key": "nominal", + "liveness": "quantitative", + "loudness": "quantitative", + "mode": "nominal", + "name": "nominal", + "popularity": "quantitative", + "release_date": "temporal", + "speechiness": "quantitative", + "tempo": "quantitative", + "id": "id", + } + + +def test_id_absenteeism_data(): + """ Tests whether an id named column is not recognized because even though it is named an id, it is not with its nature. """ + df = pd.read_csv("https://github.com/lux-org/lux-datasets/blob/master/data/absenteeism.csv?raw=true") + df.maintain_metadata() + assert df.data_type == { + "ID": "quantitative", + "Reason for absence": "quantitative", + "Month of absence": "nominal", + "Day of the week": "nominal", + "Seasons": "nominal", + "Transportation expense": "quantitative", + "Distance from Residence to Work": "quantitative", + "Service time": "nominal", + "Age": "quantitative", + "Work load Average/day ": "quantitative", + "Hit target": "nominal", + "Disciplinary failure": "nominal", + "Education": "nominal", + "Son": "nominal", + "Social drinker": "nominal", + "Social smoker": "nominal", + "Pet": "nominal", + "Weight": "quantitative", + "Height": "nominal", + "Body mass index": "nominal", + "Absenteeism time in hours": "nominal", + }