Id function improvised (#234)

* Id function added * removed unnecessary datasets * More accurate differentiation between an actual ID field and an identification field * Done * Only changing intended files * Reverted to orig versions of dataset * Reverted to orig versions of dataset * reformat indentation issue, run black, revert college dataset * broken data link * Index columns support added for ID function * black * new changes * revert plot config changes Co-authored-by: Doris Lee <dorisjunglinlee@gmail.com>
lux-org · Jan 27, 2021 · 1c0e2eb · 1c0e2eb
1 parent 5e7d5c9
commit 1c0e2eb
Show file tree

Hide file tree

Showing 3 changed files with 139 additions and 4 deletions.
diff --git a/lux/utils/utils.py b/lux/utils/utils.py
@@ -77,7 +77,7 @@ def check_if_id_like(df, attribute):
     # Strong signals
     # so that aggregated reset_index fields don't get misclassified
     high_cardinality = df.cardinality[attribute] > 500
-    attribute_contain_id = re.search(r"id", str(attribute)) is not None
+    attribute_contain_id = re.search(r"id|ID|iD|Id", str(attribute)) is not None
     almost_all_vals_unique = df.cardinality[attribute] >= 0.98 * len(df)
     is_string = pd.api.types.is_string_dtype(df[attribute])
     if is_string:
@@ -93,8 +93,15 @@ def check_if_id_like(df, attribute):
             and str_length_uniformity
         )
     else:
-        # TODO: Could probably add some type of entropy measure (since the binned id fields are usually very even)
-        return high_cardinality and (attribute_contain_id or almost_all_vals_unique)
+        if len(df) >= 2:
+            series = df[attribute]
+            diff = series.diff()
+            evenly_spaced = all(diff.iloc[1:] == diff.iloc[1])
+        else:
+            evenly_spaced = True
+        if attribute_contain_id:
+            almost_all_vals_unique = df.cardinality[attribute] >= 0.75 * len(df)
+        return high_cardinality and (almost_all_vals_unique or evenly_spaced)
 
 
 def like_nan(val):

diff --git a/tests/test_config.py b/tests/test_config.py
@@ -304,4 +304,4 @@ def test_sort(global_var):
 
 # 	vis_code = df.recommendation["Correlation"][0].to_Altair()
 # 	print (vis_code)
-# 	assert 'chart = chart.configure_mark(color="green")' in vis_code, "Exported chart does not have additional plot style setting."
+# 	assert 'chart = chart.configure_mark(color="green")' in vis_code, "Exported chart does not have additional plot style setting."
diff --git a/tests/test_type.py b/tests/test_type.py
@@ -14,6 +14,7 @@
 
 from .context import lux
 import pytest
+import random
 import pandas as pd
 import warnings
 
@@ -238,3 +239,130 @@ def test_set_wrong_data_type():
     )
     df.set_data_type({"Year": "quantitative"})
     assert df.data_type["Year"] == "quantitative"
+
+
+def test_id_with_label():
+    df = pd.read_csv(
+        "https://github.com/lux-org/lux-datasets/blob/master/data/state_timeseries.csv?raw=true"
+    )
+    df.maintain_metadata()
+    assert df.data_type == {"Date": "temporal", "State": "nominal", "Value": "quantitative"}
+
+
+def test_ID_random():
+    """Tests whether a ID column not satisfying other properties of an ID gets recognized."""
+    values = [
+        {"ID": random.randint(0, 1000), "A": 6.0, "B": 1.0, "C": 1.0, "D": 3.0, "E": 2.0, "F": 5.0}
+        for x in range(1000)
+    ]
+    df = pd.DataFrame(values)
+    df.maintain_metadata()
+    assert df.data_type == {
+        "ID": "quantitative",
+        "A": "nominal",
+        "B": "nominal",
+        "C": "nominal",
+        "D": "nominal",
+        "E": "nominal",
+        "F": "nominal",
+    }
+
+
+def test_ID():
+    """Tests different ways of writing id"""
+    values = [{"ID": x, "A": 6.0, "B": 1.0, "C": 1.0, "D": 3.0, "E": 2.0, "F": 5.0} for x in range(1000)]
+    df = pd.DataFrame(values)
+    df.maintain_metadata()
+    assert df.data_type == {
+        "ID": "id",
+        "A": "nominal",
+        "B": "nominal",
+        "C": "nominal",
+        "D": "nominal",
+        "E": "nominal",
+        "F": "nominal",
+    }
+
+
+def test_id_aug_test():
+    """Tests in a different dataset
+    Reference: https://www.kaggle.com/arashnic/hr-analytics-job-change-of-data-scientists
+    """
+    df = pd.read_csv("https://github.com/lux-org/lux-datasets/blob/master/data/aug_test.csv?raw=true")
+    df.maintain_metadata()
+    assert df.data_type == {
+        "enrollee_id": "id",
+        "city": "nominal",
+        "city_development_index": "quantitative",
+        "gender": "nominal",
+        "relevent_experience": "nominal",
+        "enrolled_university": "nominal",
+        "education_level": "nominal",
+        "major_discipline": "nominal",
+        "experience": "nominal",
+        "company_size": "nominal",
+        "company_type": "nominal",
+        "last_new_job": "nominal",
+        "training_hours": "quantitative",
+    }
+
+
+def test_id_music_data():
+    """Tests in a different dataset if a column not named as an ID is recognized as an identification.
+    Reference: https://www.kaggle.com/yamaerenay/spotify-dataset-19212020-160k-tracks
+    """
+    df = pd.read_csv("https://github.com/lux-org/lux-datasets/blob/master/data/spotify.csv?raw=true")
+    df["unique_num"] = df["id"]
+    df.drop(columns=["id"])
+    df.maintain_metadata()
+    assert df.data_type == {
+        "valence": "quantitative",
+        "year": "temporal",
+        "acousticness": "quantitative",
+        "artists": "nominal",
+        "danceability": "quantitative",
+        "duration_ms": "quantitative",
+        "energy": "quantitative",
+        "explicit": "nominal",
+        "unique_num": "id",
+        "instrumentalness": "quantitative",
+        "key": "nominal",
+        "liveness": "quantitative",
+        "loudness": "quantitative",
+        "mode": "nominal",
+        "name": "nominal",
+        "popularity": "quantitative",
+        "release_date": "temporal",
+        "speechiness": "quantitative",
+        "tempo": "quantitative",
+        "id": "id",
+    }
+
+
+def test_id_absenteeism_data():
+    """ Tests whether an id named column is not recognized because even though it is named an id, it is not with its nature. """
+    df = pd.read_csv("https://github.com/lux-org/lux-datasets/blob/master/data/absenteeism.csv?raw=true")
+    df.maintain_metadata()
+    assert df.data_type == {
+        "ID": "quantitative",
+        "Reason for absence": "quantitative",
+        "Month of absence": "nominal",
+        "Day of the week": "nominal",
+        "Seasons": "nominal",
+        "Transportation expense": "quantitative",
+        "Distance from Residence to Work": "quantitative",
+        "Service time": "nominal",
+        "Age": "quantitative",
+        "Work load Average/day ": "quantitative",
+        "Hit target": "nominal",
+        "Disciplinary failure": "nominal",
+        "Education": "nominal",
+        "Son": "nominal",
+        "Social drinker": "nominal",
+        "Social smoker": "nominal",
+        "Pet": "nominal",
+        "Weight": "quantitative",
+        "Height": "nominal",
+        "Body mass index": "nominal",
+        "Absenteeism time in hours": "nominal",
+    }