Skip to content

Commit

Permalink
Id function improvised (#234)
Browse files Browse the repository at this point in the history
* Id function added

* removed unnecessary datasets

* More accurate differentiation between an actual ID field and an identification field

* Done

* Only changing intended files

* Reverted to orig versions of dataset

* Reverted to orig versions of dataset

* reformat indentation issue, run black, revert college dataset

* broken data link

* Index columns support added for ID function

* black

* new changes

* revert plot config changes

Co-authored-by: Doris Lee <dorisjunglinlee@gmail.com>
  • Loading branch information
Priyansdesai and dorisjlee committed Jan 27, 2021
1 parent 5e7d5c9 commit 1c0e2eb
Show file tree
Hide file tree
Showing 3 changed files with 139 additions and 4 deletions.
13 changes: 10 additions & 3 deletions lux/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def check_if_id_like(df, attribute):
# Strong signals
# so that aggregated reset_index fields don't get misclassified
high_cardinality = df.cardinality[attribute] > 500
attribute_contain_id = re.search(r"id", str(attribute)) is not None
attribute_contain_id = re.search(r"id|ID|iD|Id", str(attribute)) is not None
almost_all_vals_unique = df.cardinality[attribute] >= 0.98 * len(df)
is_string = pd.api.types.is_string_dtype(df[attribute])
if is_string:
Expand All @@ -93,8 +93,15 @@ def check_if_id_like(df, attribute):
and str_length_uniformity
)
else:
# TODO: Could probably add some type of entropy measure (since the binned id fields are usually very even)
return high_cardinality and (attribute_contain_id or almost_all_vals_unique)
if len(df) >= 2:
series = df[attribute]
diff = series.diff()
evenly_spaced = all(diff.iloc[1:] == diff.iloc[1])
else:
evenly_spaced = True
if attribute_contain_id:
almost_all_vals_unique = df.cardinality[attribute] >= 0.75 * len(df)
return high_cardinality and (almost_all_vals_unique or evenly_spaced)


def like_nan(val):
Expand Down
2 changes: 1 addition & 1 deletion tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,4 +304,4 @@ def test_sort(global_var):

# vis_code = df.recommendation["Correlation"][0].to_Altair()
# print (vis_code)
# assert 'chart = chart.configure_mark(color="green")' in vis_code, "Exported chart does not have additional plot style setting."
# assert 'chart = chart.configure_mark(color="green")' in vis_code, "Exported chart does not have additional plot style setting."
128 changes: 128 additions & 0 deletions tests/test_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

from .context import lux
import pytest
import random
import pandas as pd
import warnings

Expand Down Expand Up @@ -238,3 +239,130 @@ def test_set_wrong_data_type():
)
df.set_data_type({"Year": "quantitative"})
assert df.data_type["Year"] == "quantitative"


def test_id_with_label():
df = pd.read_csv(
"https://github.com/lux-org/lux-datasets/blob/master/data/state_timeseries.csv?raw=true"
)
df.maintain_metadata()
assert df.data_type == {"Date": "temporal", "State": "nominal", "Value": "quantitative"}


def test_ID_random():
"""Tests whether a ID column not satisfying other properties of an ID gets recognized."""
values = [
{"ID": random.randint(0, 1000), "A": 6.0, "B": 1.0, "C": 1.0, "D": 3.0, "E": 2.0, "F": 5.0}
for x in range(1000)
]
df = pd.DataFrame(values)
df.maintain_metadata()
assert df.data_type == {
"ID": "quantitative",
"A": "nominal",
"B": "nominal",
"C": "nominal",
"D": "nominal",
"E": "nominal",
"F": "nominal",
}


def test_ID():
"""Tests different ways of writing id"""
values = [{"ID": x, "A": 6.0, "B": 1.0, "C": 1.0, "D": 3.0, "E": 2.0, "F": 5.0} for x in range(1000)]
df = pd.DataFrame(values)
df.maintain_metadata()
assert df.data_type == {
"ID": "id",
"A": "nominal",
"B": "nominal",
"C": "nominal",
"D": "nominal",
"E": "nominal",
"F": "nominal",
}


def test_id_aug_test():
"""Tests in a different dataset
Reference: https://www.kaggle.com/arashnic/hr-analytics-job-change-of-data-scientists
"""
df = pd.read_csv("https://github.com/lux-org/lux-datasets/blob/master/data/aug_test.csv?raw=true")
df.maintain_metadata()
assert df.data_type == {
"enrollee_id": "id",
"city": "nominal",
"city_development_index": "quantitative",
"gender": "nominal",
"relevent_experience": "nominal",
"enrolled_university": "nominal",
"education_level": "nominal",
"major_discipline": "nominal",
"experience": "nominal",
"company_size": "nominal",
"company_type": "nominal",
"last_new_job": "nominal",
"training_hours": "quantitative",
}


def test_id_music_data():
"""Tests in a different dataset if a column not named as an ID is recognized as an identification.
Reference: https://www.kaggle.com/yamaerenay/spotify-dataset-19212020-160k-tracks
"""
df = pd.read_csv("https://github.com/lux-org/lux-datasets/blob/master/data/spotify.csv?raw=true")
df["unique_num"] = df["id"]
df.drop(columns=["id"])
df.maintain_metadata()
assert df.data_type == {
"valence": "quantitative",
"year": "temporal",
"acousticness": "quantitative",
"artists": "nominal",
"danceability": "quantitative",
"duration_ms": "quantitative",
"energy": "quantitative",
"explicit": "nominal",
"unique_num": "id",
"instrumentalness": "quantitative",
"key": "nominal",
"liveness": "quantitative",
"loudness": "quantitative",
"mode": "nominal",
"name": "nominal",
"popularity": "quantitative",
"release_date": "temporal",
"speechiness": "quantitative",
"tempo": "quantitative",
"id": "id",
}


def test_id_absenteeism_data():
""" Tests whether an id named column is not recognized because even though it is named an id, it is not with its nature. """
df = pd.read_csv("https://github.com/lux-org/lux-datasets/blob/master/data/absenteeism.csv?raw=true")
df.maintain_metadata()
assert df.data_type == {
"ID": "quantitative",
"Reason for absence": "quantitative",
"Month of absence": "nominal",
"Day of the week": "nominal",
"Seasons": "nominal",
"Transportation expense": "quantitative",
"Distance from Residence to Work": "quantitative",
"Service time": "nominal",
"Age": "quantitative",
"Work load Average/day ": "quantitative",
"Hit target": "nominal",
"Disciplinary failure": "nominal",
"Education": "nominal",
"Son": "nominal",
"Social drinker": "nominal",
"Social smoker": "nominal",
"Pet": "nominal",
"Weight": "quantitative",
"Height": "nominal",
"Body mass index": "nominal",
"Absenteeism time in hours": "nominal",
}

0 comments on commit 1c0e2eb

Please sign in to comment.