Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Id function improvised #234

Merged
merged 15 commits into from
Jan 27, 2021
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 11 additions & 3 deletions lux/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def check_if_id_like(df, attribute):
# Strong signals
# so that aggregated reset_index fields don't get misclassified
high_cardinality = df.cardinality[attribute] > 500
attribute_contain_id = re.search(r"id", str(attribute)) is not None
attribute_contain_id = re.search(r"id|ID|iD|Id", str(attribute)) is not None
almost_all_vals_unique = df.cardinality[attribute] >= 0.98 * len(df)
is_string = pd.api.types.is_string_dtype(df[attribute])
if is_string:
Expand All @@ -92,8 +92,16 @@ def check_if_id_like(df, attribute):
and str_length_uniformity
)
else:
# TODO: Could probably add some type of entropy measure (since the binned id fields are usually very even)
return high_cardinality and (attribute_contain_id or almost_all_vals_unique)
if len(df) >= 2:
series = df[attribute]
diff = series.diff()
evenly_spaced = all(diff.iloc[1:] == diff.iloc[1])
else:
evenly_spaced = True
if attribute_contain_id:
almost_all_vals_unique = df.cardinality[attribute] >= 0.75 * len(df)
return high_cardinality and (almost_all_vals_unique or evenly_spaced)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this line can be deleted

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you want to delete all lines from 100 to 103?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jerry - Done the required changes and pushed as well.

return high_cardinality and (almost_all_vals_unique or evenly_spaced)


def like_nan(val):
Expand Down
22 changes: 11 additions & 11 deletions tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,17 +274,17 @@ def test_sort(global_var):

# TODO: This test does not pass in pytest but is working in Jupyter notebook.
# def test_plot_setting(global_var):
# df = pytest.car_df
# df["Year"] = pd.to_datetime(df["Year"], format='%Y')
# def change_color_add_title(chart):
# chart = chart.configure_mark(color="green") # change mark color to green
# chart.title = "Custom Title" # add title to chart
# return chart
# df = pytest.car_df
# df["Year"] = pd.to_datetime(df["Year"], format='%Y')
# def change_color_add_title(chart):
# chart = chart.configure_mark(color="green") # change mark color to green
# chart.title = "Custom Title" # add title to chart
# return chart

# df.plot_config = change_color_add_title
# df.plot_config = change_color_add_title

# df._repr_html_()
# df._repr_html_()

# vis_code = df.recommendation["Correlation"][0].to_Altair()
# print (vis_code)
# assert 'chart = chart.configure_mark(color="green")' in vis_code, "Exported chart does not have additional plot style setting."
# vis_code = df.recommendation["Correlation"][0].to_Altair()
# print (vis_code)
# assert 'chart = chart.configure_mark(color="green")' in vis_code, "Exported chart does not have additional plot style setting."
128 changes: 128 additions & 0 deletions tests/test_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

from .context import lux
import pytest
import random
import pandas as pd
import warnings

Expand Down Expand Up @@ -238,3 +239,130 @@ def test_set_wrong_data_type():
)
df.set_data_type({"Year": "quantitative"})
assert df.data_type["Year"] == "quantitative"


def test_id_with_label():
df = pd.read_csv(
"https://github.com/lux-org/lux-datasets/blob/master/data/state_timeseries.csv?raw=true"
)
df.maintain_metadata()
assert df.data_type == {"Date": "temporal", "State": "nominal", "Value": "quantitative"}


def test_ID_random():
"""Tests whether a ID column not satisfying other properties of an ID gets recognized."""
values = [
{"ID": random.randint(0, 1000), "A": 6.0, "B": 1.0, "C": 1.0, "D": 3.0, "E": 2.0, "F": 5.0}
for x in range(1000)
]
df = pd.DataFrame(values)
df.maintain_metadata()
assert df.data_type == {
"ID": "quantitative",
"A": "nominal",
"B": "nominal",
"C": "nominal",
"D": "nominal",
"E": "nominal",
"F": "nominal",
}


def test_ID():
"""Tests different ways of writing id"""
values = [{"ID": x, "A": 6.0, "B": 1.0, "C": 1.0, "D": 3.0, "E": 2.0, "F": 5.0} for x in range(1000)]
df = pd.DataFrame(values)
df.maintain_metadata()
assert df.data_type == {
"ID": "id",
"A": "nominal",
"B": "nominal",
"C": "nominal",
"D": "nominal",
"E": "nominal",
"F": "nominal",
}


def test_id_aug_test():
"""Tests in a different dataset
Reference: https://www.kaggle.com/arashnic/hr-analytics-job-change-of-data-scientists
"""
df = pd.read_csv("https://github.com/lux-org/lux-datasets/blob/master/data/aug_test.csv?raw=true")
df.maintain_metadata()
assert df.data_type == {
"enrollee_id": "id",
"city": "nominal",
"city_development_index": "quantitative",
"gender": "nominal",
"relevent_experience": "nominal",
"enrolled_university": "nominal",
"education_level": "nominal",
"major_discipline": "nominal",
"experience": "nominal",
"company_size": "nominal",
"company_type": "nominal",
"last_new_job": "nominal",
"training_hours": "quantitative",
}


def test_id_music_data():
"""Tests in a different dataset if a column not named as an ID is recognized as an identification.
Reference: https://www.kaggle.com/yamaerenay/spotify-dataset-19212020-160k-tracks
"""
df = pd.read_csv("https://github.com/lux-org/lux-datasets/blob/master/data/spotify.csv?raw=true")
df["unique_num"] = df["id"]
df.drop(columns=["id"])
df.maintain_metadata()
assert df.data_type == {
"valence": "quantitative",
"year": "temporal",
"acousticness": "quantitative",
"artists": "nominal",
"danceability": "quantitative",
"duration_ms": "quantitative",
"energy": "quantitative",
"explicit": "nominal",
"unique_num": "id",
"instrumentalness": "quantitative",
"key": "nominal",
"liveness": "quantitative",
"loudness": "quantitative",
"mode": "nominal",
"name": "nominal",
"popularity": "quantitative",
"release_date": "temporal",
"speechiness": "quantitative",
"tempo": "quantitative",
"id": "id",
}


def test_id_absenteeism_data():
""" Tests whether an id named column is not recognized because even though it is named an id, it is not with its nature. """
df = pd.read_csv("https://github.com/lux-org/lux-datasets/blob/master/data/absenteeism.csv?raw=true")
df.maintain_metadata()
assert df.data_type == {
"ID": "quantitative",
"Reason for absence": "quantitative",
"Month of absence": "nominal",
"Day of the week": "nominal",
"Seasons": "nominal",
"Transportation expense": "quantitative",
"Distance from Residence to Work": "quantitative",
"Service time": "nominal",
"Age": "quantitative",
"Work load Average/day ": "quantitative",
"Hit target": "nominal",
"Disciplinary failure": "nominal",
"Education": "nominal",
"Son": "nominal",
"Social drinker": "nominal",
"Social smoker": "nominal",
"Pet": "nominal",
"Weight": "quantitative",
"Height": "nominal",
"Body mass index": "nominal",
"Absenteeism time in hours": "nominal",
}