-
Notifications
You must be signed in to change notification settings - Fork 362
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Id function improvised #234
Changes from 9 commits
283e8d1
e3d9a24
96d4b14
34e9152
830cf9a
83de78d
3b422d1
0220526
27be821
6e95143
f3157c6
69962f7
c56a489
41dd51a
d1a2f17
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -76,7 +76,7 @@ def check_if_id_like(df, attribute): | |
# Strong signals | ||
# so that aggregated reset_index fields don't get misclassified | ||
high_cardinality = df.cardinality[attribute] > 500 | ||
attribute_contain_id = re.search(r"id", str(attribute)) is not None | ||
attribute_contain_id = re.search(r"id|ID|iD|Id", str(attribute)) is not None | ||
almost_all_vals_unique = df.cardinality[attribute] >= 0.98 * len(df) | ||
is_string = pd.api.types.is_string_dtype(df[attribute]) | ||
if is_string: | ||
|
@@ -92,8 +92,15 @@ def check_if_id_like(df, attribute): | |
and str_length_uniformity | ||
) | ||
else: | ||
# TODO: Could probably add some type of entropy measure (since the binned id fields are usually very even) | ||
return high_cardinality and (attribute_contain_id or almost_all_vals_unique) | ||
if len(df) >= 2: | ||
diff = df[attribute].diff() | ||
evenly_spaced = all(diff.loc[1:] == diff.loc[1]) | ||
else: | ||
evenly_spaced = True | ||
if attribute_contain_id: | ||
almost_all_vals_unique = df.cardinality[attribute] >= 0.75 * len(df) | ||
return high_cardinality and (almost_all_vals_unique or evenly_spaced) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this line can be deleted There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you want to delete all lines from 100 to 103? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jerry - Done the required changes and pushed as well. |
||
return high_cardinality and (almost_all_vals_unique or evenly_spaced) | ||
|
||
|
||
def like_nan(val): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This line fails on examples with indexes.