Skip to content

Commit

Permalink
format with black
Browse files Browse the repository at this point in the history
  • Loading branch information
RobinL committed Dec 5, 2020
1 parent 84f8e71 commit a68084d
Showing 1 changed file with 33 additions and 67 deletions.
100 changes: 33 additions & 67 deletions splink/diagnostics.py
Expand Up @@ -23,9 +23,7 @@
"title": "Histogram of splink scores",
"width": 700,
"encoding": {
"tooltip": [
{"field": "count_rows", "title": "count", "type": "quantitative"}
],
"tooltip": [{"field": "count_rows", "title": "count", "type": "quantitative"}],
"x": {
"axis": {"title": "splink score"},
"bin": "binned",
Expand All @@ -44,33 +42,27 @@

@check_types
def _calc_probability_density(
df_e: DataFrame, spark: SparkSession, buckets=None, score_colname=None,
df_e: DataFrame,
spark: SparkSession,
buckets=None,
score_colname=None,
):

"""perform splink score histogram calculations / internal function
"""perform splink score histogram calculations / internal function
Compute a histogram using the provided buckets.
Args:
df_e (DataFrame): A dataframe of record comparisons containing a splink score,
e.g. as produced by the .... function
df_e (DataFrame): A dataframe of record comparisons containing a
splink score, e.g. as produced by the .... function
spark (SparkSession): SparkSession object
score_colname : is the score in another column? defaults to None
buckets : accepts either a list of split points or an integer number that is used
to create equally spaced split points. It defaults to 100 equally spaced split points
from 0.0 to 1.0
score_colname: is the score in another column? defaults to None
buckets: accepts either a list of split points or an integer number that is used
to create equally spaced split points. It defaults to 100 equally
spaced split points from 0.0 to 1.0
Returns:
(DataFrame) : pandas dataframe of histogram bins for appropriate splink score variable ready to be plotted.
"""

# if splits a list then use it. if None... then create default. if integer then create equal bins
Expand All @@ -96,11 +88,7 @@ def _calc_probability_density(
# Otherwise match_probability is used or if that doesnt exit a warning is fired and function exits

if score_colname:
hist = (
df_e.select(score_colname)
.rdd.flatMap(lambda x: x)
.histogram(buckets)
)
hist = df_e.select(score_colname).rdd.flatMap(lambda x: x).histogram(buckets)

elif "tf_adjusted_match_prob" in df_e.columns:

Expand All @@ -113,9 +101,7 @@ def _calc_probability_density(
elif "match_probability" in df_e.columns:

hist = (
df_e.select("match_probability")
.rdd.flatMap(lambda x: x)
.histogram(buckets)
df_e.select("match_probability").rdd.flatMap(lambda x: x).histogram(buckets)
)

else:
Expand All @@ -124,12 +110,8 @@ def _calc_probability_density(
# get bucket from and to points

hist[1].append(None)
hist_df = pd.DataFrame(
{"splink_score_bin_low": hist[0], "count_rows": hist[1]}
)
hist_df["splink_score_bin_high"] = hist_df["splink_score_bin_low"].shift(
-1
)
hist_df = pd.DataFrame({"splink_score_bin_low": hist[0], "count_rows": hist[1]})
hist_df["splink_score_bin_high"] = hist_df["splink_score_bin_low"].shift(-1)
hist_df = hist_df.drop(hist_df.tail(1).index)

# take into account the bin width
Expand All @@ -146,21 +128,14 @@ def _calc_probability_density(


def _create_probability_density_plot(hist_df):
"""plot score histogram
"""plot score histogram
Args:
hist_df (pandas DataFrame): A pandas dataframe of histogram bins
Args:
hist_df (pandas DataFrame): A pandas dataframe of histogram bins
as produced by the _calc_probability_density function
Returns:
if altair is installed a plot. if altair is not installed
Returns:
if altair is installed a plot. if altair is not installed
then it returns the vega lite chart spec as a dictionary
"""

data = hist_df.to_dict(orient="records")
Expand All @@ -173,35 +148,26 @@ def _create_probability_density_plot(hist_df):


def splink_score_histogram(
df_e: DataFrame, spark: SparkSession, buckets=None, score_colname=None,
df_e: DataFrame,
spark: SparkSession,
buckets=None,
score_colname=None,
):

"""splink score histogram diagnostic plot public API function
Compute a histogram using the provided buckets and plot the result.
Args:
df_e (DataFrame): A dataframe of record comparisons containing a splink score,
e.g. as produced by the `get_scored_comparisons` function
Args:
df_e (DataFrame): A dataframe of record comparisons containing a splink score,
e.g. as produced by the `get_scored_comparisons` function
spark (SparkSession): SparkSession object
score_colname : is the score in another column? defaults to None
buckets : accepts either a list of split points or an integer number that is used to
create equally spaced split points. It defaults to 100 equally spaced split points from 0.0 to 1.0
buckets : accepts either a list of split points or an integer number that is used to
create equally spaced split points. It defaults to 100 equally spaced split points from 0.0 to 1.0
Returns:
if altair library is installed this function returns a histogram plot. if altair is not installed
then it returns the vega lite chart spec as a dictionary
"""

pd_df = _calc_probability_density(
Expand Down

0 comments on commit a68084d

Please sign in to comment.