Skip to content

Commit

Permalink
Merge 04eb1b3 into ba983a4
Browse files Browse the repository at this point in the history
  • Loading branch information
zslade committed Feb 16, 2022
2 parents ba983a4 + 04eb1b3 commit f271607
Show file tree
Hide file tree
Showing 2 changed files with 114 additions and 0 deletions.
72 changes: 72 additions & 0 deletions splink/files/chart_defs/missingness_chart_def.json
@@ -0,0 +1,72 @@
{
"config": {
"view": {
"continuousWidth": 400,
"continuousHeight": 300,
"width": 400
},
"axis": {
"labelFontSize": 11
}
},
"title": "",
"layer": [
{
"mark": "bar",
"encoding": {
"color": {
"type": "quantitative",
"field": "percentage",
"legend": {
"format": ".0%"
},
"scale": {
"range": "heatmap"
},
"title": "Missingness"
},
"tooltip": [
{
"type": "nominal",
"field": "variable",
"title": "Column"
},
{
"type": "quantitative",
"field": "value",
"format": ".0f",
"title": "Count of nulls"
},
{
"type": "quantitative",
"field": "percentage",
"format": ".2%",
"title": "Percentage of nulls"
}
],
"x": {
"type": "quantitative",
"axis": {
"format": "%",
"title": "Percentage of nulls"
},
"field": "percentage"
},
"y": {
"type": "nominal",
"axis": {
"title": ""
},
"field": "variable",
"sort": "-x"
}
},
"title": ""
}
],
"data": {
"values": "",
"name": "data-0e7bce5a1d2f132e282789d6ef7780fe"
},
"$schema": "https://vega.github.io/schema/vega-lite/v4.8.1.json"
}
42 changes: 42 additions & 0 deletions splink/missingness.py
@@ -0,0 +1,42 @@
from pyspark.sql.dataframe import DataFrame
from pyspark.sql.functions import col, count, when

from .charts import load_chart_definition, altair_if_installed_else_json

import pandas as pd


def missingness_chart(df: DataFrame):
"""Produce bar chart of missingness in standardised nodes
Args:
df (DataFrame): Input Spark dataframe
Returns:
Bar chart of missingness
"""

# Load JSON definition of missingness chart
chart_path = "missingness_chart_def.json"
missingness_chart_def = load_chart_definition(chart_path)


# Data for plot
# Count and percentage of nulls in each columns as pandas dataframe
df_nulls = df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns])
pd_nulls = df_nulls.toPandas()
pd_nulls = pd.melt(pd_nulls)

record_count = df.count()
pd_nulls["percentage"] = round(pd_nulls['value']*100/record_count, 1)
pd_nulls["percentage"] = pd_nulls['percentage'].astype(str) + "%"


# Add data to JSON chart definition
missingness_chart_def["data"]["values"] = pd_nulls.to_dict("records")

# Update chart title
for c in missingness_chart_def["layer"]:
c["title"] = "Missingness per column out of {} records".format(record_count)


return altair_if_installed_else_json(missingness_chart_def)

0 comments on commit f271607

Please sign in to comment.