# Chcek Nulls
This program checks percentage of nulls in each column in each table for a given input bigquery dataset

Reference:
https://dabblingwithdata.amedcalf.com/2021/05/17/a-quick-way-to-count-the-number-of-null-values-in-each-field-of-a-bigquery-table/

In [1]:
pip install --upgrade google-cloud-bigquery

Collecting google-cloud-bigquery
  Downloading google_cloud_bigquery-3.21.0-py2.py3-none-any.whl.metadata (8.9 kB)
Downloading google_cloud_bigquery-3.21.0-py2.py3-none-any.whl (235 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.5/235.5 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: google-cloud-bigquery
  Attempting uninstall: google-cloud-bigquery
    Found existing installation: google-cloud-bigquery 2.34.4
    Uninstalling google-cloud-bigquery-2.34.4:
      Successfully uninstalled google-cloud-bigquery-2.34.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
beatrix-jupyterlab 2023.128.151533 requires jupyterlab~=3.6.0, but you have jupyterlab 4.1.6 which is incompatible.
google-cloud-aiplatform 0.6.0a1 requires google-api-core[grpc]<2.0.0dev,>=1.22.2, but you have google-api-core 2.

In [2]:
import numpy as np
import pandas as pd
import csv
import os
from collections import defaultdict
from google.cloud import bigquery

In [3]:
gc_project_id      = 'kagglehomecredit'
bq_dataset_source  = 'homecredit_stage_1'
outputfilepath     = "/kaggle/working/null_counts.csv"
header_row         = ['table_name', 'column_name', 'column_datatype', 'row_count', 'null_count', 'null_percent']

In [4]:
def create_bq_client(gc_project_id):
    
    bq_client = bigquery.Client(project=gc_project_id)
    
    return bq_client


def get_table_count(gc_project_id, bq_dataset):
    
    table_and_count = defaultdict(list)
    
    sql_query = f"""SELECT * FROM `{gc_project_id}.{bq_dataset_source}.__TABLES__`; """
    results   = bq_client.query_and_wait(sql_query)
        
    for result in results:
        table_and_count[result["table_id"]].append(result["row_count"])
    
    return table_and_count


def get_table_column(gc_project_id, bq_dataset):
    
    table_and_column = defaultdict(list)
    
    sql_query = f"""SELECT * FROM `{gc_project_id}.{bq_dataset_source}.INFORMATION_SCHEMA.COLUMNS`; """
    results   = bq_client.query_and_wait(sql_query)
    
    for result in results:
        table_and_column[result["table_name"]].append((result["column_name"], result["data_type"]))
    
    return table_and_column


def get_null_count(gc_project_id, bq_dataset, table_name):

    tabel_column_null_count = []
    sql_query=f"""  
                    SELECT column_name, COUNT(1) AS nulls_count
                    FROM {gc_project_id}.{bq_dataset}.{table_name},
                    UNNEST(REGEXP_EXTRACT_ALL(TO_JSON_STRING({table_name}), r'"(\w+)":null')) column_name
                    GROUP BY column_name
                    ORDER BY nulls_count DESC 
               """ 
    results   = bq_client.query_and_wait(sql_query)
    
    for result in results:
        tabel_column_null_count.append((result["column_name"], result["nulls_count"]))
    
    return tabel_column_null_count


def map_data(table_column, table_count, table_column_null_count):
    
    map_info_list = []
    
    for table_name, row_count in table_count.items():
        column_list = table_column[table_name]
        column_null_count = dict(table_column_null_count[table_name])

        for column in column_list:
            column_null_percent = round(column_null_count.get(column[0],0) / row_count[0] * 100, 2) 
            map_info_list.append([table_name, column[0], column[1], row_count[0], column_null_count.get(column[0],0), column_null_percent])
            
    return map_info_list


def write2csv(output_list):
    
    with open(outputfilepath, 'w') as f:
        write = csv.writer(f)
        write.writerow(header_row)
        write.writerows(output_list)
    
    return

In [5]:
bq_client = create_bq_client(gc_project_id)

In [6]:
table_column = get_table_column(gc_project_id,bq_dataset_source)

In [7]:
table_count = get_table_count(gc_project_id,bq_dataset_source)

In [8]:
table_column_null_count = defaultdict(list)

for table_name, count in table_count.items():
    table_column_null_count[table_name] = get_null_count(gc_project_id, bq_dataset_source, table_name)

In [9]:
null_count_list = map_data(table_column, table_count, table_column_null_count)
write2csv(null_count_list)