#### Enable Auto-reload
- Loads the autoreload extension to automatically reload modules when they change
- Useful for development


In [0]:
%load_ext autoreload
%autoreload 2

#### Install Required Library
- Installs the openpyxl library needed to read Excel files with pandas


In [0]:
pip install openpyxl

#### Define Excel Table Detection Functions
- Creates utility functions to automatically detect and extract multiple tables from an Excel sheet
- Finds contiguous data regions and determines table boundaries
- Sets first row as column headers
- Tracks visited cells to avoid duplicate extraction


In [0]:
import pandas as pd
import numpy as np
from openpyxl import load_workbook

def detect_and_extract_tables(excel_file, sheet_name=0):
    # Read the entire sheet
    df = pd.read_excel(excel_file, sheet_name=sheet_name, header=None)
    
    # Replace NaN with empty strings for easier processing
    df = df.fillna('')
    
    tables = []
    visited = set()
    
    for row in range(len(df)):
        for col in range(len(df.columns)):
            if (row, col) not in visited and df.iloc[row, col] != '':
                # Found potential table start
                table_data, table_coords = extract_table_from_position(df, row, col, visited)
                if len(table_data) > 1:  # Ensure it's not just a single cell
                    tables.append(table_data)
    
    return tables

def extract_table_from_position(df, start_row, start_col, visited):
    # Find table boundaries by looking for contiguous data
    max_row = start_row
    max_col = start_col
    
    # Expand right to find table width
    while max_col < len(df.columns) - 1:
        if has_data_in_column(df, max_col + 1, start_row, len(df)):
            max_col += 1
        else:
            break
    
    # Expand down to find table height
    while max_row < len(df) - 1:
        if has_data_in_row(df, max_row + 1, start_col, max_col + 1):
            max_row += 1
        else:
            break
    
    # Extract the table
    table_data = df.iloc[start_row:max_row+1, start_col:max_col+1]
    
    # Mark cells as visited
    for r in range(start_row, max_row + 1):
        for c in range(start_col, max_col + 1):
            visited.add((r, c))
    
    # Set first row as column names
    table_data.columns = table_data.iloc[0]
    table_data = table_data.drop(table_data.index[0])

    return table_data, (start_row, start_col, max_row, max_col)

def has_data_in_column(df, col, start_row, end_row):
    return any(df.iloc[r, col] != '' for r in range(start_row, min(end_row, len(df))))

def has_data_in_row(df, row, start_col, end_col):
    return any(df.iloc[row, c] != '' for c in range(start_col, min(end_col, len(df.columns))))



#### Define DataFrame Conversion Function
- Converts extracted tables to cleaned DataFrames
- Normalizes column names (lowercase, removes special chars)
- Removes empty columns and auto-detects numeric columns
- Creates PySpark DataFrames for Databricks


In [0]:
def tables_to_dfs(tables):
    import re
    tables_dfs = []
    for i, table in enumerate(tables):
        # Convert all columns to string to avoid type issues
        table_str = table.astype(str)
        table_str.columns = [re.sub(r'_+', '_', re.sub(r'[^a-zA-Z0-9]+$', '', re.sub(r'[^a-zA-Z0-9]', '_', str(col).lower().strip()))) for col in table_str.columns]
        # Remove empty columns
        table_str = table_str.loc[:, (table_str != '').any(axis=0)]
        # Convert numeric columns
        for col in table_str.columns:
            numeric_col = pd.to_numeric(table_str[col], errors='coerce')
            if not numeric_col.isna().all():
                # Check if all non-null values are integers
                if numeric_col.dropna().apply(lambda x: x == int(x)).all():
                    table_str[col] = numeric_col.astype('Int64')  # Nullable integer (capital I)
                else:
                    table_str[col] = numeric_col
        table_df = spark.createDataFrame(table_str)
        tables_dfs.append(table_df)
    return tables_dfs

#### Extract Tables from Excel File
- Reads the NHMACS/NMACS schizophrenia Excel file and extracts all tables from all sheets
- Currently limited to first sheet with `break` statement


In [0]:
# Get all sheet names first
excel_file_path = '/Volumes/mc/teva/files/YW_NHMACS_NMACS_MEPS_SCHIZ_CLEAN.xlsx'
xl_file = pd.ExcelFile(excel_file_path)
tables = []

for sheet_name in xl_file.sheet_names:
    sheet_tables = detect_and_extract_tables(excel_file_path, sheet_name)
    print(sheet_name)
    tables.extend(sheet_tables)
    break #keep only first sheet : Report_Format

#### Convert Tables to DataFrames
- Processes all extracted tables through the cleaning and conversion pipeline
- Returns a list of PySpark DataFrames


In [0]:
tables_dfs = tables_to_dfs(tables)
len(tables_dfs)

#### Display All DataFrames
- Iterates through all converted DataFrames, displaying sample data and schema information for each table


In [0]:
for df in tables_dfs:
    df.display()
    df.printSchema()

#### Define Table Names
- Creates a list of descriptive table names that will be used to save each DataFrame to the catalog
- Names indicate: survey type (NHMACS vs NMACS), scope (national vs regional), condition (schizophrenia), and imputation status (imputed vs unimputed race/ethnicity)


In [0]:
table_names = [
    'NHMACS_national_schiz_avg_annual_freq_race_ethnic_unimputed',
    'NMACS_national_schiz_avg_annual_freq_race_ethnic_unimputed',
    'NHMACS_national_schiz_avg_annual_freq_race_ethnic_imputed',
    'NMACS_national_schiz_avg_annual_freq_race_ethnic_imputed',
    'NHMACS_regional_schiz_avg_annual_freq_race_ethnic_unimputed',
    'NMACS_regional_schiz_avg_annual_freq_race_ethnic_unimputed',
    'NHMACS_regional_schiz_avg_annual_freq_race_ethnic_imputed',
    'NMACS_regional_schiz_avg_annual_freq_race_ethnic_imputed',
    'NHMACS_national_schiz_avg_annual_freq_race_ethnic_unimputed_SAS_raw_format',
    'NMACS_national_schiz_avg_annual_freq_race_ethnic_unimputed_SAS_raw_format',
    'NHMACS_national_schiz_avg_annual_freq_race_ethnic_imputed_SAS_raw_format',
    'NMACS_national_schiz_avg_annual_freq_race_ethnic_imputed_SAS_raw_format',
    'NHMACS_regional_schiz_avg_annual_freq_race_ethnic_unimputed_SAS_raw_format',
    'NMACS_regional_schiz_avg_annual_freq_race_ethnic_unimputed_SAS_raw_format',
    'NHMACS_regional_schiz_avg_annual_freq_race_ethnic_imputed_SAS_raw_format',
    'NMACS_regional_schiz_avg_annual_freq_race_ethnic_imputed_SAS_raw_format',
]

#### Save All Tables to Delta
- Writes each DataFrame to a corresponding Delta table in the mc.teva catalog with the defined names


In [0]:
for i, df in enumerate(tables_dfs):
    df.write.format("delta").mode("overwrite").saveAsTable(f'mc.teva.{table_names[i]}')

#### Define Clean Table Names
- Creates a subset list containing only the final cleaned table names (excluding SAS raw format versions)


In [0]:
table_names_clean = [
    'NHMACS_national_schiz_avg_annual_freq_race_ethnic_unimputed',
    'NMACS_national_schiz_avg_annual_freq_race_ethnic_unimputed',
    'NHMACS_national_schiz_avg_annual_freq_race_ethnic_imputed',
    'NMACS_national_schiz_avg_annual_freq_race_ethnic_imputed',
    'NHMACS_regional_schiz_avg_annual_freq_race_ethnic_unimputed',
    'NMACS_regional_schiz_avg_annual_freq_race_ethnic_unimputed',
    'NHMACS_regional_schiz_avg_annual_freq_race_ethnic_imputed',
    'NMACS_regional_schiz_avg_annual_freq_race_ethnic_imputed'
]

#### Define Raw Format Table Names
- Creates a list of table names that were in SAS raw format (to be cleaned up/removed)


In [0]:
table_names_raw = [
    'NHMACS_national_schiz_avg_annual_freq_race_ethnic_unimputed_SAS_raw_format',
    'NMACS_national_schiz_avg_annual_freq_race_ethnic_unimputed_SAS_raw_format',
    'NHMACS_national_schiz_avg_annual_freq_race_ethnic_imputed_SAS_raw_format',
    'NMACS_national_schiz_avg_annual_freq_race_ethnic_imputed_SAS_raw_format',
    'NHMACS_regional_schiz_avg_annual_freq_race_ethnic_unimputed_SAS_raw_format',
    'NMACS_regional_schiz_avg_annual_freq_race_ethnic_unimputed_SAS_raw_format',
    'NHMACS_regional_schiz_avg_annual_freq_race_ethnic_imputed_SAS_raw_format',
    'NMACS_regional_schiz_avg_annual_freq_race_ethnic_imputed_SAS_raw_format'
    ]

#### Drop Raw Format Tables
- Removes the SAS raw format tables from the catalog as they're not needed for analysis


In [0]:
for name in table_names_raw:
    spark.sql(f"DROP TABLE IF EXISTS mc.teva.{name}")

#### Save Table Names Reference
- Creates a reference table storing all the clean table names for easy iteration and documentation


In [0]:
import pandas as pd

table_names_df = pd.DataFrame({'table_name': table_names_clean})
spark.createDataFrame(table_names_df).write.format("delta").mode("overwrite").saveAsTable("mc.teva.table_names")

#### View Table Names Reference
- Displays the reference table containing all clean table names


In [0]:
%sql
select * from mc.teva.table_names

#### Display All Tables (Verification)
- Loops through all table names and displays sample data and schema for verification purposes


In [0]:
table_names_df = spark.table("mc.teva.table_names").toPandas()
for name in table_names_df['table_name']:
    print(name)
    df = spark.table(f"mc.teva.{name}")
    df.show(5)
    df.printSchema()

#### Build Context String for AI
- Creates a comprehensive context string containing all table schemas and sample data
- Used as input for AI-based documentation generation


In [0]:
import io
import sys

context = ""
table_names_df = spark.table("mc.teva.table_names").toPandas()
for name in table_names_df['table_name']:
    context += f"\nmc.teva.{name}\n"
    df = spark.table(f"mc.teva.{name}")
    
    # Capture show() output
    old_stdout = sys.stdout
    sys.stdout = buffer = io.StringIO()
    df.show(5)
    sys.stdout = old_stdout
    context += buffer.getvalue()
    
    # Capture schema output
    sys.stdout = buffer = io.StringIO()
    df.printSchema()
    sys.stdout = old_stdout
    context += buffer.getvalue() + "\n"

print(context)

#### Install Databricks SDK
- Installs the Databricks SDK with OpenAI integration for accessing AI models


In [0]:
pip install databricks-sdk[openai]

#### Restart Python Runtime
- Restarts the Python interpreter to ensure the newly installed library is properly loaded


In [0]:
dbutils.library.restartPython()

#### Generate Table Documentation with AI
- Uses Databricks Claude Sonnet 4 to automatically generate comprehensive documentation for all tables
- Includes explanation of acronyms (NHMACS, NMACS), complete table descriptions, and detailed column descriptions
- Output is formatted as Python dictionary structure


In [0]:
from databricks.sdk import WorkspaceClient

w = WorkspaceClient()
openai_client = w.serving_endpoints.get_open_ai_client()

instructions = """
I will provide you with a list of tables and their schemas. For each table, provide a short description for each table. 
The description should 
- Explain every Acronym used in the table name.
- be complete and exhaustive. 
- be written in a way that is accessible to a broad audience, including those without any background. 
- NOT analyse the data. Just provide a description of the content of the table and explain what it contains
- inside the text use ` instead of ' for quotation marks.
ONLY OUTPUT allowed froo your part should be formatted like this: 

table_descriptions = [
    {
        "table_name": "enter table name here",
        "table_description": "enter description here",
        "columns": {
            "column_1_name": "enter column 1 description here",
            "column_2_name": "enter column 2 description here"
            # etc...
        }
    },
    # etc...
]
    
    here are the tables and their schemas : """

response = openai_client.chat.completions.create(
    model="databricks-claude-sonnet-4",
    messages=[
      {"role": "user", "content": [
          {"type": "text", "text": f"{instructions}{context}"},
          ]}
    ],
    max_tokens=5000
)

table_descriptions = response.choices[0].message.content

print(table_descriptions)

#### Define Table Descriptions (Manual)
- Manually defines the complete table and column descriptions in Python dictionary format
- Hardcoded from AI generation in previous step for reproducibility


In [0]:
table_descriptions = [
    {
        "table_name": "mc.teva.NHMACS_national_schiz_avg_annual_freq_race_ethnic_unimputed",
        "table_description": "This table contains national-level statistics from the NHMACS (National Hospital Medical Care Survey) study focusing on schizophrenia patients. The data shows average annual frequency distributions by race and ethnicity categories where missing race/ethnicity data has NOT been statistically filled in (`unimputed`). `Schiz` is an abbreviation for schizophrenia, a mental health condition. The table presents weighted frequency counts, statistical measures of variability, and reliability indicators for different racial groups in the dataset.",
        "columns": {
            "race_unimputed": "The racial category of patients as originally reported in the data, without any statistical estimation for missing values. Categories include `White Only`, `Black/African American`, `Asian Only`, `Native Hawaiian/Other Pacific Islander`, and `Blank` for missing data.",
            "wgtfreq": "Weighted frequency - the estimated number of cases in each racial category after applying statistical weights to make the sample representative of the broader population.",
            "stddev": "Standard deviation - a measure of how much the weighted frequency values vary or spread out from the average, indicating the precision of the estimate.",
            "rowpercent": "The percentage that each racial category represents out of the total sample, showing the distribution across different racial groups.",
            "rowstderr": "Row standard error - a measure of uncertainty in the percentage calculation, indicating how precise the percentage estimate is.",
            "flag_reliability": "A reliability indicator where 1 typically means the estimate meets reliability standards and 0 means it may be unreliable due to small sample size or high variability."
        }
    },
    {
        "table_name": "mc.teva.NMACS_national_schiz_avg_annual_freq_race_ethnic_unimputed",
        "table_description": "This table contains national-level statistics from the NMACS (National Medical Care Survey) study focusing on schizophrenia patients. Similar to the NHMACS table, this presents average annual frequency distributions by race and ethnicity categories where missing data has NOT been statistically filled in (`unimputed`). `Schiz` is an abbreviation for schizophrenia. The table shows how schizophrenia cases are distributed across different racial groups at the national level.",
        "columns": {
            "race_unimputed": "The racial category of patients as originally reported in the data, without any statistical estimation for missing values. Categories include `White Only`, `Black/African American`, `Asian Only`, `Native Hawaiian/Other Pacific Islander`, and `Blank` for missing data.",
            "wgtfreq": "Weighted frequency - the estimated number of cases in each racial category after applying statistical weights to make the sample representative of the broader population.",
            "stddev": "Standard deviation - a measure of how much the weighted frequency values vary or spread out from the average, indicating the precision of the estimate.",
            "rowpercent": "The percentage that each racial category represents out of the total sample, showing the distribution across different racial groups.",
            "rowstderr": "Row standard error - a measure of uncertainty in the percentage calculation, indicating how precise the percentage estimate is.",
            "flag_reliability": "A reliability indicator where 1 typically means the estimate meets reliability standards and 0 means it may be unreliable due to small sample size or high variability."
        }
    },
    {
        "table_name": "mc.teva.NHMACS_national_schiz_avg_annual_freq_race_ethnic_imputed",
        "table_description": "This table contains national-level statistics from the NHMACS (National Hospital Medical Care Survey) study focusing on schizophrenia patients. Unlike the unimputed version, this data shows average annual frequency distributions by race and ethnicity categories where missing race/ethnicity data HAS been statistically estimated and filled in (`imputed`). `Schiz` is an abbreviation for schizophrenia. The categories combine both race and ethnicity information (such as `Non-Hispanic White` and `Hispanic`) rather than race alone.",
        "columns": {
            "race_ethnicity_imputed": "The combined race and ethnicity category where missing values have been statistically estimated and filled in. Categories include `Non-Hispanic White`, `Non-Hispanic Black`, `Hispanic`, `Non-Hispanic Other`, and `Total` for the overall sum.",
            "wgtfreq": "Weighted frequency - the estimated number of cases in each race/ethnicity category after applying statistical weights to make the sample representative of the broader population.",
            "stddev": "Standard deviation - a measure of how much the weighted frequency values vary or spread out from the average, indicating the precision of the estimate.",
            "rowpercent": "The percentage that each race/ethnicity category represents out of the total sample, showing the distribution across different groups.",
            "rowstderr": "Row standard error - a measure of uncertainty in the percentage calculation, indicating how precise the percentage estimate is.",
            "flag_reliability": "A reliability indicator where 1 typically means the estimate meets reliability standards and 0 means it may be unreliable due to small sample size or high variability."
        }
    },
    {
        "table_name": "mc.teva.NMACS_national_schiz_avg_annual_freq_race_ethnic_imputed",
        "table_description": "This table contains national-level statistics from the NMACS (National Medical Care Survey) study focusing on schizophrenia patients. This version shows average annual frequency distributions by race and ethnicity categories where missing race/ethnicity data HAS been statistically estimated and filled in (`imputed`). `Schiz` is an abbreviation for schizophrenia. The data combines both race and ethnicity information to provide a more complete picture of the demographic distribution.",
        "columns": {
            "race_ethnicity_imputed": "The combined race and ethnicity category where missing values have been statistically estimated and filled in. Categories include `Non-Hispanic White`, `Non-Hispanic Black`, `Hispanic`, `Non-Hispanic Other`, and `Total` for the overall sum.",
            "wgtfreq": "Weighted frequency - the estimated number of cases in each race/ethnicity category after applying statistical weights to make the sample representative of the broader population.",
            "stddev": "Standard deviation - a measure of how much the weighted frequency values vary or spread out from the average, indicating the precision of the estimate.",
            "rowpercent": "The percentage that each race/ethnicity category represents out of the total sample, showing the distribution across different groups.",
            "rowstderr": "Row standard error - a measure of uncertainty in the percentage calculation, indicating how precise the percentage estimate is.",
            "flag_reliability": "A reliability indicator where 1 typically means the estimate meets reliability standards and 0 means it may be unreliable due to small sample size or high variability."
        }
    },
    {
        "table_name": "mc.teva.NHMACS_regional_schiz_avg_annual_freq_race_ethnic_unimputed",
        "table_description": "This table contains regional-level statistics from the NHMACS (National Hospital Medical Care Survey) study focusing on schizophrenia patients. The data shows average annual frequency distributions broken down by both geographic region and race/ethnicity categories where missing race/ethnicity data has NOT been statistically filled in (`unimputed`). `Schiz` is an abbreviation for schizophrenia. This allows for comparison of schizophrenia case distributions across different regions of the country (such as Northeast, South, Midwest, West).",
        "columns": {
            "region": "The geographic region of the United States where the cases were recorded, such as `Northeast`, `South`, `Midwest`, or `West`.",
            "race_unimputed": "The racial category of patients as originally reported in the data, without any statistical estimation for missing values. Categories include `White Only`, `Black/African American`, `Asian Only`, `Native Hawaiian/Other Pacific Islander`, and `Blank` for missing data.",
            "wgtfreq": "Weighted frequency - the estimated number of cases in each region-race combination after applying statistical weights to make the sample representative of the broader population.",
            "stddev": "Standard deviation - a measure of how much the weighted frequency values vary or spread out from the average, indicating the precision of the estimate.",
            "rowpercent": "The percentage that each race category represents within that specific region, showing the racial distribution within each geographic area.",
            "rowstderr": "Row standard error - a measure of uncertainty in the percentage calculation, indicating how precise the percentage estimate is.",
            "flag_reliability": "A reliability indicator where 1 typically means the estimate meets reliability standards and 0 means it may be unreliable due to small sample size or high variability."
        }
    },
    {
        "table_name": "mc.teva.NMACS_regional_schiz_avg_annual_freq_race_ethnic_unimputed",
        "table_description": "This table contains regional-level statistics from the NMACS (National Medical Care Survey) study focusing on schizophrenia patients. The data shows average annual frequency distributions broken down by both geographic region and race/ethnicity categories where missing race/ethnicity data has NOT been statistically filled in (`unimputed`). `Schiz` is an abbreviation for schizophrenia. This provides insight into how schizophrenia cases are distributed across different racial groups within specific regions of the country.",
        "columns": {
            "region": "The geographic region of the United States where the cases were recorded, such as `Northeast`, `South`, `Midwest`, or `West`.",
            "race_unimputed": "The racial category of patients as originally reported in the data, without any statistical estimation for missing values. Categories include `White Only`, `Black/African American`, `Asian Only`, `Native Hawaiian/Other Pacific Islander`, and `Blank` for missing data.",
            "wgtfreq": "Weighted frequency - the estimated number of cases in each region-race combination after applying statistical weights to make the sample representative of the broader population.",
            "stddev": "Standard deviation - a measure of how much the weighted frequency values vary or spread out from the average, indicating the precision of the estimate.",
            "rowpercent": "The percentage that each race category represents within that specific region, showing the racial distribution within each geographic area.",
            "rowstderr": "Row standard error - a measure of uncertainty in the percentage calculation, indicating how precise the percentage estimate is.",
            "flag_reliability": "A reliability indicator where 1 typically means the estimate meets reliability standards and 0 means it may be unreliable due to small sample size or high variability."
        }
    },
    {
        "table_name": "mc.teva.NHMACS_regional_schiz_avg_annual_freq_race_ethnic_imputed",
        "table_description": "This table contains regional-level statistics from the NHMACS (National Hospital Medical Care Survey) study focusing on schizophrenia patients. The data shows average annual frequency distributions broken down by both geographic region and race/ethnicity categories where missing race/ethnicity data HAS been statistically estimated and filled in (`imputed`). `Schiz` is an abbreviation for schizophrenia. This version provides a more complete demographic picture by combining race and ethnicity information and filling in missing data statistically.",
        "columns": {
            "region": "The geographic region of the United States where the cases were recorded, such as `Northeast`, `South`, `Midwest`, or `West`.",
            "race_ethnicity_imputed": "The combined race and ethnicity category where missing values have been statistically estimated and filled in. Categories include `Non-Hispanic White`, `Non-Hispanic Black`, `Hispanic`, `Non-Hispanic Other`, and `Total` for the regional sum.",
            "wgtfreq": "Weighted frequency - the estimated number of cases in each region-race/ethnicity combination after applying statistical weights to make the sample representative of the broader population.",
            "stddev": "Standard deviation - a measure of how much the weighted frequency values vary or spread out from the average, indicating the precision of the estimate.",
            "rowpercent": "The percentage that each race/ethnicity category represents within that specific region, showing the demographic distribution within each geographic area.",
            "rowstderr": "Row standard error - a measure of uncertainty in the percentage calculation, indicating how precise the percentage estimate is.",
            "flag_reliability": "A reliability indicator where 1 typically means the estimate meets reliability standards and 0 means it may be unreliable due to small sample size or high variability."
        }
    },
    {
        "table_name": "mc.teva.NMACS_regional_schiz_avg_annual_freq_race_ethnic_imputed",
        "table_description": "This table contains regional-level statistics from the NMACS (National Medical Care Survey) study focusing on schizophrenia patients. The data shows average annual frequency distributions broken down by both geographic region and race/ethnicity categories where missing race/ethnicity data HAS been statistically estimated and filled in (`imputed`). `Schiz` is an abbreviation for schizophrenia. This allows for comprehensive analysis of schizophrenia case patterns across different regions and demographic groups with more complete data coverage.",
        "columns": {
            "region": "The geographic region of the United States where the cases were recorded, such as `Northeast`, `South`, `Midwest`, or `West`.",
            "race_ethnicity_imputed": "The combined race and ethnicity category where missing values have been statistically estimated and filled in. Categories include `Non-Hispanic White`, `Non-Hispanic Black`, `Hispanic`, `Non-Hispanic Other`, and `Total` for the regional sum.",
            "wgtfreq": "Weighted frequency - the estimated number of cases in each region-race/ethnicity combination after applying statistical weights to make the sample representative of the broader population.",
            "stddev": "Standard deviation - a measure of how much the weighted frequency values vary or spread out from the average, indicating the precision of the estimate.",
            "rowpercent": "The percentage that each race/ethnicity category represents within that specific region, showing the demographic distribution within each geographic area.",
            "rowstderr": "Row standard error - a measure of uncertainty in the percentage calculation, indicating how precise the percentage estimate is.",
            "flag_reliability": "A reliability indicator where 1 typically means the estimate meets reliability standards and 0 means it may be unreliable due to small sample size or high variability."
        }
    }
]

#### Apply Documentation to Tables
- Iterates through all table descriptions and applies them to the actual Delta tables using SQL COMMENT statements
- Adds both table-level and column-level documentation


In [0]:
for table in table_descriptions:
    table_name = table["table_name"]
    table_desc = table["table_description"]

    spark.sql(f"COMMENT ON TABLE {table_name} IS '{table_desc}'")

    for col_name, col_desc in table["columns"].items():
        spark.sql(f"COMMENT ON COLUMN {table_name}.{col_name} IS '{col_desc}'")