Dataset: https://www.kaggle.com/datasets/wyattowalsh/basketball

In [None]:
!pip install kaggle
!pip install pysqlite3

import os
import sqlite3
import pandas as pd
import random

os.environ["KAGGLE_USERNAME"] = "..."
os.environ["KAGGLE_KEY"] = "..."

!kaggle datasets download -d wyattowalsh/basketball -p dataset
!unzip dataset/basketball.zip -d dataset

In [None]:
def find_min_none_instance(table_instances, num_elements):
    random.shuffle(table_instances)
    instances_sorted = sorted(table_instances, key=lambda instance: sum(1 for value in instance if value is None or value == ""))
    return instances_sorted[:num_elements]


def print_data(data_path: str, num_examples: int):
    connection = sqlite3.connect(data_path)
    cursor = connection.cursor()

    cursor.execute("SELECT name, sql FROM sqlite_master WHERE type='table';")
    tables = cursor.fetchall()

    output = []

    for table in tables:
        table_name = table[0]
        table_sql = table[1]

        cursor.execute(f"SELECT * FROM {table_name} ORDER BY RANDOM() LIMIT 1000;")
        random_instances = cursor.fetchall()

        examples = find_min_none_instance(random_instances, num_examples)

        output.append("Table:\n")
        output.append(table_sql)
        output.append("\nExamples:\n")
        for example in examples:
            output.append(str(example))
            output.append("\n")
        output.append("\n")

    connection.close()

    output_str = "".join(output)
    print(output_str)
    return output_str

def remove_tables(data_path: str, tables_to_remove: list):
    connection = sqlite3.connect(data_path)
    cursor = connection.cursor()

    remove_command = "DROP TABLE IF EXISTS {table};"

    for table in tables_to_remove:
        cursor.execute(remove_command.format(table=table))

    connection.commit()
    connection.close()


sql_data_path = "dataset/nba.sqlite"
to_remove = [
    'game_summary',
    'line_score',
    'other_stats',
    'inactive_players',
    'officials'
    'play_by_play',
    'draft_combine_stats',
    'team_info_common'
]

remove_tables(sql_data_path, to_remove)
print_output = print_data(sql_data_path, 2)

with open("sql_structure.txt", "w") as file:
    file.write(print_output)