In [2]:
import os
import csv
import json
from litellm import completion

In [None]:
def generate_column_description(column_name, column_data):
    system_message = "You are a helpful assistant that generates brief descriptions for database columns. Given a column name and sample data, provide a comprehensive description of what the column represents, the type of data it holds, and its potential values. Return only the description and nothing else. You do not need to include special formatting or the column name in the description."
    user_message = f"Name: {column_name} Data: {column_data}"

    try:
        # Call LiteLLM's completion function
        response = completion(
            model="ollama/llama3",
            messages=[
                {"role": "system", "content": system_message},
                {"role": "user", "content": user_message}
            ]
        )
        
        # Extract the description from the response
        description = response['choices'][0]['message']['content'].strip()
        
        return description
    except Exception as e:
        print(f"Error generating description for column {column_name}: {e}")
        return f"Description for {column_name} could not be generated."

def convert_csv_to_json(csv_file_path):
    with open(csv_file_path, newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        rows = list(reader)
        columns = list(reader.fieldnames)
                
        # Generate column descriptions using LiteLLM
        column_descriptions = []
        data = [list(row.values()) for row in rows]
        for i in range(len(columns)):
            sampleData = []
            for j in range(min(len(data), 10)):
                sampleData.append(data[j][i])
            column_descriptions.append(generate_column_description(columns[i], sampleData))
        
        # Put data together including all relevant data.
        data_structure = {
            "name": os.path.splitext(os.path.basename(csv_file_path))[0],
            "columns": columns,
            "column_descriptions": column_descriptions,
            "primary_key": columns[0],
            "data": data
        }
        
        return data_structure

# Validate datasets
if not os.path.exists('./Databases'):
    print(f"Please add your files to a directory labelled 'Databases'.")
    raise

csv_files = [f for f in os.listdir("./Databases") if f.endswith('.csv')]

if not csv_files:
    print("No CSV files found in the directory.")
    raise

all_tables = [] 

# Loop through each CSV file
for csv_file in csv_files:
    csv_file_path = os.path.join("./Databases", csv_file)
    print(f"Processing {csv_file_path}")
    
    # Convert CSV to JSON format and append it to all_tables
    table_data = convert_csv_to_json(csv_file_path)
    all_tables.append(table_data)

# Find the next available schema file name
x = 1
while os.path.exists(os.path.join("./", f"schema_{x}.json")):
    x += 1
output_file_path =  os.path.join("./", f"schema_{x}.json")

x_str = str(x)
if x < 100:
     x_str = '0' + x_str
if x < 10:
     x_str = '0' + x_str

system_message = "You are a helpful assistant that generates a domain based on data contained within a table. For example, a domain can be \"Video Games\", \"Flowers\", or \"Meteorological Patterns\". Given the information, you will return only the domain and nothing else. Keep the domain to 1-3 words at most."
user_message = f"Data: {all_tables}"

response = completion(
    model="ollama/llama3",
    messages=[
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_message}
    ]
)

# Extract the description from the response
domain = response['choices'][0]['message']['content'].strip()

schema_data = {
    "schema_id": "schema_r" + x_str,
    "domain": domain,
    "tables": all_tables
}

with open(output_file_path, 'w', encoding='utf-8') as jsonfile:
        json.dump(schema_data, jsonfile, indent=4)

Processing ./Databases\datasetV2.csv
Processing ./Databases\GeneralEsportData.csv
Processing ./Databases\HistoricalEsportData.csv
Processing ./Databases\vgsales.csv
