In [None]:
from difflib import SequenceMatcher as SM

def best_match(qmap, school_name, zipcode):
    if zipcode not in qmap:
        return None

    best_ratio = 0
    best_match = None

    for school in qmap[zipcode]:
        ratio = SM(None, school_name, school["school_name"]).ratio()
        if ratio > best_ratio:
            best_ratio = ratio
            best_match = school
    return {"best_match": best_match, "ratio": best_ratio}


In [None]:
import hashlib

def generate_hash(input_string: str) -> str:
    hash_object = hashlib.sha256()
    hash_object.update(input_string.encode('utf-8'))    
    return hash_object.hexdigest()

In [None]:
from dotenv import load_dotenv
import psycopg2
import os
import json
import csv

load_dotenv()

db_password = os.getenv("DB_PASSWORD")
db_user = os.getenv("DB_USER")
db_name = os.getenv("DB_NAME")

connection = psycopg2.connect(
    dbname=db_name, user=db_user, password=db_password, host="localhost", port="5432"
)

cursor = connection.cursor()

# SFUSD = '68478'
cursor.execute(
    "SELECT school_code, school_name, zip_code from entities where county_code = '38';"
)

results = cursor.fetchall()
qmap = {}
for scode, sname, szip in results:
    if szip in qmap:
        qmap[szip].append({"school_code": scode, "school_name": sname})
    else:
        qmap[szip] = [{"school_code": scode, "school_name": sname}]

cursor.close()
connection.close()


#### read the json file
with open("schoolList.json", "r", encoding="utf-8") as file:
    data = json.load(file)

csv_output = [
    [
        "school_hash",
        "school_uid",
        "schoolLabel",
        "schoolCode",
        "zip",
        "match_school_id",
        "match_school_name",
        "match_ratio",
    ]
]
for school in data:
    zip = f"no zip for {school['schoolLabel']}"
    if "geolocations" in school:
        location = school["geolocations"][0]["addressDetails"]
        schoolCode = school["schoolCode"] if "schoolCode" in school else "<missing>"
        if "PostalCode" in location:
            zip = location["PostalCode"].split("-")[0]
            
            best = best_match(qmap, school["schoolLabel"], zip)
            match_school_id = best["best_match"]["school_code"] if best else ""
            match_school_name = best["best_match"]["school_name"] if best else ""
            match_ratio = best["ratio"] if best else ""

            school_unique_id = f"{zip}-{school['schoolLabel']}"
            school_hash = generate_hash(f"{school_unique_id}")
            
            csv_output.append(
                [
                    str(school_hash),
                    school_unique_id,
                    school["schoolLabel"],
                    schoolCode,
                    zip,
                    match_school_id,
                    match_school_name,
                    match_ratio,
                ]
            )
            school["school_hash"] = school_hash
            school["school_unique_id"] = school_unique_id
            print(f"added {school_unique_id} ({school_hash})")
        else:
            print(f"no zip for {school['schoolLabel']}")
    else:
        print(f"no geolocations for {school['schoolLabel']}")

# Write the CSV
with open("matches.csv", mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)

    for row in csv_output:
        writer.writerow(row)

print(f"Data written to CSV file")

# Write the JSON
with open("schoolList_hashed.json", "w", encoding="utf-8") as file:
    json.dump(data, file, indent=2)