In [1]:
import pandas as pd

# === Load raw data ===
df = pd.read_csv("Food_Inspections_20250216.csv")

# === Drop unstructured or irrelevant columns (optional but recommended) ===
df = df.drop(columns=["Violations", "Latitude", "Longitude", "Location"], errors="ignore")

# === Drop rows with missing values in essential fields ===
df = df.dropna()

# === Convert float columns to string for categorical matching (FDs/INDs prefer discrete values) ===
float_cols = df.select_dtypes(include=["float64"]).columns
df[float_cols] = df[float_cols].astype(int).astype(str)

# === Normalize all object columns (trim and lowercase for IND checking) ===
for col in df.select_dtypes(include=["object"]).columns:
    df[col] = df[col].str.strip().str.lower()

# === Export cleaned version for FD/IND analysis ===
df.to_csv("Food_Inspections_Cleaned.csv", index=False)


ModuleNotFoundError: No module named 'pandas'

In [None]:
import pandas as pd
import time
from tane import TANE, read_db

# === Step 1: Load cleaned dataset and prepare sample ===
df = pd.read_csv("Food_Inspections_Cleaned.csv")
columns_to_use = [
    "Inspection ID", "License #", "DBA Name", "Facility Type",
    "Risk", "City", "State", "Results"
]
df_sample = df[columns_to_use].head(2000)
sample_csv = "Food_Inspections_Sample.csv"
df_sample.to_csv(sample_csv, index=False)

# === Step 2: Load actual column names from saved CSV ===
column_names = pd.read_csv(sample_csv, nrows=0).columns.tolist()

# === Step 3: Run TANE ===
T = read_db(sample_csv)
tane = TANE(T)

start = time.time()
tane.run()
end = time.time()

# === Step 4: Print results safely ===
print(f"\n Execution Time: {end - start:.2f} seconds")
print(f" {len(tane.rules)} FDs found:\n")

for lhs, rhs in tane.rules:
    if rhs < len(column_names) and all(i < len(column_names) for i in lhs):
        lhs_names = [column_names[i] for i in lhs]
        rhs_name = column_names[rhs]
        print(f"{', '.join(lhs_names)} → {rhs_name}")


In [None]:
import pandas as pd
import time
from itertools import combinations

def auto_fd_search(df, max_lhs=2):
    results = []
    start = time.time()
    columns = df.columns.tolist()

    for rhs in columns:
        lhs_candidates = [col for col in columns if col != rhs]
        for lhs in combinations(lhs_candidates, max_lhs):
            grouped = df.groupby(list(lhs))[rhs].nunique()
            if grouped.max() == 1:
                results.append((list(lhs), rhs))

    end = time.time()
    print(f"\n max_lhs = {max_lhs} |  Runtime: {end - start:.2f} seconds")
    print(f" {len(results)} functional dependencies found:\n")

    for lhs, rhs in results:
        print(f"{', '.join(lhs)} → {rhs}")
    return results


In [None]:
df = pd.read_csv("Food_Inspections_Sample.csv")

# 1 → 1 FDs
auto_fd_search(df, max_lhs=1)

# 2 → 1 FDs
auto_fd_search(df, max_lhs=2)

# 3 → 1 (optional but slower)
auto_fd_search(df, max_lhs=3)

# 4 → 1 (optional but slower)
auto_fd_search(df, max_lhs=4)


In [None]:
import pandas as pd
import time
from itertools import combinations

# Step 1: Brute-force FD discovery
def auto_fd_search(df, max_lhs=2):
    results = []
    columns = df.columns.tolist()
    for rhs in columns:
        lhs_candidates = [col for col in columns if col != rhs]
        for lhs in combinations(lhs_candidates, max_lhs):
            grouped = df.groupby(list(lhs))[rhs].nunique()
            if grouped.max() == 1:
                results.append((list(lhs), rhs))
    return results

# Step 2: Filter for only minimal FDs
def filter_minimal_fds(fds):
    minimal = []
    for lhs, rhs in fds:
        is_minimal = True
        for other_lhs, other_rhs in fds:
            if rhs == other_rhs and set(other_lhs).issubset(set(lhs)) and lhs != other_lhs:
                is_minimal = False
                break
        if is_minimal:
            minimal.append((lhs, rhs))
    return minimal

# Step 3: Run across multiple LHS sizes and combine
def discover_minimal_fds(df, max_lhs=4):
    combined_results = []
    for i in range(1, max_lhs + 1):
        print(f" Searching FDs with max_lhs = {i} ...")
        result = auto_fd_search(df, max_lhs=i)
        combined_results.extend(result)
    minimal = filter_minimal_fds(combined_results)

    print(f"\n Final set of minimal FDs (after pruning redundant ones): {len(minimal)}\n")
    for lhs, rhs in minimal:
        print(f"{', '.join(lhs)} → {rhs}")
    return minimal

df = pd.read_csv("Food_Inspections_Sample.csv")
start_time = time.time()
minimal_fds = discover_minimal_fds(df, max_lhs=5)
total_time = time.time() - start_time

print(f"\n Total Runtime (all steps): {total_time:.2f} seconds")


In [None]:
import pandas as pd

# Load your dataset
df = pd.read_csv("Food_Inspections_Cleaned.csv")

# Function to find unary INDs using a value index
def find_unary_inds_via_index(df):
    value_index = {}

    # Step 1: Build value index (value -> set of columns where it appears)
    for col in df.columns:
        for val in df[col].dropna().unique():
            if val not in value_index:
                value_index[val] = set()
            value_index[val].add(col)

    # Step 2: Check for unary INDs
    unary_inds = []
    for lhs in df.columns:
        lhs_values = df[lhs].dropna().unique()
        for rhs in df.columns:
            if lhs == rhs:
                continue
            # Check if every value in lhs also appears in rhs
            if all(rhs in value_index.get(val, set()) for val in lhs_values):
                unary_inds.append((lhs, rhs))
    return unary_inds

# Run the discovery and print results
inds = find_unary_inds_via_index(df)
print("Unary INDs found (LHS ⊆ RHS):")
for lhs, rhs in inds:
    print(f"{lhs} ⊆ {rhs}")
