# IRIS Data Analysis - Structural Schema & Relationship Mapping

This notebook focuses on identifying the database schema, Primary Keys (PK), and Foreign Keys (FK) to determine table relationships and isolate irrelevant (junk) files.

In [None]:
import pandas as pd
import glob
import os
import itertools

pd.set_option('display.max_columns', None)

## 1. Load and Clean Data
Loading all CSVs and dropping the `index` column immediately.

In [None]:
DATA_PATH = 'Sales Dataset'
csv_files = glob.glob(os.path.join(DATA_PATH, "*.csv"))
dataframes = {}

for file in csv_files:
    filename = os.path.basename(file)
    try:
        try:
            df = pd.read_csv(file, encoding='utf-8')
        except UnicodeDecodeError:
            df = pd.read_csv(file, encoding='ISO-8859-1')
            
        # Drop 'index' column if it exists
        if 'index' in df.columns:
            df = df.drop(columns=['index'])
            
        # Standardize column names (strip whitespace, lower case for comparison)
        # But keep original for display
        df.columns = [c.strip() for c in df.columns]
            
        dataframes[filename] = df
        print(f"Loaded {filename}: {df.shape}")
    except Exception as e:
        print(f"Error loading {filename}: {e}")

## 2. Identify Primary Keys (PK)
A Primary Key must be unique and non-null. We will check each column in every dataframe.

In [None]:
potential_pks = {}

print("--- Potential Primary Keys ---")
for name, df in dataframes.items():
    pks = []
    for col in df.columns:
        if df[col].is_unique and not df[col].isnull().any():
            pks.append(col)
    potential_pks[name] = pks
    print(f"{name}: {pks}")

## 3. Identify Relationships (Foreign Keys)
We look for columns that share names and data content between tables.

In [None]:
print("--- Relationship Mapping (Shared Columns) ---")
file_pairs = itertools.combinations(dataframes.keys(), 2)
connections = []

for name1, name2 in file_pairs:
    cols1 = set(dataframes[name1].columns)
    cols2 = set(dataframes[name2].columns)
    
    # Find common columns
    common_cols = cols1.intersection(cols2)
    
    # Also check for fuzzy matches (e.g. 'SKU' vs 'Sku' vs 'SKU Code')
    # We'll normalize to lower case for this check
    cols1_lower = {c.lower(): c for c in cols1}
    cols2_lower = {c.lower(): c for c in cols2}
    common_lower = set(cols1_lower.keys()).intersection(set(cols2_lower.keys()))
    
    for c_lower in common_lower:
        c1 = cols1_lower[c_lower]
        c2 = cols2_lower[c_lower]
        
        # Verify content overlap to confirm it's a real relationship
        vals1 = set(dataframes[name1][c1].dropna().unique())
        vals2 = set(dataframes[name2][c2].dropna().unique())
        
        overlap = vals1.intersection(vals2)
        if len(overlap) > 0:
            print(f"{name1} ({c1}) <-> {name2} ({c2}) | Overlap: {len(overlap)}")
            connections.append((name1, name2))
        else:
             print(f"{name1} ({c1}) <-> {name2} ({c2}) | No content overlap (False Positive)")

## 4. Isolate Junk Tables
Tables that have NO connections to others are likely junk or standalone reference files.

In [None]:
connected_files = set()
for n1, n2 in connections:
    connected_files.add(n1)
    connected_files.add(n2)
    
all_files = set(dataframes.keys())
junk_files = all_files - connected_files

print("\n--- Connected Files (Core Schema) ---")
for f in connected_files:
    print(f)

print("\n--- Isolated Files (Potential Junk) ---")
for f in junk_files:
    print(f)
    print(f"Preview of {f}:")
    display(dataframes[f].head())