In [None]:
import pandas as pd
import numpy as np
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from fuzzywuzzy import fuzz, process
from collections import Counter, defaultdict
import re
from string import capwords

# Load data
df = pd.read_csv('Food_Inspections_20250216.csv')

# 1. Enhanced Data Preprocessing with Typo Detection - Improved version
def detect_typos(text_series, min_word_length=4, min_count=10, similarity_threshold=90):
    words = []
    for text in text_series.dropna():
        # Extract words, exclude short words and numbers
        extracted = [w.lower() for w in re.findall(r"\b[\w']+\b", str(text)) 
                    if len(w) >= min_word_length and not w.isdigit()]
        words.extend(extracted)
    
    word_counts = Counter(words)
    common_words = {word for word, count in word_counts.items() if count >= min_count}
    
    # Known common typos in facility types
    common_typos = {
        'childern': 'children',
        'assissted': 'assisted',
        'restuarant': 'restaurant',
        'commisary': 'commissary',
        'convnience': 'convenience',
        'liquore': 'liquor',
        'facilty': 'facility',
        'nutriton': 'nutrition',
        'herbalcal': 'herbalife',
        'cafetaria': 'cafeteria',
        'poulty': 'poultry',
        'hooka': 'hookah'
    }
    
    # Find potential typos not in our dictionary
    potential_typos = {}
    for word in word_counts:
        if word not in common_typos and word not in common_words:
            matches = process.extract(word, common_words, limit=1)
            if matches and matches[0][1] >= similarity_threshold:
                # Additional validation - first letter should match
                if word[0] == matches[0][0][0]:
                    potential_typos[word] = matches[0][0]
    
    return {**common_typos, **potential_typos}

# Detect typos in the dataset
typo_dict = detect_typos(df['Facility Type'])
print("Detected Typos and Corrections:")
for typo, correction in typo_dict.items():
    print(f"{typo: <20} → {correction}")

# 2. Enhanced Preprocessing with Similar Terms - Updated to handle 4-digit codes
def preprocess_facility_type(text):
    if pd.isna(text):
        return np.nan
    
    text = str(text).strip()
    
    # Remove any 4-digit numbers completely
    text = re.sub(r'\b\d{4}\b', '', text).strip()
    
    # Remove parentheses if they wrap the entire string
    if text.startswith('(') and text.endswith(')'):
        text = text[1:-1].strip()
    
    # Convert to title case while preserving apostrophes
    def title_case_preserve_apostrophes(s):
        return ' '.join(
            word.capitalize() if "'" not in word else word
            for word in s.lower().split()
        )
    
    text = title_case_preserve_apostrophes(text)
    
    # Apply typo corrections word by word, skipping words with apostrophes
    words = re.findall(r"\b[\w']+\b", text.lower())
    corrected_words = []
    for word in words:
        if "'" in word:
            corrected_words.append(word)  # Keep words with apostrophes unchanged
        else:
            corrected_words.append(typo_dict.get(word, word))
    text = ' '.join(corrected_words)
    
    # Manually fix common case issues with apostrophes
    text = re.sub(r"Children'S", "Children's", text, flags=re.IGNORECASE)
    text = re.sub(r"Childrens", "Children's", text, flags=re.IGNORECASE)
    
    # Specific corrections before general processing
    corrections = {
        'Childern': 'Children',
        'Tavern': 'Restaurant',
        'Grocery Store Store': 'Grocery Store',
        'Assissted': 'Assisted',
        'Restuarant': 'Restaurant',
        'Commisary': 'Commissary',
        'Hooka': 'Hookah',
        'Parlor': 'Shop',
        'Cart': 'Station',
        'Liqour': 'Liquor',
        "Children S Services Facility": "Children's Services Facility"  # Fix the specific issue
    }
    
    for wrong, right in corrections.items():
        text = re.sub(rf'\b{wrong}\b', right, text, flags=re.IGNORECASE)
    
    # Similar terms standardization
    similar_terms = {
        r'Hookah (Bar|Lounge)': 'Hookah Lounge',
        r'Ice Cream (Parlor|Shop|Store)': 'Ice Cream Shop',
        r'Hot Dog (Cart|Station)': 'Hot Dog Station',
        r'Long[\s-]Term Care (Facility)?': 'Long Term Care',
        r'Day ?Care': 'Day Care',
        r'Banquet (Hall|Room|Facility)': 'Banquet Hall',
        r'Gas ?Station': 'Gas Station',
        r'Shared Kitchen': 'Shared Kitchen User',
        r'Mobile (Food|Frozen)': 'Mobile Food'
    }
    
    for pattern, replacement in similar_terms.items():
        if re.search(pattern, text, re.IGNORECASE):
            text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
            break  # Only apply one standardization
    
    return text.title() if text != '' else np.nan

# Apply preprocessing
df['Facility_Type_Clean'] = df['Facility Type'].apply(preprocess_facility_type)

# 3. Fixed Targeted Fuzzy Matching Consolidation
# [Previous imports and code remain the same until the consolidate_facility_types function]

def consolidate_facility_types(series):
    # First handle specific cases that shouldn't be changed
    protected_terms = {
        'School': 'School',
        'Grocery Store': 'Grocery Store',
        'Charter School': 'Charter School',
        'Private School': 'Private School',
        'Culinary School': 'Culinary School',
        'Pastry School': 'Pastry School',
        'Teaching School': 'Teaching School',
        'Children\'s Services Facility': 'Children\'s Services Facility',
        'Long Term Care': 'Long Term Care',
        'Ice Cream Shop': 'Ice Cream Shop',
        'Hookah Lounge': 'Hookah Lounge',
        'Hot Dog Station': 'Hot Dog Station',
        'Convenience Store': 'Convenience Store',
        'Banquet Hall': 'Banquet Hall',
        'After School Program': 'After School Program',
        'Liquor': 'Liquor',
        'Dollar Store': 'Dollar Store',
        'Drug Store': 'Drug Store',
        'Event Venue': 'Event Venue',
        'Fitness Center': 'Fitness Center',
        'Gas Station': 'Gas Station'
    }
    
    # Then handle general cases with priority ordering
    general_mappings = [
        # School-related mappings
        (r'Charter School.*', 'Charter School'),
        (r'After School (Care|Program)', 'After School Program'),
        (r'Before And After School Program', 'After School Program'),
        (r'.*Culinary.*', 'Culinary School'),  # Anything with "Culinary"
        
        # Day care mappings (fixed DAY CARE 2-14 pattern)
        (r'DAY\s?CARE\s?2-14', 'Day Care 2 Yrs To 14 Yrs'),
        (r'DAY\s?CARE\s?6\s?WKS-5\s?YRS', 'Day Care 5 Weeks To 5 Yrs'),
        (r'Day Care.*2.*6', 'Day Care (2 - 6 Years)'),
        (r'Day Care.*Under 2', 'Day Care (Under 2 Years)'),
        (r'Day Care.*Combo', 'Day Care Combo'),
        
        # Store types
        (r'Convenient Store', 'Convenience Store'),
        (r'Convenience(?! Store)', 'Convenience Store'),
        (r'.*Dollar.*', 'Dollar Store'),  # Anything with "Dollar"
        (r'.*Drug.*', 'Drug Store'),      # Anything with "Drug"
        
        # Event and fitness
        (r'.*Event.*', 'Event Venue'),
        (r'.*Fitness.*', 'Fitness Center'),
        
        # Gas stations
        (r'Gas\s?[Ss]tation', 'Gas Station'),
        (r'Gas Mini Mart', 'Gas Station'),
        
        # Other facility types
        (r'(Banquet|Banquet Dining)', 'Banquet Hall'),
        (r'TAVERN/RESTAURANT', 'Restaurant'),
        (r'TAVERN/LIQUOR', 'Liquor'),
        (r'Mobile.*Food.*Prepar', 'Mobile Prepared Food Vendor'),
        (r'Mobile.*Food.*Dispens', 'Mobile Food Dispenser'),
        (r'Long.*Term.*Care', 'Long Term Care'),
        (r'Ice Cream.*', 'Ice Cream Shop'),
        (r'Hookah.*', 'Hookah Lounge'),
        (r'Hot Dog.*', 'Hot Dog Station'),
        (r'Paleteria.*', 'Ice Cream Shop'),
        (r'(?<!\S)School(?!\S)', 'School'),
        
        # Remove GROCERY/ prefix
        (r'GROCERY/', '')
    ]
    
    def mapper(text):
        if pd.isna(text):
            return np.nan
        
        original_text = text
        text = str(text).strip()
        
        # Preserve Roman numerals (II, III, etc.)
        text = re.sub(r'\b(II|III|IV)\b', lambda m: m.group(1).upper(), text)
        
        # Remove GROCERY/ prefix
        text = re.sub(r'GROCERY/', '', text)
        
        # 1. Check for EXACT matches in protected terms (case-insensitive)
        for term, replacement in protected_terms.items():
            if text.lower() == term.lower():
                return replacement
        
        # 2. Check for protected terms WITHIN text
        for term, replacement in protected_terms.items():
            if term.lower() in text.lower():
                return replacement
        
        # 3. Apply general mappings in priority order
        for pattern, replacement in general_mappings:
            if re.search(pattern, text, re.IGNORECASE):
                return replacement
        
        # Default to original if no match (with Roman numeral preservation)
        return original_text
    
    return series.apply(mapper)

# Create clean copy properly to avoid SettingWithCopyWarning
df_clean = df.dropna(subset=['Facility_Type_Clean']).copy()
df_clean.loc[:, 'Facility_Type_Final'] = consolidate_facility_types(df_clean['Facility_Type_Clean'])

# 4. Generate Comparison Word Clouds - Fixed matplotlib warning
def generate_wordcloud(text, title, color_scheme, ax=None):
    wordcloud = WordCloud(
        width=1000,
        height=600,
        background_color='white',
        max_words=200,
        colormap=color_scheme,
        collocations=False
    ).generate(text)
    
    if ax is None:
        ax = plt.gca()
    ax.imshow(wordcloud, interpolation='bilinear')
    ax.axis('off')
    ax.set_title(title, pad=20, fontsize=16)

# Prepare text data
original_text = ' '.join(df_clean['Facility Type'].dropna().astype(str))
cleaned_text = ' '.join(df_clean['Facility_Type_Final'].dropna().astype(str))

# Create comparison figure with explicit axes
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 10))
generate_wordcloud(original_text, "Original Facility Types", 'Reds', ax1)
generate_wordcloud(cleaned_text, "Cleaned Facility Types", 'Greens', ax2)
plt.tight_layout()
plt.show()

# 5. Top Facility Types Comparison
print("\nTop 20 Facility Types (Before vs After Cleaning)")
print("="*60)
original_counts = Counter(df_clean['Facility Type'].dropna().astype(str))
cleaned_counts = Counter(df_clean['Facility_Type_Final'].dropna().astype(str))

# Get top 20 from each
top_original = original_counts.most_common(20)
top_cleaned = cleaned_counts.most_common(20)

# Create comparison table
comparison_df = pd.DataFrame({
    'Original Type': [x[0] for x in top_original],
    'Original Count': [x[1] for x in top_original],
    'Cleaned Type': [x[0] for x in top_cleaned],
    'Cleaned Count': [x[1] for x in top_cleaned]
})

print(comparison_df.to_string(index=False))

# 6. Most Significant Changes
print("\nMost Impactful Standardizations:")
print("="*60)
changes = []
for typ in df_clean['Facility_Type_Final'].unique():
    if pd.isna(typ):
        continue
    original_variants = df_clean[df_clean['Facility_Type_Final'] == typ]['Facility Type'].value_counts()
    for orig, count in original_variants.items():
        if orig != typ:
            changes.append((orig, typ, count))

changes.sort(key=lambda x: -x[2])
for original, cleaned, count in changes[:20]:
    print(f"{original: <40} → {cleaned: <40} ({count} records)")

# 7. Similar Terms Analysis
print("\nSimilar Terms Consolidated:")
print("="*60)
similar_terms_groups = {
    'Hookah': ['Hooka Bar', 'Hookah Bar', 'Hooka Lounge', 'Hookah Lounge'],
    'Ice Cream': ['Ice Cream', 'Ice Cream Parlor', 'Ice Cream Shop', 'Ice Cream Store'],
    'Hot Dog': ['Hot Dog Cart', 'Hot Dog Station'],
    'Long Term Care': ['Long Term Care', 'Long-Term Care', 'Long Term Care Facility', 'Long-Term Care Facility'],
    'Banquet': ['Banquet', 'Banquet Hall', 'Banquet Room', 'Banquet Facility'],
    'Mobile Food': ['Mobile Food', 'Mobile Food Preparer', 'Mobile Food Dispenser', 'Mobile Food Vendor'],
    'School': ['School', 'Charter School', 'Private School', 'Culinary School', 'Elementary School', 'High School']
}

for group_name, terms in similar_terms_groups.items():
    print(f"\n{group_name} Group:")
    for term in terms:
        count = len(df_clean[df_clean['Facility_Type_Final'].str.contains(term, case=False, na=False)])
        print(f"  - {term: <30} → {count: >5} records")
        
df_clean.to_csv('clean.csv')