# Restaurant Data Cleaning Notebook


In [2]:
# Step 1: Load and convert ARFF file to DataFrame
# ARFF to CSV converter adapted from MIT-licensed code by haloboy777
# Original: https://github.com/haloboy777/arff-to-csv
# License: MIT

import pandas as pd
from io import StringIO

def arff_to_csv(filepath):
    """
    Convert ARFF file to CSV format.
    
    Extracts column names from @ATTRIBUTE lines and data from @DATA section.
    Returns pandas DataFrame.
    
    Args:
        filepath: Path to .arff file
    
    Returns:
        DataFrame with restaurant data
    """
    data = False
    header = ""
    csv_content = []
    
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.rstrip('\n')
            
            # Extract column names from @ATTRIBUTE/@attribute lines
            if "@attribute" in line.lower():
                attributes = line.split()
                attri_idx = next(i for i, x in enumerate(attributes) 
                               if x.lower() == "@attribute")
                column_name = attributes[attri_idx + 1]
                header = header + column_name + ","
            
            # Start data collection after @DATA marker
            elif "@data" in line.lower():
                data = True
                header = header.rstrip(',') + '\n'
                csv_content.append(header)
            
            # Collect data rows
            elif data and line.strip():
                csv_content.append(line + '\n')
    
    # Parse CSV content into DataFrame
    csv_string = ''.join(csv_content)
    df = pd.read_csv(StringIO(csv_string), quotechar='"')
    
    return df

# Load restaurant data
df = arff_to_csv("datasets/restaurant/fz.arff")
print(f"Loaded {len(df)} restaurants")
print(f"Columns: {df.columns.tolist()}")
df.head()

Loaded 864 restaurants
Columns: ['name', 'addr', 'city', 'phone', 'type', 'class']


Unnamed: 0,name,addr,city,phone,type,class
0,arnie morton's of chicago,"""435 s. la cienega blv.""","""los angeles""","""310/246-1501""","""american""",'0'
1,arnie morton's of chicago,"""435 s. la cienega blvd.""","""los angeles""","""310-246-1501""","""steakhouses""",'0'
2,art's delicatessen,"""12224 ventura blvd.""","""studio city""","""818/762-1221""","""american""",'1'
3,art's deli,"""12224 ventura blvd.""","""studio city""","""818-762-1221""","""delis""",'1'
4,hotel bel-air,"""701 stone canyon rd.""","""bel air""","""310/472-1211""","""californian""",'2'


In [4]:
# Step 2: Extract area code from phone number
# Area code is the first 3 digits of the phone number (after country code)
# Phone numbers in ARFF have surrounding quotes that need to be removed

def extract_area_code(phone):
    """
    Extract area code (first 3 digits) from phone number.
    
    Handles various phone formats:
    - "310/246-1501" (with quotes)
    - 310/246-1501
    - 310-246-1501
    - 3102461501
    
    Args:
        phone: Phone number string
    
    Returns:
        3-digit area code string, or None if invalid
    """
    if pd.isna(phone) or phone is None:
        return None
    
    # Convert to string, strip whitespace and quotes
    phone_str = str(phone).strip().strip('"').strip("'")
    
    # Remove formatting characters
    phone_digits = phone_str.replace('-', '').replace('/', '').replace(' ', '')
    
    # Extract first 3 digits if we have enough digits
    if len(phone_digits) >= 3:
        area_code = phone_digits[:3]
        # Verify it's actually digits
        if area_code.isdigit():
            return area_code
    
    return None

# Apply area code extraction to all restaurants
df['area_code'] = df['phone'].apply(extract_area_code)

# Check results
print("Area Code Extraction Summary:")
print(f"  Total restaurants: {len(df)}")
print(f"  Restaurants with area code: {df['area_code'].notna().sum()}")
print(f"  Missing area codes: {df['area_code'].isna().sum()}")
print(f"\nArea Code Distribution:")
print(df['area_code'].value_counts().head(10))

# Show sample
print("\nSample data with area codes:")
print(df[['name', 'phone', 'area_code']].head(10))

Area Code Extraction Summary:
  Total restaurants: 864
  Restaurants with area code: 864
  Missing area codes: 0

Area Code Distribution:
area_code
212    338
415    148
404    114
310     93
702     63
213     55
818     25
770     15
718     11
805      1
Name: count, dtype: int64

Sample data with area codes:
                        name            phone area_code
0  arnie morton's of chicago   "310/246-1501"       310
1  arnie morton's of chicago   "310-246-1501"       310
2         art's delicatessen   "818/762-1221"       818
3                 art's deli   "818-762-1221"       818
4              hotel bel-air   "310/472-1211"       310
5              bel-air hotel   "310-472-1211"       310
6                 cafe bizou   "818/788-3536"       818
7                 cafe bizou   "818-788-3536"       818
8                  campanile   "213/938-1447"       213
9                  campanile   "213-938-1447"       213


In [5]:
# Step 3: Build similarity network
import Levenshtein

def string_distance(str_a, str_b):
    """
    Calculate edit distance between two strings using Levenshtein distance.
    
    Args:
        str_a: First string
        str_b: Second string
    
    Returns:
        Integer distance (0 = identical, higher = more different)
    """
    return Levenshtein.distance(str(str_a), str(str_b))

# Build similarity edges
similarity_edges = []
address_threshold = 7  # Maximum edit distance for "similar" addresses

print("Building similarity network...")
print(f"  Comparing {len(df)} restaurants")
print(f"  Address similarity threshold: {address_threshold}")

for i in range(len(df)):
    for j in range(i + 1, len(df)):  # Avoid duplicate pairs
        restaurant_i = df.iloc[i]
        restaurant_j = df.iloc[j]
        
        # Only create edge if same city
        if restaurant_i['city'] != restaurant_j['city']:
            continue
        
        # Calculate address similarity
        addr_distance = string_distance(restaurant_i['addr'], restaurant_j['addr'])
        
        # Create edge if addresses are similar enough
        if addr_distance < address_threshold:
            similarity_edges.append((i, j))

# Report results
print(f"\nSimilarity Network Summary:")
print(f"  Total restaurants: {len(df)}")
print(f"  Total similarity edges: {len(similarity_edges)}")
if len(df) > 0:
    avg_degree = 2 * len(similarity_edges) / len(df)
    print(f"  Average degree: {avg_degree:.2f}")

# Show sample edges
print(f"\nSample similarity edges (first 10):")
for idx, (i, j) in enumerate(similarity_edges[:10], 1):
    name_i = df.iloc[i]['name']
    name_j = df.iloc[j]['name']
    city = df.iloc[i]['city']
    addr_dist = string_distance(df.iloc[i]['addr'], df.iloc[j]['addr'])
    print(f"  {idx}. {name_i} <-> {name_j} (city: {city}, addr_dist: {addr_dist})")

Building similarity network...
  Comparing 864 restaurants
  Address similarity threshold: 7

Similarity Network Summary:
  Total restaurants: 864
  Total similarity edges: 5307
  Average degree: 12.28

Sample similarity edges (first 10):
  1. arnie morton's of chicago <-> arnie morton's of chicago (city:  "los angeles", addr_dist: 1)
  2. arnie morton's of chicago <-> l'orangerie (city:  "los angeles", addr_dist: 5)
  3. arnie morton's of chicago <-> drai's (city:  "los angeles", addr_dist: 4)
  4. arnie morton's of chicago <-> l'orangerie (city:  "los angeles", addr_dist: 4)
  5. arnie morton's of chicago <-> drai's (city:  "los angeles", addr_dist: 3)
  6. art's delicatessen <-> art's deli (city:  "studio city", addr_dist: 0)
  7. art's delicatessen <-> pinot bistro (city:  "studio city", addr_dist: 3)
  8. art's delicatessen <-> sushi nozawa (city:  "studio city", addr_dist: 3)
  9. art's deli <-> pinot bistro (city:  "studio city", addr_dist: 3)
  10. art's deli <-> sushi nozawa (

In [6]:
# Step 4: Save cleaned restaurant data and similarity edges

from pathlib import Path
from datetime import datetime

timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")

# Save restaurant catalog (id, name, area_code, address, city)
restaurants_file = Path("datasets/temp") / f"restaurants_{timestamp}.txt"
with open(restaurants_file, "w", encoding="utf-8") as f:
    f.write("id\tname\tarea_code\taddr\tcity\n")  # Header
    for idx, row in df.iterrows():
        f.write(f"{idx}\t{row['name']}\t{row['area_code']}\t{row['addr']}\t{row['city']}\n")

# Save similarity network (pairs of restaurants that should have same area code)
similarities_file = Path("datasets/temp") / f"restaurant_similarities_{timestamp}.txt"
with open(similarities_file, "w", encoding="utf-8") as f:
    for i, j in similarity_edges:
        f.write(f"({i},{j})\n")

print(f"\nSaved cleaned data:")
print(f"  Restaurants: {restaurants_file}")
print(f"  Similarities: {similarities_file}")
print(f"\nSummary:")
print(f"  Total restaurants: {len(df)}")
print(f"  Total similarity edges: {len(similarity_edges)}")
print(f"  Unique area codes: {df['area_code'].nunique()}")


Saved cleaned data:
  Restaurants: datasets\temp\restaurants_20260107-125146.txt
  Similarities: datasets\temp\restaurant_similarities_20260107-125146.txt

Summary:
  Total restaurants: 864
  Total similarity edges: 5307
  Unique area codes: 11
