In [3]:
import sys
from pathlib import Path
sys.path.append('../src')

import pandas as pd
import numpy as np
import networkx as nx
from collections import defaultdict, Counter
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import combinations
import random
from sklearn.model_selection import train_test_split

from config import config, hardware_config

In [4]:
listings_df = pd.read_csv(f"../data/{config.listings_file}")
duplicates_df = pd.read_csv(f"../data/{config.duplicates_file}")

In [5]:
# Rebuild connected components graph from EDA
G = nx.Graph()
for _, row in duplicates_df.iterrows():
    G.add_edge(row['listing_id_1'], row['listing_id_2'])

# Get connected components (clusters)
clusters = list(nx.connected_components(G))
cluster_sizes = [len(cluster) for cluster in clusters]

print(f"\nFound {len(clusters)} duplicate clusters")
print(f"Cluster size distribution: {Counter(cluster_sizes)}")

# Sort clusters by size for investigation
clusters_sorted = sorted(clusters, key=len, reverse=True)
print(f"\nLargest clusters: {[len(c) for c in clusters_sorted[:10]]}")


Found 279 duplicate clusters
Cluster size distribution: Counter({2: 98, 3: 53, 4: 33, 6: 27, 5: 16, 7: 12, 8: 11, 10: 6, 9: 6, 11: 4, 12: 3, 13: 3, 1: 2, 18: 2, 14: 1, 22: 1, 24: 1})

Largest clusters: [24, 22, 18, 18, 14, 13, 13, 13, 12, 12]


In [6]:
def investigate_cluster(cluster, listings_df, cluster_id):
    """Detailed investigation of a specific cluster"""
    print(f"CLUSTER #{cluster_id} - Size: {len(cluster)} duplicates")
    
    # Get cluster listings
    cluster_listings = listings_df[listings_df['listing_id'].isin(cluster)].copy()
    
    if len(cluster_listings) == 0:
        print("No listings found for this cluster (data quality issue)")
        return False
    
    print(f"\n Found {len(cluster_listings)} actual listings in data")
    
    # Basic statistics
    print(f"\nPRICE ANALYSIS:")
    prices = cluster_listings['current_price'].dropna()
    if len(prices) > 0:
        print(f"   Price range: €{prices.min():,.0f} - €{prices.max():,.0f}")
        print(f"   Price std: €{prices.std():,.0f} ({100*prices.std()/prices.mean():.1f}% CV)")
    
    print(f"\nSURFACE ANALYSIS:")
    surfaces = cluster_listings['surface_m2'].dropna()
    if len(surfaces) > 0:
        print(f"   Surface range: {surfaces.min():.0f}m² - {surfaces.max():.0f}m²")
        print(f"   Surface std: {surfaces.std():.1f}m² ({100*surfaces.std()/surfaces.mean():.1f}% CV)")
    
    print(f"\nPROPERTY CHARACTERISTICS:")
    room_counts = cluster_listings['room_count'].dropna().unique()
    floors = cluster_listings['floor'].dropna().unique()
    floor_counts = cluster_listings['floor_count'].dropna().unique()
    
    print(f"   Room counts: {sorted(room_counts) if len(room_counts) > 0 else 'No data'}")
    print(f"   Floors: {sorted(floors) if len(floors) > 0 else 'No data'}")
    print(f"   Floor counts: {sorted(floor_counts) if len(floor_counts) > 0 else 'No data'}")
    
    # Sample descriptions
    descriptions = cluster_listings['description'].dropna()
    print(f"\nDESCRIPTION SAMPLES:")
    for i, desc in enumerate(descriptions.head(3)):
        print(f"   Sample {i+1}: {desc[:150]}...")
    
    # Quality assessment
    price_cv = prices.std() / prices.mean() if len(prices) > 1 else 0
    surface_cv = surfaces.std() / surfaces.mean() if len(surfaces) > 1 else 0
    
    is_suspicious = (
        price_cv > 0.3 or  # >30% price variation
        surface_cv > 0.3 or  # >30% surface variation
        len(room_counts) > 2  # Too many different room counts
    )
    
    print(f"\nQUALITY ASSESSMENT:")
    print(f"   Price CV: {price_cv:.3f} {'⚠️' if price_cv > 0.3 else '✅'}")
    print(f"   Surface CV: {surface_cv:.3f} {'⚠️' if surface_cv > 0.3 else '✅'}")
    print(f"   Room diversity: {'⚠️' if len(room_counts) > 2 else '✅'}")
    print(f"   Overall: {'SUSPICIOUS' if is_suspicious else '✅ LEGITIMATE'}")
    
    return not is_suspicious


In [7]:
legitimate_clusters = []
suspicious_clusters = []

for i, cluster in enumerate(clusters_sorted[:5]):
    is_legitimate = investigate_cluster(cluster, listings_df, i+1)
    if is_legitimate:
        legitimate_clusters.append(cluster)
    else:
        suspicious_clusters.append(cluster)

print(f"\nINVESTIGATION SUMMARY:")
print(f"   Legitimate clusters: {len(legitimate_clusters)}")
print(f"   Suspicious clusters: {len(suspicious_clusters)}")

CLUSTER #1 - Size: 24 duplicates

 Found 24 actual listings in data

PRICE ANALYSIS:
   Price range: €259,900 - €269,900
   Price std: €4,945 (1.9% CV)

SURFACE ANALYSIS:
   Surface range: 82m² - 82m²
   Surface std: 0.0m² (0.0% CV)

PROPERTY CHARACTERISTICS:
   Room counts: [4.0]
   Floors: [3.0]
   Floor counts: [3.0]

DESCRIPTION SAMPLES:
   Sample 1: * EXCLUSIVTE / CONFLANS* ! SECTEUR de FIN D'OISE, situé à deux pas GARE RER (800 mètres) proche commodités, sur secteur PRISE et RESIDENTIEL.!! dans R...
   Sample 2: * EXCLUSIVTE / CONFLANS* ! SECTEUR de FIN D'OISE, situé à deux pas GARE RER (800 mètres) proche commodités, sur secteur PRISE et RESIDENTIEL.!! dans R...
   Sample 3: * EXCLUSIVTE / CONFLANS* ! SECTEUR de FIN D'OISE, situé à deux pas GARE RER (800 mètres) proche commodités, sur secteur PRISE et RESIDENTIEL.!! dans R...

QUALITY ASSESSMENT:
   Price CV: 0.019 ✅
   Surface CV: 0.000 ✅
   Room diversity: ✅
   Overall: ✅ LEGITIMATE
CLUSTER #2 - Size: 22 duplicates

 Found 22 

In [8]:
def assess_cluster_quality(cluster, listings_df):
    """Assess if a cluster is suitable for training data"""
    cluster_listings = listings_df[listings_df['listing_id'].isin(cluster)]
    
    if len(cluster_listings) < 2:
        return False, "insufficient_data"
    
    # Price consistency check
    prices = cluster_listings['current_price'].dropna()
    if len(prices) > 1:
        price_cv = prices.std() / prices.mean()
        if price_cv > 0.4:  # More than 40% price variation
            return False, "price_inconsistent"
    
    # Surface consistency check
    surfaces = cluster_listings['surface_m2'].dropna()
    if len(surfaces) > 1:
        surface_cv = surfaces.std() / surfaces.mean()
        if surface_cv > 0.3:  # More than 30% surface variation
            return False, "surface_inconsistent"
    
    # Room count consistency (allow 1-2 different room counts)
    room_counts = cluster_listings['room_count'].nunique()
    if room_counts > 2:
        return False, "room_inconsistent"
    
    # Cluster size limit (avoid overly complex clusters)
    if len(cluster) > 10:
        return False, "too_large"
    
    return True, "legitimate"


In [9]:
clean_clusters = []
filtered_stats = Counter()

print("🔍 Filtering all clusters for training data quality...")

for cluster in clusters:
    is_clean, reason = assess_cluster_quality(cluster, listings_df)
    filtered_stats[reason] += 1
    
    if is_clean:
        clean_clusters.append(cluster)

print(f"\nFILTERING RESULTS:")
for reason, count in filtered_stats.items():
    result = "Good" if reason == "legitimate" else "Not good"
    print(f"   {result} {reason}: {count} clusters")

print(f"\nTRAINING DATA SUMMARY:")
print(f"   Total original clusters: {len(clusters)}")
print(f"   Clean clusters for training: {len(clean_clusters)}")
print(f"   Filter rate: {100*(1-len(clean_clusters)/len(clusters)):.1f}%")

# Calculate clean training pairs
total_clean_pairs = sum(len(list(combinations(cluster, 2))) for cluster in clean_clusters)
print(f"   Available positive pairs: {total_clean_pairs:,}")

🔍 Filtering all clusters for training data quality...

FILTERING RESULTS:
   Good legitimate: 260 clusters
   Not good insufficient_data: 2 clusters
   Not good too_large: 15 clusters
   Not good price_inconsistent: 2 clusters

TRAINING DATA SUMMARY:
   Total original clusters: 279
   Clean clusters for training: 260
   Filter rate: 6.8%
   Available positive pairs: 2,060


  surface_cv = surfaces.std() / surfaces.mean()


In [11]:
def create_positive_pairs(clean_clusters):
    """Generate positive training pairs from clean clusters"""
    positive_pairs = []
    cluster_labels = []  # Track which cluster each pair comes from
    
    for cluster_id, cluster in enumerate(clean_clusters):
        # Generate all possible pairs within cluster
        cluster_pairs = list(combinations(cluster, 2))
        positive_pairs.extend(cluster_pairs)
        cluster_labels.extend([cluster_id] * len(cluster_pairs))
    
    return positive_pairs, cluster_labels

positive_pairs, pair_cluster_labels = create_positive_pairs(clean_clusters)

print(f"Generated {len(positive_pairs):,} positive training pairs")
print(f"Average pairs per cluster: {len(positive_pairs)/len(clean_clusters):.1f}")

# Convert to DataFrame for easier handling
positive_df = pd.DataFrame({
    'listing_id_1': [pair[0] for pair in positive_pairs],
    'listing_id_2': [pair[1] for pair in positive_pairs],
    'label': 1,
    'cluster_id': pair_cluster_labels
})

print(f"Positive pairs DataFrame shape: {positive_df.shape}")
print(positive_df.head())

Generated 2,060 positive training pairs
Average pairs per cluster: 7.9
Positive pairs DataFrame shape: (2060, 4)
   listing_id_1  listing_id_2  label  cluster_id
0      64728971      65459581      1           0
1      64728971      64721495      1           0
2      65459581      64721495      1           0
3      98429480      98430083      1           1
4     117949690     117274350      1           2


In [12]:
def create_negative_pairs(positive_pairs, listings_df, ratio=3):
    """
    Generate negative pairs with random sampling strategy
    - ratio: negative to positive ratio (3:1 recommended)
    """
    print(f"Creating negative samples (ratio={ratio}:1)")
    
    # Create lookup for existing positive pairs
    positive_set = set()
    for pair in positive_pairs:
        positive_set.add(tuple(sorted(pair)))
    
    print(f"Avoiding {len(positive_set):,} existing positive pairs")
    
    # Get all available listing IDs
    all_listings = listings_df['listing_id'].tolist()
    print(f"Total available listings for sampling: {len(all_listings):,}")
    
    negative_pairs = []
    attempts = 0
    max_attempts = len(positive_pairs) * ratio * 10  # Safety limit
    
    while len(negative_pairs) < len(positive_pairs) * ratio and attempts < max_attempts:
        attempts += 1
        
        # random sampling from all listings
        id1, id2 = random.sample(all_listings, 2)
        
        # Ensure not already a positive pair
        pair_key = tuple(sorted([id1, id2]))
        if pair_key not in positive_set:
            negative_pairs.append((id1, id2))
            positive_set.add(pair_key)  # Prevent duplicates in negatives too
    
    print(f"Generated {len(negative_pairs):,} negative pairs in {attempts:,} attempts")
    print(f"Success rate: {100*len(negative_pairs)/attempts:.1f}%")
    return negative_pairs

# Generate negative pairs
negative_pairs = create_negative_pairs(positive_pairs, listings_df, 
                                     ratio=config.negative_sampling_ratio)

# Convert to DataFrame
negative_df = pd.DataFrame({
    'listing_id_1': [pair[0] for pair in negative_pairs],
    'listing_id_2': [pair[1] for pair in negative_pairs],
    'label': 0,
    'cluster_id': -1  # No cluster for negatives
})

print(f"Negative pairs DataFrame shape: {negative_df.shape}")

Creating negative samples (ratio=3:1)
Avoiding 2,060 existing positive pairs
Total available listings for sampling: 1,428
Generated 6,180 negative pairs in 6,214 attempts
Success rate: 99.5%
Negative pairs DataFrame shape: (6180, 4)


In [13]:
# Combine positive and negative pairs
training_pairs_df = pd.concat([positive_df, negative_df], ignore_index=True)
print(f"\nFINAL TRAINING DATASET:")
print(f"   Total pairs: {len(training_pairs_df):,}")
print(f"   Positive pairs: {len(positive_df):,} ({100*len(positive_df)/len(training_pairs_df):.1f}%)")
print(f"   Negative pairs: {len(negative_df):,} ({100*len(negative_df)/len(training_pairs_df):.1f}%)")


FINAL TRAINING DATASET:
   Total pairs: 8,240
   Positive pairs: 2,060 (25.0%)
   Negative pairs: 6,180 (75.0%)


In [14]:
## 6. Cluster-Aware Train/Test Split

def cluster_aware_split(training_pairs_df, test_size=0.2, random_state=42):
    """
    Split data ensuring no cluster appears in both train and test
    This prevents data leakage in evaluation
    """
    print(f"Performing cluster-aware train/test split (test_size={test_size})")
    
    # Get unique clusters (only for positive pairs)
    positive_pairs = training_pairs_df[training_pairs_df['label'] == 1]
    unique_clusters = positive_pairs['cluster_id'].unique()
    
    print(f"Found {len(unique_clusters)} unique clusters to split")
    
    # Split clusters into train/test
    train_clusters, test_clusters = train_test_split(
        unique_clusters, 
        test_size=test_size, 
        random_state=random_state
    )
    
    print(f"   Train clusters: {len(train_clusters)}")
    print(f"   Test clusters: {len(test_clusters)}")
    
    # Assign pairs to train/test based on cluster membership
    def assign_split(row):
        if row['label'] == 0:  # Negative pairs - random assignment
            return 'test' if random.random() < test_size else 'train'
        else:  # Positive pairs - based on cluster
            return 'test' if row['cluster_id'] in test_clusters else 'train'
    
    # Set random seed for reproducible negative pair assignment
    random.seed(random_state)
    training_pairs_df['split'] = training_pairs_df.apply(assign_split, axis=1)
    
    # Create train/test DataFrames
    train_df = training_pairs_df[training_pairs_df['split'] == 'train'].copy()
    test_df = training_pairs_df[training_pairs_df['split'] == 'test'].copy()
    
    print(f"\nSPLIT RESULTS:")
    print(f"   Train set: {len(train_df):,} pairs")
    print(f"   Test set: {len(test_df):,} pairs")
    
    # Verify no cluster overlap
    train_pos_clusters = set(train_df[train_df['label']==1]['cluster_id'].unique())
    test_pos_clusters = set(test_df[test_df['label']==1]['cluster_id'].unique())
    overlap = train_pos_clusters.intersection(test_pos_clusters)
    
    print(f"   Cluster overlap: {len(overlap)} ({'NONE' if len(overlap)==0 else 'FOUND'})")
    
    # Label distribution
    print(f"\nLABEL DISTRIBUTION:")
    print("   Train set:")
    print(f"     Positive: {len(train_df[train_df['label']==1]):,}")
    print(f"     Negative: {len(train_df[train_df['label']==0]):,}")
    print("   Test set:")
    print(f"     Positive: {len(test_df[test_df['label']==1]):,}")
    print(f"     Negative: {len(test_df[test_df['label']==0]):,}")
    
    return train_df.drop('split', axis=1), test_df.drop('split', axis=1)

# Perform cluster-aware split
train_pairs, test_pairs = cluster_aware_split(
    training_pairs_df, 
    test_size=config.train_test_split,
    random_state=config.random_state
)

Performing cluster-aware train/test split (test_size=0.2)
Found 260 unique clusters to split
   Train clusters: 208
   Test clusters: 52

SPLIT RESULTS:
   Train set: 6,677 pairs
   Test set: 1,563 pairs
   Cluster overlap: 0 (NONE)

LABEL DISTRIBUTION:
   Train set:
     Positive: 1,742
     Negative: 4,935
   Test set:
     Positive: 318
     Negative: 1,245


In [15]:
# Create output directory
output_dir = Path("../data/processed")
output_dir.mkdir(exist_ok=True)

# Save training pairs
train_pairs.to_csv(output_dir / "train_pairs.csv", index=False)
test_pairs.to_csv(output_dir / "test_pairs.csv", index=False)

# Save cluster information for reference
cluster_info = pd.DataFrame({
    'cluster_id': range(len(clean_clusters)),
    'cluster_size': [len(cluster) for cluster in clean_clusters],
    'listing_ids': [list(cluster) for cluster in clean_clusters]
})
cluster_info.to_csv(output_dir / "clean_clusters.csv", index=False)

print("💾 SAVED TRAINING DATA:")
print(f"   {output_dir}/train_pairs.csv ({len(train_pairs):,} pairs)")
print(f"   {output_dir}/test_pairs.csv ({len(test_pairs):,} pairs)")
print(f"   {output_dir}/clean_clusters.csv ({len(clean_clusters)} clusters)")

💾 SAVED TRAINING DATA:
   ../data/processed/train_pairs.csv (6,677 pairs)
   ../data/processed/test_pairs.csv (1,563 pairs)
   ../data/processed/clean_clusters.csv (260 clusters)
