Seongmin Hwang 20102127

Marion SCHMITT 25170158

Seungwon Jeon 16102288

**Data Science Practice : Project (MODEL COMPARISON VERSION)**

# Analyzing the Relationship between News Bias and Audience Influence

**Objective: Analyze the relationship between media bias and audience influence through multiple methodological approaches**

In [1]:
import pandas as pd 
import numpy as np
import json
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import silhouette_score, davies_bouldin_score

import matplotlib.pyplot as plt
import seaborn as sns

# Korean Font
plt.rcParams['font.family'] = 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False

## Note: Dictionary Construction

**This notebook assumes that bias dictionaries have already been created using different methods:**
- Method 1: N-gram (2,3) + CountVectorizer
- Method 2: TF-IDF + lexical_units (1-gram)
- Method 3: N-gram (2,3) + TF-IDF

**Input Data Expected:**
- `bias_scores_method1.csv` - Bias scores using N-gram CountVectorizer
- `bias_scores_method2.csv` - Bias scores using TF-IDF lexical_units
- `bias_scores_method3.csv` - Bias scores using N-gram TF-IDF
- `influence_data.csv` - Influence scores for media outlets

Each bias score file should contain columns: `언론사`, `Bias_Score`

## 1. Load Pre-computed Bias Scores

In [2]:
# Load bias scores from different dictionary methods
# NOTE: Replace with actual file paths from your team

# Method 1: N-gram CountVectorizer (baseline)
bias_method1 = pd.read_csv('bias_scores_method1.csv')
bias_method1.columns = ['언론사', 'Bias_Score_Method1']

# Method 2: TF-IDF lexical_units
bias_method2 = pd.read_csv('bias_scores_method2.csv')
bias_method2.columns = ['언론사', 'Bias_Score_Method2']

# Method 3: N-gram TF-IDF (optional)
try:
    bias_method3 = pd.read_csv('bias_scores_method3.csv')
    bias_method3.columns = ['언론사', 'Bias_Score_Method3']
    has_method3 = True
except:
    print("Method 3 not available, proceeding with 2 methods")
    has_method3 = False

# Load influence data
influence_df = pd.read_csv('influence_data.csv')

print("Data loaded successfully")
print(f"Method 1 media count: {len(bias_method1)}")
print(f"Method 2 media count: {len(bias_method2)}")
print(f"Influence data count: {len(influence_df)}")

FileNotFoundError: [Errno 2] No such file or directory: 'bias_scores_method1.csv'

## 2. Merge All Data

In [None]:
# Merge all bias scores with influence data
df_comparison = bias_method1.merge(bias_method2, on='언론사', how='inner')

if has_method3:
    df_comparison = df_comparison.merge(bias_method3, on='언론사', how='inner')

df_comparison = df_comparison.merge(
    influence_df[['언론사', 'Influence_Score']], 
    on='언론사', 
    how='inner'
)

print(f"\nFinal dataset: {len(df_comparison)} media outlets")
print("\nColumns:", df_comparison.columns.tolist())
print("\nSample data:")
print(df_comparison.head())

# PART 1: Dictionary Method Comparison

## 3. Compare Correlations Across Methods

In [None]:
# Calculate correlations for each method
correlations = {}

correlations['Method 1 (N-gram CV)'] = df_comparison['Bias_Score_Method1'].corr(
    df_comparison['Influence_Score']
)

correlations['Method 2 (TF-IDF LU)'] = df_comparison['Bias_Score_Method2'].corr(
    df_comparison['Influence_Score']
)

if has_method3:
    correlations['Method 3 (N-gram TF-IDF)'] = df_comparison['Bias_Score_Method3'].corr(
        df_comparison['Influence_Score']
    )

print("="*70)
print("CORRELATION COMPARISON")
print("="*70)
for method, corr in correlations.items():
    print(f"\n{method}")
    print(f"  Correlation: {corr:.4f}")
    if abs(corr) < 0.3:
        strength = "Weak"
    elif abs(corr) < 0.7:
        strength = "Moderate"
    else:
        strength = "Strong"
    print(f"  Strength: {strength}")

# Determine best method by correlation strength
best_method = max(correlations.items(), key=lambda x: abs(x[1]))
print(f"\n{'='*70}")
print(f"BEST METHOD (by correlation): {best_method[0]}")
print(f"Correlation: {best_method[1]:.4f}")
print("="*70)

In [None]:
# Visualize correlation comparison
fig, axes = plt.subplots(1, len(correlations), figsize=(6*len(correlations), 5))

if len(correlations) == 1:
    axes = [axes]

methods_list = list(correlations.keys())
bias_cols = ['Bias_Score_Method1', 'Bias_Score_Method2']
if has_method3:
    bias_cols.append('Bias_Score_Method3')

for idx, (method, bias_col) in enumerate(zip(methods_list, bias_cols)):
    ax = axes[idx]
    
    # Scatter plot
    ax.scatter(df_comparison[bias_col], df_comparison['Influence_Score'],
               alpha=0.6, s=100, edgecolors='black')
    
    # Regression line
    z = np.polyfit(df_comparison[bias_col], df_comparison['Influence_Score'], 1)
    p = np.poly1d(z)
    x_line = np.linspace(df_comparison[bias_col].min(), df_comparison[bias_col].max(), 100)
    ax.plot(x_line, p(x_line), 'r--', linewidth=2, 
            label=f'r = {correlations[method]:.3f}')
    
    ax.set_xlabel('Bias Score', fontsize=12)
    ax.set_ylabel('Influence Score', fontsize=12)
    ax.set_title(method, fontsize=13, fontweight='bold')
    ax.legend(fontsize=11)
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('correlation_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

## 4. Compare Bias Score Distributions

In [None]:
# Statistical comparison of bias scores
print("="*70)
print("BIAS SCORE DISTRIBUTION COMPARISON")
print("="*70)

for method, bias_col in zip(methods_list, bias_cols):
    print(f"\n{method}:")
    print(f"  Mean: {df_comparison[bias_col].mean():.4f}")
    print(f"  Median: {df_comparison[bias_col].median():.4f}")
    print(f"  Std: {df_comparison[bias_col].std():.4f}")
    print(f"  Range: [{df_comparison[bias_col].min():.4f}, {df_comparison[bias_col].max():.4f}]")
    print(f"  CV (Coefficient of Variation): {df_comparison[bias_col].std() / df_comparison[bias_col].mean():.4f}")

In [None]:
# Visualize distributions
fig, axes = plt.subplots(2, 1, figsize=(12, 10))

# Histograms
ax1 = axes[0]
for bias_col, method in zip(bias_cols, methods_list):
    ax1.hist(df_comparison[bias_col], bins=30, alpha=0.5, label=method, edgecolor='black')
ax1.set_xlabel('Bias Score', fontsize=12)
ax1.set_ylabel('Frequency', fontsize=12)
ax1.set_title('Bias Score Distribution Comparison', fontsize=14, fontweight='bold')
ax1.legend(fontsize=10)
ax1.grid(True, alpha=0.3)

# Box plots
ax2 = axes[1]
data_to_plot = [df_comparison[col] for col in bias_cols]
bp = ax2.boxplot(data_to_plot, labels=[m.replace(' ', '\n') for m in methods_list],
                  patch_artist=True)
for patch in bp['boxes']:
    patch.set_facecolor('lightblue')
ax2.set_ylabel('Bias Score', fontsize=12)
ax2.set_title('Bias Score Distribution (Box Plot)', fontsize=14, fontweight='bold')
ax2.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('distribution_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

## 5. Method Agreement Analysis

In [None]:
# Compare rankings across methods
print("="*70)
print("METHOD AGREEMENT ANALYSIS")
print("="*70)

# Rank media outlets by bias score for each method
for bias_col, method in zip(bias_cols, methods_list):
    df_comparison[f'Rank_{bias_col}'] = df_comparison[bias_col].rank(ascending=False)

# Calculate ranking correlation (Spearman)
from scipy.stats import spearmanr

print("\nRanking Correlations (Spearman):")
if len(bias_cols) >= 2:
    corr_12, p_12 = spearmanr(df_comparison[f'Rank_{bias_cols[0]}'], 
                               df_comparison[f'Rank_{bias_cols[1]}'])
    print(f"  Method 1 vs Method 2: {corr_12:.4f} (p={p_12:.4f})")

if has_method3:
    corr_13, p_13 = spearmanr(df_comparison[f'Rank_{bias_cols[0]}'], 
                               df_comparison[f'Rank_{bias_cols[2]}'])
    corr_23, p_23 = spearmanr(df_comparison[f'Rank_{bias_cols[1]}'], 
                               df_comparison[f'Rank_{bias_cols[2]}'])
    print(f"  Method 1 vs Method 3: {corr_13:.4f} (p={p_13:.4f})")
    print(f"  Method 2 vs Method 3: {corr_23:.4f} (p={p_23:.4f})")

# Show top 5 and bottom 5 for each method
print("\n" + "="*70)
for bias_col, method in zip(bias_cols, methods_list):
    print(f"\n{method}:")
    print("\n  Top 5 (Highest Bias):")
    top5 = df_comparison.nlargest(5, bias_col)[['언론사', bias_col]]
    for idx, row in top5.iterrows():
        print(f"    {row['언론사']:15s} {row[bias_col]:.4f}")
    
    print("\n  Bottom 5 (Lowest Bias):")
    bottom5 = df_comparison.nsmallest(5, bias_col)[['언론사', bias_col]]
    for idx, row in bottom5.iterrows():
        print(f"    {row['언론사']:15s} {row[bias_col]:.4f}")

# PART 2: Clustering Algorithm Comparison

## 6. Prepare Data for Clustering

We will use the **best dictionary method** determined from Part 1 for clustering comparison.

In [None]:
# Select best method based on correlation strength
best_method_idx = list(correlations.values()).index(best_method[1])
best_bias_col = bias_cols[best_method_idx]

print(f"Using {best_method[0]} for clustering comparison")
print(f"Column: {best_bias_col}")

# Prepare features for clustering
X = df_comparison[[best_bias_col, 'Influence_Score']].values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(f"\nData shape: {X_scaled.shape}")
print(f"Feature 1 (Bias): mean={X_scaled[:, 0].mean():.4f}, std={X_scaled[:, 0].std():.4f}")
print(f"Feature 2 (Influence): mean={X_scaled[:, 1].mean():.4f}, std={X_scaled[:, 1].std():.4f}")

## 7. Determine Optimal Number of Clusters

In [None]:
# Elbow method for K-Means
max_k = min(8, len(df_comparison) - 1)
K_range = range(2, max_k)

inertias = []
silhouette_scores = []
davies_bouldin_scores = []

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = kmeans.fit_predict(X_scaled)
    
    inertias.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(X_scaled, labels))
    davies_bouldin_scores.append(davies_bouldin_score(X_scaled, labels))

# Plot evaluation metrics
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Inertia (Elbow)
axes[0].plot(K_range, inertias, 'bo-', linewidth=2, markersize=8)
axes[0].set_xlabel('Number of Clusters (K)', fontsize=12)
axes[0].set_ylabel('Inertia', fontsize=12)
axes[0].set_title('Elbow Method', fontsize=14, fontweight='bold')
axes[0].grid(True, alpha=0.3)

# Silhouette Score (higher is better)
axes[1].plot(K_range, silhouette_scores, 'go-', linewidth=2, markersize=8)
axes[1].set_xlabel('Number of Clusters (K)', fontsize=12)
axes[1].set_ylabel('Silhouette Score', fontsize=12)
axes[1].set_title('Silhouette Score (Higher is Better)', fontsize=14, fontweight='bold')
axes[1].grid(True, alpha=0.3)

# Davies-Bouldin Index (lower is better)
axes[2].plot(K_range, davies_bouldin_scores, 'ro-', linewidth=2, markersize=8)
axes[2].set_xlabel('Number of Clusters (K)', fontsize=12)
axes[2].set_ylabel('Davies-Bouldin Index', fontsize=12)
axes[2].set_title('Davies-Bouldin Index (Lower is Better)', fontsize=14, fontweight='bold')
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('optimal_k_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

# Determine optimal K
best_k_silhouette = K_range[np.argmax(silhouette_scores)]
best_k_db = K_range[np.argmin(davies_bouldin_scores)]

print("\n" + "="*70)
print("OPTIMAL K ANALYSIS")
print("="*70)
print(f"\nBest K by Silhouette Score: {best_k_silhouette} (score: {max(silhouette_scores):.4f})")
print(f"Best K by Davies-Bouldin Index: {best_k_db} (score: {min(davies_bouldin_scores):.4f})")

# Use the K with best silhouette score
optimal_k = best_k_silhouette
print(f"\n→ Selected optimal K: {optimal_k}")

## 8. Compare Clustering Algorithms

In [None]:
# Apply different clustering algorithms
clustering_results = {}

# 1. K-Means
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
clustering_results['K-Means'] = kmeans.fit_predict(X_scaled)

# 2. Hierarchical Clustering (Agglomerative)
hierarchical = AgglomerativeClustering(n_clusters=optimal_k)
clustering_results['Hierarchical'] = hierarchical.fit_predict(X_scaled)

# 3. DBSCAN (if applicable)
# Try different eps values to get reasonable number of clusters
from sklearn.neighbors import NearestNeighbors

# Find optimal eps using k-distance
neighbors = NearestNeighbors(n_neighbors=4)
neighbors_fit = neighbors.fit(X_scaled)
distances, indices = neighbors_fit.kneighbors(X_scaled)
distances = np.sort(distances[:, -1], axis=0)

# Use median distance as eps
eps_value = np.median(distances)
dbscan = DBSCAN(eps=eps_value, min_samples=2)
dbscan_labels = dbscan.fit_predict(X_scaled)

# Check if DBSCAN is reasonable
n_clusters_dbscan = len(set(dbscan_labels)) - (1 if -1 in dbscan_labels else 0)
n_noise = list(dbscan_labels).count(-1)

if n_clusters_dbscan > 1 and n_noise < len(df_comparison) * 0.3:
    clustering_results['DBSCAN'] = dbscan_labels
    print(f"DBSCAN: {n_clusters_dbscan} clusters, {n_noise} noise points")
else:
    print(f"DBSCAN not suitable: {n_clusters_dbscan} clusters, {n_noise} noise points")
    print("Proceeding with K-Means and Hierarchical only")

print(f"\nClustering algorithms applied: {list(clustering_results.keys())}")

## 9. Evaluate Clustering Performance

In [None]:
# Calculate metrics for each algorithm
evaluation_results = {}

for name, labels in clustering_results.items():
    # Skip noise points for DBSCAN
    if -1 in labels:
        mask = labels != -1
        X_eval = X_scaled[mask]
        labels_eval = labels[mask]
    else:
        X_eval = X_scaled
        labels_eval = labels
    
    # Calculate metrics only if we have at least 2 clusters
    n_clusters = len(set(labels_eval))
    if n_clusters > 1:
        sil_score = silhouette_score(X_eval, labels_eval)
        db_score = davies_bouldin_score(X_eval, labels_eval)
        
        evaluation_results[name] = {
            'n_clusters': n_clusters,
            'silhouette': sil_score,
            'davies_bouldin': db_score
        }

# Display results
print("="*70)
print("CLUSTERING ALGORITHM COMPARISON")
print("="*70)
print(f"\nOptimal K used: {optimal_k}")
print("\nPerformance Metrics:")
print(f"{'Algorithm':<20} {'N Clusters':<12} {'Silhouette':<15} {'Davies-Bouldin':<15}")
print("-" * 70)

for name, metrics in evaluation_results.items():
    print(f"{name:<20} {metrics['n_clusters']:<12} "
          f"{metrics['silhouette']:<15.4f} {metrics['davies_bouldin']:<15.4f}")

# Determine best algorithm
best_by_silhouette = max(evaluation_results.items(), 
                          key=lambda x: x[1]['silhouette'])[0]
best_by_db = min(evaluation_results.items(), 
                  key=lambda x: x[1]['davies_bouldin'])[0]

print("\n" + "="*70)
print(f"BEST ALGORITHM by Silhouette Score: {best_by_silhouette}")
print(f"BEST ALGORITHM by Davies-Bouldin Index: {best_by_db}")

if best_by_silhouette == best_by_db:
    print(f"\n→ OVERALL BEST: {best_by_silhouette} (consistent across both metrics)")
    best_algorithm = best_by_silhouette
else:
    print(f"\n→ Metrics disagree. Using Silhouette Score as primary: {best_by_silhouette}")
    best_algorithm = best_by_silhouette

print("="*70)

In [None]:
# Visualize metrics comparison
algorithms = list(evaluation_results.keys())
silhouette_vals = [evaluation_results[alg]['silhouette'] for alg in algorithms]
db_vals = [evaluation_results[alg]['davies_bouldin'] for alg in algorithms]

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Silhouette scores
colors = ['green' if alg == best_by_silhouette else 'lightblue' for alg in algorithms]
axes[0].bar(algorithms, silhouette_vals, color=colors, edgecolor='black', linewidth=1.5)
axes[0].set_ylabel('Silhouette Score', fontsize=12)
axes[0].set_title('Silhouette Score Comparison\n(Higher is Better)', 
                   fontsize=14, fontweight='bold')
axes[0].grid(True, alpha=0.3, axis='y')
for i, v in enumerate(silhouette_vals):
    axes[0].text(i, v + 0.01, f'{v:.3f}', ha='center', fontsize=11, fontweight='bold')

# Davies-Bouldin scores
colors = ['green' if alg == best_by_db else 'lightcoral' for alg in algorithms]
axes[1].bar(algorithms, db_vals, color=colors, edgecolor='black', linewidth=1.5)
axes[1].set_ylabel('Davies-Bouldin Index', fontsize=12)
axes[1].set_title('Davies-Bouldin Index Comparison\n(Lower is Better)', 
                   fontsize=14, fontweight='bold')
axes[1].grid(True, alpha=0.3, axis='y')
for i, v in enumerate(db_vals):
    axes[1].text(i, v + 0.05, f'{v:.3f}', ha='center', fontsize=11, fontweight='bold')

plt.tight_layout()
plt.savefig('clustering_metrics_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

## 10. Visualize Clustering Results

In [None]:
# Visualize all clustering results
n_algorithms = len(clustering_results)
fig, axes = plt.subplots(1, n_algorithms, figsize=(7*n_algorithms, 6))

if n_algorithms == 1:
    axes = [axes]

colors_palette = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A', '#98D8C8']

for idx, (name, labels) in enumerate(clustering_results.items()):
    ax = axes[idx]
    
    # Assign colors (handle noise points in DBSCAN)
    unique_labels = set(labels)
    colors = []
    for label in labels:
        if label == -1:  # Noise point
            colors.append('gray')
        else:
            colors.append(colors_palette[label % len(colors_palette)])
    
    # Scatter plot
    scatter = ax.scatter(X[:, 0], X[:, 1], c=colors, s=150, 
                         alpha=0.6, edgecolors='black', linewidths=1.5)
    
    # Add cluster centers for K-Means
    if name == 'K-Means':
        centers = scaler.inverse_transform(kmeans.cluster_centers_)
        ax.scatter(centers[:, 0], centers[:, 1], marker='X', s=500, 
                   c='red', edgecolors='black', linewidths=2, 
                   label='Centers', zorder=10)
    
    ax.set_xlabel(f'{best_method[0]} Bias Score', fontsize=11)
    ax.set_ylabel('Influence Score', fontsize=11)
    
    # Highlight best algorithm
    title = name
    if name == best_algorithm:
        title += ' ⭐ BEST'
    ax.set_title(title, fontsize=13, fontweight='bold')
    
    # Add metrics to title
    if name in evaluation_results:
        sil = evaluation_results[name]['silhouette']
        db = evaluation_results[name]['davies_bouldin']
        ax.text(0.5, -0.15, f'Silhouette: {sil:.3f} | DB: {db:.3f}',
                ha='center', transform=ax.transAxes, fontsize=10,
                bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
    
    if name == 'K-Means':
        ax.legend(fontsize=10)
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('clustering_comparison_visual.png', dpi=300, bbox_inches='tight')
plt.show()

## 11. Detailed Cluster Analysis (Best Algorithm)

In [None]:
# Add cluster labels to dataframe
df_comparison['Cluster_Best'] = clustering_results[best_algorithm]

# Analyze clusters
print("="*70)
print(f"DETAILED CLUSTER ANALYSIS - {best_algorithm}")
print("="*70)

for cluster_id in sorted(df_comparison['Cluster_Best'].unique()):
    if cluster_id == -1:
        cluster_name = "Noise Points"
    else:
        cluster_name = f"Cluster {cluster_id}"
    
    cluster_data = df_comparison[df_comparison['Cluster_Best'] == cluster_id]
    
    print(f"\n{cluster_name}:")
    print(f"  Size: {len(cluster_data)} media outlets")
    print(f"  Avg Bias Score: {cluster_data[best_bias_col].mean():.4f}")
    print(f"  Avg Influence Score: {cluster_data['Influence_Score'].mean():.4f}")
    print(f"  Media outlets: {', '.join(cluster_data['언론사'].tolist())}")
    
    # Characterize cluster
    avg_bias = cluster_data[best_bias_col].mean()
    avg_influence = cluster_data['Influence_Score'].mean()
    
    median_bias = df_comparison[best_bias_col].median()
    median_influence = df_comparison['Influence_Score'].median()
    
    bias_char = "High" if avg_bias > median_bias else "Low"
    influence_char = "High" if avg_influence > median_influence else "Low"
    
    print(f"  → Characteristics: {bias_char} Bias + {influence_char} Influence")

print("\n" + "="*70)

# SUMMARY: Model Comparison Results

In [None]:
print("="*70)
print("FINAL MODEL COMPARISON SUMMARY")
print("="*70)

print("\n1. DICTIONARY METHOD COMPARISON")
print("-" * 70)
print("\nMethods tested:")
for i, method in enumerate(methods_list, 1):
    corr = correlations[method]
    print(f"  {i}. {method}: r = {corr:.4f}")

print(f"\n→ BEST METHOD: {best_method[0]}")
print(f"  Correlation: {best_method[1]:.4f}")
print(f"  Reasoning: Strongest correlation with influence score")

print("\n2. CLUSTERING ALGORITHM COMPARISON")
print("-" * 70)
print(f"\nOptimal number of clusters: {optimal_k}")
print("\nAlgorithms tested:")
for i, (name, metrics) in enumerate(evaluation_results.items(), 1):
    print(f"  {i}. {name}:")
    print(f"     Silhouette Score: {metrics['silhouette']:.4f}")
    print(f"     Davies-Bouldin Index: {metrics['davies_bouldin']:.4f}")

print(f"\n→ BEST ALGORITHM: {best_algorithm}")
print(f"  Silhouette Score: {evaluation_results[best_algorithm]['silhouette']:.4f}")
print(f"  Davies-Bouldin Index: {evaluation_results[best_algorithm]['davies_bouldin']:.4f}")
print(f"  Reasoning: Best clustering quality metrics")

print("\n3. KEY FINDINGS")
print("-" * 70)
overall_corr = df_comparison[best_bias_col].corr(df_comparison['Influence_Score'])
print(f"\nOverall correlation (best method): {overall_corr:.4f}")
print(f"Number of clusters identified: {evaluation_results[best_algorithm]['n_clusters']}")
print(f"Total media outlets analyzed: {len(df_comparison)}")

# Cluster characteristics summary
print("\nCluster characteristics:")
for cluster_id in sorted(df_comparison['Cluster_Best'].unique()):
    if cluster_id != -1:
        cluster_data = df_comparison[df_comparison['Cluster_Best'] == cluster_id]
        avg_bias = cluster_data[best_bias_col].mean()
        avg_influence = cluster_data['Influence_Score'].mean()
        print(f"  Cluster {cluster_id}: {len(cluster_data)} outlets, "
              f"Bias={avg_bias:.4f}, Influence={avg_influence:.4f}")

print("\n" + "="*70)
print("END OF MODEL COMPARISON ANALYSIS")
print("="*70)

## 12. Export Results

In [None]:
# Export final results
output_file = 'model_comparison_results.csv'
df_comparison.to_csv(output_file, index=False, encoding='utf-8-sig')
print(f"✓ Results exported to '{output_file}'")

# Export summary statistics
summary_data = {
    'Metric': [],
    'Value': []
}

# Add dictionary method results
for method, corr in correlations.items():
    summary_data['Metric'].append(f'{method} Correlation')
    summary_data['Value'].append(f'{corr:.4f}')

summary_data['Metric'].append('Best Dictionary Method')
summary_data['Value'].append(best_method[0])

# Add clustering results
for name, metrics in evaluation_results.items():
    summary_data['Metric'].append(f'{name} Silhouette')
    summary_data['Value'].append(f"{metrics['silhouette']:.4f}")
    summary_data['Metric'].append(f'{name} Davies-Bouldin')
    summary_data['Value'].append(f"{metrics['davies_bouldin']:.4f}")

summary_data['Metric'].append('Best Clustering Algorithm')
summary_data['Value'].append(best_algorithm)

summary_data['Metric'].append('Optimal K')
summary_data['Value'].append(str(optimal_k))

summary_df = pd.DataFrame(summary_data)
summary_df.to_csv('model_comparison_summary.csv', index=False, encoding='utf-8-sig')
print(f"✓ Summary exported to 'model_comparison_summary.csv'")

print("\n" + "="*70)
print("All results have been exported successfully!")
print("="*70)