# EDA - Reviews Analytics

This notebook provides comprehensive analytics for classified bank reviews.

**Sections:**
. **Overview** — Business stats table, avg ratings histogram
. **Rating Distribution** — Stacked bar chart (% of - stars per business)
. **Category Analysis** — Heatmaps for negative & positive categories with ranks
. **Regional/City View** — Performance by geography
. **Temporal View** — Trends over time

**Output:** All plots saved to `data/0_analysis/eda/` as HTML (and PNG if Kaleido installed)



In [None]:
# 
# SETUP & IMPORTS
# 
import os
import sys
from pathlib import Path
from datetime import datetime, date
import math
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio

from IPython.display import display, HTML

# 
# Paths
# 
NB_DIR = Path.cwd()
PROJECT_ROOT = NB_DIR.parent
DATA_DIR = PROJECT_ROOT / "data"
ANALYSIS_DIR = DATA_DIR / "0_analysis"
PROCESSED_DIR = DATA_DIR / "0_processed"
ANALYSIS_DIR.mkdir(parents=True, exist_ok=True)

# Try importing config for categories
CATEGORIES = []
POSITIVE_CATEGORIES = []
NEGATIVE_CATEGORIES = []
try:
 sys.path.insert(0, str(PROJECT_ROOT / "src"))
 import review_analyzer.config as config
 DATA_DIR = config.DATA_DIR
 ANALYSIS_DIR = config.ANALYSIS_DIR
 PROCESSED_DIR = config.PROCESSED_DIR
 CATEGORIES = list(getattr(config, "CATEGORIES", []))
except Exception:
 pass

# Define positive/negative categories based on config order
# First 7 are positive, next 7 are negative, then neutral/other
if CATEGORIES:
 POSITIVE_CATEGORIES = CATEGORIES[:7]
 NEGATIVE_CATEGORIES = CATEGORIES[7:]

# 
# BCG Theme
# 
BCG_GREEN = "#0B6EF"
BCG_DARK_GREEN = "#00D"
BCG_LIGHT_GREEN = "#A7C97"
BCG_TEAL = "#F7A8C"
BCG_GRAY = "#C7C89"
BCG_RED = "#D678"
BCG_ORANGE = "#FF7F0E"
BCG_YELLOW = "#FFD700"

# Rating colors (=red =green)
RATING_COLORS = {
 : "#D678", # Red
 : "#FF7F0E", # Orange
 : "#FFD700", # Yellow
 : "#A7C97", # Light green
 : "#0B6EF", # BCG green
}

BCG_COLORWAY = [BCG_GREEN, BCG_TEAL, BCG_LIGHT_GREEN, BCG_DARK_GREEN, BCG_GRAY, BCG_ORANGE]
BCG_FONT = "Inter, 'Helvetica Neue', Arial, sans-serif"

pio.templates["bcg"] = go.layout.Template(
 layout=go.Layout(
 font=dict(family=BCG_FONT, size=, color="#0F"),
 colorway=BCG_COLORWAY,
 paper_bgcolor="white",
 plot_bgcolor="white",
 margin=dict(l=60, r=0, t=80, b=60),
 legend=dict(orientation="h", yanchor="bottom", y=.0, x=0, bgcolor="rgba(0,0,0,0)"),
 xaxis=dict(gridcolor="#EAEAEA", zerolinecolor="#EAEAEA", ticks="outside"),
 yaxis=dict(gridcolor="#EAEAEA", zerolinecolor="#EAEAEA", ticks="outside"),
 )
)
pio.templates.default = "bcg"

# 
# Helper Functions
# 
def _ensure_dir(path: Path) -> Path:
 path.mkdir(parents=True, exist_ok=True)
 return path

HAS_KALEIDO = False
try:
 import kaleido
 HAS_KALEIDO = True
except:
 pass

def save_fig(fig, name: str, subdir: str = "eda", width=00, height=700) -> dict:
 """Save figure to HTML and PNG."""
 out_dir = _ensure_dir(ANALYSIS_DIR / subdir)
 html_path = out_dir / f"{name}.html"
 fig.write_html(str(html_path), include_plotlyjs="cdn")
 png_path = None
 if HAS_KALEIDO:
 try:
 png_path = out_dir / f"{name}.png"
 fig.write_image(str(png_path), scale=, width=width, height=height)
 except:
 png_path = None
 return {"html": html_path, "png": png_path}

def show_fig(fig, name: str, subdir: str = "eda"):
 """Display and save figure."""
 paths = save_fig(fig, name=name, subdir=subdir)
 # Always use HTML embedding to avoid nbformat issues
 html = fig.to_html(include_plotlyjs="cdn", full_html=False)
 display(HTML(html))
 return paths

def save_table(df: pd.DataFrame, name: str, subdir: str = "eda") -> Path:
 out_dir = _ensure_dir(ANALYSIS_DIR / subdir)
 csv_path = out_dir / f"{name}.csv"
 df.to_csv(csv_path, index=False)
 return csv_path

def shorten_category(cat: str, max_len: int = 0) -> str:
 """Shorten category name for display."""
 short = cat.split("(")[0].strip()
 if len(short) > max_len:
 short = short[:max_len-] + "..."
 return short

def build_category_legend(cols: list) -> pd.DataFrame:
 """Build category ID legend."""
 return pd.DataFrame({
 "ID": [f"C{i+:0d}" for i in range(len(cols))],
 "Short": [shorten_category(c) for c in cols],
 "Full": cols
 })

print(" Setup complete")



In [None]:
# 
# LOAD DATA
# 

# Set path to your classified reviews CSV (or leave empty to auto-detect)
INPUT_CSV_PATH = ""

# Auto-detect latest CSV
search_dirs = [
 PROCESSED_DIR / "classification" / "latest",
 PROCESSED_DIR / "classification",
 PROCESSED_DIR,
]

candidate_files = []
for d in search_dirs:
 if d.exists():
 candidate_files.extend(sorted(d.glob("*.csv"), key=lambda p: p.stat().st_mtime, reverse=True))

csv_path = Path(INPUT_CSV_PATH).expanduser() if INPUT_CSV_PATH else (candidate_files[0] if candidate_files else None)
if not csv_path or not csv_path.exists():
 raise FileNotFoundError(f"No CSV found. Set INPUT_CSV_PATH or place a CSV under {PROCESSED_DIR}")

# Robust CSV reading
def read_csv_robust(path):
 for enc in ["utf-8", "utf-8-sig", "cp", "latin-"]:
 try:
 return pd.read_csv(path, encoding=enc, low_memory=False), enc
 except UnicodeDecodeError:
 continue
 raise RuntimeError("Failed to read CSV")

print(f" Loading: {csv_path.name}")
df, encoding = read_csv_robust(csv_path)
print(f" Encoding: {encoding}")
print(f" Rows: {len(df):,} | Columns: {len(df.columns)}")

# 
# Detect columns
# 
def detect_col(df, candidates):
 for c in candidates:
 if c in df.columns:
 return c
 return None

business_col = detect_col(df, ["_business", "_bank", "business", "bank", "brand"])
city_col = detect_col(df, ["_city", "city", "ville", "commune", "region"])
date_col = detect_col(df, ["month","created_at", "date", "time", "created_at"])
rating_col = detect_col(df, ["rating", "note", "stars", "score"])
text_col = detect_col(df, ["review_text", "text", "content", "review"])
place_id_col = detect_col(df, ["_place_id", "place_id", "agency_id", "branch_id"])

# Detect category columns
if CATEGORIES:
 category_cols = [c for c in CATEGORIES if c in df.columns]
else:
 # Heuristic: boolean-like columns
 exclude = {business_col, city_col, date_col, rating_col, text_col, "sentiment", place_id_col}
 category_cols = [c for c in df.columns if c not in exclude and df[c].dropna().isin([0, , True, False]).all()]

# Split into positive/negative if not already set
pos_cats = [c for c in POSITIVE_CATEGORIES if c in df.columns]
neg_cats = [c for c in NEGATIVE_CATEGORIES if c in df.columns]

print(f"\n Detected columns:")
print(f" Business: {business_col}")
print(f" City: {city_col}")
print(f" Date: {date_col}")
print(f" Rating: {rating_col}")
print(f" Place ID: {place_id_col}")
print(f" Categories: {len(category_cols)} total ({len(pos_cats)} pos, {len(neg_cats)} neg)")

# 
# Preprocessing
# 
if date_col:
 df[date_col] = pd.to_datetime(df[date_col], errors="coerce", dayfirst=True)
 df["_month"] = df[date_col].dt.to_period("M").dt.to_timestamp()
 df["_year"] = df[date_col].dt.year

if rating_col:
 df[rating_col] = pd.to_numeric(df[rating_col], errors="coerce")

# Normalize business names
if business_col:
 df[business_col] = df[business_col].astype(str).str.strip()

print(f"\n Data loaded and preprocessed")



---
## Overview — Business Statistics

- **Table**: Reviews count, agencies count, avg reviews per agency, avg rating
- **Histogram**: Average rating by business



In [None]:
# 
# OVERVIEW — BUSINESS STATISTICS TABLE
# 

if business_col:
 stats_rows = []
 for biz in sorted(df[business_col].dropna().unique()):
 sub = df[df[business_col] == biz]
 n_reviews = len(sub)
 n_agencies = sub[place_id_col].nunique() if place_id_col and place_id_col in sub.columns else np.nan
 avg_reviews_per_agency = n_reviews / n_agencies if n_agencies and n_agencies > 0 else np.nan
 avg_rating = sub[rating_col].mean() if rating_col else np.nan
 pct_positive = (sub["sentiment"] == "Positif").mean() * 00 if "sentiment" in sub.columns else np.nan
 pct_negative = (sub["sentiment"] == "Négatif").mean() * 00 if "sentiment" in sub.columns else np.nan
 
 stats_rows.append({
 "Business": biz,
 "Reviews": n_reviews,
 "Agencies": int(n_agencies) if not pd.isna(n_agencies) else "-",
 "Avg Reviews/Agency": round(avg_reviews_per_agency, ) if not pd.isna(avg_reviews_per_agency) else "-",
 "Avg Rating": round(avg_rating, ) if not pd.isna(avg_rating) else "-",
 "% Positive": f"{pct_positive:.f}%" if not pd.isna(pct_positive) else "-",
 "% Negative": f"{pct_negative:.f}%" if not pd.isna(pct_negative) else "-",
 })
 
 stats_df = pd.DataFrame(stats_rows).sort_values("Reviews", ascending=False)
 
 # Style the table
 display(HTML("<h> Business Overview</h>"))
 display(stats_df)
 save_table(stats_df, "overview_business_stats", "eda")
 print(f"\n Saved to: data/0_analysis/eda/overview_business_stats.csv")



In [None]:
# 
# HISTOGRAM — AVERAGE RATING BY BUSINESS
# 

if business_col and rating_col:
 avg_ratings = df.groupby(business_col)[rating_col].mean().sort_values(ascending=True).reset_index()
 avg_ratings.columns = ["Business", "Avg Rating"]
 
 fig = px.bar(
 avg_ratings,
 x="Avg Rating",
 y="Business",
 orientation="h",
 title="<b>Average Rating by Business</b>",
 color="Avg Rating",
 color_continuous_scale=[[0, BCG_RED], [0., BCG_YELLOW], [, BCG_GREEN]],
 range_color=[, ],
 )
 fig.update_layout(
 height=max(00, len(avg_ratings) * 0),
 xaxis_title="Average Rating (-)",
 yaxis_title="",
 coloraxis_showscale=False,
 xaxis=dict(range=[0, ]),
 )
 fig.update_traces(texttemplate="%{x:.f}", textposition="outside")
 
 show_fig(fig, "overview_avg_rating_by_business", "eda")
 print(f" Saved to: data/0_analysis/eda/")



---
## Rating Distribution — Stacked Bar Chart

Shows the percentage of , , , , reviews for each business.



In [None]:
# 
# STACKED BAR CHART — RATING DISTRIBUTION BY BUSINESS
# 

if business_col and rating_col:
 # Calculate rating distribution per business
 rating_dist = df.groupby([business_col, rating_col]).size().unstack(fill_value=0)
 rating_pct = rating_dist.div(rating_dist.sum(axis=), axis=0) * 00
 
 # Ensure columns - exist
 for r in [, , , , ]:
 if r not in rating_pct.columns:
 rating_pct[r] = 0
 rating_pct = rating_pct[[, , , , ]].sort_index()
 
 # Sort by average rating
 avg_order = df.groupby(business_col)[rating_col].mean().sort_values(ascending=False).index
 rating_pct = rating_pct.reindex(avg_order)
 
 # Create stacked bar chart
 fig = go.Figure()
 
 for rating in [, , , , ]:
 fig.add_trace(go.Bar(
 name=f"{rating}",
 y=rating_pct.index,
 x=rating_pct[rating],
 orientation="h",
 marker_color=RATING_COLORS[rating],
 text=[f"{v:.0f}%" if v >= else "" for v in rating_pct[rating]],
 textposition="inside",
 textfont=dict(color="white", size=),
 ))
 
 fig.update_layout(
 barmode="stack",
 title="<b>Rating Distribution by Business</b><br><sup>Percentage of to reviews</sup>",
 xaxis_title="Percentage (%)",
 yaxis_title="",
 height=max(00, len(rating_pct) * 60),
 legend=dict(orientation="h", yanchor="bottom", y=.0, x=0., xanchor="center"),
 xaxis=dict(range=[0, 00]),
 )
 
 show_fig(fig, "rating_distribution_stacked", "eda")
 
 # Save data
 rating_pct_export = rating_pct.reset_index()
 rating_pct_export.columns = ["Business", " %", " %", " %", " %", " %"]
 save_table(rating_pct_export, "rating_distribution_by_business", "eda")
 print(f" Saved to: data/0_analysis/eda/")



---
## Category Analysis — BCG-Style Heatmaps

Heatmaps matching the BCG slide format:
- **X-axis**: Banks/Businesses
- **Y-axis**: Category labels (short names)
- **Cells**: Rank in bold + percentage — e.g. "**** (7.%)"
- **Colors**: Based on rank (darker = worse for negative, darker = better for positive)

**Negative**: Lower % = better rank (rank = least complaints)
**Positive**: Higher % = better rank (rank = most praise)



In [None]:
# 
# CATEGORY HEATMAPS — NEGATIVE & POSITIVE WITH RANKS (BCG Style)
# 

# Short display names for categories
CATEGORY_SHORT_NAMES = {
 # Positive
 "Accueil chaleureux et personnel attentionné": "Accueil chaleureux",
 "Service client réactif et à l'écoute": "Service réactif",
 "Conseil personnalisé et professionnalisme des équipes": "Conseil personnalisé",
 "Efficacité et rapidité de traitement": "Efficacité/rapidité",
 "Accessibilité et proximité des services": "Accessibilité",
 "Satisfaction sans détails spécifiques": "Satisfaction générique",
 "Expérience digitale et services en ligne pratiques": "Digital",
 # Negative
 "Attente interminable et lenteur en agence": "Attente / Lenteur",
 "Service client injoignable ou non réactif": "Service injoignable",
 "Réclamations ignorées ou mal suivies": "Réclamations ignorées",
 "Incidents techniques et erreurs récurrentes": "Incidents techniques",
 "Frais bancaires jugés abusifs ou non justifiés": "Frais Abusifs",
 "Insatisfaction sans détails spécifiques": "Insatisfaction générique",
 "Manque de considération ou attitude peu professionnelle": "Attitude négative",
 # Other
 "Autre (positif)": "Autre (Positif)",
 "Autre (négatif)": "Autre (Négatif)",
 "Autre (neutre)": "Autre (Neutre)",
}

def get_short_category_name(cat: str) -> str:
 """Get short display name for category."""
 # Exact match
 if cat in CATEGORY_SHORT_NAMES:
 return CATEGORY_SHORT_NAMES[cat]
 
 # Partial match (category name might be slightly different)
 cat_lower = cat.lower().strip()
 for full_name, short_name in CATEGORY_SHORT_NAMES.items():
 if full_name.lower() in cat_lower or cat_lower in full_name.lower():
 return short_name
 
 # Keyword-based fallback
 if "accueil" in cat_lower or "chaleureux" in cat_lower:
 return "Accueil chaleureux"
 if "réactif" in cat_lower or "écoute" in cat_lower:
 return "Service réactif"
 if "conseil" in cat_lower or "professionn" in cat_lower:
 return "Conseil personnalisé"
 if "efficac" in cat_lower or "rapid" in cat_lower:
 return "Efficacité/rapidité"
 if "accessib" in cat_lower or "proxim" in cat_lower:
 return "Accessibilité"
 if "satisfaction" in cat_lower and "sans" in cat_lower:
 return "Satisfaction générique"
 if "digital" in cat_lower or "en ligne" in cat_lower:
 return "Digital"
 if "attente" in cat_lower or "lenteur" in cat_lower:
 return "Attente / Lenteur"
 if "injoignable" in cat_lower:
 return "Service injoignable"
 if "réclamation" in cat_lower or "ignoré" in cat_lower:
 return "Réclamations ignorées"
 if "incident" in cat_lower or "technique" in cat_lower:
 return "Incidents techniques"
 if "frais" in cat_lower or "abusif" in cat_lower:
 return "Frais Abusifs"
 if "insatisfaction" in cat_lower:
 return "Insatisfaction générique"
 if "considération" in cat_lower or "attitude" in cat_lower:
 return "Attitude négative"
 if "autre" in cat_lower and "positif" in cat_lower:
 return "Autre (Positif)"
 if "autre" in cat_lower and "négatif" in cat_lower:
 return "Autre (Négatif)"
 if "autre" in cat_lower:
 return "Autre (Neutre)"
 
 # Last resort: first 0 chars without truncation symbol
 return cat[:0]


def create_category_heatmap_bcg(df, business_col, category_subset, title, is_negative=True, subtitle=""):
 """
 Create BCG-styled heatmap matching the reference screenshots.
 
 Layout:
 - X-axis: Banks/Businesses
 - Y-axis: Categories (short names)
 - Cells: "Rank (Percentage%)" format with rank in bold
 - Dark background with color gradient based on rank
 """
 if not category_subset or not business_col:
 return None, None
 
 # Filter to existing columns
 cat_cols = [c for c in category_subset if c in df.columns]
 if not cat_cols:
 print(f" No matching category columns found for: {title}")
 return None, None
 
 # Get businesses
 businesses = sorted(df[business_col].dropna().unique())
 n_businesses = len(businesses)
 n_categories = len(cat_cols)
 
 # Calculate % per business per category
 pct_matrix = np.zeros((n_categories, n_businesses))
 
 for j, biz in enumerate(businesses):
 sub = df[df[business_col] == biz]
 n_reviews = len(sub)
 for i, cat in enumerate(cat_cols):
 if cat in sub.columns and n_reviews > 0:
 pct = (sub[cat].fillna(0).astype(int).sum() / n_reviews * 00)
 pct_matrix[i, j] = round(pct, )
 
 # Calculate ranks per category (across businesses)
 # For negative: higher % = worse = higher rank number (rank = lowest %)
 # For positive: higher % = better = rank 
 rank_matrix = np.zeros_like(pct_matrix, dtype=int)
 
 for i in range(n_categories):
 row = pct_matrix[i, :]
 if is_negative:
 # Lower % is better, so rank ascending (lowest % gets rank )
 ranks = pd.Series(row).rank(ascending=True, method="min").astype(int).values
 else:
 # Higher % is better, so rank descending (highest % gets rank )
 ranks = pd.Series(row).rank(ascending=False, method="min").astype(int).values
 rank_matrix[i, :] = ranks
 
 # Normalize ranks to 0- for colorscale
 rank_normalized = (rank_matrix - ) / max(, (n_businesses - ))
 
 # Create annotation text with DYNAMIC colors based on cell brightness
 # For negative: light cells (low rank) need dark text, dark cells (high rank) need white text
 # For positive: dark cells (low rank) need white text, light cells (high rank) need dark text
 annot_text = []
 annot_colors = [] # Store colors for each cell
 
 for i in range(n_categories):
 row_text = []
 row_colors = []
 for j in range(n_businesses):
 rank = rank_matrix[i, j]
 pct = pct_matrix[i, j]
 row_text.append(f"<b>{rank}</b> ({pct:.f}%)")
 
 # Determine text color based on normalized rank and heatmap type
 norm_val = rank_normalized[i, j]
 if is_negative:
 # Negative: light background at low rank, dark at high rank
 # Use dark text for norm_val < 0. (light cells)
 text_color = "#aaa" if norm_val < 0. else "white"
 else:
 # Positive: dark background at low rank, light at high rank
 # Use white text for norm_val < 0. (dark cells), dark text for light cells
 text_color = "white" if norm_val < 0. else "#aaa"
 row_colors.append(text_color)
 annot_text.append(row_text)
 annot_colors.append(row_colors)
 
 # Define colorscales
 if is_negative:
 # Rank (best) = light cream, Rank N (worst) = dark red
 colorscale = [
 [0.0, "#FFFF0"], # Light cream (best rank)
 [0., "#FCBBA"], # Light red
 [0., "#FB6AA"], # Medium red
 [0.7, "#CB8D"], # Dark red
 [.0, "#67000D"], # Very dark red (worst rank)
 ]
 title_color = "#C09B"
 bg_color = "#af8" # Dark teal background
 else:
 # Rank (best) = dark green, Rank N (worst) = light
 colorscale = [
 [0.0, "#00B"], # Dark green (best rank)
 [0., "#8B"], # Medium green
 [0., "#7C76"], # Light green
 [0.7, "#C7E9C0"], # Very light green
 [.0, "#F7FCF"], # Almost white (worst rank)
 ]
 title_color = "#7AE60"
 bg_color = "#af8" # Dark teal background
 
 # Short category names for Y-axis
 short_cat_names = [get_short_category_name(c) for c in cat_cols]
 
 # Create heatmap - we'll add annotations separately for color control
 fig = go.Figure(data=go.Heatmap(
 z=rank_normalized,
 x=businesses,
 y=short_cat_names,
 colorscale=colorscale,
 hovertemplate="<b>%{y}</b><br>Business: %{x}<br>Rank: %{z:.0f}<extra></extra>",
 showscale=False, # Hide colorbar - we use visual legend
 xgap=,
 ygap=,
 ))
 
 # Add text annotations with individual colors
 annotations_list = []
 for i in range(n_categories):
 for j in range(n_businesses):
 annotations_list.append(dict(
 x=businesses[j],
 y=short_cat_names[i],
 text=annot_text[i][j],
 showarrow=False,
 font=dict(size=0, color=annot_colors[i][j], family="Trebuchet MS, sans-serif"),
 xref="x",
 yref="y",
 ))
 
 # Add rank legend annotation on the right side
 if is_negative:
 legend_text = "Plus le % est <b style='color:#FF6B6B'>bas</b>, meilleur est le rang"
 else:
 legend_text = "Plus le % est <b style='color:#7AE60'>haut</b>, meilleur est le rang"
 
 # Add category labels on the left as annotations (full control over display)
 category_label_annotations = []
 for i, cat_name in enumerate(short_cat_names):
 # Calculate y position (0 = bottom, = top, reversed for heatmap)
 y_pos = .0 - (i + 0.) / n_categories
 category_label_annotations.append(dict(
 text=f"<b>{cat_name}</b>",
 xref="paper", yref="paper",
 x=-0.0, y=y_pos,
 xanchor="right", yanchor="middle",
 showarrow=False,
 font=dict(size=, color="white", family="Trebuchet MS, sans-serif"),
 ))
 
 # Add static annotations (legend, rank scale labels)
 static_annotations = [
 dict(
 text=legend_text,
 xref="paper", yref="paper",
 x=0.98, y=.08,
 xanchor="right", yanchor="bottom",
 showarrow=False,
 font=dict(size=, color="white"),
 ),
 # Rank scale label - top
 dict(
 text="",
 xref="paper", yref="paper",
 x=.06, y=0.9,
 xanchor="center", yanchor="middle",
 showarrow=False,
 font=dict(size=, color="white"),
 ),
 # Rank scale label - bottom
 dict(
 text=str(n_businesses),
 xref="paper", yref="paper",
 x=.06, y=0.0,
 xanchor="center", yanchor="middle",
 showarrow=False,
 font=dict(size=, color="white"),
 ),
 # Rank title
 dict(
 text="<b>Rang</b>",
 xref="paper", yref="paper",
 x=.0, y=0.,
 xanchor="left", yanchor="middle",
 showarrow=False,
 font=dict(size=, color="white"),
 textangle=90,
 ),
 ]
 
 # Combine cell annotations with category labels and static annotations
 all_annotations = annotations_list + category_label_annotations + static_annotations
 
 fig.update_layout(
 title=dict(
 text=f"<span style='color:{title_color}; font-size:px; font-weight:bold'>{title}</span>",
 x=0.0,
 xanchor="left",
 font=dict(family="Trebuchet MS, sans-serif"),
 ),
 paper_bgcolor=bg_color,
 plot_bgcolor=bg_color,
 font=dict(family="Trebuchet MS, sans-serif", size=, color="white"),
 height=max(00, n_categories * 60 + 0),
 width=max(000, n_businesses * 9 + 00),
 margin=dict(l=0, r=0, t=00, b=00), # Increased left margin for category labels
 xaxis=dict(
 side="bottom",
 tickangle=-,
 tickfont=dict(size=, color="#88ccaa"),
 showgrid=False,
 title="",
 ),
 yaxis=dict(
 autorange="reversed",
 showticklabels=False, # Hide default tick labels - we'll use annotations
 showgrid=False,
 title="",
 ),
 annotations=all_annotations, # Include both cell and static annotations
 )
 
 # Add a vertical color bar shape to indicate rank scale
 # For negative: light at top (rank ), dark at bottom (rank N)
 # For positive: dark at top (rank ), light at bottom (rank N)
 if is_negative:
 bar_colors = ["#FFFF0", "#FCBBA", "#FB6AA", "#CB8D", "#67000D"]
 else:
 bar_colors = ["#00B", "#8B", "#7C76", "#C7E9C0", "#F7FCF"]
 
 # Add colored rectangles for rank scale
 for i, color in enumerate(bar_colors):
 y0 = .0 - (i + ) / len(bar_colors)
 y = .0 - i / len(bar_colors)
 fig.add_shape(
 type="rect",
 xref="paper", yref="paper",
 x0=.0, x=.08,
 y0=y0, y=y,
 fillcolor=color,
 line=dict(width=0),
 )
 
 # Build data export DataFrame
 export_data = pd.DataFrame(pct_matrix, index=short_cat_names, columns=businesses)
 export_data.index.name = "Category"
 
 return fig, export_data


# 
# NEGATIVE CATEGORIES HEATMAP
# 

# Identify negative category columns
neg_cat_cols = [c for c in neg_cats if c in df.columns] if neg_cats else []

if not neg_cat_cols:
 # Fallback: use columns with negative-sounding keywords
 neg_keywords = ["mauvais", "lent", "problème", "négatif", "attente", "incompétent", "erreur", "refus"]
 neg_cat_cols = [c for c in category_cols if any(kw in c.lower() for kw in neg_keywords)]

if neg_cat_cols:
 display(HTML("<h style='color:#C09B; font-family: Trebuchet MS;'> Facteurs Négatifs (+)</h>"))
 
 fig_neg, data_neg = create_category_heatmap_bcg(
 df, business_col, neg_cat_cols,
 title="Facteurs Négatifs (+)",
 is_negative=True,
 )
 
 if fig_neg:
 show_fig(fig_neg, "negative_categories_heatmap", "eda")
 save_table(data_neg.reset_index(), "negative_categories_data", "eda")
 print(f"\n Saved to: data/0_analysis/eda/negative_categories_heatmap.html")
else:
 print(" No negative category columns detected")



In [None]:
# 
# POSITIVE CATEGORIES HEATMAP
# 

# Identify positive category columns
pos_cat_cols = [c for c in pos_cats if c in df.columns] if pos_cats else []

if not pos_cat_cols:
 # Fallback: use columns with positive-sounding keywords
 pos_keywords = ["bon", "excellent", "rapide", "positif", "accueil", "professionnel", "efficace", "qualité"]
 pos_cat_cols = [c for c in category_cols if any(kw in c.lower() for kw in pos_keywords)]

if pos_cat_cols:
 display(HTML("<h style='color:#7AE60; font-family: Trebuchet MS;'> Facteurs Positifs (+)</h>"))
 
 fig_pos, data_pos = create_category_heatmap_bcg(
 df, business_col, pos_cat_cols,
 title="Facteurs Positifs (+)",
 is_negative=False,
 )
 
 if fig_pos:
 show_fig(fig_pos, "positive_categories_heatmap", "eda")
 save_table(data_pos.reset_index(), "positive_categories_data", "eda")
 print(f"\n Saved to: data/0_analysis/eda/positive_categories_heatmap.html")
else:
 print(" No positive category columns detected")


---
## Regional / City View

- **Bar Chart**: Average rating by city (top 0)
- **Table**: Per-city KPIs (reviews, avg rating, % positive/negative)
- **Heatmap**: Business × City performance matrix


In [None]:
# 
# REGIONAL / CITY VIEW
# 

if city_col:
 display(HTML("<h> Regional Analysis</h>"))
 
 # 
 # Average Rating by City (Top 0)
 # 
 if rating_col:
 city_stats = df.groupby(city_col).agg(
 avg_rating=(rating_col, "mean"),
 n_reviews=(rating_col, "size")
 ).reset_index()
 city_stats = city_stats[city_stats["n_reviews"] >= 0] # Minimum 0 reviews
 city_stats = city_stats.sort_values("avg_rating", ascending=True).tail(0)
 
 fig = px.bar(
 city_stats,
 x="avg_rating",
 y=city_col,
 orientation="h",
 title="<b>Average Rating by City</b><br><sup>Top 0 cities with ≥0 reviews</sup>",
 color="avg_rating",
 color_continuous_scale=[[0, BCG_RED], [0., BCG_YELLOW], [, BCG_GREEN]],
 range_color=[, ],
 text="n_reviews",
 )
 fig.update_traces(texttemplate="%{x:.f} (n=%{text})", textposition="outside")
 fig.update_layout(
 height=max(00, len(city_stats) * ),
 xaxis_title="Average Rating",
 yaxis_title="",
 coloraxis_showscale=False,
 xaxis=dict(range=[0, .]),
 )
 show_fig(fig, "city_avg_rating", "eda")
 
 # 
 # City KPIs Table
 # 
 city_kpi_rows = []
 for city in df[city_col].dropna().unique():
 sub = df[df[city_col] == city]
 n_reviews = len(sub)
 avg_rating = sub[rating_col].mean() if rating_col else np.nan
 pct_pos = (sub["sentiment"] == "Positif").mean() * 00 if "sentiment" in sub.columns else np.nan
 pct_neg = (sub["sentiment"] == "Négatif").mean() * 00 if "sentiment" in sub.columns else np.nan
 n_businesses = sub[business_col].nunique() if business_col else np.nan
 
 city_kpi_rows.append({
 "City": city,
 "Reviews": n_reviews,
 "Businesses": int(n_businesses) if not pd.isna(n_businesses) else "-",
 "Avg Rating": round(avg_rating, ) if not pd.isna(avg_rating) else "-",
 "% Positive": f"{pct_pos:.f}%" if not pd.isna(pct_pos) else "-",
 "% Negative": f"{pct_neg:.f}%" if not pd.isna(pct_neg) else "-",
 })
 
 city_kpi_df = pd.DataFrame(city_kpi_rows).sort_values("Reviews", ascending=False)
 display(HTML("<b>City KPIs (sorted by review count):</b>"))
 display(city_kpi_df.head(0))
 save_table(city_kpi_df, "city_kpis", "eda")
 
 # 
 # Business × City Performance Heatmap
 # 
 if business_col and rating_col:
 # Get top cities by review count
 top_cities = df[city_col].value_counts().head().index.tolist()
 businesses = sorted(df[business_col].dropna().unique())
 
 # Build matrix
 matrix_data = []
 for biz in businesses:
 row = {"Business": biz}
 for city in top_cities:
 sub = df[(df[business_col] == biz) & (df[city_col] == city)]
 row[city] = round(sub[rating_col].mean(), ) if len(sub) >= else np.nan
 matrix_data.append(row)
 
 matrix_df = pd.DataFrame(matrix_data).set_index("Business")
 
 fig = go.Figure(data=go.Heatmap(
 z=matrix_df.values,
 x=matrix_df.columns.tolist(),
 y=matrix_df.index.tolist(),
 colorscale=[[0, BCG_RED], [0., BCG_YELLOW], [, BCG_GREEN]],
 zmin=, zmax=,
 text=[[f"{v:.f}" if not pd.isna(v) else "-" for v in row] for row in matrix_df.values],
 texttemplate="%{text}",
 textfont=dict(size=0),
 hovertemplate="Business: %{y}<br>City: %{x}<br>Rating: %{z:.f}<extra></extra>",
 colorbar=dict(title="Avg Rating"),
 ))
 
 fig.update_layout(
 title="<b>Business × City Performance Matrix</b><br><sup>Average rating (min reviews per cell)</sup>",
 xaxis_title="City",
 yaxis_title="",
 height=max(00, len(businesses) * + 00),
 xaxis=dict(tickangle=-),
 )
 
 show_fig(fig, "business_city_heatmap", "eda")
 save_table(matrix_df.reset_index(), "business_city_matrix", "eda")
else:
 print(" City column not detected — Regional view unavailable")


---
## Temporal View

- **Line Chart**: Average rating trend over time (monthly)
- **Line Chart**: Review volume over time (monthly)
- **Multi-line**: Rating trends by business
- **Heatmap**: Monthly sentiment distribution


In [None]:
# 
# TEMPORAL VIEW
# 

if date_col and "_month" in df.columns:
 display(HTML("<h> Temporal Analysis</h>"))
 
 # Filter out rows with invalid dates
 df_temporal = df.dropna(subset=["_month"]).copy()
 
 # 
 # Overall Monthly Trends
 # 
 monthly_agg = df_temporal.groupby("_month").agg(
 avg_rating=(rating_col, "mean") if rating_col else ("_month", "size"),
 n_reviews=(rating_col, "size") if rating_col else ("_month", "size"),
 ).reset_index()
 monthly_agg = monthly_agg.sort_values("_month")
 
 # Average Rating Trend
 if rating_col:
 fig = px.line(
 monthly_agg,
 x="_month",
 y="avg_rating",
 title="<b>Average Rating Over Time</b>",
 markers=True,
 )
 fig.update_traces(line=dict(color=BCG_GREEN, width=), marker=dict(size=8))
 fig.add_hline(y=monthly_agg["avg_rating"].mean(), line_dash="dash", line_color=BCG_GRAY,
 annotation_text=f"Overall avg: {monthly_agg['avg_rating'].mean():.f}")
 fig.update_layout(
 xaxis_title="Month",
 yaxis_title="Average Rating",
 yaxis=dict(range=[, ]),
 height=00,
 )
 show_fig(fig, "temporal_avg_rating", "eda")
 
 # Review Volume Trend
 fig = px.bar(
 monthly_agg,
 x="_month",
 y="n_reviews",
 title="<b>Review Volume Over Time</b>",
 )
 fig.update_traces(marker_color=BCG_TEAL)
 fig.update_layout(
 xaxis_title="Month",
 yaxis_title="Number of Reviews",
 height=00,
 )
 show_fig(fig, "temporal_volume", "eda")
 
 # 
 # Rating Trends by Business
 # 
 if business_col and rating_col:
 monthly_by_biz = df_temporal.groupby(["_month", business_col]).agg(
 avg_rating=(rating_col, "mean"),
 n_reviews=(rating_col, "size"),
 ).reset_index()
 
 fig = px.line(
 monthly_by_biz,
 x="_month",
 y="avg_rating",
 color=business_col,
 title="<b>Rating Trends by Business</b>",
 markers=True,
 )
 fig.update_traces(line=dict(width=), marker=dict(size=))
 fig.update_layout(
 xaxis_title="Month",
 yaxis_title="Average Rating",
 yaxis=dict(range=[, ]),
 height=00,
 legend=dict(orientation="h", yanchor="bottom", y=.0, x=0),
 )
 show_fig(fig, "temporal_rating_by_business", "eda")
 save_table(monthly_by_biz, "temporal_rating_by_business", "eda")
 
 # 
 # Monthly Sentiment Distribution (Stacked Area)
 # 
 if "sentiment" in df_temporal.columns:
 sentiment_monthly = df_temporal.groupby(["_month", "sentiment"]).size().unstack(fill_value=0)
 sentiment_monthly_pct = sentiment_monthly.div(sentiment_monthly.sum(axis=), axis=0) * 00
 sentiment_monthly_pct = sentiment_monthly_pct.reset_index()
 
 # Melt for plotly
 sentiment_long = sentiment_monthly_pct.melt(id_vars="_month", var_name="Sentiment", value_name="Percentage")
 
 sentiment_colors = {
 "Positif": BCG_GREEN,
 "Négatif": BCG_RED,
 "Neutre": BCG_GRAY,
 }
 
 fig = px.area(
 sentiment_long,
 x="_month",
 y="Percentage",
 color="Sentiment",
 color_discrete_map=sentiment_colors,
 title="<b>Sentiment Distribution Over Time</b>",
 )
 fig.update_layout(
 xaxis_title="Month",
 yaxis_title="Percentage (%)",
 yaxis=dict(range=[0, 00]),
 height=00,
 legend=dict(orientation="h", yanchor="bottom", y=.0, x=0., xanchor="center"),
 )
 show_fig(fig, "temporal_sentiment", "eda")
 save_table(sentiment_monthly_pct, "temporal_sentiment_distribution", "eda")
 
 # 
 # INDIVIDUAL BUSINESS TEMPORAL CHARTS (Volume + Rating Dual Axis)
 # 
 if business_col and rating_col:
 display(HTML("<h> Individual Business Temporal Analysis</h>"))
 display(HTML("<p><i>Volume bars colored by year, with rating trend line overlay</i></p>"))
 
 # Determine business type from column name
 business_type = "Bank" if "bank" in business_col.lower() else "Business"
 
 # Define year-based color palette (similar to the reference image)
 # Older years = green, middle = yellow, recent = cyan, most recent months = red outline
 def get_year_color(year, min_year, max_year):
 """Get color based on year position in range."""
 if max_year == min_year:
 return BCG_TEAL
 ratio = (year - min_year) / (max_year - min_year)
 if ratio < 0.:
 return BCG_GREEN # Older years
 elif ratio < 0.7:
 return BCG_YELLOW # Middle years
 else:
 return "#6BE9" # Recent years (cyan/light blue)
 
 # Get current date for highlighting recent months
 current_date = pd.Timestamp.now()
 recent_cutoff = current_date - pd.DateOffset(months=6)
 
 # Process each business
 businesses = sorted(df_temporal[business_col].dropna().unique())
 
 for biz_name in businesses:
 biz_data = df_temporal[df_temporal[business_col] == biz_name].copy()
 
 if len(biz_data) < : # Skip businesses with too few reviews
 continue
 
 # Aggregate by month
 biz_monthly = biz_data.groupby("_month").agg(
 avg_rating=(rating_col, "mean"),
 n_reviews=(rating_col, "size"),
 ).reset_index().sort_values("_month")
 
 if len(biz_monthly) < : # Skip if too few months
 continue
 
 # Add year column for coloring
 biz_monthly["_year"] = biz_monthly["_month"].dt.year
 min_year = biz_monthly["_year"].min()
 max_year = biz_monthly["_year"].max()
 
 # Assign colors based on year
 biz_monthly["bar_color"] = biz_monthly["_year"].apply(
 lambda y: get_year_color(y, min_year, max_year)
 )
 
 # Mark recent months for red outline
 biz_monthly["is_recent"] = biz_monthly["_month"] >= recent_cutoff
 
 # Create figure with secondary y-axis
 fig = make_subplots(specs=[[{"secondary_y": True}]])
 
 # Format x-axis labels as YYYY-MM
 biz_monthly["_month_str"] = biz_monthly["_month"].dt.strftime("%Y-%m")
 
 # Add bars for volume - group by year for legend
 years_in_data = sorted(biz_monthly["_year"].unique())
 
 for year in years_in_data:
 year_data = biz_monthly[biz_monthly["_year"] == year]
 color = get_year_color(year, min_year, max_year)
 
 # Check if any of these months are recent (for red outline)
 for _, row in year_data.iterrows():
 is_recent = row["is_recent"]
 fig.add_trace(
 go.Bar(
 x=[row["_month_str"]],
 y=[row["n_reviews"]],
 name=str(year),
 marker=dict(
 color=color,
 line=dict(
 color=BCG_RED if is_recent else color,
 width= if is_recent else 0
 )
 ),
 text=[int(row["n_reviews"])],
 textposition="outside",
 textfont=dict(size=9, color="#"),
 showlegend=False,
 hovertemplate=f"Month: %{{x}}<br>Volume: %{{y}}<br>Year: {year}<extra></extra>",
 ),
 secondary_y=False,
 )
 
 # Add line for average rating
 fig.add_trace(
 go.Scatter(
 x=biz_monthly["_month_str"],
 y=biz_monthly["avg_rating"],
 mode="lines+markers+text",
 name="Rating Moyen",
 line=dict(color=BCG_LIGHT_GREEN, width=),
 marker=dict(size=8, color=BCG_LIGHT_GREEN),
 text=[f"{r:.f}" if i % max(, len(biz_monthly)//8) == 0 or r == biz_monthly["avg_rating"].max() or r == biz_monthly["avg_rating"].min() else "" 
 for i, r in enumerate(biz_monthly["avg_rating"])],
 textposition="top center",
 textfont=dict(size=0, color=BCG_LIGHT_GREEN),
 hovertemplate="Month: %{x}<br>Avg Rating: %{y:.f}<extra></extra>",
 ),
 secondary_y=True,
 )
 
 # Update layout with dark theme similar to reference
 fig.update_layout(
 title=dict(
 text=f"<b>Business Type: {business_type} | Business Name: {biz_name}</b>",
 font=dict(size=6, color="white"),
 x=0.,
 xanchor="center",
 ),
 paper_bgcolor="#af8", # Dark teal background
 plot_bgcolor="#af8",
 font=dict(family=BCG_FONT, size=, color="white"),
 height=0,
 margin=dict(l=60, r=60, t=80, b=80),
 showlegend=False,
 bargap=0.,
 )
 
 # Update axes
 fig.update_xaxes(
 tickangle=-,
 tickfont=dict(size=9, color="#88ccaa"),
 gridcolor="rgba(,,,0.)",
 showgrid=False,
 )
 
 fig.update_yaxes(
 title_text="Rating Volume",
 title_font=dict(color=BCG_LIGHT_GREEN, size=),
 tickfont=dict(color=BCG_LIGHT_GREEN),
 gridcolor="rgba(,,,0.)",
 secondary_y=False,
 )
 
 fig.update_yaxes(
 title_text="Rating Moyen",
 title_font=dict(color=BCG_LIGHT_GREEN, size=),
 tickfont=dict(color=BCG_LIGHT_GREEN),
 range=[0, ],
 gridcolor="rgba(,,,0.)",
 secondary_y=True,
 )
 
 # Sanitize business name for filename
 safe_name = "".join(c if c.isalnum() else "_" for c in biz_name).lower()
 show_fig(fig, f"temporal_business_{safe_name}", "eda")
 
 print(f"\n Individual business temporal charts saved to: data/0_analysis/eda/")
 
 print(f"\n All temporal charts saved to: data/0_analysis/eda/")
else:
 print(" Date column not detected — Temporal view unavailable")


---
## Summary — Outputs Generated

All visualizations and data tables have been saved to `data/0_analysis/eda/`:

| File | Description |
|------|-------------|
| `overview_business_stats.csv` | Business statistics table |
| `overview_avg_rating_by_business.html/png` | Average rating histogram |
| `rating_distribution_stacked.html/png` | Stacked bar chart (-) |
| `rating_distribution_by_business.csv` | Rating distribution data |
| `negative_categories_heatmap.html/png` | Negative categories heatmap |
| `negative_categories_legend.csv` | Legend for negative category IDs |
| `positive_categories_heatmap.html/png` | Positive categories heatmap |
| `positive_categories_legend.csv` | Legend for positive category IDs |
| `city_avg_rating.html/png` | City ratings bar chart |
| `city_kpis.csv` | City KPIs table |
| `business_city_heatmap.html/png` | Business × City matrix |
| `temporal_*.html/png` | Temporal trend charts |
| `temporal_business_*.html/png` | Individual business temporal charts (volume + rating) |
