diff --git a/README.md b/README.md index 3655b59..8e1a3b7 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,7 @@ The MCP servers in this demo highlight how each tool can light up widgets by com - `pizzaz_server_node/` – MCP server implemented with the official TypeScript SDK. - `pizzaz_server_python/` – Python MCP server that returns the Pizzaz widgets. - `solar-system_server_python/` – Python MCP server for the 3D solar system widget. +- `data_explorer_server_python/` – Python MCP server that powers the Data Explorer widget (CSV uploads, filters, charts). - `build-all.mts` – Vite build orchestrator that produces hashed bundles for every widget entrypoint. ## Prerequisites @@ -73,7 +74,7 @@ pnpm run dev ## Serve the static assets -All of the MCP servers expect the bundled HTML, JS, and CSS to be served from the local static file server. After every build, start the server before launching any MCP processes: +All of the MCP servers (except the Data Explorer server) expect the bundled HTML, JS, and CSS to be served from the local static file server. After every build, start the server before launching any MCP processes: ```bash pnpm run serve @@ -83,12 +84,15 @@ The assets are exposed at [`http://localhost:4444`](http://localhost:4444) with > **Note:** The Python Pizzaz server caches widget HTML with `functools.lru_cache`. If you rebuild or manually edit files in `assets/`, restart the MCP server so it picks up the updated markup. +> **Note:** The Data Explorer server reads the built widget assets directly from the `assets/` directory, so you still need to run `pnpm run build` whenever you change the frontend, but you don't have to start `pnpm run serve` while that server is running. + ## Run the MCP servers The repository ships several demo MCP servers that highlight different widget bundles: - **Pizzaz (Node & Python)** – pizza-inspired collection of tools and components - **Solar system (Python)** – 3D solar system viewer +- **Data Explorer (Python)** – interactive CSV upload, profiling, preview, and charting ### Pizzaz Node server @@ -117,6 +121,35 @@ uvicorn solar-system_server_python.main:app --port 8000 You can reuse the same virtual environment for all Python servers—install the dependencies once and run whichever entry point you need. +### Data Explorer Python server + +```bash +python -m venv .venv +source .venv/bin/activate +pip install -r data_explorer_server_python/requirements.txt +pnpm run build +uvicorn data_explorer_server_python.main:app --port 8001 --reload +``` + +This server accepts CSV uploads, profiles dataset metadata, exposes filtered preview tables, and generates chart-ready aggregates. Built assets are served directly by the MCP server, so rerun `pnpm run build` whenever you update the widget bundle. + +#### Data Explorer security configuration + +- `DATA_EXPLORER_ALLOWED_UPLOAD_ROOTS` (required for `filePath`/`fileUri` uploads) – os-path-separated list of directories (e.g., `/tmp:/Users/me/datasets`). Path-based uploads are disabled unless this allowlist is set. +- `DATA_EXPLORER_AUTH_TOKEN` – when set, every HTTP request (including MCP transport) must send `Authorization: Bearer `. +- `DATA_EXPLORER_CORS_ALLOW_ORIGINS` – comma-delimited list of origins (e.g., `https://platform.openai.com,https://studio.openai.com`) that should receive CORS headers. CORS is disabled when this variable is unset. + +If you expose the server over the public internet, configure all three variables to avoid leaking local files or running an unauthenticated, cross-origin-accessible endpoint. + +For local development you can continue testing path uploads by pointing the allowlist at directories you control, for example: + +```bash +export DATA_EXPLORER_ALLOWED_UPLOAD_ROOTS="$(pwd)/sample-data:/tmp" +uvicorn data_explorer_server_python.main:app --port 8001 --reload +``` + +Inline (`csvText`) and chunked uploads do not require any of the security environment variables, so you can omit them when doing quick experiments. + ## Testing in ChatGPT To add these apps to ChatGPT, enable [developer mode](https://platform.openai.com/docs/guides/developer-mode), and add your apps in Settings > Connectors. @@ -143,7 +176,7 @@ You can then invoke tools by asking something related. For example, for the Pizz ## Next steps -- Customize the widget data: edit the handlers in `pizzaz_server_node/src`, `pizzaz_server_python/main.py`, or the solar system server to fetch data from your systems. +- Customize the widget data: edit the handlers in `pizzaz_server_node/src`, `pizzaz_server_python/main.py`, `solar-system_server_python`, or `data_explorer_server_python` to fetch data from your systems. - Create your own components and add them to the gallery: drop new entries into `src/` and they will be picked up automatically by the build script. ### Deploy your MCP server diff --git a/data_explorer_server_python/README.md b/data_explorer_server_python/README.md new file mode 100644 index 0000000..f05e1e9 --- /dev/null +++ b/data_explorer_server_python/README.md @@ -0,0 +1,64 @@ +## Data Explorer MCP Server + +This FastMCP server backs the Data Explorer demo widget. It accepts CSV uploads, profiles column metadata, serves preview rows with optional filters, and produces chart-ready aggregates. + +### Prerequisites + +- Python 3.10 or later +- `uv` (recommended) or `pip` +- Frontend assets built via `pnpm run build` (the server loads `assets/data-explorer-*.html`) + +### Setup + +```bash +cd data_explorer_server_python +python -m venv .venv +source .venv/bin/activate +pip install -r requirements.txt +``` + +### Run + +```bash +uvicorn data_explorer_server_python.main:app --port 8001 --reload +``` + +Once built, the server serves the widget's HTML, JS, and CSS directly over MCP resource requests, +so you don't need to run a separate static asset host. Re-run `pnpm run build` whenever you update +the frontend code to refresh the embedded assets. + +When developing & uploading files from your local environment, add allowlist directories you own before starting the server: + +```bash +export DATA_EXPLORER_ALLOWED_UPLOAD_ROOTS="$(pwd)/sample-data:/tmp" +uvicorn data_explorer_server_python.main:app --port 8001 --reload +``` + +If you only use inline (`csvText`) uploads, you can skip that variable. + +Interactive tooling (ChatGPT Apps SDK, `mcp-client`, etc.) can then call the following tools: + +- `data-explorer.open` – returns the widget template and recent dataset summaries. +- `data-explorer.uploadInit` – begin a chunked upload session for large CSVs (returns an `uploadId`). +- `data-explorer.uploadChunk` – append CSV text to a session; mark the final chunk with `isFinal=true` to trigger profiling. +- `data-explorer.upload` – store and profile an uploaded CSV. Supply either `csvText` (inline + string data) or a `filePath`/`fileUri` pointing to a local file when the dataset is already on + disk. Path-based uploads require `DATA_EXPLORER_ALLOWED_UPLOAD_ROOTS` to include the directory + that holds the CSV. +- `data-explorer.preview` – fetch filtered table rows with pagination. +- `data-explorer.chart` – build datasets for bar, scatter, or histogram charts. + +Restart the server to clear in-memory datasets. + +### Security hardening + +The server ships with conservative defaults: + +- Path-based uploads are disabled until you set `DATA_EXPLORER_ALLOWED_UPLOAD_ROOTS` (use the OS + path separator, e.g., `:/` on Unix or `;` on Windows) to the directories that should be readable. +- Set `DATA_EXPLORER_AUTH_TOKEN` to require every HTTP request to include `Authorization: Bearer `. +- Provide `DATA_EXPLORER_CORS_ALLOW_ORIGINS` (comma-separated list) to opt into CORS headers for + trusted origins. Leave it empty to block cross-origin callers. + +Always combine these knobs with your preferred network isolation (VPN, tunnel, etc.) before exposing +the MCP server to the wider internet. diff --git a/data_explorer_server_python/__init__.py b/data_explorer_server_python/__init__.py new file mode 100644 index 0000000..d0cfab4 --- /dev/null +++ b/data_explorer_server_python/__init__.py @@ -0,0 +1 @@ +"""Data explorer MCP server package.""" diff --git a/data_explorer_server_python/charts.py b/data_explorer_server_python/charts.py new file mode 100644 index 0000000..024789f --- /dev/null +++ b/data_explorer_server_python/charts.py @@ -0,0 +1,140 @@ +from __future__ import annotations + +from typing import Dict, List + +import numpy as np +import pandas as pd + +from .schemas import ( + Aggregation, + BarChartSeries, + ChartConfig, + ChartResponse, + ChartType, + HistogramBin, + ScatterPoint, +) +from .utils import ensure_column_exists, to_python_value + + +def _bar_chart(dataframe: pd.DataFrame, config: ChartConfig) -> List[Dict]: + ensure_column_exists(dataframe, config.x) + group_keys = [config.x] + if config.color: + ensure_column_exists(dataframe, config.color) + group_keys.append(config.color) + + working = dataframe.dropna(subset=[config.x]) + + if config.aggregation == Aggregation.COUNT: + grouped = ( + working.groupby(group_keys, dropna=False).size().reset_index(name="value") + ) + else: + if config.y is None: + raise ValueError("Bar charts with sum/avg require `y` column.") + ensure_column_exists(dataframe, config.y) + numeric_y = pd.to_numeric(working[config.y], errors="coerce") + working = working.assign(**{config.y: numeric_y}).dropna(subset=[config.y]) + grouped = working.groupby(group_keys, dropna=False)[config.y] + if config.aggregation == Aggregation.SUM: + grouped = grouped.sum().reset_index(name="value") + else: + grouped = grouped.mean().reset_index(name="value") + + records: List[Dict] = [] + for _, row in grouped.iterrows(): + base_record = { + "category": to_python_value(row[config.x]), + "value": float(row["value"]) if row["value"] is not None else None, + } + if config.color: + base_record["color"] = to_python_value(row[config.color]) + records.append(base_record) + return records + + +def _scatter_points( + dataframe: pd.DataFrame, config: ChartConfig, limit: int = 500 +) -> List[Dict]: + if config.y is None: + raise ValueError("Scatter charts require `y` column.") + + series_x = pd.to_numeric(ensure_column_exists(dataframe, config.x), errors="coerce") + series_y = pd.to_numeric(ensure_column_exists(dataframe, config.y), errors="coerce") + + working = pd.DataFrame({config.x: series_x, config.y: series_y}) + if config.color and config.color in dataframe.columns: + working[config.color] = dataframe[config.color] + + working = working.dropna(subset=[config.x, config.y]) + working = working.iloc[:limit] + working = working.sort_values(by=config.x) + + points: List[Dict] = [] + for _, row in working.iterrows(): + point = { + "x": float(row[config.x]), + "y": float(row[config.y]), + } + if config.color and config.color in working.columns: + point["color"] = to_python_value(row[config.color]) + points.append(point) + return points + + +def _histogram_bins(dataframe: pd.DataFrame, config: ChartConfig) -> List[Dict]: + series = pd.to_numeric(ensure_column_exists(dataframe, config.x), errors="coerce") + numeric = series.dropna() + if numeric.empty: + return [] + + bin_count = config.bin_count or 10 + counts, bin_edges = np.histogram(numeric, bins=bin_count) + + bins: List[Dict] = [] + for idx in range(len(counts)): + bins.append( + { + "binStart": float(bin_edges[idx]), + "binEnd": float(bin_edges[idx + 1]), + "count": int(counts[idx]), + } + ) + return bins + + +def build_chart_response( + dataframe: pd.DataFrame, config: ChartConfig, dataset_id: str +) -> ChartResponse: + if config.chart_type == ChartType.BAR: + data = _bar_chart(dataframe, config) + series = [BarChartSeries(**item) for item in data] + return ChartResponse( + dataset_id=dataset_id, + chart_type=config.chart_type, + series=series, + config=config, + ) + + if config.chart_type == ChartType.SCATTER: + data = _scatter_points(dataframe, config) + points = [ScatterPoint(**item) for item in data] + return ChartResponse( + dataset_id=dataset_id, + chart_type=config.chart_type, + points=points, + config=config, + ) + + if config.chart_type == ChartType.HISTOGRAM: + data = _histogram_bins(dataframe, config) + bins = [HistogramBin(**item) for item in data] + return ChartResponse( + dataset_id=dataset_id, + chart_type=config.chart_type, + bins=bins, + config=config, + ) + + raise ValueError(f"Unsupported chart type: {config.chart_type}") diff --git a/data_explorer_server_python/filters.py b/data_explorer_server_python/filters.py new file mode 100644 index 0000000..470f408 --- /dev/null +++ b/data_explorer_server_python/filters.py @@ -0,0 +1,49 @@ +from __future__ import annotations + +from typing import Iterable + +import pandas as pd + +from .schemas import EqualsFilter, Filter, RangeFilter +from .utils import coerce_value_for_series, ensure_column_exists + + +def apply_filters(dataframe: pd.DataFrame, filters: Iterable[Filter]) -> pd.DataFrame: + filters_list = list(filters) if filters is not None else [] + if not filters_list: + return dataframe + + mask = pd.Series(True, index=dataframe.index) + + for raw_filter in filters_list: + try: + series = ensure_column_exists(dataframe, raw_filter.column) + except KeyError: + # Ignore filters that reference non-existent columns. + continue + + if raw_filter.type == "equals": + equals_filter = ( + raw_filter + if isinstance(raw_filter, EqualsFilter) + else EqualsFilter.model_validate(raw_filter.model_dump()) + ) + value = coerce_value_for_series(series, equals_filter.value) + if value is None or pd.isna(value): + mask &= series.isna() + else: + mask &= series == value + elif raw_filter.type == "range": + range_filter = ( + raw_filter + if isinstance(raw_filter, RangeFilter) + else RangeFilter.model_validate(raw_filter.model_dump()) + ) + if range_filter.min is not None: + min_value = coerce_value_for_series(series, range_filter.min) + mask &= series >= min_value + if range_filter.max is not None: + max_value = coerce_value_for_series(series, range_filter.max) + mask &= series <= max_value + + return dataframe[mask] diff --git a/data_explorer_server_python/main.py b/data_explorer_server_python/main.py new file mode 100644 index 0000000..1a9b652 --- /dev/null +++ b/data_explorer_server_python/main.py @@ -0,0 +1,962 @@ +from __future__ import annotations + +import io +import json +from dataclasses import dataclass +from datetime import datetime, timezone +from pathlib import Path +from threading import Lock +from typing import Any, Dict, List, Optional +from uuid import uuid4 +import logging +import os +import re +import secrets +from urllib.parse import urlparse, unquote + +import pandas as pd +import mcp.types as types +from fastapi import HTTPException +from mcp.server.fastmcp import FastMCP +from pandas.errors import EmptyDataError, ParserError +from pydantic import ValidationError + +from .charts import build_chart_response +from .filters import apply_filters +from .profiling import profile_dataframe +from .schemas import ( + ChartInput, + DatasetProfile, + DatasetSummary, + OpenResponse, + PreviewInput, + PreviewResponse, + UploadChunkInput, + UploadDatasetInput, + UploadDatasetResponse, + UploadInitInput, + UploadInitResponse, +) +from .store import DatasetRecord, DatasetStore +from .utils import dataframe_preview +from starlette.requests import Request +from starlette.responses import JSONResponse + + +ASSET_LOGGER = logging.getLogger(__name__) + +ASSETS_DIR = Path(__file__).resolve().parent.parent / "assets" +COMPONENT_NAME = "data-explorer" +TEMPLATE_URI = f"ui://widget/{COMPONENT_NAME}.html" +SCRIPT_URI = f"ui://widget/{COMPONENT_NAME}.js" +STYLE_URI = f"ui://widget/{COMPONENT_NAME}.css" +MIME_TYPE = "text/html+skybridge" +EMPTY_INPUT_SCHEMA: Dict[str, Any] = { + "type": "object", + "properties": {}, + "additionalProperties": False, +} +MAX_UPLOAD_BYTES = 10 * 1024 * 1024 # 10 MB default limit +UPLOAD_SESSION_TTL_SECONDS = 20 * 60 # 20 minutes +PATH_ALLOWLIST_ENV = "DATA_EXPLORER_ALLOWED_UPLOAD_ROOTS" +AUTH_TOKEN_ENV = "DATA_EXPLORER_AUTH_TOKEN" +CORS_ORIGINS_ENV = "DATA_EXPLORER_CORS_ALLOW_ORIGINS" +PATH_UPLOAD_DISABLED_MESSAGE = ( + "filePath/fileUri uploads are disabled. Provide csvText or set " + f"{PATH_ALLOWLIST_ENV} to allow specific directories." +) + +SECURITY_LOGGER = logging.getLogger("data_explorer_server.security") + +_SCRIPT_TAG_PATTERN = re.compile( + r"]*src=\"[^\"]*/(?P[\w.-]+\.js)\"[^>]*>", + re.IGNORECASE, +) + +_STYLESHEET_LINK_PATTERN = re.compile( + r"]*href=\"[^\"]*/(?P[\w.-]+\.css)\"[^>]*/?>", + re.IGNORECASE, +) + + +def _parse_path_allowlist(raw_value: Optional[str]) -> tuple[Path, ...]: + if not raw_value: + return () + + roots: List[Path] = [] + for entry in raw_value.split(os.pathsep): + candidate = entry.strip() + if not candidate: + continue + try: + resolved = Path(candidate).expanduser().resolve(strict=False) + except OSError as exc: + SECURITY_LOGGER.warning( + "Ignoring invalid path in %s (%s): %s", + PATH_ALLOWLIST_ENV, + candidate, + exc, + ) + continue + roots.append(resolved) + return tuple(roots) + + +def _ensure_path_within_allowlist(resolved_path: Path) -> None: + if not _ALLOWED_UPLOAD_ROOTS: + raise HTTPException(status_code=400, detail=PATH_UPLOAD_DISABLED_MESSAGE) + + for root in _ALLOWED_UPLOAD_ROOTS: + try: + resolved_path.relative_to(root) + return + except ValueError: + continue + + raise HTTPException( + status_code=403, + detail="Requested file path is outside the configured allowlist.", + ) + + +def _load_widget_assets() -> tuple[str, Optional[str], Optional[str]]: + base_path = ASSETS_DIR / f"{COMPONENT_NAME}.html" + html_source: Optional[str] = None + + if base_path.exists(): + html_source = base_path.read_text(encoding="utf8") + else: + candidates = sorted(ASSETS_DIR.glob(f"{COMPONENT_NAME}-*.html")) + if candidates: + html_source = candidates[-1].read_text(encoding="utf8") + + if html_source is None: + # Provide minimal shell if assets missing. + return ( + """\n\n \n \n \n \n \n
\n \n \n\n""", + None, + None, + ) + + script_match = _SCRIPT_TAG_PATTERN.search(html_source) + style_match = _STYLESHEET_LINK_PATTERN.search(html_source) + script_text: Optional[str] = None + style_text: Optional[str] = None + + if script_match: + script_filename = script_match.group("filename") + try: + script_text = (ASSETS_DIR / script_filename).read_text(encoding="utf8") + except FileNotFoundError: + ASSET_LOGGER.warning("Script asset missing: %s", script_filename) + except OSError as exc: + ASSET_LOGGER.warning("Failed to read script %s: %s", script_filename, exc) + if script_text is not None: + html_source = _SCRIPT_TAG_PATTERN.sub( + f'', + html_source, + count=1, + ) + + if style_match: + style_filename = style_match.group("filename") + try: + style_text = (ASSETS_DIR / style_filename).read_text(encoding="utf8") + except FileNotFoundError: + ASSET_LOGGER.warning("Stylesheet asset missing: %s", style_filename) + except OSError as exc: + ASSET_LOGGER.warning( + "Failed to read stylesheet %s: %s", style_filename, exc + ) + if style_text is not None: + html_source = _STYLESHEET_LINK_PATTERN.sub( + f'', + html_source, + count=1, + ) + + return html_source, script_text, style_text + + +def _format_size_limit_error(max_bytes: int) -> str: + return f"Upload exceeds maximum size of {max_bytes // (1024 * 1024)} MB." + + +@dataclass +class UploadSession: + upload_id: str + dataset_name: str + delimiter: Optional[str] + has_header: bool + filename: Optional[str] + created_at: datetime + buffer: io.StringIO + byte_count: int = 0 + chunk_count: int = 0 + + def append_chunk( + self, chunk: str, chunk_index: Optional[int], max_bytes: int + ) -> int: + if chunk_index is not None and chunk_index != self.chunk_count: + raise ValueError( + f"Unexpected chunkIndex {chunk_index}; expected {self.chunk_count}." + ) + + encoded = chunk.encode("utf-8") + new_total = self.byte_count + len(encoded) + if new_total > max_bytes: + raise HTTPException( + status_code=413, + detail=_format_size_limit_error(max_bytes), + ) + + self.buffer.write(chunk) + self.byte_count = new_total + self.chunk_count += 1 + return self.byte_count + + +class UploadSessionManager: + def __init__(self, max_bytes: int, ttl_seconds: int) -> None: + self._max_bytes = max_bytes + self._ttl_seconds = ttl_seconds + self._sessions: Dict[str, UploadSession] = {} + self._lock = Lock() + + def _cleanup_expired(self) -> None: + now = datetime.now(timezone.utc) + with self._lock: + expired = [ + upload_id + for upload_id, session in self._sessions.items() + if (now - session.created_at).total_seconds() > self._ttl_seconds + ] + for upload_id in expired: + del self._sessions[upload_id] + + def create_session( + self, + dataset_name: str, + delimiter: Optional[str], + has_header: bool, + filename: Optional[str], + ) -> UploadSession: + self._cleanup_expired() + upload_id = str(uuid4()) + session = UploadSession( + upload_id=upload_id, + dataset_name=dataset_name, + delimiter=delimiter, + has_header=has_header, + filename=filename, + created_at=datetime.now(timezone.utc), + buffer=io.StringIO(), + ) + with self._lock: + self._sessions[upload_id] = session + return session + + def append_chunk( + self, upload_id: str, chunk: str, chunk_index: Optional[int] + ) -> UploadSession: + with self._lock: + session = self._sessions.get(upload_id) + if session is None: + raise KeyError(upload_id) + try: + session.append_chunk(chunk, chunk_index, self._max_bytes) + except ValueError as exc: + raise HTTPException(status_code=400, detail=str(exc)) from exc + return session + + def finalize_session(self, upload_id: str) -> UploadSession: + with self._lock: + session = self._sessions.pop(upload_id, None) + if session is None: + raise KeyError(upload_id) + return session + + +def _summary_from_record(record: DatasetRecord) -> DatasetSummary: + profile = DatasetProfile.model_validate(record.profile) + return DatasetSummary( + dataset_id=record.dataset_id, + dataset_name=record.name, + row_count=record.row_count, + column_count=record.column_count, + created_at=record.created_at.isoformat(), + filename=record.filename, + profile=profile, + ) + + +def _json_content(payload: Dict[str, Any]) -> List[types.Content]: + text = json.dumps(payload, default=str) + return [ + types.TextContent( + type="text", + text=text, + ) + ] + + +def _ensure_inline_size_within_limit(csv_text: str, max_bytes: int) -> None: + encoded = csv_text.encode("utf-8") + if len(encoded) > max_bytes: + raise HTTPException(status_code=413, detail=_format_size_limit_error(max_bytes)) + + +def _extract_path_from_payload(payload: UploadDatasetInput) -> Optional[Path]: + if payload.file_path: + return Path(payload.file_path).expanduser() + + if payload.file_uri: + parsed = urlparse(payload.file_uri) + if parsed.scheme not in ("", "file"): + raise HTTPException( + status_code=400, + detail="Only local file URIs (file://) are supported for uploads.", + ) + + allowed_hosts = ("", "localhost", "127.0.0.1") + netloc = parsed.netloc + path_component = unquote(parsed.path or "") + + if os.name == "nt" and netloc not in allowed_hosts: + # Windows file URIs may use the drive letter as the authority component + if len(netloc) == 2 and netloc[1] == ":": + path_component = f"{netloc}{path_component}" + netloc = "" + else: + raise HTTPException( + status_code=400, + detail="File URIs must reference the local machine.", + ) + + if netloc not in allowed_hosts: + raise HTTPException( + status_code=400, + detail="File URIs must reference the local machine.", + ) + + if not path_component: + raise HTTPException(status_code=400, detail="fileUri is missing a path.") + + if ( + os.name == "nt" + and path_component.startswith("/") + and len(path_component) > 2 + and path_component[2] == ":" + ): + path_component = path_component.lstrip("/") + + return Path(path_component).expanduser() + + return None + + +def _read_csv_from_path(path: Path, desired_encoding: Optional[str]) -> str: + candidate = path.expanduser() + try: + resolved_path = candidate.resolve(strict=True) + except FileNotFoundError: + raise HTTPException(status_code=404, detail=f"File not found: {candidate}") + except OSError as exc: + raise HTTPException( + status_code=400, detail=f"Unable to resolve upload path: {exc}" + ) from exc + + _ensure_path_within_allowlist(resolved_path) + + if not resolved_path.is_file(): + raise HTTPException( + status_code=400, + detail=f"Upload target is not a file: {resolved_path}", + ) + + file_size = resolved_path.stat().st_size + if file_size == 0: + raise HTTPException(status_code=400, detail="Uploaded file is empty.") + if file_size > MAX_UPLOAD_BYTES: + raise HTTPException( + status_code=413, detail=_format_size_limit_error(MAX_UPLOAD_BYTES) + ) + + raw_bytes = resolved_path.read_bytes() + # Probe encodings in priority order, skipping duplicates + candidate_encodings = [] + if desired_encoding: + candidate_encodings.append(desired_encoding) + candidate_encodings.extend(["utf-8", "utf-8-sig", "latin-1"]) + + seen: set[str] = set() + for encoding in candidate_encodings: + normalized = encoding.lower() + if normalized in seen: + continue + seen.add(normalized) + try: + return raw_bytes.decode(encoding) + except UnicodeDecodeError: + continue + except LookupError: + continue + + raise HTTPException( + status_code=400, + detail=( + "Failed to decode uploaded file. Specify an encoding via the encoding field" + ), + ) + + +def _resolve_upload_source(payload: UploadDatasetInput) -> tuple[str, Optional[str]]: + if payload.csv_text: + _ensure_inline_size_within_limit(payload.csv_text, MAX_UPLOAD_BYTES) + return payload.csv_text, payload.filename + + path = _extract_path_from_payload(payload) + if path is None: + raise HTTPException( + status_code=400, detail="No CSV source provided for upload." + ) + + csv_text = _read_csv_from_path( + path, payload.encoding.strip() if payload.encoding else None + ) + filename = payload.filename or path.name + return csv_text, filename + + +def _csv_to_dataframe(csv_text: str, payload: UploadDatasetInput) -> pd.DataFrame: + buffer = io.StringIO(csv_text) + read_kwargs: Dict[str, Any] = { + "sep": payload.delimiter if payload.delimiter else None, + "engine": "python", + } + if not payload.has_header: + read_kwargs["header"] = None + try: + dataframe = pd.read_csv(buffer, **read_kwargs) + except (EmptyDataError, ParserError) as exc: + raise HTTPException( + status_code=400, detail=f"Failed to parse CSV: {exc}" + ) from exc + + if dataframe.empty: + raise HTTPException(status_code=400, detail="Uploaded CSV is empty.") + + dataframe = dataframe.convert_dtypes() + if not payload.has_header: + dataframe.columns = [f"column_{idx + 1}" for idx in range(dataframe.shape[1])] + + return dataframe + + +_WIDGET_HTML, _WIDGET_SCRIPT, _WIDGET_STYLE = _load_widget_assets() + +_ALLOWED_UPLOAD_ROOTS = _parse_path_allowlist(os.getenv(PATH_ALLOWLIST_ENV)) +_AUTH_TOKEN = os.getenv(AUTH_TOKEN_ENV) +_CORS_ALLOWED_ORIGINS = [ + origin.strip() + for origin in (os.getenv(CORS_ORIGINS_ENV) or "").split(",") + if origin.strip() +] + +store = DatasetStore() +upload_manager = UploadSessionManager(MAX_UPLOAD_BYTES, UPLOAD_SESSION_TTL_SECONDS) +mcp = FastMCP(name="data-explorer", stateless_http=True) +WIDGET_HTML = _WIDGET_HTML +logger = logging.getLogger("data_explorer_server") +logging.basicConfig(level=logging.INFO) + + +def _tool_meta() -> Dict[str, Any]: + return { + "openai/outputTemplate": TEMPLATE_URI, + "openai/resultCanProduceWidget": True, + "openai/widgetAccessible": True, + "openai/toolInvocation/invoking": "Profiling dataset", + "openai/toolInvocation/invoked": "Dataset ready", + } + + +@mcp._mcp_server.list_tools() +async def _list_tools() -> List[types.Tool]: + def build_tool( + name: str, title: str, description: str, input_schema: Dict[str, Any] + ) -> types.Tool: + return types.Tool( + name=name, + title=title, + description=description, + inputSchema=json.loads(json.dumps(input_schema)), + _meta=_tool_meta(), + annotations={ + "destructiveHint": False, + "openWorldHint": False, + "readOnlyHint": False, + }, + ) + + return [ + build_tool( + "data-explorer.open", + "Open Data Explorer", + "Mount the data exploration widget.", + EMPTY_INPUT_SCHEMA, + ), + build_tool( + "data-explorer.uploadInit", + "Begin Chunked Upload", + "Start a chunked CSV upload session.", + UploadInitInput.model_json_schema(), + ), + build_tool( + "data-explorer.uploadChunk", + "Upload CSV Chunk", + "Append a CSV chunk to an existing upload session.", + UploadChunkInput.model_json_schema(), + ), + build_tool( + "data-explorer.upload", + "Upload CSV", + "Upload a CSV dataset for profiling and exploration.", + UploadDatasetInput.model_json_schema(), + ), + build_tool( + "data-explorer.preview", + "Get Preview", + "Fetch a filtered preview of the dataset.", + PreviewInput.model_json_schema(), + ), + build_tool( + "data-explorer.chart", + "Build Chart", + "Create chart-ready data using current filters.", + ChartInput.model_json_schema(), + ), + ] + + +@mcp._mcp_server.list_resources() +async def _list_resources() -> List[types.Resource]: + resources = [ + types.Resource( + name="Data Explorer Widget", + title="Data Explorer Widget", + uri=TEMPLATE_URI, + description="HTML bundle for the data explorer widget.", + mimeType=MIME_TYPE, + _meta=_tool_meta(), + ) + ] + if _WIDGET_SCRIPT is not None: + resources.append( + types.Resource( + name="Data Explorer Script", + title="Data Explorer Script", + uri=SCRIPT_URI, + description="JavaScript bundle for the data explorer widget.", + mimeType="text/javascript", + _meta=_tool_meta(), + ) + ) + if _WIDGET_STYLE is not None: + resources.append( + types.Resource( + name="Data Explorer Styles", + title="Data Explorer Styles", + uri=STYLE_URI, + description="Stylesheet for the data explorer widget.", + mimeType="text/css", + _meta=_tool_meta(), + ) + ) + return resources + + +@mcp._mcp_server.list_resource_templates() +async def _list_resource_templates() -> List[types.ResourceTemplate]: + templates = [ + types.ResourceTemplate( + name="Data Explorer Widget", + title="Data Explorer Widget", + uriTemplate=TEMPLATE_URI, + description="HTML bundle for the data explorer widget.", + mimeType=MIME_TYPE, + _meta=_tool_meta(), + ) + ] + if _WIDGET_SCRIPT is not None: + templates.append( + types.ResourceTemplate( + name="Data Explorer Script", + title="Data Explorer Script", + uriTemplate=SCRIPT_URI, + description="JavaScript bundle for the data explorer widget.", + mimeType="text/javascript", + _meta=_tool_meta(), + ) + ) + if _WIDGET_STYLE is not None: + templates.append( + types.ResourceTemplate( + name="Data Explorer Styles", + title="Data Explorer Styles", + uriTemplate=STYLE_URI, + description="Stylesheet for the data explorer widget.", + mimeType="text/css", + _meta=_tool_meta(), + ) + ) + return templates + + +async def _read_resource(req: types.ReadResourceRequest) -> types.ServerResult: + uri = str(req.params.uri) + if uri == TEMPLATE_URI: + return types.ServerResult( + types.ReadResourceResult( + contents=[ + types.TextResourceContents( + text=WIDGET_HTML, + uri=TEMPLATE_URI, + mimeType=MIME_TYPE, + _meta=_tool_meta(), + ) + ] + ) + ) + + if uri == SCRIPT_URI and _WIDGET_SCRIPT is not None: + return types.ServerResult( + types.ReadResourceResult( + contents=[ + types.TextResourceContents( + text=_WIDGET_SCRIPT, + uri=SCRIPT_URI, + mimeType="text/javascript", + _meta=_tool_meta(), + ) + ] + ) + ) + + if uri == STYLE_URI and _WIDGET_STYLE is not None: + return types.ServerResult( + types.ReadResourceResult( + contents=[ + types.TextResourceContents( + text=_WIDGET_STYLE, + uri=STYLE_URI, + mimeType="text/css", + _meta=_tool_meta(), + ) + ] + ) + ) + + return types.ServerResult( + types.ReadResourceResult( + contents=[], + _meta={"error": f"Unknown resource: {req.params.uri}"}, + ) + ) + + +def _handle_open() -> OpenResponse: + datasets = [_summary_from_record(record) for record in store.list_recent()] + response = OpenResponse( + datasets=datasets, + active_dataset_id=datasets[0].dataset_id if datasets else None, + supports_chunk_upload=True, + max_upload_bytes=MAX_UPLOAD_BYTES, + ) + return response + + +def _process_dataset_upload(payload: UploadDatasetInput) -> UploadDatasetResponse: + csv_text, inferred_filename = _resolve_upload_source(payload) + dataframe = _csv_to_dataframe(csv_text, payload) + resolved_filename = payload.filename or inferred_filename + profile = profile_dataframe(dataframe) + record = store.create( + name=payload.dataset_name, + filename=resolved_filename, + dataframe=dataframe, + profile=profile, + ) + summary = _summary_from_record(record) + preview_rows = dataframe_preview(record.dataframe, limit=20) + + return UploadDatasetResponse( + dataset=summary, + preview=preview_rows, + columns=list(record.dataframe.columns), + ) + + +def _handle_upload(args: Dict[str, Any]) -> UploadDatasetResponse: + payload = UploadDatasetInput.model_validate(args) + return _process_dataset_upload(payload) + + +def _handle_upload_init(args: Dict[str, Any]) -> UploadInitResponse: + logger.info("Handling uploadInit with args keys: %s", list(args.keys())) + if "csvText" in args: + logger.info( + "uploadInit received csvText; treating request as direct upload for backwards compatibility" + ) + dataset_args = { + "datasetName": args.get("datasetName"), + "csvText": args.get("csvText"), + "delimiter": args.get("delimiter"), + "hasHeader": args.get("hasHeader", True), + "filename": args.get("filename"), + } + dataset_payload = UploadDatasetInput.model_validate(dataset_args) + return _process_dataset_upload(dataset_payload) + + payload = UploadInitInput.model_validate(args) + dataset_name = (payload.dataset_name or "").strip() + session = upload_manager.create_session( + dataset_name=dataset_name, + delimiter=payload.delimiter, + has_header=payload.has_header, + filename=payload.filename, + ) + return UploadInitResponse(uploadId=session.upload_id) + + +def _handle_upload_chunk(args: Dict[str, Any]) -> Dict[str, Any]: + payload = UploadChunkInput.model_validate(args) + try: + session = upload_manager.append_chunk( + payload.upload_id, payload.chunk_text, payload.chunk_index + ) + except KeyError: + raise HTTPException( + status_code=404, detail=f"Unknown uploadId: {payload.upload_id}" + ) from None + + total_bytes = session.byte_count + + if not payload.is_final: + return { + "uploadId": session.upload_id, + "receivedBytes": total_bytes, + "isFinalized": False, + } + + finalized_session = upload_manager.finalize_session(payload.upload_id) + dataset_payload = UploadDatasetInput( + datasetName=finalized_session.dataset_name, + csvText=finalized_session.buffer.getvalue(), + delimiter=finalized_session.delimiter, + hasHeader=finalized_session.has_header, + filename=finalized_session.filename, + ) + response = _process_dataset_upload(dataset_payload) + result = response.model_dump(by_alias=True) + result["uploadId"] = finalized_session.upload_id + result["isFinalized"] = True + result["receivedBytes"] = total_bytes + return result + + +def _handle_preview(args: Dict[str, Any]) -> PreviewResponse: + payload = PreviewInput.model_validate(args) + record = store.get(payload.dataset_id) + filtered = apply_filters(record.dataframe, payload.filters) + preview_rows = dataframe_preview( + filtered, limit=payload.limit, offset=payload.offset + ) + + response = PreviewResponse( + dataset_id=record.dataset_id, + total_rows=int(filtered.shape[0]), + rows=preview_rows, + columns=list(filtered.columns), + applied_filters=payload.filters, + ) + return response + + +def _handle_chart(args: Dict[str, Any]) -> Dict[str, Any]: + payload = ChartInput.model_validate(args) + record = store.get(payload.dataset_id) + filtered = apply_filters(record.dataframe, payload.filters) + if filtered.empty: + chart_response = { + "datasetId": record.dataset_id, + "chartType": payload.config.chart_type.value, + "config": payload.config.model_dump(by_alias=True), + "series": [], + "points": [], + "bins": [], + } + else: + chart_model = build_chart_response(filtered, payload.config, record.dataset_id) + chart_response = chart_model.model_dump(by_alias=True) + + return chart_response + + +async def _on_read_resource(req: types.ReadResourceRequest) -> types.ServerResult: + return await _read_resource(req) + + +async def _on_call_tool(req: types.CallToolRequest) -> types.ServerResult: + tool_name = req.params.name + args = req.params.arguments or {} + logger.info( + "Received tool call: name=%s args=%s", tool_name, json.dumps(args, default=str) + ) + + try: + if tool_name == "data-explorer.open": + response = _handle_open() + payload = response.model_dump(by_alias=True) + elif tool_name == "data-explorer.uploadInit": + response = _handle_upload_init(args) + payload = response.model_dump(by_alias=True) + elif tool_name == "data-explorer.uploadChunk": + payload = _handle_upload_chunk(args) + elif tool_name == "data-explorer.upload": + response = _handle_upload(args) + payload = response.model_dump(by_alias=True) + elif tool_name == "data-explorer.preview": + response = _handle_preview(args) + payload = response.model_dump(by_alias=True) + elif tool_name == "data-explorer.chart": + payload = _handle_chart(args) + else: + return types.ServerResult( + types.CallToolResult( + content=[ + types.TextContent( + type="text", text=f"Unknown tool: {tool_name}" + ) + ], + isError=True, + ) + ) + except ValidationError as exc: + logger.warning( + "Validation error while handling tool '%s': %s", + tool_name, + exc.errors(), + ) + return types.ServerResult( + types.CallToolResult( + content=[ + types.TextContent( + type="text", + text=json.dumps( + {"error": "validation_error", "details": exc.errors()} + ), + ) + ], + isError=True, + ) + ) + except HTTPException as exc: + logger.warning( + "HTTPException while handling tool '%s': %s", + tool_name, + exc.detail, + ) + return types.ServerResult( + types.CallToolResult( + content=[ + types.TextContent( + type="text", + text=json.dumps({"error": exc.detail}), + ) + ], + isError=True, + ) + ) + except Exception as exc: # pragma: no cover - defensive + logger.exception("Unhandled error while handling tool '%s'", tool_name) + return types.ServerResult( + types.CallToolResult( + content=[ + types.TextContent( + type="text", + text=json.dumps( + {"error": "internal_error", "details": str(exc)} + ), + ) + ], + isError=True, + ) + ) + + return types.ServerResult( + types.CallToolResult( + content=_json_content(payload), + structuredContent=payload, + _meta=_tool_meta(), + ) + ) + + +mcp._mcp_server.request_handlers[types.ReadResourceRequest] = _on_read_resource +mcp._mcp_server.request_handlers[types.CallToolRequest] = _on_call_tool + +app = mcp.streamable_http_app() + + +@app.middleware("http") +async def _enforce_bearer_token(request: Request, call_next): + if not _AUTH_TOKEN: + return await call_next(request) + + header_value = request.headers.get("authorization") + if not header_value or not header_value.startswith("Bearer "): + return JSONResponse({"error": "Missing bearer token"}, status_code=401) + + provided = header_value.split(" ", 1)[1].strip() + if not provided or not secrets.compare_digest(provided, _AUTH_TOKEN): + return JSONResponse({"error": "Invalid bearer token"}, status_code=403) + + return await call_next(request) + + +async def _health_endpoint(request) -> JSONResponse: + return JSONResponse({"status": "ok"}) + + +@app.middleware("http") +async def _log_requests(request: Request, call_next): + response = await call_next(request) + logger.info("%s %s -> %s", request.method, request.url.path, response.status_code) + return response + + +app.add_route("/health", _health_endpoint, methods=["GET"]) + +try: + from starlette.middleware.cors import CORSMiddleware + + if _CORS_ALLOWED_ORIGINS: + app.add_middleware( + CORSMiddleware, + allow_origins=_CORS_ALLOWED_ORIGINS, + allow_methods=["*"], + allow_headers=["*"], + allow_credentials=False, + ) +except Exception: # pragma: no cover - optional dependency + pass + + +__all__ = ["app", "mcp", "store"] + + +if __name__ == "__main__": + import uvicorn + + uvicorn.run("data_explorer_server_python.main:app", host="0.0.0.0", port=8001) diff --git a/data_explorer_server_python/profiling.py b/data_explorer_server_python/profiling.py new file mode 100644 index 0000000..e2320de --- /dev/null +++ b/data_explorer_server_python/profiling.py @@ -0,0 +1,81 @@ +from __future__ import annotations + +from typing import Any, Dict, List + +import pandas as pd + +from .utils import infer_role, series_sample, to_python_value, top_value_counts + + +def _numeric_stats(series: pd.Series) -> Dict[str, Any]: + clean = series.dropna().astype("float64") + if clean.empty: + return { + "min": None, + "max": None, + "mean": None, + "median": None, + "stdDev": None, + } + return { + "min": to_python_value(clean.min()), + "max": to_python_value(clean.max()), + "mean": to_python_value(clean.mean()), + "median": to_python_value(clean.median()), + "stdDev": to_python_value(clean.std(ddof=0)), + } + + +def _datetime_stats(series: pd.Series) -> Dict[str, Any]: + clean = pd.to_datetime(series.dropna(), errors="coerce") + clean = clean.dropna() + if clean.empty: + return {"min": None, "max": None} + return { + "min": to_python_value(clean.min()), + "max": to_python_value(clean.max()), + } + + +def profile_dataframe(dataframe: pd.DataFrame) -> Dict[str, Any]: + frame = dataframe.copy() + frame = frame.convert_dtypes() + columns: List[Dict[str, Any]] = [] + + total_rows = int(frame.shape[0]) + + for column in frame.columns: + series = frame[column] + role = infer_role(series) + missing = int(series.isna().sum()) + non_null = total_rows - missing + distinct = int(series.nunique(dropna=True)) + column_profile: Dict[str, Any] = { + "name": column, + "role": role, + "dtype": str(series.dtype), + "nonNullCount": non_null, + "missingCount": missing, + "missingProportion": float(missing / total_rows) if total_rows else 0.0, + "distinctCount": distinct, + "sampleValues": series_sample(series, limit=5), + } + + if role == "numeric": + column_profile["stats"] = _numeric_stats(series) + elif role == "datetime": + column_profile["stats"] = _datetime_stats(series) + + if role in {"categorical", "text", "boolean"}: + column_profile["topValues"] = top_value_counts(series, limit=5) + + columns.append(column_profile) + + profile = { + "rowCount": total_rows, + "columnCount": int(frame.shape[1]), + "columns": columns, + "memoryUsageBytes": int(frame.memory_usage(deep=True).sum()), + } + + return profile diff --git a/data_explorer_server_python/requirements.txt b/data_explorer_server_python/requirements.txt new file mode 100644 index 0000000..0b4fc1f --- /dev/null +++ b/data_explorer_server_python/requirements.txt @@ -0,0 +1,6 @@ +fastapi>=0.115.0 +mcp[fastapi]>=0.1.0 +numpy>=1.26.0 +pandas>=2.2.0 +python-dateutil>=2.9.0 +uvicorn>=0.30.0 diff --git a/data_explorer_server_python/schemas.py b/data_explorer_server_python/schemas.py new file mode 100644 index 0000000..a324837 --- /dev/null +++ b/data_explorer_server_python/schemas.py @@ -0,0 +1,217 @@ +from __future__ import annotations + +from enum import Enum +from typing import Annotated, Any, Dict, List, Literal, Optional, Union + +from pydantic import BaseModel, ConfigDict, Field, model_validator + + +class ChartType(str, Enum): + BAR = "bar" + SCATTER = "scatter" + HISTOGRAM = "histogram" + + +class Aggregation(str, Enum): + COUNT = "count" + SUM = "sum" + AVG = "avg" + + +class UploadDatasetInput(BaseModel): + dataset_name: str = Field(alias="datasetName", min_length=1) + csv_text: Optional[str] = Field(default=None, alias="csvText", min_length=1) + file_path: Optional[str] = Field(default=None, alias="filePath", min_length=1) + file_uri: Optional[str] = Field(default=None, alias="fileUri", min_length=1) + delimiter: Optional[str] = Field(default=None) + has_header: bool = Field(default=True, alias="hasHeader") + filename: Optional[str] = Field(default=None) + encoding: Optional[str] = Field(default=None) + + model_config = ConfigDict(populate_by_name=True, extra="ignore") + + @model_validator(mode="after") + def _ensure_source(self) -> UploadDatasetInput: # type: ignore[override] + if not (self.csv_text or self.file_path or self.file_uri): + raise ValueError( + "Provide csvText, filePath, or fileUri when uploading a dataset." + ) + return self + + +class UploadInitInput(BaseModel): + dataset_name: str = Field(alias="datasetName", min_length=1) + delimiter: Optional[str] = Field(default=None) + has_header: bool = Field(default=True, alias="hasHeader") + filename: Optional[str] = Field(default=None) + + model_config = ConfigDict(populate_by_name=True, extra="ignore") + + +class UploadInitResponse(BaseModel): + upload_id: str = Field(alias="uploadId") + + model_config = ConfigDict(populate_by_name=True) + + +class UploadChunkInput(BaseModel): + upload_id: str = Field(alias="uploadId", min_length=1) + chunk_text: str = Field(alias="chunkText", min_length=1) + is_final: bool = Field(alias="isFinal") + chunk_index: Optional[int] = Field(default=None, alias="chunkIndex", ge=0) + + model_config = ConfigDict(populate_by_name=True, extra="ignore") + + +class UploadChunkStatus(BaseModel): + upload_id: str = Field(alias="uploadId") + received_bytes: int = Field(alias="receivedBytes", ge=0) + is_finalized: bool = Field(alias="isFinalized") + dataset: Optional[DatasetSummary] = None + preview: Optional[List[Dict[str, Any]]] = None + columns: Optional[List[str]] = None + + model_config = ConfigDict(populate_by_name=True) + + +class BaseFilter(BaseModel): + column: str + type: str + + model_config = ConfigDict(extra="ignore") + + +class EqualsFilter(BaseFilter): + type: Literal["equals"] = "equals" + value: Any + + +class RangeFilter(BaseFilter): + type: Literal["range"] = "range" + min: Optional[float | str] = None + max: Optional[float | str] = None + + +Filter = Annotated[ + Union[EqualsFilter, RangeFilter], + Field(discriminator="type"), +] + + +class ChartConfig(BaseModel): + chart_type: ChartType = Field(alias="chartType") + x: str + y: Optional[str] = None + color: Optional[str] = None + bin_count: Optional[int] = Field(default=10, alias="binCount") + aggregation: Aggregation = Aggregation.COUNT + + model_config = ConfigDict(populate_by_name=True, extra="ignore") + + +class DatasetColumnProfile(BaseModel): + name: str + role: str + dtype: str + non_null_count: int = Field(alias="nonNullCount") + missing_count: int = Field(alias="missingCount") + missing_proportion: float = Field(alias="missingProportion") + distinct_count: int = Field(alias="distinctCount") + sample_values: List[Any] = Field(alias="sampleValues") + stats: Optional[Dict[str, Any]] = None + top_values: Optional[List[Dict[str, Any]]] = Field(default=None, alias="topValues") + + model_config = ConfigDict(populate_by_name=True) + + +class DatasetProfile(BaseModel): + row_count: int = Field(alias="rowCount") + column_count: int = Field(alias="columnCount") + columns: List[DatasetColumnProfile] + memory_usage_bytes: int = Field(alias="memoryUsageBytes") + + model_config = ConfigDict(populate_by_name=True) + + +class DatasetSummary(BaseModel): + dataset_id: str = Field(alias="datasetId") + dataset_name: str = Field(alias="datasetName") + row_count: int = Field(alias="rowCount") + column_count: int = Field(alias="columnCount") + created_at: str = Field(alias="createdAt") + filename: Optional[str] = None + + profile: DatasetProfile + + model_config = ConfigDict(populate_by_name=True) + + +class UploadDatasetResponse(BaseModel): + dataset: DatasetSummary + preview: List[Dict[str, Any]] + columns: List[str] + + +class PreviewInput(BaseModel): + dataset_id: str = Field(alias="datasetId") + limit: int = Field(default=20, ge=1, le=200) + offset: int = Field(default=0, ge=0) + filters: List[Filter] = Field(default_factory=list) + + model_config = ConfigDict(populate_by_name=True, extra="ignore") + + +class PreviewResponse(BaseModel): + dataset_id: str = Field(alias="datasetId") + total_rows: int = Field(alias="totalRows") + rows: List[Dict[str, Any]] + columns: List[str] + applied_filters: List[Filter] = Field(alias="appliedFilters") + + model_config = ConfigDict(populate_by_name=True) + + +class ChartInput(BaseModel): + dataset_id: str = Field(alias="datasetId") + config: ChartConfig + filters: List[Filter] = Field(default_factory=list) + + model_config = ConfigDict(populate_by_name=True, extra="ignore") + + +class BarChartSeries(BaseModel): + category: Any + value: float + color: Optional[str] = None + + +class ScatterPoint(BaseModel): + x: float + y: float + color: Optional[str] = None + + +class HistogramBin(BaseModel): + bin_start: float = Field(alias="binStart") + bin_end: float = Field(alias="binEnd") + count: int + + +class ChartResponse(BaseModel): + dataset_id: str = Field(alias="datasetId") + chart_type: ChartType = Field(alias="chartType") + series: Optional[List[BarChartSeries]] = None + points: Optional[List[ScatterPoint]] = None + bins: Optional[List[HistogramBin]] = None + config: ChartConfig + + model_config = ConfigDict(populate_by_name=True) + + +class OpenResponse(BaseModel): + datasets: List[DatasetSummary] + active_dataset_id: Optional[str] = Field(default=None, alias="activeDatasetId") + supports_chunk_upload: bool = Field(default=False, alias="supportsChunkUpload") + max_upload_bytes: int = Field(default=0, alias="maxUploadBytes") + + model_config = ConfigDict(populate_by_name=True) diff --git a/data_explorer_server_python/store.py b/data_explorer_server_python/store.py new file mode 100644 index 0000000..ebda2a2 --- /dev/null +++ b/data_explorer_server_python/store.py @@ -0,0 +1,82 @@ +from __future__ import annotations + +from dataclasses import dataclass +from datetime import datetime, timezone +from threading import Lock +from typing import Dict, Iterable, Optional +from uuid import uuid4 + +import pandas as pd + + +@dataclass(slots=True) +class DatasetRecord: + dataset_id: str + name: str + filename: Optional[str] + created_at: datetime + updated_at: datetime + dataframe: pd.DataFrame + profile: Dict + row_count: int + column_count: int + + +class DatasetStore: + """In-memory dataset registry keyed by dataset id.""" + + def __init__(self) -> None: + self._datasets: Dict[str, DatasetRecord] = {} + self._lock = Lock() + + def create( + self, + *, + name: str, + filename: Optional[str], + dataframe: pd.DataFrame, + profile: Dict, + ) -> DatasetRecord: + dataset_id = str(uuid4()) + now = datetime.now(timezone.utc) + record = DatasetRecord( + dataset_id=dataset_id, + name=name, + filename=filename, + created_at=now, + updated_at=now, + dataframe=dataframe, + profile=profile, + row_count=int(dataframe.shape[0]), + column_count=int(dataframe.shape[1]), + ) + with self._lock: + self._datasets[dataset_id] = record + return record + + def get(self, dataset_id: str) -> DatasetRecord: + with self._lock: + record = self._datasets.get(dataset_id) + if record is None: + raise KeyError(dataset_id) + return record + + def update_profile(self, dataset_id: str, profile: Dict) -> None: + with self._lock: + if dataset_id not in self._datasets: + raise KeyError(dataset_id) + record = self._datasets[dataset_id] + record.profile = profile + record.row_count = int(record.dataframe.shape[0]) + record.column_count = int(record.dataframe.shape[1]) + record.updated_at = datetime.now(timezone.utc) + + def list_recent(self, limit: int = 5) -> Iterable[DatasetRecord]: + with self._lock: + records = list(self._datasets.values()) + records.sort(key=lambda rec: rec.updated_at, reverse=True) + return records[:limit] + + def clear(self) -> None: + with self._lock: + self._datasets.clear() diff --git a/data_explorer_server_python/tests/test_server.py b/data_explorer_server_python/tests/test_server.py new file mode 100644 index 0000000..0203443 --- /dev/null +++ b/data_explorer_server_python/tests/test_server.py @@ -0,0 +1,210 @@ +from __future__ import annotations + +import math +import tempfile +import unittest +from pathlib import Path + +from fastapi import HTTPException + +from data_explorer_server_python import main +from data_explorer_server_python.schemas import ChartInput, PreviewInput + + +class DataExplorerServerTests(unittest.TestCase): + def setUp(self) -> None: + main.store.clear() + + def test_health_endpoint(self) -> None: + from fastapi.testclient import TestClient + + client = TestClient(main.app) + response = client.get("/health") + self.assertEqual(response.status_code, 200) + self.assertEqual(response.json(), {"status": "ok"}) + + def test_widget_html_inlines_assets(self) -> None: + html = main.WIDGET_HTML + self.assertNotIn("http://localhost:4444", html) + if main._WIDGET_SCRIPT is not None: + self.assertIn(main.SCRIPT_URI, html) + else: + self.assertIn(" None: + payload = { + "datasetId": "abc123", + "filters": [ + {"type": "equals", "column": "city", "value": "SF"}, + {"type": "range", "column": "value", "min": 5, "max": 15}, + ], + } + + model = PreviewInput.model_validate(payload) + self.assertEqual(len(model.filters), 2) + self.assertEqual(model.filters[0].type, "equals") + self.assertEqual(model.filters[1].type, "range") + + def test_upload_preview_and_chart_flow(self) -> None: + upload_response = main._handle_upload( + { + "datasetName": "Sample Cities", + "csvText": "city,value\nSF,10\nNYC,20\nSF,12\nLA,8\n", + "filename": "cities.csv", + } + ) + + dataset_id = upload_response.dataset.dataset_id + self.assertEqual(upload_response.dataset.row_count, 4) + self.assertIn("city", upload_response.columns) + self.assertIn("value", upload_response.columns) + + preview_response = main._handle_preview( + { + "datasetId": dataset_id, + "limit": 10, + "offset": 0, + "filters": [{"type": "equals", "column": "city", "value": "SF"}], + } + ) + self.assertEqual(preview_response.total_rows, 2) + self.assertTrue(all(row["city"] == "SF" for row in preview_response.rows)) + + chart_payload = ChartInput.model_validate( + { + "datasetId": dataset_id, + "config": { + "chartType": "bar", + "x": "city", + "aggregation": "count", + }, + "filters": [], + } + ) + + chart_response = main._handle_chart(chart_payload.model_dump(by_alias=True)) + self.assertEqual(chart_response["chartType"], "bar") + categories = { + item["category"]: item["value"] for item in chart_response["series"] + } + self.assertEqual(categories["SF"], 2) + + histogram_response = main._handle_chart( + { + "datasetId": dataset_id, + "config": { + "chartType": "histogram", + "x": "value", + "binCount": 2, + }, + "filters": [], + } + ) + self.assertEqual(histogram_response["chartType"], "histogram") + total_counts = sum(bin_["count"] for bin_ in histogram_response["bins"]) + self.assertEqual(total_counts, 4) + self.assertTrue( + all(math.isfinite(bin_["binStart"]) for bin_ in histogram_response["bins"]) + ) + + def test_chunked_upload_flow(self) -> None: + init_response = main._handle_upload_init( + { + "datasetName": "Chunked Dataset", + "filename": "chunked.csv", + "hasHeader": True, + } + ) + upload_id = init_response.upload_id + status = main._handle_upload_chunk( + { + "uploadId": upload_id, + "chunkText": "city,value\nSF,10\nNYC", + "isFinal": False, + "chunkIndex": 0, + } + ) + self.assertFalse(status["isFinalized"]) + self.assertGreater(status["receivedBytes"], 0) + + final = main._handle_upload_chunk( + { + "uploadId": upload_id, + "chunkText": ",20\nLA,8\n", + "isFinal": True, + "chunkIndex": 1, + } + ) + self.assertTrue(final["isFinalized"]) + self.assertEqual(final["dataset"]["datasetName"], "Chunked Dataset") + self.assertEqual(final["dataset"]["rowCount"], 3) + + def test_chunked_upload_ignores_future_fields(self) -> None: + init_response = main._handle_upload_init( + { + "datasetName": "Future Fields", + "filename": "chunked.csv", + "hasHeader": True, + "chunkSize": 1234, + "someNewField": "value", + } + ) + status = main._handle_upload_chunk( + { + "uploadId": init_response.upload_id, + "chunkText": "city,value\nSF,10\n", + "isFinal": True, + "chunkIndex": 0, + "newFlag": True, + } + ) + self.assertTrue(status["isFinalized"]) + self.assertEqual(status["dataset"]["datasetName"], "Future Fields") + + def test_upload_via_file_path(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + csv_path = Path(tmpdir) / "cities.csv" + csv_path.write_text("city,value\nSF,10\nNYC,20\n", encoding="utf-8") + + original_roots = main._ALLOWED_UPLOAD_ROOTS + main._ALLOWED_UPLOAD_ROOTS = (Path(tmpdir).resolve(),) + try: + upload_response = main._handle_upload( + { + "datasetName": "Path Upload", + "filePath": str(csv_path), + } + ) + finally: + main._ALLOWED_UPLOAD_ROOTS = original_roots + + self.assertEqual(upload_response.dataset.dataset_name, "Path Upload") + self.assertEqual(upload_response.dataset.filename, "cities.csv") + self.assertEqual(upload_response.dataset.row_count, 2) + + def test_upload_via_file_path_rejected_without_allowlist(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + csv_path = Path(tmpdir) / "cities.csv" + csv_path.write_text("city,value\nSF,10\nNYC,20\n", encoding="utf-8") + + original_roots = main._ALLOWED_UPLOAD_ROOTS + main._ALLOWED_UPLOAD_ROOTS = () + + try: + with self.assertRaises(HTTPException): + main._handle_upload( + { + "datasetName": "Blocked Path", + "filePath": str(csv_path), + } + ) + finally: + main._ALLOWED_UPLOAD_ROOTS = original_roots + + +if __name__ == "__main__": + unittest.main() diff --git a/data_explorer_server_python/utils.py b/data_explorer_server_python/utils.py new file mode 100644 index 0000000..b91d2a6 --- /dev/null +++ b/data_explorer_server_python/utils.py @@ -0,0 +1,110 @@ +from __future__ import annotations + +from datetime import datetime +from typing import Any, List + +import numpy as np +import pandas as pd + + +def to_python_value(value: Any) -> Any: + """Convert pandas/numpy scalars into JSON-serializable Python primitives.""" + if value is None: + return None + + if isinstance(value, (str, bool, int, float)): + return value + + if isinstance(value, datetime): + return value.isoformat() + + if isinstance(value, (np.integer,)): + return int(value) + + if isinstance(value, (np.floating,)): + return float(value) + + if isinstance(value, (np.bool_,)): + return bool(value) + + if isinstance(value, (pd.Timestamp, pd.Timedelta)): + return value.isoformat() + + if pd.isna(value): + return None + + return str(value) + + +def series_sample(series: pd.Series, limit: int = 5) -> List[Any]: + values = series.dropna().head(limit).tolist() + return [to_python_value(v) for v in values] + + +def dataframe_preview( + dataframe: pd.DataFrame, *, limit: int, offset: int = 0 +) -> List[dict]: + frame = dataframe.iloc[offset : offset + limit] + converted = frame.convert_dtypes() + records = converted.to_dict(orient="records") + return [{k: to_python_value(v) for k, v in row.items()} for row in records] + + +def coerce_value_for_series(series: pd.Series, value: Any) -> Any: + if value is None or pd.isna(value): + return value + + dtype = series.dtype + try: + if pd.api.types.is_numeric_dtype(dtype): + if isinstance(value, str) and value.strip() == "": + return None + return float(value) + if pd.api.types.is_bool_dtype(dtype): + if isinstance(value, str): + lowered = value.strip().lower() + if lowered in {"true", "1", "yes"}: + return True + if lowered in {"false", "0", "no"}: + return False + return bool(value) + if pd.api.types.is_datetime64_any_dtype(dtype): + return pd.to_datetime(value, errors="coerce") + except Exception: + return value + return value + + +def ensure_column_exists(dataframe: pd.DataFrame, column: str) -> pd.Series: + if column not in dataframe.columns: + raise KeyError(f"Column '{column}' not found") + return dataframe[column] + + +def infer_role(series: pd.Series) -> str: + dtype = series.dtype + if pd.api.types.is_bool_dtype(dtype): + return "boolean" + if pd.api.types.is_numeric_dtype(dtype): + return "numeric" + if pd.api.types.is_datetime64_any_dtype(dtype): + return "datetime" + if pd.api.types.is_categorical_dtype(dtype): + return "categorical" + if pd.api.types.is_string_dtype(dtype): + unique_ratio = series.nunique(dropna=True) / max(len(series), 1) + return "text" if unique_ratio > 0.6 else "categorical" + return "text" + + +def top_value_counts(series: pd.Series, limit: int = 5) -> List[dict]: + counts = series.value_counts(dropna=True).head(limit) + total = counts.sum() or 1 + return [ + { + "value": to_python_value(index), + "count": int(count), + "percentage": float(count / total), + } + for index, count in counts.items() + ] diff --git a/package.json b/package.json index 3a92c3f..e2cf3c3 100644 --- a/package.json +++ b/package.json @@ -49,6 +49,7 @@ "embla-carousel": "^8.0.0", "embla-carousel-react": "^8.0.0", "partial-json": "^0.1.7", + "recharts": "^2.13.3", "react": "^19.1.1", "react-datepicker": "^8.4.0", "react-datepicker.css": "link:react-datepicker/dist/react-datepicker.css", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index ba08cb1..3d7a41b 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -77,6 +77,9 @@ importers: react-router-dom: specifier: ^7.8.2 version: 7.8.2(react-dom@19.1.1(react@19.1.1))(react@19.1.1) + recharts: + specifier: ^2.13.3 + version: 2.15.4(react-dom@19.1.1(react@19.1.1))(react@19.1.1) three: specifier: ^0.179.1 version: 0.179.1 @@ -1368,6 +1371,9 @@ packages: supports-color: optional: true + decimal.js-light@2.5.1: + resolution: {integrity: sha512-qIMFpTMZmny+MMIitAB6D7iVPEorVw6YQRWkvarTkT4tBeSLLiHzcwj6q0MmYSFCiVpiqPJTJEYIrpcPzVEIvg==} + decimal.js@10.6.0: resolution: {integrity: sha512-YpgQiITW3JXGntzdUmyUR1V812Hn8T1YVXhCu+wO3OpS4eU9l4YdD3qjyiKdV6mvV29zapkMeD390UVEf2lkUg==} @@ -1389,6 +1395,9 @@ packages: resolution: {integrity: sha512-3UDv+G9CsCKO1WKMGw9fwq/SWJYbI0c5Y7LU1AXYoDdbhE2AHQ6N6Nb34sG8Fj7T5APy8qXDCKuuIHd1BR0tVA==} engines: {node: '>=8'} + dom-helpers@5.2.1: + resolution: {integrity: sha512-nRCa7CK3VTrM2NmGkIy4cbK7IZlgBE/PYMn55rrXefr5xXDP0LdtfPnblFDoVdcAfslJ7or6iqAUnx0CCGIWQA==} + dompurify@3.2.6: resolution: {integrity: sha512-/2GogDQlohXPZe6D6NOgQvXLPSYBqIWMnZ8zzOhn09REE4eyAzb+Hed3jhoM9OkuaJ8P6ZGTTVWQKAi8ieIzfQ==} @@ -1436,6 +1445,9 @@ packages: resolution: {integrity: sha512-WUj2qlxaQtO4g6Pq5c29GTcWGDyd8itL8zTlipgECz3JesAiiOKotd8JU6otB3PACgG6xkJUyVhboMS+bje/jA==} engines: {node: '>=6'} + eventemitter3@4.0.7: + resolution: {integrity: sha512-8guHBZCwKnFhYdHr2ysuRWErTwhoN2X8XELRlrRwpmfeY2jjuUN4taQMsULKUVo1K4DvZl+0pgfyoysHxvmvEw==} + execa@5.1.1: resolution: {integrity: sha512-8uSpZZocAZRBAPIEINJj3Lo9HyGitllczc27Eh5YYojjMFMn8yHMDMaUHE2Jqfq05D/wucwI4JGURyXt1vchyg==} engines: {node: '>=10'} @@ -1446,6 +1458,10 @@ packages: fast-deep-equal@3.1.3: resolution: {integrity: sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q==} + fast-equals@5.3.2: + resolution: {integrity: sha512-6rxyATwPCkaFIL3JLqw8qXqMpIZ942pTX/tbQFkRsDGblS8tNGtlUauA/+mt6RUfqn/4MoEr+WDkYoIQbibWuQ==} + engines: {node: '>=6.0.0'} + fast-glob@3.3.3: resolution: {integrity: sha512-7MptL8U0cqcFdzIzwOTHoilX9x5BrNqye7Z/LuC7kCMRio1EMSyqRK3BEAUD7sXRq4iT4AzTVuZdhgQ2TCvYLg==} engines: {node: '>=8.6.0'} @@ -1745,6 +1761,10 @@ packages: lodash@4.17.21: resolution: {integrity: sha512-v2kDEe57lecTulaDIuNTPy3Ry4gLGJ6Z1O3vE1krgXZNrsQ+LFTGHVxVjcXPs17LhbZVGedAJv8XZ1tvj5FvSg==} + loose-envify@1.4.0: + resolution: {integrity: sha512-lyuxPGr/Wfhrlem2CL/UcnUc1zcqKAImBDzukY7Y5F/yQiNdko6+fRLevlw1HgMySw7f611UIY408EtxRSoK3Q==} + hasBin: true + lru-cache@5.1.1: resolution: {integrity: sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==} @@ -1888,6 +1908,10 @@ packages: resolution: {integrity: sha512-S48WzZW777zhNIrn7gxOlISNAqi9ZC/uQFnRdbeIHhZhCA6UqpkOT8T1G7BvfdgP4Er8gF4sUbaS0i7QvIfCWw==} engines: {node: '>=8'} + object-assign@4.1.1: + resolution: {integrity: sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==} + engines: {node: '>=0.10.0'} + on-headers@1.0.2: resolution: {integrity: sha512-pZAE+FJLoyITytdqK0U5s+FIpjN0JP3OzFi/u8Rx+EV5/W+JTWGXG8xFzevE7AjBfDqHv/8vL8qQsIhHnqRkrA==} engines: {node: '>= 0.8'} @@ -1966,6 +1990,9 @@ packages: promise-worker-transferable@1.0.4: resolution: {integrity: sha512-bN+0ehEnrXfxV2ZQvU2PetO0n4gqBD4ulq3MI1WOPLgr7/Mg9yRQkX5+0v1vagr74ZTsl7XtzlaYDo2EuCeYJw==} + prop-types@15.8.1: + resolution: {integrity: sha512-oj87CgZICdulUohogVAR7AjlC0327U4el4L6eAvOqCeudMDVU0NThNaV+b9Df4dXgSP1gXMTnPdhfe/2qDH5cg==} + protocol-buffers-schema@3.6.0: resolution: {integrity: sha512-TdDRD+/QNdrCGCE7v8340QyuXd4kIWIgapsE2+n/SaGiSSbomYl4TjHlvIoCWRpE7wFt02EpB35VVA2ImcBVqw==} @@ -2017,6 +2044,9 @@ packages: react-is@16.13.1: resolution: {integrity: sha512-24e6ynE2H+OKt4kqsOvNd8kBpV65zoxbA4BVsEOB3ARVWQki/DHzaUoC5KuON/BiccDaCCTZBuOcfZs70kR8bQ==} + react-is@18.3.1: + resolution: {integrity: sha512-/LLMVyas0ljjAtoYiPqYiL8VWXzUUdThrmU5+n20DZv+a+ClRoevUzw5JxU+Ieh5/c87ytoTBV9G1FiKfNJdmg==} + react-reconciler@0.31.0: resolution: {integrity: sha512-7Ob7Z+URmesIsIVRjnLoDGwBEG/tVitidU0nMsqX/eeJaLY89RISO/10ERe0MqmzuKUUB1rmY+h1itMbUHg9BQ==} engines: {node: '>=0.10.0'} @@ -2044,6 +2074,18 @@ packages: react-dom: optional: true + react-smooth@4.0.4: + resolution: {integrity: sha512-gnGKTpYwqL0Iii09gHobNolvX4Kiq4PKx6eWBCYYix+8cdw+cGo3do906l1NBPKkSWx1DghC1dlWG9L2uGd61Q==} + peerDependencies: + react: ^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0 + react-dom: ^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0 + + react-transition-group@4.4.5: + resolution: {integrity: sha512-pZcd1MCJoiKiBR2NRxeCRg13uCXbydPnmB4EOeRrY7480qNWO8IIgQG6zlDkm6uRMsURXPuKq0GWtiM59a5Q6g==} + peerDependencies: + react: '>=16.6.0' + react-dom: '>=16.6.0' + react-use-measure@2.1.7: resolution: {integrity: sha512-KrvcAo13I/60HpwGO5jpW7E9DfusKyLPLvuHlUyP5zqnmAPhNc6qTRjUQrdTADl0lpPpDVU2/Gg51UlOGHXbdg==} peerDependencies: @@ -2057,6 +2099,16 @@ packages: resolution: {integrity: sha512-w8nqGImo45dmMIfljjMwOGtbmC/mk4CMYhWIicdSflH91J9TyCyczcPFXJzrZ/ZXcgGRFeP6BU0BEJTw6tZdfQ==} engines: {node: '>=0.10.0'} + recharts-scale@0.4.5: + resolution: {integrity: sha512-kivNFO+0OcUNu7jQquLXAxz1FIwZj8nrj+YkOKc5694NbjCvcT6aSZiIzNzd2Kul4o4rTto8QVR9lMNtxD4G1w==} + + recharts@2.15.4: + resolution: {integrity: sha512-UT/q6fwS3c1dHbXv2uFgYJ9BMFHu3fwnd7AYZaEQhXuYQ4hgsxLvsUXzGdKeZrW5xopzDCvuA2N41WJ88I7zIw==} + engines: {node: '>=14'} + peerDependencies: + react: ^16.0.0 || ^17.0.0 || ^18.0.0 || ^19.0.0 + react-dom: ^16.0.0 || ^17.0.0 || ^18.0.0 || ^19.0.0 + registry-auth-token@3.3.2: resolution: {integrity: sha512-JL39c60XlzCVgNrO+qq68FoNb56w/m7JYvGR2jT5iR1xBrUA3Mfx5Twk5rqTThPmQKMWydGmq8oFtDlxfrmxnQ==} @@ -2232,6 +2284,9 @@ packages: three@0.179.1: resolution: {integrity: sha512-5y/elSIQbrvKOISxpwXCR4sQqHtGiOI+MKLc3SsBdDXA2hz3Mdp3X59aUp8DyybMa34aeBwbFTpdoLJaUDEWSw==} + tiny-invariant@1.3.3: + resolution: {integrity: sha512-+FbBPE1o9QAYvviau/qC5SE3caw21q3xkvWKBtja5vgqOWIHHJ3ioaq1VPfn/Szqctz2bU/oYeKd9/z5BL+PVg==} + tinyexec@1.0.1: resolution: {integrity: sha512-5uC6DDlmeqiOwCPmK9jMSdOuZTh8bU39Ys6yidB+UTt5hfZUPGAypSgFRiEp+jbi9qH40BLDvy85jIU88wKSqw==} @@ -2329,6 +2384,9 @@ packages: resolution: {integrity: sha512-BNGbWLfd0eUPabhkXUVm0j8uuvREyTh5ovRa/dyow/BqAbZJyC+5fU+IzQOzmAKzYqYRAISoRhdQr3eIZ/PXqg==} engines: {node: '>= 0.8'} + victory-vendor@36.9.2: + resolution: {integrity: sha512-PnpQQMuxlwYdocC8fIJqVXvkeViHYzotI+NJrCuav0ZYFoq912ZHBk3mCeuj+5/VpodOjPe1z0Fk2ihgzlXqjQ==} + vite@7.1.1: resolution: {integrity: sha512-yJ+Mp7OyV+4S+afWo+QyoL9jFWD11QFH0i5i7JypnfTcA1rmgxCbiA8WwAICDEtZ1Z1hzrVhN8R8rGTqkTY8ZQ==} engines: {node: ^20.19.0 || >=22.12.0} @@ -3672,6 +3730,8 @@ snapshots: dependencies: ms: 2.1.3 + decimal.js-light@2.5.1: {} + decimal.js@10.6.0: {} deep-extend@0.6.0: {} @@ -3688,6 +3748,11 @@ snapshots: detect-libc@2.0.4: {} + dom-helpers@5.2.1: + dependencies: + '@babel/runtime': 7.28.2 + csstype: 3.1.3 + dompurify@3.2.6: optionalDependencies: '@types/trusted-types': 2.0.7 @@ -3752,6 +3817,8 @@ snapshots: escalade@3.2.0: {} + eventemitter3@4.0.7: {} + execa@5.1.1: dependencies: cross-spawn: 7.0.6 @@ -3768,6 +3835,8 @@ snapshots: fast-deep-equal@3.1.3: {} + fast-equals@5.3.2: {} + fast-glob@3.3.3: dependencies: '@nodelib/fs.stat': 2.0.5 @@ -4001,6 +4070,10 @@ snapshots: lodash@4.17.21: {} + loose-envify@1.4.0: + dependencies: + js-tokens: 4.0.0 + lru-cache@5.1.1: dependencies: yallist: 3.1.1 @@ -4167,6 +4240,8 @@ snapshots: dependencies: path-key: 3.1.1 + object-assign@4.1.1: {} + on-headers@1.0.2: {} onetime@5.1.2: @@ -4237,6 +4312,12 @@ snapshots: is-promise: 2.2.2 lie: 3.3.0 + prop-types@15.8.1: + dependencies: + loose-envify: 1.4.0 + object-assign: 4.1.1 + react-is: 16.13.1 + protocol-buffers-schema@3.6.0: {} punycode@2.3.1: {} @@ -4292,6 +4373,8 @@ snapshots: react-is@16.13.1: {} + react-is@18.3.1: {} + react-reconciler@0.31.0(react@19.1.1): dependencies: react: 19.1.1 @@ -4313,6 +4396,23 @@ snapshots: optionalDependencies: react-dom: 19.1.1(react@19.1.1) + react-smooth@4.0.4(react-dom@19.1.1(react@19.1.1))(react@19.1.1): + dependencies: + fast-equals: 5.3.2 + prop-types: 15.8.1 + react: 19.1.1 + react-dom: 19.1.1(react@19.1.1) + react-transition-group: 4.4.5(react-dom@19.1.1(react@19.1.1))(react@19.1.1) + + react-transition-group@4.4.5(react-dom@19.1.1(react@19.1.1))(react@19.1.1): + dependencies: + '@babel/runtime': 7.28.2 + dom-helpers: 5.2.1 + loose-envify: 1.4.0 + prop-types: 15.8.1 + react: 19.1.1 + react-dom: 19.1.1(react@19.1.1) + react-use-measure@2.1.7(react-dom@19.1.1(react@19.1.1))(react@19.1.1): dependencies: react: 19.1.1 @@ -4321,6 +4421,23 @@ snapshots: react@19.1.1: {} + recharts-scale@0.4.5: + dependencies: + decimal.js-light: 2.5.1 + + recharts@2.15.4(react-dom@19.1.1(react@19.1.1))(react@19.1.1): + dependencies: + clsx: 2.1.1 + eventemitter3: 4.0.7 + lodash: 4.17.21 + react: 19.1.1 + react-dom: 19.1.1(react@19.1.1) + react-is: 18.3.1 + react-smooth: 4.0.4(react-dom@19.1.1(react@19.1.1))(react@19.1.1) + recharts-scale: 0.4.5 + tiny-invariant: 1.3.3 + victory-vendor: 36.9.2 + registry-auth-token@3.3.2: dependencies: rc: 1.2.8 @@ -4517,6 +4634,8 @@ snapshots: three@0.179.1: {} + tiny-invariant@1.3.3: {} + tinyexec@1.0.1: {} tinyglobby@0.2.14: @@ -4602,6 +4721,23 @@ snapshots: vary@1.1.2: {} + victory-vendor@36.9.2: + dependencies: + '@types/d3-array': 3.2.1 + '@types/d3-ease': 3.0.2 + '@types/d3-interpolate': 3.0.4 + '@types/d3-scale': 4.0.9 + '@types/d3-shape': 3.1.7 + '@types/d3-time': 3.0.4 + '@types/d3-timer': 3.0.2 + d3-array: 3.2.4 + d3-ease: 3.0.1 + d3-interpolate: 3.0.1 + d3-scale: 4.0.2 + d3-shape: 3.2.0 + d3-time: 3.1.0 + d3-timer: 3.0.1 + vite@7.1.1(@types/node@24.3.0)(jiti@2.5.1)(lightningcss@1.30.1)(tsx@4.20.4): dependencies: esbuild: 0.25.8 diff --git a/src/data-explorer/App.tsx b/src/data-explorer/App.tsx new file mode 100644 index 0000000..224a99c --- /dev/null +++ b/src/data-explorer/App.tsx @@ -0,0 +1,1335 @@ +import { + ChangeEvent, + FormEvent, + useCallback, + useEffect, + useMemo, + useState, +} from "react"; +import clsx from "clsx"; +import { + Bar, + BarChart, + CartesianGrid, + Cell, + Legend, + ResponsiveContainer, + Scatter, + ScatterChart, + Tooltip, + XAxis, + YAxis, +} from "recharts"; +import { useMaxHeight } from "../use-max-height"; +import { useWidgetProps } from "../use-widget-props"; +import { useWidgetState } from "../use-widget-state"; +import { useOpenAiGlobal } from "../use-openai-global"; +import { callToolJson } from "./utils/callTool"; +import { + formatBytes, + formatNumber, + formatPercentage, + formatValue, +} from "./utils/format"; +import type { + ChartConfig, + ChartResponse, + ChartType, + DatasetColumnProfile, + DatasetSummary, + FilterDefinition, + OpenResponse, + PreviewResponse, + UploadChunkFinal, + UploadChunkResponse, + UploadDatasetResponse, + UploadInitResponse, + WidgetState, +} from "./types"; + +const DEFAULT_PREVIEW_LIMIT = 20; +const COLOR_PALETTE = ["#2563eb", "#f97316", "#16a34a", "#9333ea", "#facc15"]; +const DEFAULT_MAX_UPLOAD_BYTES = 10 * 1024 * 1024; +const DEFAULT_CHARS_PER_CHUNK = 180_000; + +function normalizeChartConfig(config: ChartConfig): ChartConfig { + const base: ChartConfig = { + chartType: config.chartType, + x: config.x, + }; + + if (config.chartType === "bar") { + base.aggregation = config.aggregation ?? "count"; + if (config.y) { + base.y = config.y; + } + if (config.color) { + base.color = config.color; + } + return base; + } + + if (config.chartType === "scatter") { + if (config.y) { + base.y = config.y; + } + if (config.color) { + base.color = config.color; + } + return base; + } + + if (config.chartType === "histogram") { + if (config.binCount != null) { + base.binCount = config.binCount; + } + return base; + } + + return config; +} + +export function App(): JSX.Element { + const maxHeight = useMaxHeight() ?? undefined; + const displayMode = useOpenAiGlobal("displayMode"); + const defaultWidgetProps = useMemo( + () => ({ + datasets: [], + activeDatasetId: null, + supportsChunkUpload: false, + maxUploadBytes: DEFAULT_MAX_UPLOAD_BYTES, + }), + [] + ); + const widgetProps = useWidgetProps(defaultWidgetProps); + + const baseWidgetState: WidgetState = useMemo( + () => ({ + datasetId: widgetProps.activeDatasetId ?? null, + filters: [], + preview: { limit: DEFAULT_PREVIEW_LIMIT, offset: 0 }, + chartConfig: null, + }), + [widgetProps.activeDatasetId] + ); + + const [widgetState, setWidgetState] = useWidgetState( + () => baseWidgetState + ); + + const currentState = widgetState ?? baseWidgetState; + + const mergeWidgetState = useCallback( + ( + updater: + | Partial + | ((current: WidgetState) => WidgetState | Partial) + ) => { + setWidgetState((previous) => { + const baseline = previous ?? baseWidgetState; + if (typeof updater === "function") { + return updater(baseline) as WidgetState; + } + return { ...baseline, ...updater }; + }); + }, + [baseWidgetState, setWidgetState] + ); + + useEffect(() => { + if (!widgetState?.datasetId && baseWidgetState.datasetId) { + mergeWidgetState({ datasetId: baseWidgetState.datasetId }); + } + }, [widgetState?.datasetId, baseWidgetState.datasetId, mergeWidgetState]); + + const datasetsFromProps = widgetProps?.datasets ?? []; + const [datasets, setDatasets] = useState( + datasetsFromProps + ); + useEffect(() => { + setDatasets(datasetsFromProps); + }, [datasetsFromProps]); + + const currentDatasetId = currentState.datasetId; + const currentDataset = useMemo( + () => + currentDatasetId + ? datasets.find((item) => item.datasetId === currentDatasetId) ?? null + : null, + [datasets, currentDatasetId] + ); + + const [preview, setPreview] = useState(null); + const [chart, setChart] = useState(null); + const [loading, setLoading] = useState({ + upload: false, + preview: false, + chart: false, + }); + const [uploadStatus, setUploadStatus] = useState(null); + const [error, setError] = useState(null); + + const filters = currentState.filters ?? []; + const previewSettings = currentState.preview ?? { + limit: DEFAULT_PREVIEW_LIMIT, + offset: 0, + }; + + const defaultChartConfig = useMemo(() => { + const firstColumn = currentDataset?.profile.columns[0]; + return { + chartType: "bar" as ChartType, + x: firstColumn?.name ?? "", + aggregation: "count", + }; + }, [currentDataset]); + + const [chartConfigDraft, setChartConfigDraft] = useState( + currentState.chartConfig ?? defaultChartConfig + ); + + useEffect(() => { + setChartConfigDraft(currentState.chartConfig ?? defaultChartConfig); + }, [currentState.chartConfig, defaultChartConfig]); + + useEffect(() => { + if (!currentDatasetId) { + setPreview(null); + setChart(null); + return; + } + + let isCancelled = false; + setLoading((state) => ({ ...state, preview: true })); + + callToolJson("data-explorer.preview", { + datasetId: currentDatasetId, + filters, + limit: previewSettings.limit, + offset: previewSettings.offset, + }) + .then((response) => { + if (isCancelled) { + return; + } + setPreview(response); + setError(null); + }) + .catch((err) => { + if (isCancelled) { + return; + } + setError( + err instanceof Error ? err.message : "Failed to load preview data." + ); + }) + .finally(() => { + if (!isCancelled) { + setLoading((state) => ({ ...state, preview: false })); + } + }); + + return () => { + isCancelled = true; + }; + }, [ + currentDatasetId, + filters, + previewSettings.limit, + previewSettings.offset, + ]); + + const maxUploadBytes = widgetProps?.maxUploadBytes ?? DEFAULT_MAX_UPLOAD_BYTES; + const supportsChunkUpload = widgetProps?.supportsChunkUpload ?? false; + const chunkThreshold = useMemo(() => { + return Math.max(512 * 1024, Math.floor(maxUploadBytes * 0.6)); + }, [maxUploadBytes]); + + const handleUpload = useCallback( + async (file: File, datasetName: string) => { + setLoading((state) => ({ ...state, upload: true })); + setUploadStatus(null); + try { + const friendlyName = datasetName.trim() || file.name; + const useChunkedUpload = + supportsChunkUpload && file.size > chunkThreshold; + + let response: UploadDatasetResponse | UploadChunkFinal; + + if (useChunkedUpload) { + response = await uploadFileInChunks({ + file, + datasetName: friendlyName, + maxUploadBytes, + setStatus: setUploadStatus, + }); + } else { + const text = await file.text(); + response = await callToolJson( + "data-explorer.upload", + { + datasetName: friendlyName, + csvText: text, + filename: file.name, + } + ); + } + + setDatasets((previous) => { + const filtered = previous.filter( + (item) => item.datasetId !== response.dataset.datasetId + ); + return [response.dataset, ...filtered]; + }); + + mergeWidgetState({ + datasetId: response.dataset.datasetId, + filters: [], + preview: { limit: DEFAULT_PREVIEW_LIMIT, offset: 0 }, + chartConfig: null, + }); + + setPreview({ + datasetId: response.dataset.datasetId, + totalRows: response.dataset.rowCount, + rows: response.preview, + columns: response.columns, + appliedFilters: [], + }); + setChart(null); + setError(null); + } catch (err) { + setError( + err instanceof Error ? err.message : "Failed to upload dataset." + ); + } finally { + setUploadStatus(null); + setLoading((state) => ({ ...state, upload: false })); + } + }, + [chunkThreshold, maxUploadBytes, mergeWidgetState, supportsChunkUpload] + ); + + const handleSelectDataset = useCallback( + (datasetId: string | null) => { + mergeWidgetState((state) => ({ + ...state, + datasetId, + preview: { ...state.preview, offset: 0 }, + })); + setChart(null); + }, + [mergeWidgetState] + ); + + const handleFiltersChange = useCallback( + (nextFilters: FilterDefinition[]) => { + mergeWidgetState((state) => ({ + ...state, + filters: nextFilters, + preview: { ...state.preview, offset: 0 }, + })); + setChart(null); + }, + [mergeWidgetState] + ); + + const handlePreviewPage = useCallback( + (delta: number) => { + mergeWidgetState((state) => { + const nextOffset = Math.max( + 0, + state.preview.offset + delta * state.preview.limit + ); + return { + ...state, + preview: { ...state.preview, offset: nextOffset }, + }; + }); + }, + [mergeWidgetState] + ); + + const handleSetLimit = useCallback( + (limit: number) => { + mergeWidgetState((state) => ({ + ...state, + preview: { limit, offset: 0 }, + })); + }, + [mergeWidgetState] + ); + + const handleBuildChart = useCallback( + async (config: ChartConfig) => { + if (!currentDatasetId) { + return; + } + const normalizedConfig = normalizeChartConfig(config); + setLoading((state) => ({ ...state, chart: true })); + try { + const response = await callToolJson( + "data-explorer.chart", + { + datasetId: currentDatasetId, + config: normalizedConfig, + filters, + } + ); + setChart(response); + mergeWidgetState((state) => ({ + ...state, + chartConfig: normalizedConfig, + })); + setError(null); + } catch (err) { + setError(err instanceof Error ? err.message : "Chart request failed."); + } finally { + setLoading((state) => ({ ...state, chart: false })); + } + }, + [currentDatasetId, filters, mergeWidgetState] + ); + + const isFullscreen = displayMode === "fullscreen"; + + return ( +
+
+

Data Explorer

+

+ Upload a CSV dataset, inspect column profiles, filter rows, and build + simple charts inline. +

+ {error && ( +
+ {error} +
+ )} +
+ +
+ + +
+ + +
+
+
+ ); +} + +type UploadSectionProps = { + uploading: boolean; + onUpload: (file: File, datasetName: string) => Promise; + statusMessage: string | null; +}; + +function UploadSection({ uploading, onUpload, statusMessage }: UploadSectionProps) { + const [selectedFile, setSelectedFile] = useState(null); + const [datasetName, setDatasetName] = useState(""); + + const handleFileChange = useCallback( + (event: ChangeEvent) => { + const file = event.target.files?.[0] ?? null; + setSelectedFile(file); + if (file) { + const nameWithoutExt = file.name.replace(/\.[^/.]+$/, ""); + setDatasetName(nameWithoutExt); + } + }, + [] + ); + + const handleSubmit = useCallback( + async (event: FormEvent) => { + event.preventDefault(); + if (!selectedFile) { + return; + } + await onUpload(selectedFile, datasetName); + setSelectedFile(null); + setDatasetName(""); + const fileInput = event.currentTarget.elements.namedItem( + "file" + ) as HTMLInputElement | null; + if (fileInput) { + fileInput.value = ""; + } + }, + [datasetName, onUpload, selectedFile] + ); + + return ( +
+

Upload CSV

+
+ + + setDatasetName(event.target.value)} + placeholder="Friendly dataset name" + disabled={uploading} + className="rounded-md border border-neutral-200 px-2 py-1 text-sm focus:border-neutral-400 focus:outline-none focus:ring-1 focus:ring-neutral-400" + /> + + {statusMessage ? ( +

{statusMessage}

+ ) : null} +
+
+ ); +} + +type DatasetSelectorProps = { + datasets: DatasetSummary[]; + activeId: string | null; + onSelect: (datasetId: string | null) => void; +}; + +function DatasetSelector({ + datasets, + activeId, + onSelect, +}: DatasetSelectorProps) { + if (!datasets.length) { + return ( +
+ Upload a dataset to begin exploring. +
+ ); + } + + return ( +
+ + +
+ ); +} + +type ProfileSummaryProps = { + dataset: DatasetSummary | null; +}; + +function ProfileSummary({ dataset }: ProfileSummaryProps) { + if (!dataset) { + return null; + } + + return ( +
+
+

+ {dataset.datasetName} +

+ {dataset.filename && ( +

{dataset.filename}

+ )} +
+ + + + +
+
+
+

+ Columns +

+
    + {dataset.profile.columns.map((column) => ( +
  • +
    + + {column.name} + + {column.role} +
    +
    + + Missing {formatPercentage(column.missingProportion)} ( + {formatNumber(column.missingCount)}) + + Distinct {formatNumber(column.distinctCount)} +
    + {column.stats && ( +
    + {Object.entries(column.stats).map(([key, value]) => ( + + {key}:{" "} + {typeof value === "number" + ? formatNumber(value) + : formatValue(value)} + + ))} +
    + )} +
  • + ))} +
+
+
+ ); +} + +type SummaryStatProps = { + label: string; + value: string; +}; + +function SummaryStat({ label, value }: SummaryStatProps) { + return ( +
+ + {label} + + {value} +
+ ); +} + +type FilterBuilderProps = { + columns: DatasetColumnProfile[]; + filters: FilterDefinition[]; + onFiltersChange: (filters: FilterDefinition[]) => void; +}; + +function FilterBuilder({ + columns, + filters, + onFiltersChange, +}: FilterBuilderProps) { + const [draftColumn, setDraftColumn] = useState(""); + const [draftType, setDraftType] = useState<"equals" | "range">("equals"); + const [draftValue, setDraftValue] = useState(""); + const [draftMin, setDraftMin] = useState(""); + const [draftMax, setDraftMax] = useState(""); + + useEffect(() => { + if (!draftColumn && columns.length) { + setDraftColumn(columns[0].name); + } + }, [columns, draftColumn]); + + const handleAddFilter = useCallback(() => { + if (!draftColumn) { + return; + } + const column = columns.find((item) => item.name === draftColumn); + if (!column) { + return; + } + + const nextFilters: FilterDefinition[] = [...filters]; + + if (draftType === "equals") { + if (!draftValue.trim()) { + return; + } + nextFilters.push({ + type: "equals", + column: draftColumn, + value: coerceDraftValue(column, draftValue), + }); + } else { + const minValue = coerceRangeBoundary(column, draftMin); + const maxValue = coerceRangeBoundary(column, draftMax); + if (minValue == null && maxValue == null) { + return; + } + nextFilters.push({ + type: "range", + column: draftColumn, + min: minValue, + max: maxValue, + }); + } + + onFiltersChange(nextFilters); + setDraftValue(""); + setDraftMin(""); + setDraftMax(""); + }, [columns, draftColumn, draftMax, draftMin, draftType, draftValue, filters, onFiltersChange]); + + const handleRemove = useCallback( + (index: number) => { + onFiltersChange(filters.filter((_, idx) => idx !== index)); + }, + [filters, onFiltersChange] + ); + + if (!columns.length) { + return null; + } + + return ( +
+

Filters

+
+ {filters.length === 0 ? ( +

No filters applied.

+ ) : ( +
    + {filters.map((filter, index) => ( +
  • + + {filter.column}{" "} + {filter.type === "equals" + ? `= ${formatValue(filter.value)}` + : `between ${filter.min ?? "…"} and ${filter.max ?? "…"}`} + + +
  • + ))} +
+ )} + +
+
+ + +
+ {draftType === "equals" ? ( + setDraftValue(event.target.value)} + placeholder="Value" + className="rounded-md border border-neutral-200 px-2 py-1 text-xs focus:border-neutral-400 focus:outline-none focus:ring-1 focus:ring-neutral-400" + /> + ) : ( +
+ setDraftMin(event.target.value)} + placeholder="Min" + className="w-full rounded-md border border-neutral-200 px-2 py-1 text-xs focus:border-neutral-400 focus:outline-none focus:ring-1 focus:ring-neutral-400" + /> + setDraftMax(event.target.value)} + placeholder="Max" + className="w-full rounded-md border border-neutral-200 px-2 py-1 text-xs focus:border-neutral-400 focus:outline-none focus:ring-1 focus:ring-neutral-400" + /> +
+ )} + +
+
+
+ ); +} + +function coerceDraftValue( + column: DatasetColumnProfile, + raw: string +): string | number | boolean | null { + const trimmed = raw.trim(); + if (!trimmed.length) { + return null; + } + if (column.role === "boolean") { + const normalized = trimmed.toLowerCase(); + return ["true", "1", "yes"].includes(normalized); + } + if (column.role === "numeric") { + const numeric = Number(trimmed); + return Number.isNaN(numeric) ? trimmed : numeric; + } + return trimmed; +} + +function coerceRangeBoundary( + column: DatasetColumnProfile, + raw: string +): number | string | null { + const trimmed = raw.trim(); + if (!trimmed.length) { + return null; + } + if (column.role === "numeric") { + const numeric = Number(trimmed); + return Number.isNaN(numeric) ? null : numeric; + } + return trimmed; +} + +type ChunkUploadParams = { + file: File; + datasetName: string; + maxUploadBytes: number; + setStatus: (status: string | null) => void; +}; + +async function uploadFileInChunks({ + file, + datasetName, + maxUploadBytes, + setStatus, +}: ChunkUploadParams): Promise { + setStatus("Initializing upload…"); + const initResponse = await callToolJson( + "data-explorer.uploadInit", + { + datasetName, + filename: file.name, + hasHeader: true, + } + ); + + const fullText = await file.text(); + const chunkSize = Math.max( + 32_000, + Math.min(DEFAULT_CHARS_PER_CHUNK, Math.floor(maxUploadBytes / 8)) + ); + const totalChunks = Math.max(1, Math.ceil(fullText.length / chunkSize)); + let offset = 0; + let chunkIndex = 0; + let lastResponse: UploadChunkResponse | null = null; + + while (offset < fullText.length) { + const chunkText = fullText.slice(offset, offset + chunkSize); + const isFinal = offset + chunkSize >= fullText.length; + setStatus( + isFinal + ? "Finalizing upload…" + : `Uploading chunk ${chunkIndex + 1} of ${totalChunks}…` + ); + + const chunkResponse = await callToolJson( + "data-explorer.uploadChunk", + { + uploadId: initResponse.uploadId, + chunkText, + isFinal, + chunkIndex, + } + ); + + lastResponse = chunkResponse; + offset += chunkSize; + chunkIndex += 1; + } + + if (!lastResponse || lastResponse.isFinalized !== true) { + throw new Error("Upload session did not finalize as expected."); + } + + return lastResponse; +} + +type TablePreviewProps = { + preview: PreviewResponse | null; + loading: boolean; + settings: { limit: number; offset: number }; + onChangePage: (delta: number) => void; + onChangeLimit: (limit: number) => void; +}; + +function TablePreview({ + preview, + loading, + settings, + onChangePage, + onChangeLimit, +}: TablePreviewProps) { + const totalRows = preview?.totalRows ?? 0; + const currentPage = + totalRows === 0 ? 1 : Math.floor(settings.offset / settings.limit) + 1; + const totalPages = + totalRows === 0 ? 1 : Math.ceil(totalRows / settings.limit); + + const columns = preview?.columns ?? []; + + return ( +
+
+
+ Table preview + + {loading ? "Loading…" : `${formatNumber(totalRows)} rows`} + +
+
+ + +
+ + + + Page {currentPage} of {totalPages} + +
+
+
+
+ + + + {columns.map((column) => ( + + ))} + + + + {preview?.rows?.map((row, rowIndex) => ( + + {columns.map((column) => ( + + ))} + + ))} + {!preview && !loading && ( + + + + )} + {loading && ( + + + + )} + +
+ {column} +
+ {formatValue(row[column])} +
+ Select a dataset to load preview data. +
+ Loading rows… +
+
+
+ ); +} + +type ChartBuilderProps = { + dataset: DatasetSummary | null; + draftConfig: ChartConfig; + setDraftConfig: (config: ChartConfig) => void; + chart: ChartResponse | null; + loading: boolean; + onRun: (config: ChartConfig) => void; +}; + +function ChartBuilder({ + dataset, + draftConfig, + setDraftConfig, + chart, + loading, + onRun, +}: ChartBuilderProps) { + const columns = dataset?.profile.columns ?? []; + + const handleChange = ( + key: K, + value: ChartConfig[K] + ) => { + setDraftConfig({ ...draftConfig, [key]: value }); + }; + + const canRun = Boolean(dataset) && Boolean(draftConfig.x); + + return ( +
+
+
+

+ Chart builder +

+ +
+ +
+ handleChange("x", value)} + options={columns.map((column) => ({ + value: column.name, + label: `${column.name} (${column.role})`, + }))} + /> + {draftConfig.chartType !== "histogram" && ( + + handleChange("y", value ? value : undefined) + } + options={[ + { value: "", label: "None" }, + ...columns + .filter((column) => column.role === "numeric") + .map((column) => ({ + value: column.name, + label: `${column.name} (${column.role})`, + })), + ]} + /> + )} + + handleChange("color", value ? value : undefined) + } + options={[ + { value: "", label: "None" }, + ...columns.map((column) => ({ + value: column.name, + label: `${column.name} (${column.role})`, + })), + ]} + /> + {draftConfig.chartType === "bar" && ( + + handleChange("aggregation", value as ChartConfig["aggregation"]) + } + options={[ + { value: "count", label: "Count" }, + { value: "sum", label: "Sum" }, + { value: "avg", label: "Average" }, + ]} + /> + )} + {draftConfig.chartType === "histogram" && ( + + handleChange("binCount", Number(value) || 10) + } + options={["10", "20", "30"].map((count) => ({ + value: count, + label: count, + }))} + /> + )} +
+ + +
+ +
+ {loading && ( +
+ Building chart… +
+ )} + {!loading && chart && } + {!loading && !chart && ( +
+ Configure chart inputs and run to see a visualization. +
+ )} +
+
+ ); +} + +type SelectFieldProps = { + label: string; + value: string; + onChange: (value: string) => void; + options: Array<{ value: string; label: string }>; +}; + +function SelectField({ label, value, onChange, options }: SelectFieldProps) { + return ( + + ); +} + +type ChartCanvasProps = { + response: ChartResponse; +}; + +function ChartCanvas({ response }: ChartCanvasProps) { + if (response.chartType === "bar" && response.series) { + const data = response.series.map((item) => ({ + category: formatValue(item.category), + value: item.value ?? 0, + color: item.color ? String(item.color) : undefined, + })); + + return ( + + + + + + + + + {data.map((entry, index) => ( + + ))} + + + + ); + } + + if (response.chartType === "scatter" && response.points) { + const seriesMap = new Map(); + response.points.forEach((point) => { + const key = + point.color !== undefined && point.color !== null + ? String(point.color) + : "Series"; + if (!seriesMap.has(key)) { + seriesMap.set(key, []); + } + seriesMap.get(key)!.push(point); + }); + + const series = Array.from(seriesMap.entries()); + + return ( + + + + formatNumber(value as number)} + domain={['dataMin', 'dataMax']} + allowDuplicatedCategory={false} + /> + formatNumber(value as number)} + domain={['dataMin', 'dataMax']} + /> + + + {series.map(([key, data], index) => ( + + ))} + + + ); + } + + if (response.chartType === "histogram" && response.bins) { + const data = response.bins.map((bin) => ({ + range: `${formatNumber(bin.binStart)}-${formatNumber(bin.binEnd)}`, + count: bin.count, + })); + + return ( + + + + + + + + + + ); + } + + return ( +
+ No chart data. +
+ ); +} diff --git a/src/data-explorer/index.tsx b/src/data-explorer/index.tsx new file mode 100644 index 0000000..5c605da --- /dev/null +++ b/src/data-explorer/index.tsx @@ -0,0 +1,12 @@ +import { createRoot } from "react-dom/client"; +import { App } from "./App"; + +const rootElement = document.getElementById("data-explorer-root"); + +if (rootElement) { + const root = createRoot(rootElement); + root.render(); +} + +export { App }; +export default App; diff --git a/src/data-explorer/types.ts b/src/data-explorer/types.ts new file mode 100644 index 0000000..d280a2e --- /dev/null +++ b/src/data-explorer/types.ts @@ -0,0 +1,138 @@ +export type DataRole = "numeric" | "categorical" | "datetime" | "boolean" | "text"; + +export type DatasetColumnProfile = { + name: string; + role: DataRole; + dtype: string; + nonNullCount: number; + missingCount: number; + missingProportion: number; + distinctCount: number; + sampleValues: unknown[]; + stats?: Record | null; + topValues?: Array<{ + value: unknown; + count: number; + percentage: number; + }> | null; +}; + +export type DatasetProfile = { + rowCount: number; + columnCount: number; + columns: DatasetColumnProfile[]; + memoryUsageBytes: number; +}; + +export type DatasetSummary = { + datasetId: string; + datasetName: string; + rowCount: number; + columnCount: number; + createdAt: string; + filename?: string | null; + profile: DatasetProfile; +}; + +export type FilterEquals = { + type: "equals"; + column: string; + value: string | number | boolean | null; +}; + +export type FilterRange = { + type: "range"; + column: string; + min?: number | string | null; + max?: number | string | null; +}; + +export type FilterDefinition = FilterEquals | FilterRange; + +export type ChartType = "bar" | "scatter" | "histogram"; +export type ChartAggregation = "count" | "sum" | "avg"; + +export type ChartConfig = { + chartType: ChartType; + x: string; + y?: string | null; + color?: string | null; + binCount?: number | null; + aggregation?: ChartAggregation; +}; + +export type UploadDatasetResponse = { + dataset: DatasetSummary; + preview: Array>; + columns: string[]; +}; + +export type UploadInitResponse = { + uploadId: string; +}; + +export type UploadChunkIntermediate = { + uploadId: string; + receivedBytes: number; + isFinalized: false; +}; + +export type UploadChunkFinal = UploadDatasetResponse & { + uploadId: string; + receivedBytes: number; + isFinalized: true; +}; + +export type UploadChunkResponse = UploadChunkIntermediate | UploadChunkFinal; + +export type PreviewResponse = { + datasetId: string; + totalRows: number; + rows: Array>; + columns: string[]; + appliedFilters: FilterDefinition[]; +}; + +export type BarChartSeries = { + category: unknown; + value: number | null; + color?: unknown; +}; + +export type ScatterPoint = { + x: number; + y: number; + color?: unknown; +}; + +export type HistogramBin = { + binStart: number; + binEnd: number; + count: number; +}; + +export type ChartResponse = { + datasetId: string; + chartType: ChartType; + series?: BarChartSeries[]; + points?: ScatterPoint[]; + bins?: HistogramBin[]; + config: ChartConfig; +}; + +export type OpenResponse = { + datasets: DatasetSummary[]; + activeDatasetId: string | null; + supportsChunkUpload?: boolean; + maxUploadBytes?: number; +}; + +export type WidgetState = { + datasetId: string | null; + filters: FilterDefinition[]; + preview: { + limit: number; + offset: number; + }; + chartConfig: ChartConfig | null; +}; diff --git a/src/data-explorer/utils/callTool.ts b/src/data-explorer/utils/callTool.ts new file mode 100644 index 0000000..b921726 --- /dev/null +++ b/src/data-explorer/utils/callTool.ts @@ -0,0 +1,27 @@ +import type { CallToolResponse } from "../../types"; + +export async function callToolJson( + name: string, + args: Record +): Promise { + if (!window?.openai?.callTool) { + throw new Error("callTool API is unavailable in this environment."); + } + + const response: CallToolResponse = await window.openai.callTool(name, args); + const payload = response?.result ?? ""; + + if (!payload) { + throw new Error("Tool call returned an empty response."); + } + + try { + return JSON.parse(payload) as T; + } catch (error) { + throw new Error( + `Unable to parse tool response for ${name}: ${ + error instanceof Error ? error.message : String(error) + }` + ); + } +} diff --git a/src/data-explorer/utils/format.ts b/src/data-explorer/utils/format.ts new file mode 100644 index 0000000..c7c33e7 --- /dev/null +++ b/src/data-explorer/utils/format.ts @@ -0,0 +1,43 @@ +export function formatNumber(value: number | null | undefined): string { + if (value === null || value === undefined || Number.isNaN(value)) { + return "—"; + } + if (Math.abs(value) >= 1000) { + return value.toLocaleString(); + } + return value.toString(); +} + +export function formatBytes(bytes: number): string { + if (!Number.isFinite(bytes) || bytes <= 0) { + return "0 B"; + } + const units = ["B", "KB", "MB", "GB", "TB"]; + let index = 0; + let value = bytes; + while (value >= 1024 && index < units.length - 1) { + value /= 1024; + index += 1; + } + return `${value.toFixed(index === 0 ? 0 : 1)} ${units[index]}`; +} + +export function formatPercentage(value: number | null | undefined): string { + if (value === null || value === undefined) { + return "—"; + } + return `${(value * 100).toFixed(1)}%`; +} + +export function formatValue(value: unknown): string { + if (value === null || value === undefined) { + return "—"; + } + if (typeof value === "number") { + return formatNumber(value); + } + if (value instanceof Date) { + return value.toISOString(); + } + return String(value); +}