diff --git a/.codeqa.yml b/.codeqa.yml index e6d5d467..26f39beb 100644 --- a/.codeqa.yml +++ b/.codeqa.yml @@ -4,4 +4,32 @@ # Patterns here are merged with any --ignore-paths passed on the command line. ignore_paths: - - priv/combined_metrics/samples/** + - priv/** + - tools/** + - scripts/** + - docs/** + - plans/** + - test/** + - devenv* + - direnv* + - README.md + - action.yml + +# Impact weights for overall score calculation. +# Combined metric categories default to 1 (can be overridden here). +impact: + complexity: 5 + file_structure: 4 + function_design: 4 + code_smells: 3 + naming_conventions: 2 + error_handling: 2 + consistency: 2 + documentation: 1 + testing: 1 + # combined categories override example: + # variable_naming: 2 + +combined_top: 5 # worst offender files per combined-metric behavior + +cosine_significance_threshold: 0.25 # threshold for cosine similarity calculation in behavior categories diff --git a/.dialyzer_ignore.exs b/.dialyzer_ignore.exs new file mode 100644 index 00000000..9722072f --- /dev/null +++ b/.dialyzer_ignore.exs @@ -0,0 +1,14 @@ +[ + # Dialyzer specializes analyze/2 for the codebase call-site where include_pairs + # is always true, making the false branch appear unreachable. Both branches are + # valid and reachable at runtime from the file-level and codebase callers. + {"lib/codeqa/metrics/file/near_duplicate_blocks.ex", :pattern_match}, + # Mix module type information is not available in the PLT; these are valid + # Mix.Task callbacks and standard Mix module calls. + {"lib/mix/tasks/codeqa/sample_report.ex", :callback_info_missing}, + {"lib/mix/tasks/codeqa/signal_debug.ex", :callback_info_missing}, + {"lib/mix/tasks/codeqa/sample_report.ex", :unknown_function}, + {"lib/mix/tasks/codeqa/signal_debug.ex", :unknown_function}, + # CodeQA.Engine.Registry.t/0 is defined via a macro; type is available at runtime. + {"lib/codeqa/analysis/file_metrics_server.ex", :unknown_type} +] diff --git a/.github/workflows/bootstrap-labels.yml b/.github/workflows/bootstrap-labels.yml index a8653357..52c644a1 100644 --- a/.github/workflows/bootstrap-labels.yml +++ b/.github/workflows/bootstrap-labels.yml @@ -10,7 +10,7 @@ jobs: bootstrap: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Bootstrap labels uses: actions/github-script@v7 diff --git a/.github/workflows/compare.yml b/.github/workflows/compare.yml index fa13ef0c..5a672ad6 100644 --- a/.github/workflows/compare.yml +++ b/.github/workflows/compare.yml @@ -12,10 +12,19 @@ jobs: compare: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 with: fetch-depth: 0 + - name: Cache Mix deps and build + uses: actions/cache@v4 + with: + path: | + deps + _build + key: ${{ runner.os }}-mix-1.19-27.3-${{ hashFiles('mix.lock', 'mix.exs') }} + restore-keys: ${{ runner.os }}-mix-1.19-27.3- + - name: Get fork point id: fork-point run: | diff --git a/.github/workflows/dialyzer.yml b/.github/workflows/dialyzer.yml index 06743982..dfaca601 100644 --- a/.github/workflows/dialyzer.yml +++ b/.github/workflows/dialyzer.yml @@ -25,7 +25,7 @@ jobs: steps: - name: Checkout PR - uses: actions/checkout@v4 + uses: actions/checkout@v6 - name: Set up Elixir uses: erlef/setup-beam@v1 @@ -45,9 +45,9 @@ jobs: uses: actions/cache@v4 with: path: _build - key: build-${{ env.ELIXIR_VERSION }}-${{ env.OTP_VERSION }}-${{ hashFiles('mix.lock') }} + key: build-${{ env.MIX_ENV }}-${{ env.ELIXIR_VERSION }}-${{ env.OTP_VERSION }}-${{ hashFiles('mix.lock') }} restore-keys: | - build-${{ env.ELIXIR_VERSION }}-${{ env.OTP_VERSION }}- + build-${{ env.MIX_ENV }}-${{ env.ELIXIR_VERSION }}-${{ env.OTP_VERSION }}- - name: Cache PLT uses: actions/cache@v4 diff --git a/.github/workflows/health-report.yml b/.github/workflows/health-report.yml index 55e38f09..2e0b8966 100644 --- a/.github/workflows/health-report.yml +++ b/.github/workflows/health-report.yml @@ -12,7 +12,17 @@ jobs: health-report: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 + + - name: Cache Mix deps and build + uses: actions/cache@v4 + with: + path: | + deps + _build + key: ${{ runner.os }}-mix-1.19-27.3-${{ hashFiles('mix.lock', 'mix.exs') }} + restore-keys: ${{ runner.os }}-mix-1.19-27.3- + - uses: ./ with: command: health-report diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index d1ebf666..13300bda 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -17,15 +17,33 @@ jobs: build: runs-on: ubuntu-latest + env: + ELIXIR_VERSION: "1.19" + OTP_VERSION: "27.3" + steps: - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@v6 - name: Set up Elixir uses: erlef/setup-beam@v1 with: - otp-version: "27.3" - elixir-version: "1.19" + otp-version: ${{ env.OTP_VERSION }} + elixir-version: ${{ env.ELIXIR_VERSION }} + + - name: Cache deps + uses: actions/cache@v4 + with: + path: deps + key: deps-${{ env.ELIXIR_VERSION }}-${{ env.OTP_VERSION }}-${{ hashFiles('mix.lock', 'mix.exs') }} + restore-keys: deps-${{ env.ELIXIR_VERSION }}-${{ env.OTP_VERSION }}- + + - name: Cache build + uses: actions/cache@v4 + with: + path: _build + key: build-${{ env.ELIXIR_VERSION }}-${{ env.OTP_VERSION }}-${{ hashFiles('mix.lock', 'mix.exs') }} + restore-keys: build-${{ env.ELIXIR_VERSION }}-${{ env.OTP_VERSION }}- - name: Build escript run: | diff --git a/.github/workflows/sync-behavior-coverage.yml b/.github/workflows/sync-behavior-coverage.yml new file mode 100644 index 00000000..dc1cc4f2 --- /dev/null +++ b/.github/workflows/sync-behavior-coverage.yml @@ -0,0 +1,65 @@ +name: Sync Behavior Coverage + +on: + pull_request: + branches: [main] + +permissions: + contents: write + +jobs: + sync: + runs-on: ubuntu-latest + if: github.event.pull_request.head.repo.full_name == github.repository + + steps: + - name: Checkout PR branch + uses: actions/checkout@v6 + with: + ref: ${{ github.head_ref }} + token: ${{ secrets.GITHUB_TOKEN }} + + - name: Set up Elixir + uses: erlef/setup-beam@v1 + with: + otp-version: "27.3" + elixir-version: "1.19" + + - name: Cache deps + uses: actions/cache@v4 + with: + path: | + deps + _build + key: ${{ runner.os }}-mix-dev-otp27.3-elixir1.19-${{ hashFiles('mix.lock', 'mix.exs') }} + restore-keys: ${{ runner.os }}-mix-dev-otp27.3-elixir1.19- + + - name: Install dependencies + run: mix deps.get + + - name: Compile + run: mix compile --warnings-as-errors + + - name: Regenerate language coverage + run: mix codeqa.sample_report --apply-languages + + - name: Regenerate scalar vectors + run: mix codeqa.sample_report --apply-scalars + + - name: Check for YAML drift + id: diff + run: | + if git diff --quiet priv/combined_metrics/; then + echo "changed=false" >> $GITHUB_OUTPUT + else + echo "changed=true" >> $GITHUB_OUTPUT + fi + + - name: Commit and push updated YAMLs + if: steps.diff.outputs.changed == 'true' + run: | + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + git add priv/combined_metrics/*.yml + git commit -m "chore(combined-metrics): sync language coverage and scalar vectors [skip ci]" + git push diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index d560a175..04ba9b3c 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -10,24 +10,27 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@v6 - name: Set up Elixir + id: beam uses: erlef/setup-beam@v1 with: otp-version: "27.3" elixir-version: "1.19" - name: Cache deps + id: mix-cache uses: actions/cache@v4 with: path: | deps _build - key: ${{ runner.os }}-mix-${{ hashFiles('mix.lock') }} - restore-keys: ${{ runner.os }}-mix- + key: ${{ runner.os }}-mix-${{ steps.beam.outputs.otp-version }}-${{ steps.beam.outputs.elixir-version }}-${{ hashFiles('mix.lock', 'mix.exs') }} + restore-keys: ${{ runner.os }}-mix-${{ steps.beam.outputs.otp-version }}-${{ steps.beam.outputs.elixir-version }}- - name: Install dependencies + if: steps.mix-cache.outputs.cache-hit != 'true' run: mix deps.get - name: Compile diff --git a/.github/workflows/validate-issue-links.yml b/.github/workflows/validate-issue-links.yml index e366437e..5960df43 100644 --- a/.github/workflows/validate-issue-links.yml +++ b/.github/workflows/validate-issue-links.yml @@ -12,7 +12,7 @@ jobs: validate: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Validate issue template links run: | diff --git a/.gitignore b/.gitignore index ad2603a6..93f865dd 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,10 @@ devenv.lock # Git worktrees .worktrees/ +docs/plans/ +docs/superpowers/ +plans/ +scripts/*.exs + +# Claude Code +.claude/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..d008842b --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,8 @@ +repos: +- repo: local + hooks: + - id: mix-precommit + name: Mix precommit + entry: devenv shell precommit + language: system + pass_filenames: false diff --git a/README.md b/README.md index 259ee497..1eab2d4b 100644 --- a/README.md +++ b/README.md @@ -17,14 +17,14 @@ Works with Python, Ruby, JavaScript, TypeScript, Elixir, C#, Java, C++, Go, Rust - [CLI Reference](#cli-reference) - [analyze](#analyze) - [health-report](#health-report) + - [diagnose](#diagnose) - [compare](#compare) - [history](#history) - [correlate](#correlate) - - [stopwords](#stopwords) - [Metrics Reference](#metrics-reference) - [Raw Metrics](#raw-metrics) - [Health Report Categories](#health-report-categories) - - [Behavior Checks](#behavior-checks) + - [Behavior Categories](#behavior-categories) - [Output Formats](#output-formats) - [Grading](#grading) @@ -76,7 +76,7 @@ jobs: health-report: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - uses: num42/codeqa-action@v1 with: command: health-report @@ -95,7 +95,7 @@ jobs: compare: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 with: fetch-depth: 0 - name: Get fork point @@ -112,7 +112,7 @@ jobs: | Input | Required | Default | Description | |-------|----------|---------|-------------| -| `command` | yes | — | CLI command to run: `health-report`, `compare`, or `analyze` | +| `command` | yes | — | CLI command to run: `health-report`, `compare`, `analyze`, `history`, `correlate`, or `diagnose` | | `path` | no | `.` | Directory to analyze | | `comment` | no | `false` | Post results as a sticky PR comment | | `fail-grade` | no | — | Fail the action if overall grade is below this (e.g. `C`) | @@ -153,22 +153,83 @@ ignore_paths: ```yaml categories: - - name: Naming - weight: 1.5 + Naming: + name: Naming metrics: - name: vowel_density - good: 0.4 - thresholds: [0.35, 0.3, 0.25] + weight: 1.5 + good: "high" + thresholds: + a: 0.42 + b: 0.38 + c: 0.32 + d: 0.25 ``` +Category-level keys: `name` (display name), `metrics` (list of metric overrides), `top` (worst-offender count override). + +Metric-level keys: `name` (metric key), `weight` (relative weight within the category), `good` (`"high"` or `"low"` — direction where higher values are better or worse), `source` (metric path), `thresholds` (map of letter-grade cutoffs: `a`, `b`, `c`, `d`). + ### Grade scale override ```yaml grade_scale: - - [90, "A"] - - [80, "B"] - - [70, "C"] - - [0, "F"] + - min: 90 + grade: "A" + - min: 80 + grade: "B" + - min: 70 + grade: "C" + - min: 0 + grade: "F" +``` + +### impact + +Impact weights used when computing the overall score. The 9 keys below are the built-in defaults; any category not listed falls back to `1`. These weights apply to both primary and behavior categories. + +```yaml +impact: + complexity: 5 + file_structure: 4 + function_design: 4 + code_smells: 3 + naming_conventions: 2 + error_handling: 2 + consistency: 2 + documentation: 1 + testing: 1 + # override any category key: + # variable_naming: 2 +``` + +### combined_top + +Controls how many worst-offender files are shown per behavior category in `health-report` (default: `2`). + +```yaml +combined_top: 3 +``` + +### near_duplicate_blocks + +Configures codebase-level near-duplicate block detection (used by `analyze`). + +```yaml +near_duplicate_blocks: + max_pairs_per_bucket: 50 +``` + +| Key | Description | +|-----|-------------| +| `max_pairs_per_bucket` | Maximum duplicate pairs reported per similarity bucket (default: unlimited) | + +### cosine_significance_threshold + +Minimum cosine similarity required for a behavior category match to be considered significant. Matches below this threshold are treated as noise and excluded from scoring. Default: `0.15`. + +```yaml +cosine_significance_threshold: 0.25 ``` ## CLI Reference @@ -228,6 +289,31 @@ Produces a graded quality report grouped into behavior categories with worst-off ./codeqa health-report --detail full --top 10 --format github ./lib ``` +### diagnose + +Identifies likely code quality issues by scoring behavior profiles using cosine similarity. Useful for understanding *why* a codebase scores poorly without running a full health report. + +```sh +./codeqa diagnose --path [OPTIONS] +``` + +`--path` is **required**. Note: unlike `health-report`, the path is passed as a named flag (`--path`), not a positional argument. + +| Option | Description | +|--------|-------------| +| `--path PATH` | **(Required)** File or directory to analyze | +| `--mode MODE` | `aggregate` (default) or `per-file` | +| `--top N` | Number of top issues to show (default: `15`) | +| `--format FORMAT` | Output format: `plain` or `json` (default: `plain`) | +| `--combined-top N` | Worst-offender files per behavior in per-file mode (default: `2`) | + +**Example:** + +```sh +./codeqa diagnose --path ./lib --mode aggregate --top 10 +./codeqa diagnose --path ./lib --mode per-file --format json +``` + ### compare Compares code quality metrics between two git refs. Designed for PR workflows. @@ -246,6 +332,16 @@ Compares code quality metrics between two git refs. Designed for PR workflows. | `--output MODE` | Output mode: `auto`, `summary`, or `changes` (default: `auto`) | | `--changes-only` | Only analyze files changed between refs | | `--all-files` | Analyze all source files at both refs (default) | +| `--workers N` | Parallel worker count | +| `--progress` | Show per-file progress | +| `--cache` | Cache computed metrics to disk | +| `--cache-dir PATH` | Directory for cached metrics (default: `.codeqa_cache`) | +| `--timeout MS` | Per-file timeout in milliseconds (default: `5000`) | +| `--show-ncd` | Include NCD similarity matrix | +| `--ncd-top N` | Top similar pairs per file | +| `--ncd-paths PATHS` | Comma-separated paths to compare for NCD | +| `--show-files` | Include per-file metrics in output | +| `--show-file-paths PATHS` | Comma-separated list of specific file paths to include | | `--ignore-paths GLOBS` | Comma-separated glob patterns to exclude | **Example:** @@ -269,6 +365,16 @@ Tracks codebase metrics across multiple commits, writing per-commit JSON snapsho | `--output-dir PATH` | **(Required)** Directory to write JSON snapshots | | `--commits N` | Number of recent commits to analyze | | `--commit-list SHAS` | Comma-separated list of explicit commit SHAs | +| `--workers N` | Parallel worker count | +| `--progress` | Show per-file progress | +| `--cache` | Cache computed metrics to disk | +| `--cache-dir PATH` | Directory for cached metrics (default: `.codeqa_cache`) | +| `--timeout MS` | Per-file timeout in milliseconds (default: `5000`) | +| `--show-ncd` | Include NCD similarity matrix | +| `--ncd-top N` | Top similar pairs per file | +| `--ncd-paths PATHS` | Comma-separated paths to compare for NCD | +| `--show-files` | Include per-file metrics in output | +| `--show-file-paths PATHS` | Comma-separated list of specific file paths to include | | `--ignore-paths GLOBS` | Comma-separated glob patterns to exclude | ### correlate @@ -282,27 +388,12 @@ Finds metric correlations across history snapshots produced by `history`. Run `h | Option | Description | |--------|-------------| | `--top N` | Number of top correlations to show (default: `20`) | -| `--hide-exact` | Hide perfect 1.0 correlations | -| `--all-groups` | Show all metric groups | +| `--hide-exact` | Hide perfect 1.0 and -1.0 correlations | +| `--all-groups` | Include correlations between metrics in the same group | | `--min FLOAT` | Minimum correlation threshold | | `--max FLOAT` | Maximum correlation threshold | | `--combined-only` | Show only combined-metric correlations | -| `--max-steps N` | Limit history steps used | - -### stopwords - -Extracts codebase-specific vocabulary stopwords and fingerprints. Use the output to reduce noise from project-specific boilerplate tokens in subsequent metric analysis. - -```sh -./codeqa stopwords [OPTIONS] -``` - -| Option | Description | -|--------|-------------| -| `--workers N` | Parallel worker count | -| `--stopwords-threshold FLOAT` | Minimum frequency ratio (default: `0.01`) | -| `--progress` | Show per-file progress | -| `--ignore-paths GLOBS` | Comma-separated glob patterns to exclude | +| `--max-steps N` | Maximum number of correlation pairs to evaluate | ## Metrics Reference @@ -329,10 +420,17 @@ All metrics are computed per file and aggregated at the codebase level. | **Magic number density** | Ratio of numeric literals that appear to be unnamed constants | | **Function metrics** | Function count, average and maximum function line count, average and maximum parameter count | | **Cross-file similarity** | `cross_file_density`: overall codebase redundancy via combined compression ratio. `ncd_pairs` (opt-in via `--show-ncd`): Normalized Compression Distance between similar file pairs using winnowing fingerprints | +| **Near-duplicate blocks** | Codebase-level detection of near- and exact-duplicate code blocks using token-based similarity. Reports duplicate pairs grouped by bucket, with source locations. Configurable via `near_duplicate_blocks:` in `.codeqa.yml`. | +| **Block impact & refactoring potentials** | Per-file node tree enriched with leave-one-out impact scores and refactoring potentials. Added to each file entry as `"nodes"` in `analyze` JSON output. Surfaces the highest-impact blocks to refactor. | ### Health Report Categories -The `health-report` command grades your codebase against 6 primary categories. Each category aggregates raw metrics using configurable weights and thresholds. +The `health-report` command evaluates your codebase using two complementary scoring models: + +- **6 primary categories** — graded using configurable thresholds against raw metrics (Readability, Complexity, Structure, Duplication, Naming, Magic Numbers) +- **12 behavior categories** — graded using cosine similarity against behavior profiles (see [Behavior Categories](#behavior-categories)) + +The overall score is a weighted average of all 18 categories. Primary category weights are set via `weight:` in `.codeqa.yml`; behavior category weights are configured via [`impact:`](#impact). | Category | What it measures | |----------|-----------------| @@ -343,11 +441,21 @@ The `health-report` command grades your codebase against 6 primary categories. E | **Naming** | Casing entropy, identifier length variance, avg sub-words per identifier | | **Magic Numbers** | Magic number density | +**Cosine scoring breakpoints** (used for behavior categories): + +| Cosine similarity | Score | Approx. grade | +|-------------------|-------|---------------| +| ≥ 0.5 | 90–100 | A | +| ≥ 0.2 | 70–90 | B–A- | +| ≥ 0.0 | 50–70 | C–B- | +| ≥ −0.3 | 30–50 | D–C- | +| ≥ −1.0 | 0–30 | F–D- | + > Category definitions and thresholds are configurable via `.codeqa.yml`. -### Behavior Checks +### Behavior Categories -In addition to the 6 graded categories, `health-report` evaluates additional behavior check categories using a separate multiplicative scoring model. These appear in the report as "Top Issues" diagnostics. +In addition to the 6 primary categories, `health-report` grades 12 behavior categories using cosine similarity against behavior profiles. These contribute to the overall score alongside the primary categories. | Category | Checks | |----------|--------| @@ -364,13 +472,15 @@ In addition to the 6 graded categories, `health-report` evaluates additional beh | **Dependencies** | Import and dependency patterns | | **Error Handling** | Error handling completeness | +> These categories are graded in the `health-report` output using cosine similarity scoring and contribute to the overall score. + ## Output Formats | Format | Commands | Description | |--------|----------|-------------| -| `json` | `analyze`, `compare` | Full metrics structure, suitable for tooling | -| `markdown` | `compare`, `health-report` | GitHub-flavored markdown tables | -| `plain` | `health-report` | Human-readable terminal output (Markdown) | +| `json` | `analyze`, `compare`, `diagnose` | Full metrics structure, suitable for tooling | +| `markdown` | `compare` | GitHub-flavored markdown tables | +| `plain` | `health-report`, `diagnose` | Human-readable terminal output | | `github` | `health-report`, `compare` | Markdown optimized for GitHub PR comments | ## Grading @@ -397,6 +507,8 @@ In addition to the 6 graded categories, `health-report` evaluates additional beh | E- | ≥ 6 | | F | < 6 | +The overall score is a weighted average across all categories. Primary category weights use the `weight:` field inside each category definition in `.codeqa.yml`. Behavior category weights are configured via `impact:` (defaults range from 1–5; categories not listed fall back to `1`). See [Configuration](#configuration) for examples. + The `fail-grade` action input causes a non-zero exit when the overall grade falls below the specified threshold. ## Contributing & Issues diff --git a/action.yml b/action.yml index 6be60789..ebee062b 100644 --- a/action.yml +++ b/action.yml @@ -93,15 +93,10 @@ runs: INPUT_VERSION: ${{ inputs.version }} INPUT_BUILD: ${{ inputs.build }} GITHUB_ACTION_PATH: ${{ github.action_path }} + PR_NUMBER: ${{ github.event.pull_request.number }} + GITHUB_TOKEN: ${{ github.token }} run: ${{ github.action_path }}/scripts/run.sh - - name: Post PR comment - if: inputs.comment == 'true' && github.event_name == 'pull_request' - uses: marocchino/sticky-pull-request-comment@v2 - with: - header: codeqa-${{ inputs.command }} - path: ${{ steps.run.outputs.report-file }} - - name: Check grade threshold if: inputs.fail-grade != '' && inputs.command == 'health-report' shell: bash diff --git a/devenv.yaml b/devenv.yaml index 6bf1e6c1..b45546e6 100644 --- a/devenv.yaml +++ b/devenv.yaml @@ -1,3 +1,8 @@ inputs: nixpkgs: url: github:cachix/devenv-nixpkgs/rolling + git-hooks: + url: github:cachix/git-hooks.nix + inputs: + nixpkgs: + follows: nixpkgs diff --git a/lib/codeqa/analysis/behavior_config_server.ex b/lib/codeqa/analysis/behavior_config_server.ex new file mode 100644 index 00000000..04cc9bab --- /dev/null +++ b/lib/codeqa/analysis/behavior_config_server.ex @@ -0,0 +1,119 @@ +defmodule CodeQA.Analysis.BehaviorConfigServer do + @moduledoc """ + Per-run GenServer that loads all YAML behavior configs once and serves them + from an anonymous ETS table. + + Eliminates repeated disk reads in `SampleRunner.diagnose_aggregate/2` by + loading `priv/combined_metrics/*.yml` on startup and keeping data in memory + for the duration of the analysis run. + + ETS layout: `{category, behavior} => behavior_data` + where `behavior_data` is the raw YAML map for that behavior. + """ + + use GenServer + + @yaml_dir "priv/combined_metrics" + + # --- Public API --- + + @spec start_link(keyword()) :: GenServer.on_start() + def start_link(opts \\ []) do + GenServer.start_link(__MODULE__, opts) + end + + @doc "Returns the ETS table id. Callers may read directly from it." + @spec get_tid(pid()) :: :ets.tid() + def get_tid(pid), do: GenServer.call(pid, :get_tid) + + @doc """ + Returns all behaviors grouped by category. + + %{"function_design" => [{"no_boolean_parameter", behavior_data}, ...], ...} + """ + @spec get_all_behaviors(pid()) :: %{String.t() => [{String.t(), map()}]} + def get_all_behaviors(pid) do + tid = get_tid(pid) + + tid + |> :ets.tab2list() + |> Enum.reduce(%{}, fn {{cat, beh}, data}, acc -> + Map.update(acc, cat, [{beh, data}], &[{beh, data} | &1]) + end) + end + + @doc "Returns the scalar weight map for a given category + behavior." + @spec get_scalars(pid(), String.t(), String.t()) :: %{{String.t(), String.t()} => float()} + def get_scalars(pid, category, behavior) do + tid = get_tid(pid) + + case :ets.lookup(tid, {category, behavior}) do + [{_, data}] -> scalars_from_behavior_data(data) + [] -> %{} + end + end + + @doc "Returns the `_log_baseline` value for a given category + behavior." + @spec get_log_baseline(pid(), String.t(), String.t()) :: float() + def get_log_baseline(pid, category, behavior) do + tid = get_tid(pid) + + case :ets.lookup(tid, {category, behavior}) do + [{_, data}] -> Map.get(data, "_log_baseline", 0.0) / 1.0 + [] -> 0.0 + end + end + + # --- GenServer callbacks --- + + @impl true + def init(_opts) do + tid = :ets.new(:behavior_config, [:set, :public, read_concurrency: true]) + load_configs(tid) + {:ok, %{tid: tid}} + end + + @impl true + def handle_call(:get_tid, _from, state) do + {:reply, state.tid, state} + end + + # --- Private helpers --- + + defp load_configs(tid) do + case File.ls(@yaml_dir) do + {:ok, files} -> + files + |> Enum.filter(&String.ends_with?(&1, ".yml")) + |> Enum.each(&load_yml_file(&1, tid)) + + {:error, _} -> + :ok + end + end + + defp load_yml_file(yml_file, tid) do + category = String.trim_trailing(yml_file, ".yml") + yaml_path = Path.join(@yaml_dir, yml_file) + {:ok, data} = YamlElixir.read_from_file(yaml_path) + + data + |> Enum.filter(fn {_k, v} -> is_map(v) end) + |> Enum.each(fn {behavior, behavior_data} -> + :ets.insert(tid, {{category, behavior}, behavior_data}) + end) + end + + @doc false + def scalars_from_behavior_data(behavior_data) do + behavior_data + |> Enum.flat_map(fn + {group, keys} when is_map(keys) -> + Enum.map(keys, fn {key, scalar} -> {{group, key}, scalar / 1.0} end) + + _ -> + [] + end) + |> Map.new() + end +end diff --git a/lib/codeqa/analysis/file_context_server.ex b/lib/codeqa/analysis/file_context_server.ex new file mode 100644 index 00000000..987595f9 --- /dev/null +++ b/lib/codeqa/analysis/file_context_server.ex @@ -0,0 +1,87 @@ +defmodule CodeQA.Analysis.FileContextServer do + @moduledoc """ + Per-run GenServer that memoizes `Pipeline.build_file_context/2` by + `{MD5(content), language_name}`. + + Cache key includes the resolved language name because different languages + produce different keyword/operator sets, yielding different identifiers from + the same content. + + ETS layout: `{md5_binary, language_name} => FileContext.t()` + + On a cache miss, the calling process builds the context directly and inserts + it into the shared ETS table — no GenServer mailbox round-trip for the + computation itself. + """ + + use GenServer + + alias CodeQA.Engine.{FileContext, Pipeline} + alias CodeQA.Language + alias CodeQA.Languages.Unknown + + # --- Public API --- + + @spec start_link(keyword()) :: GenServer.on_start() + def start_link(opts \\ []) do + GenServer.start_link(__MODULE__, opts) + end + + @doc "Returns the ETS table id. Callers may read directly from it." + @spec get_tid(pid()) :: :ets.tid() + def get_tid(pid), do: GenServer.call(pid, :get_tid) + + @doc """ + Returns a cached (or freshly built) `FileContext` for `content`. + + The language is resolved from `opts` (`:language` or `:path`); defaults to + `Unknown`. + """ + @spec get(pid(), String.t(), keyword()) :: FileContext.t() + def get(pid, content, opts \\ []) do + tid = get_tid(pid) + language_name = resolve_language_name(opts) + key = {md5(content), language_name} + + case :ets.lookup(tid, key) do + [{_, ctx}] -> + ctx + + [] -> + ctx = Pipeline.build_file_context(content, opts) + :ets.insert(tid, {key, ctx}) + ctx + end + end + + # --- GenServer callbacks --- + + @impl true + def init(_opts) do + tid = :ets.new(:file_context, [:set, :public, read_concurrency: true]) + {:ok, %{tid: tid}} + end + + @impl true + def handle_call(:get_tid, _from, state) do + {:reply, state.tid, state} + end + + # --- Private helpers --- + + defp md5(content), do: :crypto.hash(:md5, content) + + defp resolve_language_name(opts) do + cond do + lang = Keyword.get(opts, :language) -> + mod = Language.find(lang) || Unknown + mod.name() + + path = Keyword.get(opts, :path) -> + Language.detect(path).name() + + true -> + Unknown.name() + end + end +end diff --git a/lib/codeqa/analysis/file_metrics_server.ex b/lib/codeqa/analysis/file_metrics_server.ex new file mode 100644 index 00000000..579a63d9 --- /dev/null +++ b/lib/codeqa/analysis/file_metrics_server.ex @@ -0,0 +1,107 @@ +defmodule CodeQA.Analysis.FileMetricsServer do + @moduledoc """ + Per-run GenServer that caches `Registry.run_file_metrics/2` results. + + Pre-populated from `pipeline_result` before block analysis starts so baseline + metrics are served directly from ETS without recomputation. + + ETS layout: + - `{:path, path}` => metrics map (baseline for existing files) + - `{:hash, md5_binary}` => metrics map (computed on demand for reconstructed content) + """ + + use GenServer + + alias CodeQA.Engine.Pipeline + alias CodeQA.Engine.Registry + + # --- Public API --- + + @spec start_link(keyword()) :: GenServer.on_start() + def start_link(opts \\ []) do + GenServer.start_link(__MODULE__, opts) + end + + @doc "Returns the ETS table id. Callers may read directly from it." + @spec get_tid(pid()) :: :ets.tid() + def get_tid(pid), do: GenServer.call(pid, :get_tid) + + @doc """ + Bulk-inserts all baseline metrics from `pipeline_result` and cross-indexes by + content hash for each path present in `files_map`. + + Call once after starting the supervisor, before beginning block analysis. + """ + @spec populate(pid(), map(), map()) :: :ok + def populate(pid, pipeline_result, files_map) do + tid = get_tid(pid) + files_data = Map.get(pipeline_result, "files", %{}) + + Enum.each(files_data, fn {path, file_data} -> + metrics = Map.get(file_data, "metrics", %{}) + :ets.insert(tid, {{:path, path}, metrics}) + end) + + Enum.each(files_map, fn {path, content} -> + hash = md5(content) + + case :ets.lookup(tid, {:path, path}) do + [{_, metrics}] -> :ets.insert(tid, {{:hash, hash}, metrics}) + [] -> :ok + end + end) + + :ok + end + + @doc "Returns pre-populated baseline metrics for `path`, or `nil` if not found." + @spec get_by_path(pid(), String.t()) :: map() | nil + def get_by_path(pid, path) do + tid = get_tid(pid) + + case :ets.lookup(tid, {:path, path}) do + [{_, metrics}] -> metrics + [] -> nil + end + end + + @doc """ + Returns metrics for `content`, using the hash cache. + + On a cache miss, builds the file context and runs metrics in the calling + process, then inserts the result into ETS for future lookups. + """ + @spec get_for_content(pid(), Registry.t(), String.t(), keyword()) :: map() + def get_for_content(pid, registry, content, opts \\ []) do + tid = get_tid(pid) + hash = md5(content) + + case :ets.lookup(tid, {:hash, hash}) do + [{_, metrics}] -> + metrics + + [] -> + ctx = Pipeline.build_file_context(content, opts) + metrics = Registry.run_file_metrics(registry, ctx) + :ets.insert(tid, {{:hash, hash}, metrics}) + metrics + end + end + + # --- GenServer callbacks --- + + @impl true + def init(_opts) do + tid = :ets.new(:file_metrics, [:set, :public, read_concurrency: true]) + {:ok, %{tid: tid}} + end + + @impl true + def handle_call(:get_tid, _from, state) do + {:reply, state.tid, state} + end + + # --- Private helpers --- + + defp md5(content), do: :crypto.hash(:md5, content) +end diff --git a/lib/codeqa/analysis/run_context.ex b/lib/codeqa/analysis/run_context.ex new file mode 100644 index 00000000..e0e9d526 --- /dev/null +++ b/lib/codeqa/analysis/run_context.ex @@ -0,0 +1,15 @@ +defmodule CodeQA.Analysis.RunContext do + @moduledoc """ + Holds PIDs for the per-run GenServers started under `RunSupervisor`. + + Passed through the analysis call chain so all callers can access + cached state without named process registration. + """ + + defstruct [:behavior_config_pid, :file_context_pid] + + @type t :: %__MODULE__{ + behavior_config_pid: pid(), + file_context_pid: pid() + } +end diff --git a/lib/codeqa/analysis/run_supervisor.ex b/lib/codeqa/analysis/run_supervisor.ex new file mode 100644 index 00000000..ab6bb10f --- /dev/null +++ b/lib/codeqa/analysis/run_supervisor.ex @@ -0,0 +1,52 @@ +defmodule CodeQA.Analysis.RunSupervisor do + @moduledoc """ + One-shot supervisor for the per-analysis-run GenServers. + + Started at the top of `Analyzer.with_run_context/2` and stopped (via + `Supervisor.stop/1`) in an `after` block when the run completes. + + Servers are not registered by name, preventing collisions when multiple + analysis runs share the same BEAM node (e.g. parallel tests). + """ + + use Supervisor + + alias CodeQA.Analysis.{BehaviorConfigServer, FileContextServer, RunContext} + + @spec start_link(keyword()) :: Supervisor.on_start() + def start_link(opts \\ []) do + Supervisor.start_link(__MODULE__, opts) + end + + @doc """ + Queries child PIDs from `sup` and returns a `RunContext` struct. + + Call once after `start_link/1` succeeds, before beginning analysis. + """ + @spec run_context(pid()) :: RunContext.t() + def run_context(sup) do + children = Supervisor.which_children(sup) + + %RunContext{ + behavior_config_pid: find_pid(children, BehaviorConfigServer), + file_context_pid: find_pid(children, FileContextServer) + } + end + + @impl true + def init(_opts) do + children = [ + {BehaviorConfigServer, []}, + {FileContextServer, []} + ] + + Supervisor.init(children, strategy: :one_for_one) + end + + defp find_pid(children, module) do + {_id, pid, _type, _modules} = + Enum.find(children, fn {id, _pid, _type, _modules} -> id == module end) + + pid + end +end diff --git a/lib/codeqa/analyzer.ex b/lib/codeqa/analyzer.ex deleted file mode 100644 index ddcb6ab1..00000000 --- a/lib/codeqa/analyzer.ex +++ /dev/null @@ -1,130 +0,0 @@ -defmodule CodeQA.Analyzer do - @moduledoc "Orchestrates metric computation across files." - - alias CodeQA.Registry - alias CodeQA.Metrics - - def build_registry do - Registry.new() - |> Registry.register_file_metric(Metrics.Entropy) - |> Registry.register_file_metric(Metrics.Compression) - |> Registry.register_file_metric(Metrics.Zipf) - |> Registry.register_file_metric(Metrics.Heaps) - |> Registry.register_file_metric(Metrics.Vocabulary) - |> Registry.register_file_metric(Metrics.Ngram) - |> Registry.register_file_metric(Metrics.Halstead) - |> Registry.register_file_metric(Metrics.Readability) - |> Registry.register_file_metric(Metrics.CasingEntropy) - |> Registry.register_file_metric(Metrics.IdentifierLengthVariance) - |> Registry.register_file_metric(Metrics.Indentation) - |> Registry.register_file_metric(Metrics.Branching) - |> Registry.register_file_metric(Metrics.FunctionMetrics) - |> Registry.register_file_metric(Metrics.MagicNumberDensity) - |> Registry.register_file_metric(Metrics.SymbolDensity) - |> Registry.register_file_metric(Metrics.VowelDensity) - |> Registry.register_codebase_metric(Metrics.Similarity) - end - - def analyze_codebase(files, opts \\ []) do - registry = build_registry() - - opts = - if Keyword.get(opts, :experimental_stopwords, false) do - has_progress = Keyword.get(opts, :on_progress) - - if has_progress, - do: IO.puts(:stderr, " Analyzing Stopwords (Tokens and Fingerprints)...") - - word_extractor = fn content -> - Regex.scan(~r/\b[a-zA-Z_]\w*\b/u, content) |> List.flatten() - end - - word_stopwords = - CodeQA.Telemetry.time(:stopwords_words, fn -> - CodeQA.Stopwords.find_stopwords(files, word_extractor, opts) - end) - - fp_extractor = fn content -> - CodeQA.Metrics.TokenNormalizer.normalize(content) |> CodeQA.Metrics.Winnowing.kgrams(5) - end - - fp_stopwords = - CodeQA.Telemetry.time(:stopwords_fingerprints, fn -> - CodeQA.Stopwords.find_stopwords(files, fp_extractor, opts) - end) - - if has_progress do - IO.puts( - :stderr, - " Found #{MapSet.size(word_stopwords)} common word stopwords and #{MapSet.size(fp_stopwords)} common fingerprint stopwords." - ) - end - - opts - |> Keyword.put(:word_stopwords, word_stopwords) - |> Keyword.put(:fp_stopwords, fp_stopwords) - else - opts - end - - file_results = CodeQA.Parallel.analyze_files(files, opts) - codebase_metrics = Registry.run_codebase_metrics(registry, files, opts) - aggregate = aggregate_file_metrics(file_results) - - %{ - "files" => file_results, - "codebase" => %{ - "aggregate" => aggregate, - "similarity" => Map.get(codebase_metrics, "similarity", %{}) - } - } - end - - defp metric_data_to_triples({metric_name, metric_data}) do - metric_data - |> Enum.filter(fn {_k, v} -> is_number(v) end) - |> Enum.map(fn {key, value} -> {metric_name, key, value / 1} end) - end - - defp aggregate_file_metrics(file_results) do - file_results - |> Map.values() - |> Enum.flat_map(fn file_data -> - file_data - |> Map.get("metrics", %{}) - |> Enum.flat_map(&metric_data_to_triples/1) - end) - |> Enum.group_by(fn {metric, key, _val} -> {metric, key} end, fn {_, _, val} -> val end) - |> Enum.reduce(%{}, fn {{metric, key}, values}, acc -> - stats = compute_stats(values) - metric_agg = Map.get(acc, metric, %{}) - - updated = - Map.merge(metric_agg, %{ - "mean_#{key}" => stats.mean, - "std_#{key}" => stats.std, - "min_#{key}" => stats.min, - "max_#{key}" => stats.max - }) - - Map.put(acc, metric, updated) - end) - end - - defp compute_stats([]), do: %{mean: 0.0, std: 0.0, min: 0.0, max: 0.0} - - defp compute_stats(values) do - n = length(values) - mean = Enum.sum(values) / n - sum_squares = Enum.reduce(values, 0.0, fn v, acc -> acc + (v - mean) ** 2 end) - variance = sum_squares / n - std = :math.sqrt(variance) - - %{ - mean: Float.round(mean * 1.0, 4), - std: Float.round(std * 1.0, 4), - min: Float.round(Enum.min(values) * 1.0, 4), - max: Float.round(Enum.max(values) * 1.0, 4) - } - end -end diff --git a/lib/codeqa/ast/classification/node_classifier.ex b/lib/codeqa/ast/classification/node_classifier.ex new file mode 100644 index 00000000..7a71e584 --- /dev/null +++ b/lib/codeqa/ast/classification/node_classifier.ex @@ -0,0 +1,142 @@ +defmodule CodeQA.AST.Classification.NodeClassifier do + @moduledoc """ + Classifies a Node into a typed struct by running classification signals + over its tokens and weighing their votes. + + ## How it works + + Six classification signals scan the node's token stream in parallel via + `SignalStream`. Each signal emits weighted votes (e.g. `{:function_vote, 3}`) + when it detects a pattern indicating a node type. The classifier sums weights + per type and picks the winner. Ties and no-votes fall back to `:code`. + + ## Signals and votes + + | Signal | Vote key | Patterns detected | + |---|---|---| + | `DocSignal` | `:doc_vote` | `` token anywhere | + | `AttributeSignal` | `:attribute_vote` | `@name` at indent 0 | + | `FunctionSignal` | `:function_vote` | `def`, `func`, `fn`, etc. at indent 0 | + | `ModuleSignal` | `:module_vote` | `defmodule`, `class`, `module`, etc. at indent 0 | + | `ImportSignal` | `:import_vote` | `import`, `use`, `alias`, etc. at indent 0 | + | `TestSignal` | `:test_vote` | `test`, `describe`, `it`, etc. at indent 0 | + + ## Weights + + Weight 3 = first keyword seen (strong match); weight 1 = keyword later in + block (weak match, e.g. after a leading comment). `DocSignal` always emits + weight 3 and wins when a `` token is present, since triple-quoted strings + are unambiguous. + + ## Type-specific fields + + `FunctionNode.name/arity/visibility`, `ModuleNode.name/kind`, etc. all default + to `nil`. Population of those fields is left to a future enrichment pass. + """ + + alias CodeQA.AST.Enrichment.Node + + alias CodeQA.AST.Nodes.{ + AttributeNode, + CodeNode, + DocNode, + FunctionNode, + ImportNode, + ModuleNode, + TestNode + } + + alias CodeQA.AST.Parsing.SignalStream + + alias CodeQA.AST.Signals.Classification.{ + AttributeSignal, + DocSignal, + FunctionSignal, + ImportSignal, + ModuleSignal, + TestSignal + } + + @classification_signals [ + %DocSignal{}, + %AttributeSignal{}, + %FunctionSignal{}, + %ModuleSignal{}, + %ImportSignal{}, + %TestSignal{} + ] + + @type_modules %{ + doc: DocNode, + attribute: AttributeNode, + function: FunctionNode, + module: ModuleNode, + import: ImportNode, + test: TestNode, + code: CodeNode + } + + @doc """ + Classify a Node into the most specific typed struct. + + Runs classification signals, weighs votes, and delegates to the winning + struct's `cast/1` to build the result. Type-specific fields default to nil. + """ + @spec classify(Node.t(), module()) :: term() + def classify(%Node{} = node, lang_mod), do: classify(node, lang_mod, nil) + + @doc """ + Classify a Node, optionally seeded with `parent_context` tokens that come + immediately before the node in its parent block. + + Used for sub-blocks the bracket/keyword splitter has carved out of a parent: + e.g. the `{Bar, Baz}` of a multi-line `alias Foo.{Bar, Baz}`. Without the + context, the sub-block lacks the `alias` keyword and falls back to `:code`. + Prepending the parent's last line of tokens gives the existing classification + signals enough to vote correctly. + """ + @spec classify(Node.t(), module(), [CodeQA.AST.Lexing.Token.t()] | nil) :: term() + def classify(%Node{} = node, lang_mod, parent_context) do + tokens = prepend_context(node.tokens, parent_context) + type = vote(tokens, lang_mod) + @type_modules[type].cast(node) + end + + defp prepend_context(tokens, nil), do: tokens + defp prepend_context(tokens, []), do: tokens + defp prepend_context(tokens, ctx) when is_list(ctx), do: ctx ++ tokens + + defp vote(tokens, lang_mod) do + tokens + |> run_signals(lang_mod) + |> tally() + |> winner() + end + + defp run_signals(tokens, lang_mod) do + SignalStream.run(tokens, @classification_signals, lang_mod) + |> List.flatten() + |> Enum.filter(fn {_src, group, _name, _val} -> group == :classification end) + end + + defp tally(emissions) do + Enum.reduce(emissions, %{}, fn {_src, _grp, name, weight}, acc -> + Map.update(acc, name, weight, &(&1 + weight)) + end) + end + + defp winner(votes) when map_size(votes) == 0, do: :code + + defp winner(votes) do + {vote_name, _weight} = Enum.max_by(votes, fn {_, w} -> w end) + vote_to_type(vote_name) + end + + defp vote_to_type(:doc_vote), do: :doc + defp vote_to_type(:attribute_vote), do: :attribute + defp vote_to_type(:function_vote), do: :function + defp vote_to_type(:module_vote), do: :module + defp vote_to_type(:import_vote), do: :import + defp vote_to_type(:test_vote), do: :test + defp vote_to_type(_), do: :code +end diff --git a/lib/codeqa/ast/classification/node_protocol.ex b/lib/codeqa/ast/classification/node_protocol.ex new file mode 100644 index 00000000..fa4943df --- /dev/null +++ b/lib/codeqa/ast/classification/node_protocol.ex @@ -0,0 +1,29 @@ +defprotocol CodeQA.AST.Classification.NodeProtocol do + @moduledoc """ + Common interface for all typed AST node structs. + + All node struct types (CodeNode, DocNode, FunctionNode, etc.) implement this + protocol, allowing downstream code to work with any node type uniformly. + """ + + @spec tokens(t()) :: [term()] + def tokens(node) + + @spec flat_tokens(t()) :: [term()] + def flat_tokens(node) + + @spec line_count(t()) :: non_neg_integer() + def line_count(node) + + @spec children(t()) :: [term()] + def children(node) + + @spec start_line(t()) :: non_neg_integer() | nil + def start_line(node) + + @spec end_line(t()) :: non_neg_integer() | nil + def end_line(node) + + @spec label(t()) :: term() | nil + def label(node) +end diff --git a/lib/codeqa/ast/classification/node_type_detector.ex b/lib/codeqa/ast/classification/node_type_detector.ex new file mode 100644 index 00000000..50383713 --- /dev/null +++ b/lib/codeqa/ast/classification/node_type_detector.ex @@ -0,0 +1,20 @@ +defmodule CodeQA.AST.Classification.NodeTypeDetector do + @moduledoc """ + Classifies a list of raw `Node` structs (from `Parser`) into typed structs. + + Each node is classified by `NodeClassifier`, which runs classification signals + over the node's tokens and picks the highest-voted type. See `NodeClassifier` + for the full list of signals and their weights. + """ + + alias CodeQA.AST.Classification.NodeClassifier + alias CodeQA.AST.Enrichment.Node + + @doc """ + Classify each node in the list into the most specific typed struct. + """ + @spec detect_types([Node.t()], module()) :: [term()] + def detect_types(blocks, lang_mod) do + Enum.map(blocks, &NodeClassifier.classify(&1, lang_mod)) + end +end diff --git a/lib/codeqa/ast/classification/typed_node_kind.ex b/lib/codeqa/ast/classification/typed_node_kind.ex new file mode 100644 index 00000000..4993ee1d --- /dev/null +++ b/lib/codeqa/ast/classification/typed_node_kind.ex @@ -0,0 +1,24 @@ +defmodule CodeQA.AST.Classification.TypedNodeKind do + @moduledoc "Maps a typed node struct from `NodeClassifier` to its kind atom." + + alias CodeQA.AST.Nodes.{ + AttributeNode, + CodeNode, + DocNode, + FunctionNode, + ImportNode, + ModuleNode, + TestNode + } + + @type kind :: :doc | :attribute | :function | :module | :import | :test | :code + + @spec of(struct()) :: kind() + def of(%DocNode{}), do: :doc + def of(%AttributeNode{}), do: :attribute + def of(%FunctionNode{}), do: :function + def of(%ModuleNode{}), do: :module + def of(%ImportNode{}), do: :import + def of(%TestNode{}), do: :test + def of(%CodeNode{}), do: :code +end diff --git a/lib/codeqa/ast/enrichment/compound_node.ex b/lib/codeqa/ast/enrichment/compound_node.ex new file mode 100644 index 00000000..88a594c2 --- /dev/null +++ b/lib/codeqa/ast/enrichment/compound_node.ex @@ -0,0 +1,41 @@ +defmodule CodeQA.AST.Enrichment.CompoundNode do + @moduledoc """ + Groups semantically related typed nodes together. + + A compound node represents a complete "unit" in source code — combining + documentation, type annotations, and implementation: + + - `docs` — [DocNode.t()] (triple-quoted docstrings) + - `typespecs` — [AttributeNode.t()] (@spec, @type, etc.) + - `code` — [Node.t()] with type :code (implementation clauses) + + Boundaries span all constituent nodes in source order (docs → typespecs → + code), with leading/trailing whitespace tokens stripped. Column values are + read from the `col` field of the relevant Token structs — Node has no col + fields. + + A bare code node with no preceding docs/typespecs is still wrapped in a + CompoundNode (with empty `docs` and `typespecs`). + """ + + alias CodeQA.AST.Enrichment.Node + alias CodeQA.AST.Nodes.AttributeNode + + defstruct docs: [], + typespecs: [], + code: [], + start_line: nil, + start_col: nil, + end_line: nil, + end_col: nil + + @type t :: %__MODULE__{ + docs: [Node.t()], + typespecs: [AttributeNode.t()], + code: [Node.t()], + start_line: non_neg_integer() | nil, + start_col: non_neg_integer() | nil, + end_line: non_neg_integer() | nil, + end_col: non_neg_integer() | nil + } +end diff --git a/lib/codeqa/ast/enrichment/compound_node_builder.ex b/lib/codeqa/ast/enrichment/compound_node_builder.ex new file mode 100644 index 00000000..27c61659 --- /dev/null +++ b/lib/codeqa/ast/enrichment/compound_node_builder.ex @@ -0,0 +1,157 @@ +defmodule CodeQA.AST.Enrichment.CompoundNodeBuilder do + @moduledoc """ + Groups typed nodes into CompoundNode structs. + + A new compound starts when: + 1. A :doc or :typespec node appears after at least one :code node + 2. The trailing whitespace of the previous node contains 2+ tokens + + All consecutive :code nodes with no boundary between them accumulate + into the same compound's `code` list. + + Sub-blocks of :code nodes that have type :doc or :typespec are + promoted to the compound's `docs`/`typespecs` lists. + """ + + alias CodeQA.AST.Classification.NodeProtocol + alias CodeQA.AST.Enrichment.CompoundNode + alias CodeQA.AST.Enrichment.Node + alias CodeQA.AST.Lexing.{NewlineToken, WhitespaceToken} + alias CodeQA.AST.Nodes.{AttributeNode, DocNode} + + @doc """ + Groups a list of typed nodes into CompoundNode structs. + """ + @spec build([Node.t()]) :: [CompoundNode.t()] + def build([]), do: [] + + def build(blocks) do + # Accumulator: {current_compound, prev_trailing_ws, finalized_compounds} + # prev_trailing_ws carries the trailing / tokens stripped from the + # PREVIOUS node. Blank-line boundaries are detected on the previous node's + # trailing whitespace — BlankLineRule places blank-line tokens at the + # END of the node that precedes the split, not at the start of the new one. + {current, _, compounds} = + Enum.reduce(blocks, {empty_compound(), [], []}, fn block, + {current, prev_trailing_ws, acc} -> + {content_tokens, trailing_ws} = split_trailing_whitespace(block.tokens) + clean_block = %{block | tokens: content_tokens} + # Check the PREVIOUS node's trailing whitespace for blank-line boundary + blank_boundary = blank_line_boundary?(prev_trailing_ws) + + cond do + # Rule 1: doc/typespec after code → flush and start new compound + (is_struct(block, DocNode) or is_struct(block, AttributeNode)) and current.code != [] -> + {start_compound(clean_block), trailing_ws, [finalize(current) | acc]} + + # Rule 2: blank-line boundary on previous node → flush and start fresh + blank_boundary and not empty_compound?(current) -> + {start_compound(clean_block), trailing_ws, [finalize(current) | acc]} + + # No boundary — accumulate into current + true -> + {add_block(current, clean_block), trailing_ws, acc} + end + end) + + compounds + |> then(fn acc -> + if empty_compound?(current), do: acc, else: [finalize(current) | acc] + end) + |> Enum.reverse() + end + + defp empty_compound, do: %CompoundNode{} + + defp empty_compound?(%CompoundNode{docs: [], typespecs: [], code: []}), do: true + defp empty_compound?(_), do: false + + defp add_block(%CompoundNode{} = compound, block) when is_struct(block, DocNode) do + %CompoundNode{compound | docs: compound.docs ++ [block]} + end + + defp add_block(%CompoundNode{} = compound, block) when is_struct(block, AttributeNode) do + %CompoundNode{compound | typespecs: compound.typespecs ++ [block]} + end + + defp add_block(%CompoundNode{} = compound, block) do + {promoted_docs, promoted_specs, clean_children} = promote_sub_blocks(block.children) + clean_block = %{block | children: clean_children} + + %CompoundNode{ + compound + | code: compound.code ++ [clean_block], + docs: compound.docs ++ promoted_docs, + typespecs: compound.typespecs ++ promoted_specs + } + end + + defp start_compound(new_block) do + add_block(empty_compound(), new_block) + end + + # Separates children by type — :doc/:typespec go up to the compound level. + defp promote_sub_blocks(children) do + Enum.reduce(children, {[], [], []}, fn sub, {docs, specs, code} -> + case sub.type do + :doc -> {docs ++ [sub], specs, code} + :typespec -> {docs, specs ++ [sub], code} + _ -> {docs, specs, code ++ [sub]} + end + end) + end + + # Strips trailing / tokens from a node's token list. + # Returns {content_tokens, trailing_ws_tokens}. + defp split_trailing_whitespace(tokens) do + last_content_idx = + tokens + |> Enum.with_index() + |> Enum.reverse() + |> Enum.find_index(fn {t, _} -> + not is_map(t) or t.kind not in [WhitespaceToken.kind(), NewlineToken.kind()] + end) + + case last_content_idx do + nil -> + {[], tokens} + + rev_idx -> + content_len = length(tokens) - rev_idx + {Enum.slice(tokens, 0, content_len), Enum.slice(tokens, content_len..-1//1)} + end + end + + # A blank-line boundary exists when the trailing whitespace contains 3+ tokens + # (i.e. 2+ blank lines). A single blank line (2 NLs: end-of-line + blank line) is + # common within a compound (e.g. between function clauses) and does not split. + defp blank_line_boundary?(trailing_ws) do + Enum.count(trailing_ws, &(&1.kind == NewlineToken.kind())) >= 3 + end + + # Computes boundaries from all constituent nodes in source order: + # docs → typespecs → code. Reads col directly from Token structs. + defp finalize(%CompoundNode{} = compound) do + all_blocks = compound.docs ++ compound.typespecs ++ compound.code + all_tokens = Enum.flat_map(all_blocks, &NodeProtocol.flat_tokens/1) + + first_token = + Enum.find( + all_tokens, + &(is_map(&1) and &1.kind not in [WhitespaceToken.kind(), NewlineToken.kind()]) + ) + + last_token = + all_tokens + |> Enum.reverse() + |> Enum.find(&(is_map(&1) and &1.kind not in [WhitespaceToken.kind(), NewlineToken.kind()])) + + %CompoundNode{ + compound + | start_line: first_token && first_token.line, + start_col: first_token && first_token.col, + end_line: last_token && last_token.line, + end_col: last_token && last_token.col + } + end +end diff --git a/lib/codeqa/ast/enrichment/node.ex b/lib/codeqa/ast/enrichment/node.ex new file mode 100644 index 00000000..65e4b23d --- /dev/null +++ b/lib/codeqa/ast/enrichment/node.ex @@ -0,0 +1,70 @@ +defmodule CodeQA.AST.Enrichment.Node do + @moduledoc """ + A detected code node with optional nested sub-blocks. + + ## Fields + + - `tokens` — aggregated code content: for leaf nodes, the original token stream; + for non-leaf nodes, the flat concatenation of all children's `tokens`. + Use this for content comparison and metrics. + - `line_count` — number of source lines spanned by this node: `end_line - start_line + 1` + when both are available, else `1`. + - `children` — nested `Node.t()` structs detected by enclosure rules + (`BracketRule`, `ColonIndentationRule`). + - `label` — arbitrary term attached by the caller. Set to `"path:start_line"` + by `NearDuplicateBlocks.analyze/2` for human-readable pair reporting. + - `start_line` — 1-based line number of the first token in this node, populated by + `Parser` from `List.first(tokens).line`. + - `end_line` — 1-based line number of the last token in this node, populated by + `Parser` from `List.last(tokens).line`. + + `start_line` and `end_line` may be `nil` for synthetic nodes created in tests + without line metadata. + """ + + @enforce_keys [:tokens, :line_count, :children] + defstruct [ + :tokens, + :line_count, + :children, + :label, + :start_line, + :end_line, + type: :code + ] + + @type t :: %__MODULE__{ + tokens: [CodeQA.AST.Lexing.Token.t()], + line_count: non_neg_integer(), + children: [term()], + label: term() | nil, + start_line: non_neg_integer() | nil, + end_line: non_neg_integer() | nil, + type: :code | :doc | :typespec + } + + @spec children_count(t()) :: non_neg_integer() + def children_count(%__MODULE__{children: ch}), do: length(ch) + + # Keep old name as deprecated alias during transition + @spec sub_block_count(t()) :: non_neg_integer() + def sub_block_count(%__MODULE__{children: ch}), do: length(ch) + + @spec token_count(t()) :: non_neg_integer() + def token_count(%__MODULE__{tokens: tokens}), do: length(tokens) +end + +defimpl CodeQA.AST.Classification.NodeProtocol, for: CodeQA.AST.Enrichment.Node do + def tokens(n), do: n.tokens + def line_count(n), do: n.line_count + def children(n), do: n.children + def start_line(n), do: n.start_line + def end_line(n), do: n.end_line + def label(n), do: n.label + + def flat_tokens(n) do + if Enum.empty?(n.children), + do: n.tokens, + else: Enum.flat_map(n.children, &CodeQA.AST.Classification.NodeProtocol.flat_tokens/1) + end +end diff --git a/lib/codeqa/ast/enrichment/node_analyzer.ex b/lib/codeqa/ast/enrichment/node_analyzer.ex new file mode 100644 index 00000000..2f6221cc --- /dev/null +++ b/lib/codeqa/ast/enrichment/node_analyzer.ex @@ -0,0 +1,65 @@ +defmodule CodeQA.AST.Enrichment.NodeAnalyzer do + @moduledoc """ + Extracts locally bound variable names from a token list. + + Used by the domain tagger to subtract local bindings from the domain signal — + a variable bound within a node (e.g. `user = Repo.get!(id)`) is not a domain + reference and should not appear in the node's domain fingerprint. + """ + + alias CodeQA.AST.Lexing.NewlineToken + alias CodeQA.AST.Lexing.Token + + @doc """ + Returns a MapSet of lowercase identifier names that are locally bound + within the given token list. + + Detected patterns: + - ` "="` — simple assignment (guards against `==`, `=>`, `=~`, `!=`, `<=`, `>=`) + - ` "<-"` — with/for binding (all `` tokens on the LHS of `<-`) + + Function parameters are NOT extracted here (see `param_variables/1`). + """ + @spec bound_variables([Token.t()]) :: MapSet.t(String.t()) + def bound_variables(tokens) do + MapSet.union( + assignment_bindings(tokens), + arrow_bindings(tokens) + ) + end + + # Collect `` immediately before `=` + defp assignment_bindings(tokens) do + tokens + |> Enum.chunk_every(2, 1, :discard) + |> Enum.flat_map(fn + [%Token{kind: "", content: name}, %Token{kind: "="}] -> + [String.downcase(name)] + + _ -> + [] + end) + |> MapSet.new() + end + + # Collect all `` tokens on the LHS of `<-` (within the same line). + # Resets the accumulator on `` so RHS tokens from prior lines don't leak. + defp arrow_bindings(tokens) do + tokens + |> Enum.reduce({[], MapSet.new()}, fn + %Token{kind: "<-"}, {lhs_ids, acc} -> + new_bindings = lhs_ids |> Enum.map(&String.downcase/1) |> MapSet.new() + {[], MapSet.union(acc, new_bindings)} + + %NewlineToken{}, {_, acc} -> + {[], acc} + + %Token{kind: "", content: name}, {lhs_ids, acc} -> + {[name | lhs_ids], acc} + + _, {lhs_ids, acc} -> + {lhs_ids, acc} + end) + |> elem(1) + end +end diff --git a/lib/codeqa/ast/lexing/newline_token.ex b/lib/codeqa/ast/lexing/newline_token.ex new file mode 100644 index 00000000..2ccb7129 --- /dev/null +++ b/lib/codeqa/ast/lexing/newline_token.ex @@ -0,0 +1,29 @@ +defmodule CodeQA.AST.Lexing.NewlineToken do + @moduledoc """ + A newline token emitted by `TokenNormalizer.normalize_structural/1`. + + Represents a `\\n` line boundary between two source lines. + + ## Fields + + - `kind` — always `""`. + - `content` — the original newline character (`"\\n"`). + - `line` — 1-based line number of the line that ends here. + - `col` — 0-based byte offset of the newline within that line. + """ + + @kind "" + + defstruct [:content, :line, :col, kind: @kind] + + @doc "Returns the normalized kind string for newline tokens." + @spec kind() :: String.t() + def kind, do: @kind + + @type t :: %__MODULE__{ + kind: String.t(), + content: String.t(), + line: non_neg_integer() | nil, + col: non_neg_integer() | nil + } +end diff --git a/lib/codeqa/ast/lexing/string_token.ex b/lib/codeqa/ast/lexing/string_token.ex new file mode 100644 index 00000000..04fc2963 --- /dev/null +++ b/lib/codeqa/ast/lexing/string_token.ex @@ -0,0 +1,49 @@ +defmodule CodeQA.AST.Lexing.StringToken do + @moduledoc """ + A string token emitted by `TokenNormalizer` for all string literals, + including triple-quoted heredocs. + + ## Fields + + - `kind` — `""` for single-line strings, `""` for + triple-quoted heredoc delimiters. + - `content` — original source text (the full quoted literal or delimiter). + - `line`, `col` — source location. + - `interpolations` — list of interpolation expressions (`nil` for plain strings). + - `multiline` — `true` for triple-quoted (`\"\"\"` / `'''`) tokens. + - `quotes` — `:double`, `:single`, or `:backtick`. + """ + + @kind "" + @doc_kind "" + + defstruct [ + :content, + :line, + :col, + kind: @kind, + interpolations: nil, + multiline: false, + quotes: :double + ] + + @doc "Returns the normalized kind string for single-line string tokens." + @spec kind() :: String.t() + def kind, do: @kind + + @doc "Returns the normalized kind string for triple-quoted doc string tokens." + @spec doc_kind() :: String.t() + def doc_kind, do: @doc_kind + + @type quotes :: :double | :single | :backtick + + @type t :: %__MODULE__{ + content: String.t(), + line: non_neg_integer() | nil, + col: non_neg_integer() | nil, + kind: String.t(), + interpolations: [String.t()] | nil, + multiline: boolean(), + quotes: quotes() + } +end diff --git a/lib/codeqa/ast/lexing/token.ex b/lib/codeqa/ast/lexing/token.ex new file mode 100644 index 00000000..f705f7e6 --- /dev/null +++ b/lib/codeqa/ast/lexing/token.ex @@ -0,0 +1,45 @@ +defmodule CodeQA.AST.Lexing.Token do + @moduledoc """ + A single token emitted by `TokenNormalizer.normalize_structural/1`. + + ## Fields + + - `value` — normalized form used for structural comparison: ``, ``, + ``, ``, ``, or the literal character(s) for + punctuation and operators. + - `content` — original source text before normalization. Identical to `value` + for punctuation/structural tokens; differs for identifiers, + strings, and numbers. Enables source reconstruction and is the + correct field to check when matching declaration keywords. + - `line` — 1-based line number in the source file. + - `col` — 0-based byte offset from the start of the line. + + String literals are emitted as `StringToken` structs, not `Token`, so that + the `interpolations` field does not pollute the common token shape. + + ## Design notes (from tree-sitter, ctags, lizard) + + - **value vs content split** — mirrors tree-sitter's distinction between a + node's `type` (structural kind) and its `text` (original source). `value` + is the kind used for pattern matching and comparison; `content` is the + original text used for reporting and reconstruction. + - **Normalization lives in value, not content** — `content` is never modified. + This means two tokens with different `content` but the same `value` (e.g. + `"foo"` and `"bar"` both normalizing to ``) are structurally equivalent + for duplicate detection but distinguishable for reporting. + - **Line + col for precise location** — ctags records line numbers; tree-sitter + records byte ranges. We store both line (for human-readable reporting) and + col (for IDE navigation and sub-block start/end precision). + - **No enforcement on line/col** — synthetic tokens created in tests may omit + line/col. Consumers that need location data should guard for nil. + """ + + defstruct [:kind, :content, :line, :col] + + @type t :: %__MODULE__{ + kind: String.t(), + content: String.t(), + line: non_neg_integer() | nil, + col: non_neg_integer() | nil + } +end diff --git a/lib/codeqa/ast/lexing/token_normalizer.ex b/lib/codeqa/ast/lexing/token_normalizer.ex new file mode 100644 index 00000000..5cabba3a --- /dev/null +++ b/lib/codeqa/ast/lexing/token_normalizer.ex @@ -0,0 +1,263 @@ +defmodule CodeQA.AST.Lexing.TokenNormalizer do + @moduledoc """ + Abstracts raw source code into language-agnostic structural tokens. + + See [lexical analysis](https://en.wikipedia.org/wiki/Lexical_analysis). + """ + + alias CodeQA.AST.Lexing.NewlineToken + alias CodeQA.AST.Lexing.StringToken + alias CodeQA.AST.Lexing.Token + alias CodeQA.AST.Lexing.WhitespaceToken + + @doc """ + Normalizes source code into language-agnostic structural tokens, preserving + newlines as `` and leading whitespace as `` tokens (one per + 2-space / 1-tab indentation unit). + + Returns `[Token.t()]` where each token carries its normalized `value`, + original source `content`, 1-based `line` number, and 0-based `col` offset. + Used for structural block detection. + """ + @spec normalize_structural(String.t()) :: [Token.t()] + def normalize_structural(code) do + code = String.replace(code, ~r/[^\x00-\x7F]/, " ") + lines = String.split(code, "\n") + last_idx = length(lines) - 1 + + lines + |> Enum.with_index() + |> Enum.flat_map(fn {line, idx} -> + tokens_with_newline(line, idx, last_idx) + end) + end + + defp tokens_with_newline(line, idx, last_idx) do + line_num = idx + 1 + {tokens, last_token} = tokenize_line(line, line_num) + + if idx < last_idx do + # last_token is tracked during scanning — O(1) vs List.last/1 which is O(N). + nl_col = + case last_token do + nil -> 0 + t -> t.col + String.length(t.content) + end + + tokens ++ [%NewlineToken{content: "\n", line: line_num, col: nl_col}] + else + tokens + end + end + + # Returns {tokens, last_token} where last_token is the final token on the line + # (or nil for an empty line), allowing normalize_structural to compute nl_col + # in O(1) without calling List.last/1. + defp tokenize_line(line, line_num) do + indent_chars = + line + |> String.graphemes() + |> Enum.take_while(&(&1 in [" ", "\t"])) + + indent_units = + indent_chars + |> Enum.reduce(0, fn + "\t", acc -> acc + 2 + " ", acc -> acc + 1 + end) + |> div(2) + + indent_col_width = length(indent_chars) + + ws_tokens = + for i <- 1..indent_units//1 do + %WhitespaceToken{content: " ", line: line_num, col: (i - 1) * 2} + end + + content = String.slice(line, indent_col_width..-1//1) + {content_tokens, last_content} = scan_content(content, line_num, indent_col_width) + + # Last token on the line: prefer the last content token; fall back to the + # last WS token (only possible when the content portion is empty). + last_token = last_content || List.last(ws_tokens) + + {ws_tokens ++ content_tokens, last_token} + end + + # Multi-char operators matched longest-first so that e.g. `===` beats `==`. + # Tagged `:literal` so `next_token` uses the matched text as both value and content + # (unlike ``, ``, `` which normalise content away). + @operator_regex ~r/^(?:===|!==|<=>|==|!=|<=|>=|\|>|<>|<-|->|=>|=~|!~|&&|\|\||\?\?|\?\.|:=|::|\.\.\.|\.\.|--|\+\+|\*\*|\/\/|\+=|-=|\*=|\/=|%=)/ + + # --- Individual rule atoms so dispatch groups can reference them directly --- + @skip_rule {:skip, ~r/^\s+/} + @operator_rule {:literal, @operator_regex} + @trip_quotes_rule {"", ~r/^"""|^'''/} + @str_interp_rule {"", ~r/^"(?=[^"]*#\{)(?:[^"\\#]|\\.|#(?!\{)|#\{[^}]*\})*"/} + @str_dollar_interp_rule {"", + ~r/^"(?=[^"]*\$\{)(?:[^"\\$]|\\.|\\$(?!\{)|\$\{[^}]*\})*"/} + @str_swift_interp_rule {"", ~r/^"(?=[^"]*\\\()(?:[^"\\]|\\.)*"/} + @str_rule {"", ~r/^"(?:[^"\\]|\\.)*"|^'(?:[^'\\]|\\.)*'/} + @backtick_interp_rule {"", + ~r/^`(?=[^`]*\$\{)(?:[^`\\$]|\\.|\\$(?!\{)|\$\{[^}]*\})*`/} + @backtick_str_rule {"", ~r/^`(?:[^`\\]|\\.)*`/} + @num_rule {"", ~r/^\d+(?:\.\d+)?/} + @id_rule {"", ~r/^[a-zA-Z_]\w*/} + + # Dispatch rule subsets by first character so the common cases (identifiers, + # numbers, whitespace, operators) skip irrelevant regex attempts entirely. + @double_quote_rules [ + @trip_quotes_rule, + @str_interp_rule, + @str_dollar_interp_rule, + @str_swift_interp_rule, + @str_rule + ] + @single_quote_rules [@trip_quotes_rule, @str_rule] + @backtick_rules [@backtick_interp_rule, @backtick_str_rule] + + # Returns the rule subset for the given first byte (ASCII codepoint). + defp dispatch_rules(?"), do: @double_quote_rules + defp dispatch_rules(?'), do: @single_quote_rules + defp dispatch_rules(?`), do: @backtick_rules + defp dispatch_rules(c) when c >= ?0 and c <= ?9, do: [@num_rule] + + defp dispatch_rules(c) + when (c >= ?a and c <= ?z) or (c >= ?A and c <= ?Z) or c == ?_, + do: [@id_rule] + + defp dispatch_rules(c) + when c in [?=, ?!, ?<, ?>, ?|, ?&, ??, ?:, ?., ?-, ?+, ?*, ?/, ?%], + do: [@operator_rule] + + defp dispatch_rules(c) when c <= 32, do: [@skip_rule] + + # Unknown first char — no rule applies; caller falls through to single-char token. + defp dispatch_rules(_), do: [] + + # Returns {tokens, last_token_or_nil} — last_token is tracked during scanning + # so callers get O(1) access to the final token without List.last/1. + defp scan_content(text, line_num, col_offset) do + {reversed, last} = do_scan(text, line_num, col_offset, [], nil) + {Enum.reverse(reversed), last} + end + + defp do_scan("", _line, _col, acc, last), do: {acc, last} + + defp do_scan(<> = text, line, col, acc, last) do + case next_token(first, text, line, col) do + {:skip, rest, advance} -> do_scan(rest, line, col + advance, acc, last) + {token, rest, advance} -> do_scan(rest, line, col + advance, [token | acc], token) + end + end + + # next_token/4: dispatches on the first byte to select only candidate rules, + # avoiding regex attempts for rules whose first-char pattern can't possibly match. + defp next_token(first, text, line, col) do + rules = dispatch_rules(first) + + result = + Enum.find_value(rules, fn {type, regex} -> + case Regex.run(regex, text) do + [m | _] -> {type, m} + nil -> nil + end + end) + + case result do + {:skip, m} -> + len = String.length(m) + {:skip, String.slice(text, len..-1//1), len} + + {:literal, m} -> + len = String.length(m) + {%Token{kind: m, content: m, line: line, col: col}, String.slice(text, len..-1//1), len} + + {value, m} -> + len = String.length(m) + token = postprocess(value, %Token{kind: value, content: m, line: line, col: col}) + {token, String.slice(text, len..-1//1), len} + + nil -> + # No rule matched — emit the first character as a literal single-char token. + char = String.first(text) + {%Token{kind: char, content: char, line: line, col: col}, String.slice(text, 1..-1//1), 1} + end + end + + # Extract #{...} interpolation expressions into `interpolations` and strip + # them from `content` so downstream consumers see only the static string parts. + # Nested braces (e.g. #{foo(%{a: 1})}) are left as-is in content — the + # lookahead in the scan rule ensures a match only when simple interpolations + # are present. + defp postprocess("", token), + do: extract_interpolations(token, ~r/#\{([^}]*)\}/, ~r/#\{[^}]*\}/, quotes: :double) + + defp postprocess("", token), + do: extract_interpolations(token, ~r/\$\{([^}]*)\}/, ~r/\$\{[^}]*\}/, quotes: :double) + + defp postprocess("", token), + do: extract_interpolations(token, ~r/\\\(([^)]*)\)/, ~r/\\\([^)]*\)/, quotes: :double) + + defp postprocess("", token), + do: extract_interpolations(token, ~r/\$\{([^}]*)\}/, ~r/\$\{[^}]*\}/, quotes: :backtick) + + defp postprocess("", %Token{content: ~s(""")} = token), + do: %StringToken{ + kind: StringToken.doc_kind(), + content: token.content, + line: token.line, + col: token.col, + multiline: true, + quotes: :double + } + + defp postprocess("", token), + do: %StringToken{ + kind: StringToken.doc_kind(), + content: token.content, + line: token.line, + col: token.col, + multiline: true, + quotes: :single + } + + defp postprocess("", token), + do: %StringToken{ + kind: StringToken.kind(), + content: token.content, + line: token.line, + col: token.col, + quotes: :backtick + } + + defp postprocess("", token) do + quotes = if String.starts_with?(token.content, "\""), do: :double, else: :single + + %StringToken{ + kind: StringToken.kind(), + content: token.content, + line: token.line, + col: token.col, + quotes: quotes + } + end + + defp postprocess(_value, token), do: token + + defp extract_interpolations(token, capture_regex, strip_regex, opts) do + quotes = Keyword.get(opts, :quotes, :double) + + interpolations = + Regex.scan(capture_regex, token.content, capture: :all_but_first) + |> Enum.map(fn [expr] -> String.trim(expr) end) + + %StringToken{ + content: String.replace(token.content, strip_regex, ""), + line: token.line, + col: token.col, + interpolations: interpolations, + quotes: quotes + } + end +end diff --git a/lib/codeqa/ast/lexing/token_protocol.ex b/lib/codeqa/ast/lexing/token_protocol.ex new file mode 100644 index 00000000..e38458d7 --- /dev/null +++ b/lib/codeqa/ast/lexing/token_protocol.ex @@ -0,0 +1,59 @@ +defprotocol CodeQA.AST.Lexing.TokenProtocol do + @moduledoc """ + Protocol for token structs emitted by `TokenNormalizer`. + + Both `Token` and `StringToken` implement this protocol, so code that + processes token streams does not need to branch on the concrete struct type. + + ## Functions + + - `kind/1` — normalized structural kind (``, ``, ``, …) + - `content/1` — original source text before normalization + - `line/1` — 1-based line number in the source file (`nil` for synthetic tokens) + - `col/1` — 0-based byte offset from the start of the line (`nil` for synthetic tokens) + """ + + @doc "Returns the normalized structural kind of the token." + @spec kind(t) :: String.t() + def kind(token) + + @doc "Returns the original source text of the token." + @spec content(t) :: String.t() + def content(token) + + @doc "Returns the 1-based line number of the token, or `nil` for synthetic tokens." + @spec line(t) :: non_neg_integer() | nil + def line(token) + + @doc "Returns the 0-based column offset of the token, or `nil` for synthetic tokens." + @spec col(t) :: non_neg_integer() | nil + def col(token) +end + +defimpl CodeQA.AST.Lexing.TokenProtocol, for: CodeQA.AST.Lexing.Token do + def kind(%CodeQA.AST.Lexing.Token{kind: k}), do: k + def content(%CodeQA.AST.Lexing.Token{content: c}), do: c + def line(%CodeQA.AST.Lexing.Token{line: l}), do: l + def col(%CodeQA.AST.Lexing.Token{col: c}), do: c +end + +defimpl CodeQA.AST.Lexing.TokenProtocol, for: CodeQA.AST.Lexing.StringToken do + def kind(%CodeQA.AST.Lexing.StringToken{kind: k}), do: k + def content(%CodeQA.AST.Lexing.StringToken{content: c}), do: c + def line(%CodeQA.AST.Lexing.StringToken{line: l}), do: l + def col(%CodeQA.AST.Lexing.StringToken{col: c}), do: c +end + +defimpl CodeQA.AST.Lexing.TokenProtocol, for: CodeQA.AST.Lexing.NewlineToken do + def kind(%CodeQA.AST.Lexing.NewlineToken{kind: k}), do: k + def content(%CodeQA.AST.Lexing.NewlineToken{content: c}), do: c + def line(%CodeQA.AST.Lexing.NewlineToken{line: l}), do: l + def col(%CodeQA.AST.Lexing.NewlineToken{col: c}), do: c +end + +defimpl CodeQA.AST.Lexing.TokenProtocol, for: CodeQA.AST.Lexing.WhitespaceToken do + def kind(%CodeQA.AST.Lexing.WhitespaceToken{kind: k}), do: k + def content(%CodeQA.AST.Lexing.WhitespaceToken{content: c}), do: c + def line(%CodeQA.AST.Lexing.WhitespaceToken{line: l}), do: l + def col(%CodeQA.AST.Lexing.WhitespaceToken{col: c}), do: c +end diff --git a/lib/codeqa/ast/lexing/whitespace_token.ex b/lib/codeqa/ast/lexing/whitespace_token.ex new file mode 100644 index 00000000..cb230827 --- /dev/null +++ b/lib/codeqa/ast/lexing/whitespace_token.ex @@ -0,0 +1,29 @@ +defmodule CodeQA.AST.Lexing.WhitespaceToken do + @moduledoc """ + A whitespace/indentation token emitted by `TokenNormalizer.normalize_structural/1`. + + Represents one indentation unit (2 spaces or 1 tab) at the start of a line. + + ## Fields + + - `kind` — always `""`. + - `content` — the original source text for this indentation unit (`" "`). + - `line` — 1-based line number in the source file. + - `col` — 0-based byte offset from the start of the line. + """ + + @kind "" + + defstruct [:content, :line, :col, kind: @kind] + + @doc "Returns the normalized kind string for whitespace tokens." + @spec kind() :: String.t() + def kind, do: @kind + + @type t :: %__MODULE__{ + kind: String.t(), + content: String.t(), + line: non_neg_integer() | nil, + col: non_neg_integer() | nil + } +end diff --git a/lib/codeqa/ast/nodes/attribute_node.ex b/lib/codeqa/ast/nodes/attribute_node.ex new file mode 100644 index 00000000..7dd106cb --- /dev/null +++ b/lib/codeqa/ast/nodes/attribute_node.ex @@ -0,0 +1,67 @@ +defmodule CodeQA.AST.Nodes.AttributeNode do + @moduledoc """ + AST node for fields, constants, decorators, annotations, and typespecs. + Subsumes the previous :typespec node type (kind: :typespec). + """ + + alias CodeQA.AST.Enrichment.Node + alias CodeQA.AST.Lexing.{NewlineToken, WhitespaceToken} + + defstruct [:tokens, :line_count, :children, :start_line, :end_line, :label, :name, :kind] + + @type t :: %__MODULE__{ + tokens: [term()], + line_count: non_neg_integer(), + children: [term()], + start_line: non_neg_integer() | nil, + end_line: non_neg_integer() | nil, + label: term() | nil, + name: String.t() | nil, + kind: :field | :constant | :decorator | :annotation | :typespec | nil + } + + @typespec_attrs MapSet.new(~w[spec type typep opaque callback macrocallback]) + + @doc "Build an AttributeNode from a raw %Node{}, detecting :typespec kind from tokens." + @spec cast(Node.t()) :: t() + def cast(%Node{} = node) do + %__MODULE__{ + tokens: node.tokens, + line_count: node.line_count, + children: node.children, + start_line: node.start_line, + end_line: node.end_line, + label: node.label, + kind: detect_kind(node.tokens) + } + end + + defp detect_kind(tokens) do + tokens + |> Enum.drop_while(&(&1.kind in [WhitespaceToken.kind(), NewlineToken.kind()])) + |> case do + [%{kind: "@"}, %{kind: "", content: name} | _] -> + if MapSet.member?(@typespec_attrs, name), do: :typespec, else: nil + + _ -> + nil + end + end + + defimpl CodeQA.AST.Classification.NodeProtocol do + alias CodeQA.AST.Classification.NodeProtocol + + def tokens(n), do: n.tokens + def line_count(n), do: n.line_count + def children(n), do: n.children + def start_line(n), do: n.start_line + def end_line(n), do: n.end_line + def label(n), do: n.label + + def flat_tokens(n) do + if Enum.empty?(n.children), + do: n.tokens, + else: Enum.flat_map(n.children, &NodeProtocol.flat_tokens/1) + end + end +end diff --git a/lib/codeqa/ast/nodes/code_node.ex b/lib/codeqa/ast/nodes/code_node.ex new file mode 100644 index 00000000..b7dfd9db --- /dev/null +++ b/lib/codeqa/ast/nodes/code_node.ex @@ -0,0 +1,46 @@ +defmodule CodeQA.AST.Nodes.CodeNode do + @moduledoc "Catch-all AST node for unclassified code blocks." + + alias CodeQA.AST.Enrichment.Node + + defstruct [:tokens, :line_count, :children, :start_line, :end_line, :label] + + @type t :: %__MODULE__{ + tokens: [term()], + line_count: non_neg_integer(), + children: [term()], + start_line: non_neg_integer() | nil, + end_line: non_neg_integer() | nil, + label: term() | nil + } + + @doc "Build a CodeNode from a raw %Node{}, copying all base fields." + @spec cast(Node.t()) :: t() + def cast(%Node{} = node) do + %__MODULE__{ + tokens: node.tokens, + line_count: node.line_count, + children: node.children, + start_line: node.start_line, + end_line: node.end_line, + label: node.label + } + end + + defimpl CodeQA.AST.Classification.NodeProtocol do + alias CodeQA.AST.Classification.NodeProtocol + + def tokens(n), do: n.tokens + def line_count(n), do: n.line_count + def children(n), do: n.children + def start_line(n), do: n.start_line + def end_line(n), do: n.end_line + def label(n), do: n.label + + def flat_tokens(n) do + if Enum.empty?(n.children), + do: n.tokens, + else: Enum.flat_map(n.children, &NodeProtocol.flat_tokens/1) + end + end +end diff --git a/lib/codeqa/ast/nodes/doc_node.ex b/lib/codeqa/ast/nodes/doc_node.ex new file mode 100644 index 00000000..5e011ca3 --- /dev/null +++ b/lib/codeqa/ast/nodes/doc_node.ex @@ -0,0 +1,46 @@ +defmodule CodeQA.AST.Nodes.DocNode do + @moduledoc "AST node for documentation strings and comment blocks." + + alias CodeQA.AST.Enrichment.Node + + defstruct [:tokens, :line_count, :children, :start_line, :end_line, :label] + + @type t :: %__MODULE__{ + tokens: [term()], + line_count: non_neg_integer(), + children: [term()], + start_line: non_neg_integer() | nil, + end_line: non_neg_integer() | nil, + label: term() | nil + } + + @doc "Build a DocNode from a raw %Node{}, copying all base fields." + @spec cast(Node.t()) :: t() + def cast(%Node{} = node) do + %__MODULE__{ + tokens: node.tokens, + line_count: node.line_count, + children: node.children, + start_line: node.start_line, + end_line: node.end_line, + label: node.label + } + end + + defimpl CodeQA.AST.Classification.NodeProtocol do + alias CodeQA.AST.Classification.NodeProtocol + + def tokens(n), do: n.tokens + def line_count(n), do: n.line_count + def children(n), do: n.children + def start_line(n), do: n.start_line + def end_line(n), do: n.end_line + def label(n), do: n.label + + def flat_tokens(n) do + if Enum.empty?(n.children), + do: n.tokens, + else: Enum.flat_map(n.children, &NodeProtocol.flat_tokens/1) + end + end +end diff --git a/lib/codeqa/ast/nodes/function_node.ex b/lib/codeqa/ast/nodes/function_node.ex new file mode 100644 index 00000000..48c6a5d4 --- /dev/null +++ b/lib/codeqa/ast/nodes/function_node.ex @@ -0,0 +1,59 @@ +defmodule CodeQA.AST.Nodes.FunctionNode do + @moduledoc "AST node for function, method, or callable definitions." + + alias CodeQA.AST.Enrichment.Node + + defstruct [ + :tokens, + :line_count, + :children, + :start_line, + :end_line, + :label, + :name, + :arity, + :visibility + ] + + @type t :: %__MODULE__{ + tokens: [term()], + line_count: non_neg_integer(), + children: [term()], + start_line: non_neg_integer() | nil, + end_line: non_neg_integer() | nil, + label: term() | nil, + name: String.t() | nil, + arity: non_neg_integer() | nil, + visibility: :public | :private | nil + } + + @doc "Build a FunctionNode from a raw %Node{}, copying all base fields. Type-specific fields default to nil." + @spec cast(Node.t()) :: t() + def cast(%Node{} = node) do + %__MODULE__{ + tokens: node.tokens, + line_count: node.line_count, + children: node.children, + start_line: node.start_line, + end_line: node.end_line, + label: node.label + } + end + + defimpl CodeQA.AST.Classification.NodeProtocol do + alias CodeQA.AST.Classification.NodeProtocol + + def tokens(n), do: n.tokens + def line_count(n), do: n.line_count + def children(n), do: n.children + def start_line(n), do: n.start_line + def end_line(n), do: n.end_line + def label(n), do: n.label + + def flat_tokens(n) do + if Enum.empty?(n.children), + do: n.tokens, + else: Enum.flat_map(n.children, &NodeProtocol.flat_tokens/1) + end + end +end diff --git a/lib/codeqa/ast/nodes/import_node.ex b/lib/codeqa/ast/nodes/import_node.ex new file mode 100644 index 00000000..3730370a --- /dev/null +++ b/lib/codeqa/ast/nodes/import_node.ex @@ -0,0 +1,47 @@ +defmodule CodeQA.AST.Nodes.ImportNode do + @moduledoc "AST node for import, require, use, alias, or include statements." + + alias CodeQA.AST.Enrichment.Node + + defstruct [:tokens, :line_count, :children, :start_line, :end_line, :label, :target] + + @type t :: %__MODULE__{ + tokens: [term()], + line_count: non_neg_integer(), + children: [term()], + start_line: non_neg_integer() | nil, + end_line: non_neg_integer() | nil, + label: term() | nil, + target: String.t() | nil + } + + @doc "Build an ImportNode from a raw %Node{}, copying all base fields. Type-specific fields default to nil." + @spec cast(Node.t()) :: t() + def cast(%Node{} = node) do + %__MODULE__{ + tokens: node.tokens, + line_count: node.line_count, + children: node.children, + start_line: node.start_line, + end_line: node.end_line, + label: node.label + } + end + + defimpl CodeQA.AST.Classification.NodeProtocol do + alias CodeQA.AST.Classification.NodeProtocol + + def tokens(n), do: n.tokens + def line_count(n), do: n.line_count + def children(n), do: n.children + def start_line(n), do: n.start_line + def end_line(n), do: n.end_line + def label(n), do: n.label + + def flat_tokens(n) do + if Enum.empty?(n.children), + do: n.tokens, + else: Enum.flat_map(n.children, &NodeProtocol.flat_tokens/1) + end + end +end diff --git a/lib/codeqa/ast/nodes/module_node.ex b/lib/codeqa/ast/nodes/module_node.ex new file mode 100644 index 00000000..c8d50723 --- /dev/null +++ b/lib/codeqa/ast/nodes/module_node.ex @@ -0,0 +1,48 @@ +defmodule CodeQA.AST.Nodes.ModuleNode do + @moduledoc "AST node for module, class, namespace, or struct definitions." + + alias CodeQA.AST.Enrichment.Node + + defstruct [:tokens, :line_count, :children, :start_line, :end_line, :label, :name, :kind] + + @type t :: %__MODULE__{ + tokens: [term()], + line_count: non_neg_integer(), + children: [term()], + start_line: non_neg_integer() | nil, + end_line: non_neg_integer() | nil, + label: term() | nil, + name: String.t() | nil, + kind: :class | :module | :namespace | :struct | nil + } + + @doc "Build a ModuleNode from a raw %Node{}, copying all base fields. Type-specific fields default to nil." + @spec cast(Node.t()) :: t() + def cast(%Node{} = node) do + %__MODULE__{ + tokens: node.tokens, + line_count: node.line_count, + children: node.children, + start_line: node.start_line, + end_line: node.end_line, + label: node.label + } + end + + defimpl CodeQA.AST.Classification.NodeProtocol do + alias CodeQA.AST.Classification.NodeProtocol + + def tokens(n), do: n.tokens + def line_count(n), do: n.line_count + def children(n), do: n.children + def start_line(n), do: n.start_line + def end_line(n), do: n.end_line + def label(n), do: n.label + + def flat_tokens(n) do + if Enum.empty?(n.children), + do: n.tokens, + else: Enum.flat_map(n.children, &NodeProtocol.flat_tokens/1) + end + end +end diff --git a/lib/codeqa/ast/nodes/test_node.ex b/lib/codeqa/ast/nodes/test_node.ex new file mode 100644 index 00000000..b3460cf4 --- /dev/null +++ b/lib/codeqa/ast/nodes/test_node.ex @@ -0,0 +1,47 @@ +defmodule CodeQA.AST.Nodes.TestNode do + @moduledoc "AST node for test cases, describe blocks, and it blocks." + + alias CodeQA.AST.Enrichment.Node + + defstruct [:tokens, :line_count, :children, :start_line, :end_line, :label, :description] + + @type t :: %__MODULE__{ + tokens: [term()], + line_count: non_neg_integer(), + children: [term()], + start_line: non_neg_integer() | nil, + end_line: non_neg_integer() | nil, + label: term() | nil, + description: String.t() | nil + } + + @doc "Build a TestNode from a raw %Node{}, copying all base fields. Type-specific fields default to nil." + @spec cast(Node.t()) :: t() + def cast(%Node{} = node) do + %__MODULE__{ + tokens: node.tokens, + line_count: node.line_count, + children: node.children, + start_line: node.start_line, + end_line: node.end_line, + label: node.label + } + end + + defimpl CodeQA.AST.Classification.NodeProtocol do + alias CodeQA.AST.Classification.NodeProtocol + + def tokens(n), do: n.tokens + def line_count(n), do: n.line_count + def children(n), do: n.children + def start_line(n), do: n.start_line + def end_line(n), do: n.end_line + def label(n), do: n.label + + def flat_tokens(n) do + if Enum.empty?(n.children), + do: n.tokens, + else: Enum.flat_map(n.children, &NodeProtocol.flat_tokens/1) + end + end +end diff --git a/lib/codeqa/ast/parsing/parser.ex b/lib/codeqa/ast/parsing/parser.ex new file mode 100644 index 00000000..2615bb9a --- /dev/null +++ b/lib/codeqa/ast/parsing/parser.ex @@ -0,0 +1,234 @@ +defmodule CodeQA.AST.Parsing.Parser do + @moduledoc """ + Recursively parses a token stream into a nested node tree. + + Top-level nodes are found by splitting on blank lines and declaration keywords. + Each node is then recursively subdivided using enclosure rules (brackets, + colon-indentation) until no further subdivision is possible — forming an + arbitrarily-deep tree rather than a fixed two-level hierarchy. + + ## Recursive parsing algorithm + + `parse_block/3` is the recursive core: + + 1. Immediately create a `Node` spanning the whole token stream. + 2. Apply enclosure rules to find sub-candidate streams. + 3. **Idempotency check** — reject any enclosure that spans the entire stream + (e.g. `BracketRule` re-emitting its own input). This is the termination + condition: the node is a leaf when no strictly-smaller sub-candidates exist. + 4. Recursively call `parse_block/3` on each sub-candidate to produce children. + 5. Return the node with its children attached as `children`. + + ## Design notes (from tree-sitter, ctags, lizard) + + - **Recursive hierarchy** — replaces the old two-level (top + one level of sub-blocks) + model with an N-level tree. Each call to `parse_block/3` mirrors tree-sitter's + recursive descent: emit the node, then recurse into its contents. + - **Language detection by extension** — `language_from_path/1` follows ctags' + convention of inferring language from file extension. + - **Rule extensibility** — enclosure rules are selected per language via + `sub_block_rules/1`. Rules are composable and order-independent. + - **Error recovery** — mismatched brackets and malformed indentation are silently + skipped by individual rules. The parser emits partial nodes rather than failing, + consistent with tree-sitter's error-recovery philosophy. + """ + + alias CodeQA.AST.Enrichment.Node + alias CodeQA.AST.Lexing.{NewlineToken, WhitespaceToken} + alias CodeQA.AST.Parsing.SignalStream + + alias CodeQA.AST.Signals.Structural.{ + BlankLineSignal, + BracketSignal, + ColonIndentSignal, + KeywordSignal, + TripleQuoteSignal + } + + alias CodeQA.Language + + @spec detect_blocks([CodeQA.AST.Lexing.Token.t()], module()) :: [Node.t()] + def detect_blocks([], _lang_mod), do: [] + + def detect_blocks(tokens, lang_mod) do + all_emissions = + SignalStream.run( + tokens, + [%TripleQuoteSignal{}, %BlankLineSignal{}, %KeywordSignal{}], + lang_mod + ) + |> List.flatten() + + triple_splits = + for {_, :split, :triple_split, v} <- all_emissions, do: v + + protected_ranges = compute_protected_ranges(triple_splits) + + split_points = + for( + {_, :split, name, v} <- all_emissions, + name in [:blank_split, :keyword_split], + not inside_protected?(v, protected_ranges), + do: v + ) + |> Enum.concat(triple_splits) + |> Enum.uniq() + |> Enum.sort() + + tokens + |> split_at(split_points) + |> Enum.reject(fn s -> Enum.empty?(s) or whitespace_only?(s) end) + |> merge_same_line_slices() + |> Enum.map(&parse_block(&1, lang_mod)) + end + + @spec language_module_for_path(String.t()) :: module() + def language_module_for_path(path), do: Language.detect(path) + + @spec language_from_path(String.t()) :: atom() + def language_from_path(path), + do: path |> Language.detect() |> then(& &1.name()) |> String.to_atom() + + # Recursively parse a token stream into a Node with nested children. + # Immediately creates a node spanning the whole stream, then attempts to + # subdivide it. Terminates when no strictly-smaller sub-candidates are found. + defp parse_block(tokens, lang_mod) do + start_line = block_start_line(tokens) + end_line = block_end_line(tokens) + line_count = if start_line && end_line, do: end_line - start_line + 1, else: 1 + + block = %Node{ + tokens: tokens, + line_count: line_count, + children: [], + start_line: start_line, + end_line: end_line + } + + case find_sub_candidates(tokens, lang_mod) do + [] -> + block + + candidates -> + children = Enum.map(candidates, &parse_block(&1, lang_mod)) + %{block | children: children} + end + end + + # Collect enclosure regions from rules. + # + # If the token stream is itself a bracket pair (e.g. the stream IS `(foo, bar)`), + # we unwrap the outer brackets before running rules. Without this, BracketRule + # would only find the whole stream as a single enclosure — filtered by the + # idempotency check — and recursion would stop prematurely at every bracket level. + # Unwrapping lets us see the *inner* structure and keeps the tree growing deeper. + # + # Idempotency check: after unwrapping, reject any enclosure that still spans the + # entire search window (0..n-1), which would produce an infinite loop. + defp find_sub_candidates(tokens, lang_mod) do + {search_tokens, _} = maybe_unwrap_bracket(tokens) + n = length(search_tokens) + + enclosure_signals = + if lang_mod.uses_colon_indent?() do + [%BracketSignal{}, %ColonIndentSignal{}] + else + [%BracketSignal{}] + end + + SignalStream.run(search_tokens, enclosure_signals, lang_mod) + |> List.flatten() + |> Enum.filter(fn {_, group, _, _} -> group == :enclosure end) + |> Enum.map(fn {_, _, _, {s, e}} -> {s, e} end) + |> Enum.uniq() + |> Enum.sort() + |> Enum.reject(fn {s, e} -> s == 0 and e == n - 1 end) + |> Enum.map(fn {s, e} -> Enum.slice(search_tokens, s..e) end) + |> Enum.reject(&whitespace_only?/1) + end + + @open_brackets MapSet.new(["(", "[", "{"]) + @matching_close %{"(" => ")", "[" => "]", "{" => "}"} + + # If the stream is a balanced bracket pair, return the inner tokens. + # Otherwise return the stream unchanged. + defp maybe_unwrap_bracket([first | rest] = tokens) do + last = List.last(tokens) + + if last && MapSet.member?(@open_brackets, first.kind) && + Map.get(@matching_close, first.kind) == last.kind do + {Enum.drop(rest, -1), 1} + else + {tokens, 0} + end + end + + defp maybe_unwrap_bracket([]), do: {[], 0} + + # Pairs consecutive triple-quote split indices into protected interior ranges. + # Uses chunk_every with :discard to safely handle odd counts (malformed input). + defp compute_protected_ranges(split_indices) do + split_indices + |> Enum.chunk_every(2, 2, :discard) + |> Enum.map(fn [a, b] -> {a + 1, b - 1} end) + end + + defp inside_protected?(idx, ranges) do + Enum.any?(ranges, fn {lo, hi} -> idx >= lo and idx <= hi end) + end + + # When TripleQuoteSignal splits `@doc """` mid-line, the tokens before the + # triple-quote land in one slice and the heredoc in the next — both on the same + # starting line. Merge adjacent slices that share a line boundary so `@doc """..."""` + # becomes a single token stream fed to parse_block rather than two separate nodes. + defp merge_same_line_slices([]), do: [] + defp merge_same_line_slices([single]), do: [single] + + defp merge_same_line_slices([slice_a, slice_b | rest]) do + last_line_a = + slice_a + |> Enum.reverse() + |> Enum.find(&(&1.kind not in [WhitespaceToken.kind(), NewlineToken.kind()])) + |> then(&(&1 && &1.line)) + + first_line_b = + slice_b + |> Enum.find(&(&1.kind not in [WhitespaceToken.kind(), NewlineToken.kind()])) + |> then(&(&1 && &1.line)) + + if last_line_a && first_line_b && last_line_a == first_line_b do + merge_same_line_slices([slice_a ++ slice_b | rest]) + else + [slice_a | merge_same_line_slices([slice_b | rest])] + end + end + + defp split_at(tokens, []), do: [tokens] + + defp split_at(tokens, split_points) do + boundaries = [0 | split_points] ++ [length(tokens)] + + boundaries + |> Enum.chunk_every(2, 1, :discard) + |> Enum.map(fn [start, stop] -> Enum.slice(tokens, start..(stop - 1)//1) end) + end + + defp whitespace_only?(tokens) do + Enum.all?(tokens, &(&1.kind in [WhitespaceToken.kind(), NewlineToken.kind()])) + end + + defp block_start_line([%{line: line} | _]), do: line + defp block_start_line([]), do: nil + + defp block_end_line([]), do: nil + + defp block_end_line(tokens) do + tokens + |> Enum.reverse() + |> Enum.find(&(&1.kind not in [WhitespaceToken.kind(), NewlineToken.kind()])) + |> case do + nil -> tokens |> List.last() |> Map.get(:line) + token -> token.line + end + end +end diff --git a/lib/codeqa/ast/parsing/signal.ex b/lib/codeqa/ast/parsing/signal.ex new file mode 100644 index 00000000..dc2d19ec --- /dev/null +++ b/lib/codeqa/ast/parsing/signal.ex @@ -0,0 +1,44 @@ +defprotocol CodeQA.AST.Parsing.Signal do + @moduledoc """ + Protocol for token-stream signal emitters. + + A signal is a stateful detector that receives one token at a time and emits + zero or more named values. All signals run independently over the same token + stream — each gets its own full pass, carrying its own state. + + ## Protocol functions + + - `source/1` — the implementing module; used for debugging emission traces + - `group/1` — atom grouping this signal's emissions (e.g. `:split`, `:enclosure`) + - `init/2` — returns initial state; called once before the token stream starts + - `emit/3` — called per token; returns `{MapSet.t({name, value}), new_state}` + + ## State + + State is owned externally (in `SignalStream`) as a positionally-aligned list. + The signal defines the shape; the orchestrator threads it through unchanged. + + ## No-op emission + + To emit nothing for a token, return `{MapSet.new(), state}`. + """ + + @doc "The module that implements this signal — for debugging traces." + @spec source(t) :: module() + def source(signal) + + @doc "Group atom for all emissions from this signal (e.g. :split, :enclosure)." + @spec group(t) :: atom() + def group(signal) + + @doc "Returns the initial state for this signal." + @spec init(t, module()) :: term() + def init(signal, lang_mod) + + @doc """ + Called once per token. Returns a MapSet of `{name, value}` emission pairs + and the updated state. + """ + @spec emit(t, token :: term(), state :: term()) :: {MapSet.t(), term()} + def emit(signal, token, state) +end diff --git a/lib/codeqa/ast/parsing/signal_registry.ex b/lib/codeqa/ast/parsing/signal_registry.ex new file mode 100644 index 00000000..0f4a7521 --- /dev/null +++ b/lib/codeqa/ast/parsing/signal_registry.ex @@ -0,0 +1,94 @@ +defmodule CodeQA.AST.Parsing.SignalRegistry do + @moduledoc """ + Registry for structural and classification signals. + + Use `default/0` for the standard signal set. Compose custom registries + with `register_structural/2` and `register_classification/2` for + language-specific or analysis-specific configurations. + """ + + alias CodeQA.AST.Signals.Structural.{ + AccessModifierSignal, + AssignmentFunctionSignal, + BlankLineSignal, + BracketSignal, + BranchSplitSignal, + ColonIndentSignal, + CommentDividerSignal, + DecoratorSignal, + DedentToZeroSignal, + DocCommentLeadSignal, + KeywordSignal, + SQLBlockSignal, + TripleQuoteSignal + } + + alias CodeQA.AST.Signals.Classification.{ + AttributeSignal, + CommentDensitySignal, + ConfigSignal, + DataSignal, + DocSignal, + FunctionSignal, + ImportSignal, + ModuleSignal, + TestSignal, + TypeSignal + } + + defstruct structural: [], classification: [] + + @type t :: %__MODULE__{ + structural: [term()], + classification: [term()] + } + + @spec new() :: t() + def new, do: %__MODULE__{} + + @spec register_structural(t(), term()) :: t() + def register_structural(%__MODULE__{} = r, signal), + do: %{r | structural: r.structural ++ [signal]} + + @spec register_classification(t(), term()) :: t() + def register_classification(%__MODULE__{} = r, signal), + do: %{r | classification: r.classification ++ [signal]} + + @spec default() :: t() + def default do + new() + |> register_structural(%TripleQuoteSignal{}) + |> register_structural(%BlankLineSignal{}) + |> register_structural(%KeywordSignal{}) + |> register_structural(%AccessModifierSignal{}) + |> register_structural(%DecoratorSignal{}) + |> register_structural(%CommentDividerSignal{}) + |> register_structural(%DocCommentLeadSignal{}) + |> register_structural(%AssignmentFunctionSignal{}) + |> register_structural(%DedentToZeroSignal{}) + |> register_structural(%BranchSplitSignal{}) + |> register_structural(%BracketSignal{}) + |> register_classification(%DocSignal{}) + |> register_classification(%TestSignal{}) + |> register_classification(%FunctionSignal{}) + |> register_classification(%ModuleSignal{}) + |> register_classification(%ImportSignal{}) + |> register_classification(%AttributeSignal{}) + |> register_classification(%TypeSignal{}) + |> register_classification(%ConfigSignal{}) + |> register_classification(%DataSignal{}) + |> register_classification(%CommentDensitySignal{}) + end + + @spec python() :: t() + def python do + r = default() + %{r | structural: r.structural ++ [%ColonIndentSignal{}]} + end + + @spec sql() :: t() + def sql do + r = default() + %{r | structural: r.structural ++ [%SQLBlockSignal{}]} + end +end diff --git a/lib/codeqa/ast/parsing/signal_stream.ex b/lib/codeqa/ast/parsing/signal_stream.ex new file mode 100644 index 00000000..8b6f4519 --- /dev/null +++ b/lib/codeqa/ast/parsing/signal_stream.ex @@ -0,0 +1,58 @@ +defmodule CodeQA.AST.Parsing.SignalStream do + @moduledoc """ + Runs a list of `Signal` implementations over a token stream. + + Each signal receives its own full pass over the token stream and accumulates + its own state. Signals are independent — no shared state, no cross-signal + coordination. + + ## Return value + + Returns a list of emission lists, one per signal, in the same order as the + input signal list. Each emission is a 4-tuple: + + {source, group, name, value} + + ## Usage + + SignalStream.run(tokens, [%BlankLineSignal{}, %KeywordSignal{}], []) + # => [[{BlankLineSignal, :split, :blank_split, 5}, ...], [...]] + """ + + alias CodeQA.AST.Parsing.Signal + + @spec run([term()], [term()], module()) :: [list()] + def run(tokens, signals, lang_mod) do + prevs = [nil | tokens] + nexts = Enum.drop(tokens, 1) ++ [nil] + triples = Enum.zip_with([prevs, tokens, nexts], fn [p, c, n] -> {p, c, n} end) + + Enum.map(signals, fn signal -> + init_state = Signal.init(signal, lang_mod) + source = Signal.source(signal) + group = Signal.group(signal) + + {_final_state, emissions} = + Enum.reduce_while(triples, {init_state, []}, fn triple, {state, acc} -> + emit_step(signal, triple, state, acc, source, group) + end) + + Enum.reverse(emissions) + end) + end + + defp emit_step(signal, triple, state, acc, source, group) do + {emitted, new_state} = Signal.emit(signal, triple, state) + + new_acc = + emitted + |> Enum.map(fn {name, value} -> {source, group, name, value} end) + |> Enum.reduce(acc, fn e, a -> [e | a] end) + + if new_state == :halt do + {:halt, {new_state, new_acc}} + else + {:cont, {new_state, new_acc}} + end + end +end diff --git a/lib/codeqa/ast/signals/classification/attribute_signal.ex b/lib/codeqa/ast/signals/classification/attribute_signal.ex new file mode 100644 index 00000000..aaaa6403 --- /dev/null +++ b/lib/codeqa/ast/signals/classification/attribute_signal.ex @@ -0,0 +1,68 @@ +defmodule CodeQA.AST.Signals.Classification.AttributeSignal do + @moduledoc """ + Classification signal — votes `:attribute` when an `@identifier` pattern + appears at indent 0. + + Weights: + - 3 for Elixir typespec attributes (@spec, @type, @typep, @opaque, @callback, @macrocallback) + - 2 for all other @name attributes + + Skips @doc and @moduledoc — those nodes contain tokens and are handled by DocSignal. + Emits at most one vote per token stream. + """ + + defstruct [] + + defimpl CodeQA.AST.Parsing.Signal do + @nl CodeQA.AST.Lexing.NewlineToken.kind() + @ws CodeQA.AST.Lexing.WhitespaceToken.kind() + @typespec_attrs MapSet.new(~w[spec type typep opaque callback macrocallback]) + @skip_attrs MapSet.new(~w[doc moduledoc]) + + def source(_), do: CodeQA.AST.Signals.Classification.AttributeSignal + def group(_), do: :classification + + def init(_, _lang_mod), + do: %{at_line_start: true, indent: 0, saw_at: false, voted: false} + + def emit(_, _, %{voted: true} = state), do: {MapSet.new(), state} + + def emit(_, {_prev, token, _next}, %{at_line_start: als, indent: ind, saw_at: saw_at} = state) do + case token.kind do + @nl -> + {MapSet.new(), %{state | at_line_start: true, indent: 0, saw_at: false}} + + @ws when als -> + {MapSet.new(), %{state | indent: ind + 1, at_line_start: true}} + + @ws -> + {MapSet.new(), state} + + "@" when ind == 0 -> + {MapSet.new(), %{state | saw_at: true, at_line_start: false}} + + "" when saw_at -> + emit_attribute(token.content, state) + + _ -> + {MapSet.new(), %{state | saw_at: false, at_line_start: false}} + end + end + + defp emit_attribute(name, state) do + base_state = %{state | saw_at: false, at_line_start: false, voted: true} + + cond do + MapSet.member?(@skip_attrs, name) -> + # @doc/@moduledoc: let DocSignal handle via tokens + {MapSet.new(), base_state} + + MapSet.member?(@typespec_attrs, name) -> + {MapSet.new([{:attribute_vote, 3}]), base_state} + + true -> + {MapSet.new([{:attribute_vote, 2}]), base_state} + end + end + end +end diff --git a/lib/codeqa/ast/signals/classification/comment_density_signal.ex b/lib/codeqa/ast/signals/classification/comment_density_signal.ex new file mode 100644 index 00000000..ceb4c23a --- /dev/null +++ b/lib/codeqa/ast/signals/classification/comment_density_signal.ex @@ -0,0 +1,64 @@ +defmodule CodeQA.AST.Signals.Classification.CommentDensitySignal do + @moduledoc """ + Classification signal — votes `:comment` when more than 60% of non-blank + lines begin with a comment prefix. + + Requires `comment_prefixes: [String.t()]` in opts (from the language + module). Returns no vote if no prefixes are configured. + + Emits at the end of the stream. + """ + + defstruct [] + + defimpl CodeQA.AST.Parsing.Signal do + @nl CodeQA.AST.Lexing.NewlineToken.kind() + @ws CodeQA.AST.Lexing.WhitespaceToken.kind() + def source(_), do: CodeQA.AST.Signals.Classification.CommentDensitySignal + def group(_), do: :classification + + def init(_, lang_mod) do + prefixes = MapSet.new(lang_mod.comment_prefixes()) + %{prefixes: prefixes, at_line_start: true, comment_lines: 0, total_lines: 0} + end + + def emit(_, {_prev, token, next}, state) do + %{prefixes: prefixes, at_line_start: als} = state + + state = + case token.kind do + @nl -> + %{state | at_line_start: true} + + @ws -> + state + + _ when als -> + is_comment = MapSet.member?(prefixes, token.content) + + %{ + state + | at_line_start: false, + total_lines: state.total_lines + 1, + comment_lines: state.comment_lines + if(is_comment, do: 1, else: 0) + } + + _ -> + %{state | at_line_start: false} + end + + maybe_emit_vote(next, prefixes, state) + end + + defp maybe_emit_vote(nil, prefixes, state) + when map_size(prefixes) > 0 and state.total_lines > 0 do + if state.comment_lines / state.total_lines > 0.6 do + {MapSet.new([{:comment_vote, 2}]), :halt} + else + {MapSet.new(), state} + end + end + + defp maybe_emit_vote(_next, _prefixes, state), do: {MapSet.new(), state} + end +end diff --git a/lib/codeqa/ast/signals/classification/config_signal.ex b/lib/codeqa/ast/signals/classification/config_signal.ex new file mode 100644 index 00000000..43b58728 --- /dev/null +++ b/lib/codeqa/ast/signals/classification/config_signal.ex @@ -0,0 +1,58 @@ +defmodule CodeQA.AST.Signals.Classification.ConfigSignal do + @moduledoc """ + Classification signal — votes `:config` when a configuration keyword + appears at indent 0 and bracket depth 0. + + Matches `config` (Elixir Mix.Config), `configure`, `settings`, `options`, + `defaults`. Emits at most one vote. + """ + + defstruct [] + + defimpl CodeQA.AST.Parsing.Signal do + @nl CodeQA.AST.Lexing.NewlineToken.kind() + @ws CodeQA.AST.Lexing.WhitespaceToken.kind() + @config_keywords MapSet.new(["config", "configure", "settings", "options", "defaults"]) + def source(_), do: CodeQA.AST.Signals.Classification.ConfigSignal + def group(_), do: :classification + + def init(_, _lang_mod), + do: %{at_line_start: true, indent: 0, bracket_depth: 0, is_first: true} + + def emit(_, {_prev, token, _next}, state) do + %{at_line_start: als, indent: ind, bracket_depth: bd, is_first: first} = state + + case token.kind do + @nl -> + {MapSet.new(), %{state | at_line_start: true, indent: 0}} + + @ws when als -> + {MapSet.new(), %{state | indent: ind + 1, at_line_start: true}} + + @ws -> + {MapSet.new(), state} + + v when v in ["(", "[", "{"] -> + {MapSet.new(), %{state | bracket_depth: bd + 1, at_line_start: false, is_first: false}} + + v when v in [")", "]", "}"] -> + _ = v + + {MapSet.new(), + %{state | bracket_depth: max(0, bd - 1), at_line_start: false, is_first: false}} + + _ -> + emit_content_token(token, state, ind, bd, first) + end + end + + defp emit_content_token(token, state, ind, bd, first) do + if ind == 0 and bd == 0 and MapSet.member?(@config_keywords, token.content) do + weight = if first, do: 3, else: 1 + {MapSet.new([{:config_vote, weight}]), :halt} + else + {MapSet.new(), %{state | at_line_start: false, is_first: false}} + end + end + end +end diff --git a/lib/codeqa/ast/signals/classification/data_signal.ex b/lib/codeqa/ast/signals/classification/data_signal.ex new file mode 100644 index 00000000..1d6aa773 --- /dev/null +++ b/lib/codeqa/ast/signals/classification/data_signal.ex @@ -0,0 +1,67 @@ +defmodule CodeQA.AST.Signals.Classification.DataSignal do + @moduledoc """ + Classification signal — votes `:data` when a token stream consists primarily + of literal values (``, ``) with no control-flow keywords. + + Emits at the end of the stream (when `next == nil`). Votes only when + literal ratio > 0.6 and no control-flow keywords were seen. + """ + + defstruct [] + + defimpl CodeQA.AST.Parsing.Signal do + @str CodeQA.AST.Lexing.StringToken.kind() + @control_flow MapSet.new([ + "if", + "else", + "elsif", + "elif", + "unless", + "for", + "while", + "do", + "case", + "when", + "cond", + "switch", + "loop", + "until" + ]) + def source(_), do: CodeQA.AST.Signals.Classification.DataSignal + def group(_), do: :classification + + def init(_, _lang_mod), + do: %{literal_count: 0, id_count: 0, has_control_flow: false} + + def emit(_, {_prev, token, next}, state) do + state = + case token.kind do + kind when kind in [@str, ""] -> + %{state | literal_count: state.literal_count + 1} + + "" -> + if MapSet.member?(@control_flow, token.content) do + %{state | has_control_flow: true, id_count: state.id_count + 1} + else + %{state | id_count: state.id_count + 1} + end + + _ -> + state + end + + if next == nil do + total = state.literal_count + state.id_count + + if total > 0 and not state.has_control_flow and + state.literal_count / total > 0.6 do + {MapSet.new([{:data_vote, 2}]), :halt} + else + {MapSet.new(), state} + end + else + {MapSet.new(), state} + end + end + end +end diff --git a/lib/codeqa/ast/signals/classification/doc_signal.ex b/lib/codeqa/ast/signals/classification/doc_signal.ex new file mode 100644 index 00000000..615cf55c --- /dev/null +++ b/lib/codeqa/ast/signals/classification/doc_signal.ex @@ -0,0 +1,29 @@ +defmodule CodeQA.AST.Signals.Classification.DocSignal do + @moduledoc """ + Classification signal — votes `:doc` when a `` (triple-quoted string) token + is found anywhere in the node's token stream. + + Weight: 3 (unambiguous — triple-quoted strings are documentation). + Emits at most one vote per token stream. + """ + + defstruct [] + + defimpl CodeQA.AST.Parsing.Signal do + @doc_kind CodeQA.AST.Lexing.StringToken.doc_kind() + def source(_), do: CodeQA.AST.Signals.Classification.DocSignal + def group(_), do: :classification + + def init(_, _lang_mod), do: %{voted: false} + + def emit(_, _, %{voted: true} = state), do: {MapSet.new(), state} + + def emit(_, {_prev, token, _next}, state) do + if token.kind == @doc_kind do + {MapSet.new([{:doc_vote, 3}]), %{state | voted: true}} + else + {MapSet.new(), state} + end + end + end +end diff --git a/lib/codeqa/ast/signals/classification/function_signal.ex b/lib/codeqa/ast/signals/classification/function_signal.ex new file mode 100644 index 00000000..62d3f487 --- /dev/null +++ b/lib/codeqa/ast/signals/classification/function_signal.ex @@ -0,0 +1,76 @@ +defmodule CodeQA.AST.Signals.Classification.FunctionSignal do + @moduledoc """ + Classification signal — votes `:function` when a function definition keyword + appears at indent 0 and bracket depth 0. + + Weights: + - 3 when it is the first content token of the block (strong match) + - 1 when found later in the block (weak match, e.g. after a leading comment) + + Does NOT include module/class/namespace keywords (handled by ModuleSignal) or + test macros like `test`/`describe` (handled by TestSignal). + Emits at most one vote per token stream. + """ + + defstruct [] + + defimpl CodeQA.AST.Parsing.Signal do + @nl CodeQA.AST.Lexing.NewlineToken.kind() + @ws CodeQA.AST.Lexing.WhitespaceToken.kind() + def source(_), do: CodeQA.AST.Signals.Classification.FunctionSignal + def group(_), do: :classification + + def init(_, lang_mod) do + %{ + at_line_start: true, + indent: 0, + bracket_depth: 0, + is_first: true, + voted: false, + keywords: CodeQA.Language.function_keywords(lang_mod) + } + end + + def emit(_, _, %{voted: true} = state), do: {MapSet.new(), state} + + def emit( + _, + {_prev, token, _next}, + %{at_line_start: als, indent: ind, bracket_depth: bd, is_first: first} = state + ) do + case token.kind do + @nl -> + {MapSet.new(), %{state | at_line_start: true, indent: 0}} + + @ws when als -> + {MapSet.new(), %{state | indent: ind + 1, at_line_start: true}} + + @ws -> + {MapSet.new(), state} + + v when v in ["(", "[", "{"] -> + {MapSet.new(), %{state | bracket_depth: bd + 1, is_first: false, at_line_start: false}} + + v when v in [")", "]", "}"] -> + _ = v + + {MapSet.new(), + %{state | bracket_depth: max(0, bd - 1), is_first: false, at_line_start: false}} + + _ -> + emit_content_token(token, state, ind, bd, first) + end + end + + defp emit_content_token(token, state, ind, bd, first) do + base_state = %{state | is_first: false, at_line_start: false} + + if ind == 0 and bd == 0 and MapSet.member?(state.keywords, token.content) do + weight = if first, do: 3, else: 1 + {MapSet.new([{:function_vote, weight}]), %{base_state | voted: true}} + else + {MapSet.new(), base_state} + end + end + end +end diff --git a/lib/codeqa/ast/signals/classification/import_signal.ex b/lib/codeqa/ast/signals/classification/import_signal.ex new file mode 100644 index 00000000..e27ed8a8 --- /dev/null +++ b/lib/codeqa/ast/signals/classification/import_signal.ex @@ -0,0 +1,66 @@ +defmodule CodeQA.AST.Signals.Classification.ImportSignal do + @moduledoc """ + Classification signal — votes `:import` when an import/require/use/alias keyword + appears at indent 0. + + Weights: + - 3 when it is the first content token of the block (strong match) + - 1 when found later in the block + + Covers: Elixir (import, require, use, alias), Python (import, from), + JavaScript/Go (import, package), C# (using), Ruby/Lua (require, include). + Emits at most one vote per token stream. + """ + + defstruct [] + + defimpl CodeQA.AST.Parsing.Signal do + @nl CodeQA.AST.Lexing.NewlineToken.kind() + @ws CodeQA.AST.Lexing.WhitespaceToken.kind() + def source(_), do: CodeQA.AST.Signals.Classification.ImportSignal + def group(_), do: :classification + + def init(_, lang_mod) do + %{ + at_line_start: true, + indent: 0, + is_first: true, + voted: false, + keywords: CodeQA.Language.import_keywords(lang_mod) + } + end + + def emit(_, _, %{voted: true} = state), do: {MapSet.new(), state} + + def emit( + _, + {_prev, token, _next}, + %{at_line_start: als, indent: ind, is_first: first} = state + ) do + case token.kind do + @nl -> + {MapSet.new(), %{state | at_line_start: true, indent: 0}} + + @ws when als -> + {MapSet.new(), %{state | indent: ind + 1, at_line_start: true}} + + @ws -> + {MapSet.new(), state} + + _ -> + emit_content_token(token, state, ind, first) + end + end + + defp emit_content_token(token, state, ind, first) do + base_state = %{state | is_first: false, at_line_start: false} + + if ind == 0 and MapSet.member?(state.keywords, token.content) do + weight = if first, do: 3, else: 1 + {MapSet.new([{:import_vote, weight}]), %{base_state | voted: true}} + else + {MapSet.new(), base_state} + end + end + end +end diff --git a/lib/codeqa/ast/signals/classification/module_signal.ex b/lib/codeqa/ast/signals/classification/module_signal.ex new file mode 100644 index 00000000..4e9ca98e --- /dev/null +++ b/lib/codeqa/ast/signals/classification/module_signal.ex @@ -0,0 +1,75 @@ +defmodule CodeQA.AST.Signals.Classification.ModuleSignal do + @moduledoc """ + Classification signal — votes `:module` when a module/class/namespace definition + keyword appears at indent 0 and bracket depth 0. + + Weights: + - 3 when it is the first content token of the block (strong match) + - 1 when found later in the block + + Keyword set is disjoint from FunctionSignal and TestSignal to avoid conflicts. + Emits at most one vote per token stream. + """ + + defstruct [] + + defimpl CodeQA.AST.Parsing.Signal do + @nl CodeQA.AST.Lexing.NewlineToken.kind() + @ws CodeQA.AST.Lexing.WhitespaceToken.kind() + def source(_), do: CodeQA.AST.Signals.Classification.ModuleSignal + def group(_), do: :classification + + def init(_, lang_mod) do + %{ + at_line_start: true, + indent: 0, + bracket_depth: 0, + is_first: true, + voted: false, + keywords: CodeQA.Language.module_keywords(lang_mod) + } + end + + def emit(_, _, %{voted: true} = state), do: {MapSet.new(), state} + + def emit( + _, + {_prev, token, _next}, + %{at_line_start: als, indent: ind, bracket_depth: bd, is_first: first} = state + ) do + case token.kind do + @nl -> + {MapSet.new(), %{state | at_line_start: true, indent: 0}} + + @ws when als -> + {MapSet.new(), %{state | indent: ind + 1, at_line_start: true}} + + @ws -> + {MapSet.new(), state} + + v when v in ["(", "[", "{"] -> + {MapSet.new(), %{state | bracket_depth: bd + 1, is_first: false, at_line_start: false}} + + v when v in [")", "]", "}"] -> + _ = v + + {MapSet.new(), + %{state | bracket_depth: max(0, bd - 1), is_first: false, at_line_start: false}} + + _ -> + emit_content_token(token, state, ind, bd, first) + end + end + + defp emit_content_token(token, state, ind, bd, first) do + base_state = %{state | is_first: false, at_line_start: false} + + if ind == 0 and bd == 0 and MapSet.member?(state.keywords, token.content) do + weight = if first, do: 3, else: 1 + {MapSet.new([{:module_vote, weight}]), %{base_state | voted: true}} + else + {MapSet.new(), base_state} + end + end + end +end diff --git a/lib/codeqa/ast/signals/classification/test_signal.ex b/lib/codeqa/ast/signals/classification/test_signal.ex new file mode 100644 index 00000000..de6abe50 --- /dev/null +++ b/lib/codeqa/ast/signals/classification/test_signal.ex @@ -0,0 +1,67 @@ +defmodule CodeQA.AST.Signals.Classification.TestSignal do + @moduledoc """ + Classification signal — votes `:test` when a test block keyword appears at + indent 0. + + Weights: + - 3 when it is the first content token of the block (strong match) + - 1 when found later in the block + + Covers: ExUnit (test, describe), RSpec/Jest/Mocha (it, context, describe), + Cucumber (scenario, given, feature). `test` takes priority over + FunctionSignal — Elixir test macros look like function calls but are test blocks. + Emits at most one vote per token stream. + """ + + defstruct [] + + defimpl CodeQA.AST.Parsing.Signal do + @nl CodeQA.AST.Lexing.NewlineToken.kind() + @ws CodeQA.AST.Lexing.WhitespaceToken.kind() + def source(_), do: CodeQA.AST.Signals.Classification.TestSignal + def group(_), do: :classification + + def init(_, lang_mod) do + %{ + at_line_start: true, + indent: 0, + is_first: true, + voted: false, + keywords: CodeQA.Language.test_keywords(lang_mod) + } + end + + def emit(_, _, %{voted: true} = state), do: {MapSet.new(), state} + + def emit( + _, + {_prev, token, _next}, + %{at_line_start: als, indent: ind, is_first: first} = state + ) do + case token.kind do + @nl -> + {MapSet.new(), %{state | at_line_start: true, indent: 0}} + + @ws when als -> + {MapSet.new(), %{state | indent: ind + 1, at_line_start: true}} + + @ws -> + {MapSet.new(), state} + + _ -> + emit_content_token(token, state, ind, first) + end + end + + defp emit_content_token(token, state, ind, first) do + base_state = %{state | is_first: false, at_line_start: false} + + if ind == 0 and MapSet.member?(state.keywords, token.content) do + weight = if first, do: 3, else: 1 + {MapSet.new([{:test_vote, weight}]), %{base_state | voted: true}} + else + {MapSet.new(), base_state} + end + end + end +end diff --git a/lib/codeqa/ast/signals/classification/type_signal.ex b/lib/codeqa/ast/signals/classification/type_signal.ex new file mode 100644 index 00000000..fc4440f5 --- /dev/null +++ b/lib/codeqa/ast/signals/classification/type_signal.ex @@ -0,0 +1,53 @@ +defmodule CodeQA.AST.Signals.Classification.TypeSignal do + @moduledoc """ + Classification signal — votes `:type` when an Elixir type definition + attribute (`@type`, `@typep`, `@opaque`) appears at indent 0. + + Emits at most one vote. Complements `AttributeSignal`, which handles + `@spec`, `@doc`, and other attributes. + """ + + defstruct [] + + defimpl CodeQA.AST.Parsing.Signal do + @nl CodeQA.AST.Lexing.NewlineToken.kind() + @ws CodeQA.AST.Lexing.WhitespaceToken.kind() + @type_attrs MapSet.new(["type", "typep", "opaque"]) + def source(_), do: CodeQA.AST.Signals.Classification.TypeSignal + def group(_), do: :classification + + def init(_, _lang_mod), + do: %{at_line_start: true, indent: 0, saw_at: false, is_first: true} + + def emit(_, {_prev, token, _next}, state) do + case token.kind do + @nl -> + {MapSet.new(), %{state | at_line_start: true, indent: 0, saw_at: false}} + + @ws when state.at_line_start -> + {MapSet.new(), %{state | indent: state.indent + 1, at_line_start: true}} + + @ws -> + {MapSet.new(), state} + + "@" when state.indent == 0 -> + {MapSet.new(), %{state | saw_at: true, at_line_start: false}} + + _ when state.saw_at and state.indent == 0 -> + emit_after_at(token, state) + + _ -> + {MapSet.new(), %{state | saw_at: false, is_first: false, at_line_start: false}} + end + end + + defp emit_after_at(token, state) do + if MapSet.member?(@type_attrs, token.content) do + weight = if state.is_first, do: 3, else: 1 + {MapSet.new([{:type_vote, weight}]), :halt} + else + {MapSet.new(), %{state | saw_at: false, is_first: false, at_line_start: false}} + end + end + end +end diff --git a/lib/codeqa/ast/signals/structural/access_modifier_signal.ex b/lib/codeqa/ast/signals/structural/access_modifier_signal.ex new file mode 100644 index 00000000..43ed0687 --- /dev/null +++ b/lib/codeqa/ast/signals/structural/access_modifier_signal.ex @@ -0,0 +1,80 @@ +defmodule CodeQA.AST.Signals.Structural.AccessModifierSignal do + alias CodeQA.AST.Lexing.NewlineToken + alias CodeQA.AST.Lexing.WhitespaceToken + + @moduledoc """ + Emits `:access_modifier_split` when an access modifier keyword appears at line + start with bracket_depth == 0. + + Unlike `KeywordSignal`, this does NOT require indentation level 0, so it + detects class members inside bracket enclosures (e.g. `public void foo()` inside + a `class Foo { ... }` body). + + When `opts[:language_module]` is set, uses that language's + `access_modifiers/0` callback. + """ + + defstruct [] + + defimpl CodeQA.AST.Parsing.Signal do + def source(_), do: CodeQA.AST.Signals.Structural.AccessModifierSignal + def group(_), do: :split + + def init(_, lang_mod) do + modifiers = CodeQA.Language.access_modifiers(lang_mod) + %{idx: 0, bracket_depth: 0, at_line_start: true, seen_content: false, modifiers: modifiers} + end + + def emit(_, {_, %NewlineToken{}, _}, %{idx: idx} = state), + do: {MapSet.new(), %{state | idx: idx + 1, at_line_start: true}} + + def emit(_, {_, %WhitespaceToken{}, _}, %{idx: idx, at_line_start: true} = state), + do: {MapSet.new(), %{state | idx: idx + 1, at_line_start: true}} + + def emit(_, {_, %WhitespaceToken{}, _}, %{idx: idx} = state), + do: {MapSet.new(), %{state | idx: idx + 1}} + + def emit(_, {_, %{kind: k}, _}, %{idx: idx, bracket_depth: bd} = state) + when k in ["(", "[", "{"], + do: + {MapSet.new(), + %{ + state + | idx: idx + 1, + bracket_depth: bd + 1, + seen_content: true, + at_line_start: false + }} + + def emit(_, {_, %{kind: k}, _}, %{idx: idx, bracket_depth: bd} = state) + when k in [")", "]", "}"], + do: + {MapSet.new(), + %{ + state + | idx: idx + 1, + bracket_depth: max(0, bd - 1), + seen_content: true, + at_line_start: false + }} + + def emit(_, {_, token, _}, %{idx: idx} = state) do + base = %{state | idx: idx + 1, seen_content: true, at_line_start: false} + + emissions = + if modifier_split?(state, token), + do: MapSet.new([{:access_modifier_split, idx}]), + else: MapSet.new() + + {emissions, base} + end + + defp modifier_split?( + %{seen_content: true, bracket_depth: 0, at_line_start: true, modifiers: m}, + %{content: c} + ), + do: MapSet.member?(m, c) + + defp modifier_split?(_, _), do: false + end +end diff --git a/lib/codeqa/ast/signals/structural/assignment_function_signal.ex b/lib/codeqa/ast/signals/structural/assignment_function_signal.ex new file mode 100644 index 00000000..a778d55b --- /dev/null +++ b/lib/codeqa/ast/signals/structural/assignment_function_signal.ex @@ -0,0 +1,135 @@ +defmodule CodeQA.AST.Signals.Structural.AssignmentFunctionSignal do + alias CodeQA.AST.Lexing.NewlineToken + alias CodeQA.AST.Lexing.WhitespaceToken + + @moduledoc """ + Emits `:assignment_function_split` when a top-level assignment to a function + is detected at indent 0 and bracket depth 0. + + Covers patterns such as: + - `identifier = function(...) {}` + - `identifier = async function(...) {}` + - `identifier = (...) => {}` + """ + + defstruct [] + + defimpl CodeQA.AST.Parsing.Signal do + def source(_), do: CodeQA.AST.Signals.Structural.AssignmentFunctionSignal + def group(_), do: :split + + def init(_, _lang_mod) do + %{ + idx: 0, + indent: 0, + bracket_depth: 0, + at_line_start: true, + seen_content: false, + phase: :idle + } + end + + def emit(_, {_, %NewlineToken{}, _}, %{idx: idx} = state), + do: {MapSet.new(), %{state | idx: idx + 1, indent: 0, at_line_start: true, phase: :idle}} + + def emit(_, {_, %WhitespaceToken{}, _}, %{idx: idx, indent: i, at_line_start: true} = state), + do: {MapSet.new(), %{state | idx: idx + 1, indent: i + 1, at_line_start: true}} + + def emit(_, {_, %WhitespaceToken{}, _}, %{idx: idx} = state), + do: {MapSet.new(), %{state | idx: idx + 1}} + + def emit(_, {_, %{kind: k}, _}, %{idx: idx, bracket_depth: bd, phase: phase} = state) + when k in ["(", "[", "{"] do + new_bd = bd + 1 + new_phase = advance_phase_open(phase, k) + + {MapSet.new(), + %{ + state + | idx: idx + 1, + bracket_depth: new_bd, + at_line_start: false, + seen_content: true, + phase: new_phase + }} + end + + def emit(_, {_, %{kind: k}, _}, %{idx: idx, bracket_depth: bd, phase: phase} = state) + when k in [")", "]", "}"] do + new_bd = max(0, bd - 1) + new_phase = advance_phase_close(phase, k) + + {MapSet.new(), + %{ + state + | idx: idx + 1, + bracket_depth: new_bd, + at_line_start: false, + seen_content: true, + phase: new_phase + }} + end + + def emit( + _, + {_, token, _}, + %{ + idx: idx, + seen_content: sc, + indent: i, + bracket_depth: bd, + at_line_start: als, + phase: phase + } = state + ) do + {emissions, new_phase} = advance_phase(phase, token, idx, sc, i, bd, als) + + {emissions, + %{state | idx: idx + 1, at_line_start: false, seen_content: true, phase: new_phase}} + end + + defp advance_phase_open({:in_parens, id_idx, pd}, "("), do: {:in_parens, id_idx, pd + 1} + defp advance_phase_open({:in_parens, id_idx, pd}, _), do: {:in_parens, id_idx, pd} + defp advance_phase_open({:saw_eq, id_idx}, "("), do: {:in_parens, id_idx, 1} + defp advance_phase_open(_, _), do: :idle + + defp advance_phase_close({:in_parens, id_idx, 1}, ")"), do: {:saw_close_paren, id_idx} + + defp advance_phase_close({:in_parens, id_idx, pd}, ")") when pd > 1, + do: {:in_parens, id_idx, pd - 1} + + defp advance_phase_close({:in_parens, id_idx, pd}, _), do: {:in_parens, id_idx, pd} + defp advance_phase_close(_, _), do: :idle + + defp advance_phase(:idle, %{kind: ""}, idx, true, 0, 0, true), + do: {MapSet.new(), {:saw_id, idx}} + + defp advance_phase(:idle, _, _, _, _, _, _), do: {MapSet.new(), :idle} + + defp advance_phase({:saw_id, id_idx}, %{kind: "="}, _, _, _, _, _), + do: {MapSet.new(), {:saw_eq, id_idx}} + + defp advance_phase({:saw_id, _}, %{kind: ""}, idx, _, _, _, _), + do: {MapSet.new(), {:saw_id, idx}} + + defp advance_phase({:saw_id, id_idx}, %{kind: "."}, _, _, _, _, _), + do: {MapSet.new(), {:saw_id, id_idx}} + + defp advance_phase({:saw_id, _}, _, _, _, _, _, _), do: {MapSet.new(), :idle} + + defp advance_phase({:saw_eq, id_idx}, %{kind: "", content: "function"}, _, _, _, _, _), + do: {MapSet.new([{:assignment_function_split, id_idx}]), :idle} + + defp advance_phase({:saw_eq, id_idx}, %{kind: "", content: "async"}, _, _, _, _, _), + do: {MapSet.new(), {:saw_eq, id_idx}} + + defp advance_phase({:saw_eq, _}, _, _, _, _, _, _), do: {MapSet.new(), :idle} + + defp advance_phase({:saw_close_paren, id_idx}, %{kind: "=>"}, _, _, _, _, _), + do: {MapSet.new([{:assignment_function_split, id_idx}]), :idle} + + defp advance_phase({:saw_close_paren, _}, _, _, _, _, _, _), do: {MapSet.new(), :idle} + + defp advance_phase(_, _, _, _, _, _, _), do: {MapSet.new(), :idle} + end +end diff --git a/lib/codeqa/ast/signals/structural/blank_line_signal.ex b/lib/codeqa/ast/signals/structural/blank_line_signal.ex new file mode 100644 index 00000000..c484e1a1 --- /dev/null +++ b/lib/codeqa/ast/signals/structural/blank_line_signal.ex @@ -0,0 +1,45 @@ +defmodule CodeQA.AST.Signals.Structural.BlankLineSignal do + alias CodeQA.AST.Lexing.NewlineToken + alias CodeQA.AST.Lexing.WhitespaceToken + + @moduledoc """ + Emits `:blank_split` at the first substantive token after 2+ consecutive + blank lines that follow a known block-end token. + + When `opts[:language_module]` is set, uses that language's + `block_end_tokens/0` callback. + """ + + defstruct [] + + defimpl CodeQA.AST.Parsing.Signal do + def source(_), do: CodeQA.AST.Signals.Structural.BlankLineSignal + def group(_), do: :split + + def init(_, lang_mod) do + tokens = CodeQA.Language.block_end_tokens(lang_mod) + %{idx: 0, nl_run: 0, seen_content: false, last_content: nil, block_end_tokens: tokens} + end + + def emit(_, {_, %NewlineToken{}, _}, %{idx: idx, nl_run: nl} = state), + do: {MapSet.new(), %{state | idx: idx + 1, nl_run: nl + 1}} + + def emit(_, {_, %WhitespaceToken{}, _}, %{idx: idx} = state), + do: {MapSet.new(), %{state | idx: idx + 1}} + + def emit(_, {_, token, _}, %{idx: idx} = state) do + base = %{state | idx: idx + 1, nl_run: 0, seen_content: true, last_content: token.content} + + emissions = + if blank_split?(state), do: MapSet.new([{:blank_split, idx}]), else: MapSet.new() + + {emissions, base} + end + + defp blank_split?(%{seen_content: true, nl_run: nl, block_end_tokens: t, last_content: lc}) + when nl >= 2, + do: MapSet.member?(t, lc) + + defp blank_split?(_), do: false + end +end diff --git a/lib/codeqa/ast/signals/structural/bracket_signal.ex b/lib/codeqa/ast/signals/structural/bracket_signal.ex new file mode 100644 index 00000000..201f66e9 --- /dev/null +++ b/lib/codeqa/ast/signals/structural/bracket_signal.ex @@ -0,0 +1,51 @@ +defmodule CodeQA.AST.Signals.Structural.BracketSignal do + @moduledoc """ + Emits `:bracket_enclosure` for each outermost bracket pair `()`, `[]`, `{}`. + + Replaces `ParseRules.BracketRule`. State tracks: token index, bracket depth, + start index of current open bracket, and a stack of open bracket kinds for + mismatch detection. + """ + + defstruct [] + + defimpl CodeQA.AST.Parsing.Signal do + @close %{")" => "(", "]" => "[", "}" => "{"} + + def source(_), do: CodeQA.AST.Signals.Structural.BracketSignal + def group(_), do: :enclosure + + def init(_, _lang_mod), do: %{idx: 0, depth: 0, start_idx: nil, stack: []} + + def emit(_, {_, %{kind: k}, _}, %{idx: idx, depth: 0, stack: stack} = state) + when k in ["(", "[", "{"], + do: {MapSet.new(), %{state | idx: idx + 1, depth: 1, start_idx: idx, stack: [k | stack]}} + + def emit(_, {_, %{kind: k}, _}, %{idx: idx, depth: d, stack: stack} = state) + when k in ["(", "[", "{"], + do: {MapSet.new(), %{state | idx: idx + 1, depth: d + 1, stack: [k | stack]}} + + def emit(_, {_, %{kind: k}, _}, %{idx: idx, depth: d, stack: [top | rest]} = state) + when k in [")", "]", "}"] do + base = %{state | idx: idx + 1} + + if @close[k] == top, + do: close_match(base, d, state.start_idx, idx, rest), + else: {MapSet.new(), base} + end + + def emit(_, {_, %{kind: k}, _}, %{idx: idx} = state) when k in [")", "]", "}"], + do: {MapSet.new(), %{state | idx: idx + 1}} + + def emit(_, {_, _, _}, %{idx: idx} = state), + do: {MapSet.new(), %{state | idx: idx + 1}} + + defp close_match(state, 1, start_idx, idx, rest), + do: + {MapSet.new([{:bracket_enclosure, {start_idx, idx}}]), + %{state | depth: 0, start_idx: nil, stack: rest}} + + defp close_match(state, d, _start_idx, _idx, rest), + do: {MapSet.new(), %{state | depth: d - 1, stack: rest}} + end +end diff --git a/lib/codeqa/ast/signals/structural/branch_split_signal.ex b/lib/codeqa/ast/signals/structural/branch_split_signal.ex new file mode 100644 index 00000000..1d6d2644 --- /dev/null +++ b/lib/codeqa/ast/signals/structural/branch_split_signal.ex @@ -0,0 +1,58 @@ +defmodule CodeQA.AST.Signals.Structural.BranchSplitSignal do + alias CodeQA.AST.Lexing.NewlineToken + alias CodeQA.AST.Lexing.WhitespaceToken + + @moduledoc """ + Emits `:branch_split` when a branch keyword appears at bracket depth 0 + and at least one token has been seen before it. + + Unlike `KeywordSignal`, there is no indentation constraint — branches inside + functions are intentionally split into sibling child blocks by the parser's + recursive phase. + + When `opts[:language_module]` is set, uses that language's + `branch_keywords/0` callback. + """ + + defstruct [] + + defimpl CodeQA.AST.Parsing.Signal do + def source(_), do: CodeQA.AST.Signals.Structural.BranchSplitSignal + def group(_), do: :branch_split + + def init(_, lang_mod) do + keywords = CodeQA.Language.branch_keywords(lang_mod) + %{idx: 0, bracket_depth: 0, seen_content: false, keywords: keywords} + end + + def emit(_, {_, %NewlineToken{}, _}, %{idx: idx} = state), + do: {MapSet.new(), %{state | idx: idx + 1}} + + def emit(_, {_, %WhitespaceToken{}, _}, %{idx: idx} = state), + do: {MapSet.new(), %{state | idx: idx + 1}} + + def emit(_, {_, %{kind: k}, _}, %{idx: idx, bracket_depth: bd} = state) + when k in ["(", "[", "{"], + do: {MapSet.new(), %{state | idx: idx + 1, bracket_depth: bd + 1, seen_content: true}} + + def emit(_, {_, %{kind: k}, _}, %{idx: idx, bracket_depth: bd} = state) + when k in [")", "]", "}"], + do: + {MapSet.new(), + %{state | idx: idx + 1, bracket_depth: max(0, bd - 1), seen_content: true}} + + def emit(_, {_, token, _}, %{idx: idx} = state) do + base = %{state | idx: idx + 1, seen_content: true} + + emissions = + if branch_split?(state, token), do: MapSet.new([{:branch_split, idx}]), else: MapSet.new() + + {emissions, base} + end + + defp branch_split?(%{seen_content: true, bracket_depth: 0, keywords: kw}, %{content: c}), + do: MapSet.member?(kw, c) + + defp branch_split?(_, _), do: false + end +end diff --git a/lib/codeqa/ast/signals/structural/colon_indent_signal.ex b/lib/codeqa/ast/signals/structural/colon_indent_signal.ex new file mode 100644 index 00000000..9189b795 --- /dev/null +++ b/lib/codeqa/ast/signals/structural/colon_indent_signal.ex @@ -0,0 +1,83 @@ +defmodule CodeQA.AST.Signals.Structural.ColonIndentSignal do + alias CodeQA.AST.Lexing.NewlineToken + alias CodeQA.AST.Lexing.WhitespaceToken + + @moduledoc """ + Emits `:colon_indent_enclosure` for colon-indented blocks (Python). + + Only active when `opts[:language_module]` returns true for `uses_colon_indent?/0`. Replaces + `ParseRules.ColonIndentationRule`. + + ## Limitation + + The original rule flushes open blocks at EOF via `close_all_open/1`. Since + `emit/3` has no end-of-stream callback, open blocks are instead flushed at + each `` token. This correctly handles single-statement blocks; multi-line + blocks are closed at the first newline (conservative). + """ + + defstruct [] + + defimpl CodeQA.AST.Parsing.Signal do + def source(_), do: CodeQA.AST.Signals.Structural.ColonIndentSignal + def group(_), do: :enclosure + + def init(_, lang_mod) do + %{ + enabled: lang_mod.uses_colon_indent?(), + idx: 0, + ci: 0, + last_colon_indent: nil, + stack: [] + } + end + + def emit(_, _, %{enabled: false} = state), + do: {MapSet.new(), %{state | idx: state.idx + 1}} + + def emit(_, {_, %NewlineToken{}, _}, %{idx: idx} = state) do + {emissions, _} = flush_stack(state.stack) + {emissions, %{state | idx: idx + 1, ci: 0, stack: []}} + end + + def emit(_, {_, %WhitespaceToken{}, _}, %{idx: idx, ci: ci} = state), + do: {MapSet.new(), %{state | idx: idx + 1, ci: ci + 1}} + + def emit(_, {_, %{kind: ":"}, _}, %{idx: idx, ci: ci} = state), + do: {MapSet.new(), %{state | idx: idx + 1, last_colon_indent: ci}} + + def emit(_, {_, _, _}, %{idx: idx, ci: ci} = state) do + {dedent_emissions, remaining} = close_dedented(state.stack, ci) + new_stack = maybe_open_block(remaining, state.last_colon_indent, ci, idx) + + {dedent_emissions, + %{state | idx: idx + 1, last_colon_indent: nil, stack: update_top(new_stack, idx)}} + end + + defp close_dedented(stack, ci) do + {to_close, keep} = Enum.split_while(stack, fn e -> ci <= e.colon_indent end) + {build_emissions(to_close), keep} + end + + defp flush_stack(stack), do: {build_emissions(stack), []} + + defp maybe_open_block(stack, colon_indent, ci, idx) + when colon_indent != nil and ci > colon_indent, + do: [%{colon_indent: colon_indent, sub_start: idx, last_content_idx: idx} | stack] + + defp maybe_open_block(stack, _, _, _), do: stack + + defp build_emissions(entries) do + Enum.reduce(entries, MapSet.new(), fn + %{sub_start: s, last_content_idx: e}, acc when e != nil -> + MapSet.put(acc, {:colon_indent_enclosure, {s, e}}) + + _entry, acc -> + acc + end) + end + + defp update_top([], _idx), do: [] + defp update_top([top | rest], idx), do: [Map.put(top, :last_content_idx, idx) | rest] + end +end diff --git a/lib/codeqa/ast/signals/structural/comment_divider_signal.ex b/lib/codeqa/ast/signals/structural/comment_divider_signal.ex new file mode 100644 index 00000000..d01e5e83 --- /dev/null +++ b/lib/codeqa/ast/signals/structural/comment_divider_signal.ex @@ -0,0 +1,76 @@ +defmodule CodeQA.AST.Signals.Structural.CommentDividerSignal do + alias CodeQA.AST.Lexing.NewlineToken + alias CodeQA.AST.Lexing.WhitespaceToken + + @moduledoc """ + Emits `:comment_divider_split` when a line is a "visual divider" comment — + a comment prefix at line start followed immediately by repetitive non-word + punctuation characters. + + Used to detect section separators like `# ---`, `// ===`, `-- ---`. + No split is emitted for the first such line (seen_content must be true). + + When `opts[:language_module]` is set, uses that language's + `comment_prefixes/0` callback. + """ + + defstruct [] + + defimpl CodeQA.AST.Parsing.Signal do + def source(_), do: CodeQA.AST.Signals.Structural.CommentDividerSignal + def group(_), do: :split + + def init(_, lang_mod) do + comment_prefixes = MapSet.new(lang_mod.comment_prefixes()) + divider_indicators = CodeQA.Language.divider_indicators(lang_mod) + + %{ + idx: 0, + at_line_start: true, + seen_content: false, + indent: 0, + comment_prefixes: comment_prefixes, + divider_indicators: divider_indicators + } + end + + def emit(_, {_, %NewlineToken{}, _}, %{idx: idx} = state), + do: {MapSet.new(), %{state | idx: idx + 1, at_line_start: true, indent: 0}} + + def emit( + _, + {_, %WhitespaceToken{}, _}, + %{idx: idx, at_line_start: true, indent: indent} = state + ), + do: {MapSet.new(), %{state | idx: idx + 1, at_line_start: true, indent: indent + 1}} + + def emit(_, {_, %WhitespaceToken{}, _}, %{idx: idx} = state), + do: {MapSet.new(), %{state | idx: idx + 1}} + + def emit(_, {_, token, next}, %{idx: idx} = state) do + base = %{state | idx: idx + 1, at_line_start: false, seen_content: true} + + emissions = + if divider_split?(state, token, next), + do: MapSet.new([{:comment_divider_split, idx}]), + else: MapSet.new() + + {emissions, base} + end + + defp divider_split?( + %{ + seen_content: true, + at_line_start: true, + indent: 0, + comment_prefixes: cp, + divider_indicators: di + }, + %{kind: k}, + next + ), + do: MapSet.member?(cp, k) and next != nil and MapSet.member?(di, next.kind) + + defp divider_split?(_, _, _), do: false + end +end diff --git a/lib/codeqa/ast/signals/structural/decorator_signal.ex b/lib/codeqa/ast/signals/structural/decorator_signal.ex new file mode 100644 index 00000000..0dc1f5be --- /dev/null +++ b/lib/codeqa/ast/signals/structural/decorator_signal.ex @@ -0,0 +1,81 @@ +defmodule CodeQA.AST.Signals.Structural.DecoratorSignal do + alias CodeQA.AST.Lexing.NewlineToken + alias CodeQA.AST.Lexing.WhitespaceToken + + @moduledoc """ + Emits `:decorator_split` when a decorator/annotation marker appears at line + start with bracket_depth == 0. + + Detects two patterns: + - `@` at line start (Python, TypeScript, Java, Elixir decorators/annotations) + - `#[` at line start (Rust attribute syntax) + """ + + defstruct [] + + defimpl CodeQA.AST.Parsing.Signal do + def source(_), do: CodeQA.AST.Signals.Structural.DecoratorSignal + def group(_), do: :split + + def init(_, _lang_mod), + do: %{idx: 0, bracket_depth: 0, at_line_start: true, seen_content: false} + + def emit(_, {_, %NewlineToken{}, _}, %{idx: idx} = state), + do: {MapSet.new(), %{state | idx: idx + 1, at_line_start: true}} + + def emit(_, {_, %WhitespaceToken{}, _}, %{idx: idx, at_line_start: true} = state), + do: {MapSet.new(), %{state | idx: idx + 1, at_line_start: true}} + + def emit(_, {_, %WhitespaceToken{}, _}, %{idx: idx} = state), + do: {MapSet.new(), %{state | idx: idx + 1}} + + def emit(_, {_, %{kind: k}, _}, %{idx: idx, bracket_depth: bd} = state) + when k in ["(", "[", "{"], + do: + {MapSet.new(), + %{ + state + | idx: idx + 1, + bracket_depth: bd + 1, + seen_content: true, + at_line_start: false + }} + + def emit(_, {_, %{kind: k}, _}, %{idx: idx, bracket_depth: bd} = state) + when k in [")", "]", "}"], + do: + {MapSet.new(), + %{ + state + | idx: idx + 1, + bracket_depth: max(0, bd - 1), + seen_content: true, + at_line_start: false + }} + + def emit( + _, + {_, %{kind: "@"}, _}, + %{idx: idx, seen_content: true, bracket_depth: 0, at_line_start: true} = state + ), + do: + {MapSet.new([{:decorator_split, idx}]), + %{state | idx: idx + 1, seen_content: true, at_line_start: false}} + + def emit( + _, + {_, %{kind: "#"}, next}, + %{idx: idx, seen_content: true, bracket_depth: 0, at_line_start: true} = state + ) do + emissions = + if next != nil and next.kind == "[", + do: MapSet.new([{:decorator_split, idx}]), + else: MapSet.new() + + {emissions, %{state | idx: idx + 1, seen_content: true, at_line_start: false}} + end + + def emit(_, {_, _, _}, %{idx: idx} = state), + do: {MapSet.new(), %{state | idx: idx + 1, seen_content: true, at_line_start: false}} + end +end diff --git a/lib/codeqa/ast/signals/structural/dedent_to_zero_signal.ex b/lib/codeqa/ast/signals/structural/dedent_to_zero_signal.ex new file mode 100644 index 00000000..d644dad4 --- /dev/null +++ b/lib/codeqa/ast/signals/structural/dedent_to_zero_signal.ex @@ -0,0 +1,87 @@ +defmodule CodeQA.AST.Signals.Structural.DedentToZeroSignal do + alias CodeQA.AST.Lexing.NewlineToken + alias CodeQA.AST.Lexing.WhitespaceToken + + @moduledoc """ + Emits `:dedent_split` when code returns to indent level 0 after having been + at indent > 0 on the previous line. + + This is the primary split mechanism for Python and other indentation-significant + languages. The split fires at the first substantive token on a line that has no + leading ``, when the previous line did have leading ``. + """ + + defstruct [] + + defimpl CodeQA.AST.Parsing.Signal do + def source(_), do: CodeQA.AST.Signals.Structural.DedentToZeroSignal + def group(_), do: :split + + def init(_, _lang_mod) do + %{ + idx: 0, + at_line_start: true, + seen_content: false, + current_line_has_indent: false, + current_line_has_content: false, + prev_line_had_indent: false + } + end + + def emit( + _, + {_, %NewlineToken{}, _}, + %{ + idx: idx, + current_line_has_content: clhc, + current_line_has_indent: clhi, + prev_line_had_indent: plhi + } = state + ) do + new_plhi = if clhc, do: clhi, else: plhi + + {MapSet.new(), + %{ + state + | idx: idx + 1, + at_line_start: true, + prev_line_had_indent: new_plhi, + current_line_has_indent: false, + current_line_has_content: false + }} + end + + def emit(_, {_, %WhitespaceToken{}, _}, %{idx: idx, at_line_start: true} = state), + do: + {MapSet.new(), + %{state | idx: idx + 1, current_line_has_indent: true, at_line_start: true}} + + def emit(_, {_, %WhitespaceToken{}, _}, %{idx: idx} = state), + do: {MapSet.new(), %{state | idx: idx + 1}} + + def emit(_, {_, _, _}, %{idx: idx} = state) do + base = %{ + state + | idx: idx + 1, + at_line_start: false, + seen_content: true, + current_line_has_content: true + } + + emissions = + if dedent_split?(state), do: MapSet.new([{:dedent_split, idx}]), else: MapSet.new() + + {emissions, base} + end + + defp dedent_split?(%{ + at_line_start: true, + current_line_has_indent: false, + prev_line_had_indent: true, + seen_content: true + }), + do: true + + defp dedent_split?(_), do: false + end +end diff --git a/lib/codeqa/ast/signals/structural/doc_comment_lead_signal.ex b/lib/codeqa/ast/signals/structural/doc_comment_lead_signal.ex new file mode 100644 index 00000000..c5e5c4e3 --- /dev/null +++ b/lib/codeqa/ast/signals/structural/doc_comment_lead_signal.ex @@ -0,0 +1,65 @@ +defmodule CodeQA.AST.Signals.Structural.DocCommentLeadSignal do + alias CodeQA.AST.Lexing.NewlineToken + alias CodeQA.AST.Lexing.WhitespaceToken + + @moduledoc """ + Emits `:doc_comment_split` when a doc-comment opener appears at line start. + + Detects: + - `///` — Rust/C# XML doc comments: `//` token immediately followed by `/` + - `/**` — Java/JS JSDoc: `/` token at line start immediately followed by `*` + + No split is emitted for the first such line (seen_content must be true). + """ + + defstruct [] + + defimpl CodeQA.AST.Parsing.Signal do + def source(_), do: CodeQA.AST.Signals.Structural.DocCommentLeadSignal + def group(_), do: :split + + def init(_, _lang_mod), do: %{idx: 0, at_line_start: true, seen_content: false} + + def emit(_, {_, %NewlineToken{}, _}, %{idx: idx} = state), + do: {MapSet.new(), %{state | idx: idx + 1, at_line_start: true}} + + def emit(_, {_, %WhitespaceToken{}, _}, %{idx: idx, at_line_start: true} = state), + do: {MapSet.new(), %{state | idx: idx + 1, at_line_start: true}} + + def emit(_, {_, %WhitespaceToken{}, _}, %{idx: idx} = state), + do: {MapSet.new(), %{state | idx: idx + 1}} + + def emit( + _, + {_, %{kind: "//"}, next}, + %{idx: idx, at_line_start: true, seen_content: true} = state + ) do + base = %{state | idx: idx + 1, at_line_start: false} + + emissions = + if next != nil and next.kind == "/", + do: MapSet.new([{:doc_comment_split, idx}]), + else: MapSet.new() + + {emissions, base} + end + + def emit( + _, + {_, %{kind: "/"}, next}, + %{idx: idx, at_line_start: true, seen_content: true} = state + ) do + base = %{state | idx: idx + 1, at_line_start: false} + + emissions = + if next != nil and next.kind in ["*", "**"], + do: MapSet.new([{:doc_comment_split, idx}]), + else: MapSet.new() + + {emissions, base} + end + + def emit(_, {_, _, _}, %{idx: idx} = state), + do: {MapSet.new(), %{state | idx: idx + 1, at_line_start: false, seen_content: true}} + end +end diff --git a/lib/codeqa/ast/signals/structural/keyword_signal.ex b/lib/codeqa/ast/signals/structural/keyword_signal.ex new file mode 100644 index 00000000..c13d3cf9 --- /dev/null +++ b/lib/codeqa/ast/signals/structural/keyword_signal.ex @@ -0,0 +1,83 @@ +defmodule CodeQA.AST.Signals.Structural.KeywordSignal do + alias CodeQA.AST.Lexing.NewlineToken + alias CodeQA.AST.Lexing.WhitespaceToken + + @moduledoc """ + Emits `:keyword_split` when a declaration keyword appears at bracket depth 0 + and indentation level 0. + + When `opts[:language_module]` is set, uses that language's + `declaration_keywords/0` callback. + """ + + defstruct [] + + defimpl CodeQA.AST.Parsing.Signal do + def source(_), do: CodeQA.AST.Signals.Structural.KeywordSignal + def group(_), do: :split + + def init(_, lang_mod) do + keywords = CodeQA.Language.declaration_keywords(lang_mod) + + %{ + idx: 0, + bracket_depth: 0, + indent: 0, + at_line_start: true, + seen_content: false, + keywords: keywords + } + end + + def emit(_, {_, %NewlineToken{}, _}, %{idx: idx} = state), + do: {MapSet.new(), %{state | idx: idx + 1, indent: 0, at_line_start: true}} + + def emit(_, {_, %WhitespaceToken{}, _}, %{idx: idx, indent: i, at_line_start: true} = state), + do: {MapSet.new(), %{state | idx: idx + 1, indent: i + 1, at_line_start: true}} + + def emit(_, {_, %WhitespaceToken{}, _}, %{idx: idx} = state), + do: {MapSet.new(), %{state | idx: idx + 1}} + + def emit(_, {_, %{kind: k}, _}, %{idx: idx, bracket_depth: bd} = state) + when k in ["(", "[", "{"], + do: + {MapSet.new(), + %{ + state + | idx: idx + 1, + bracket_depth: bd + 1, + seen_content: true, + at_line_start: false + }} + + def emit(_, {_, %{kind: k}, _}, %{idx: idx, bracket_depth: bd} = state) + when k in [")", "]", "}"], + do: + {MapSet.new(), + %{ + state + | idx: idx + 1, + bracket_depth: max(0, bd - 1), + seen_content: true, + at_line_start: false + }} + + def emit(_, {_, token, _}, %{idx: idx} = state) do + base = %{state | idx: idx + 1, seen_content: true, at_line_start: false} + + emissions = + if keyword_split?(state, token), + do: MapSet.new([{:keyword_split, idx}]), + else: MapSet.new() + + {emissions, base} + end + + defp keyword_split?(%{seen_content: true, bracket_depth: 0, indent: 0, keywords: kw}, %{ + content: c + }), + do: MapSet.member?(kw, c) + + defp keyword_split?(_, _), do: false + end +end diff --git a/lib/codeqa/ast/signals/structural/sql_block_signal.ex b/lib/codeqa/ast/signals/structural/sql_block_signal.ex new file mode 100644 index 00000000..1e376f59 --- /dev/null +++ b/lib/codeqa/ast/signals/structural/sql_block_signal.ex @@ -0,0 +1,55 @@ +defmodule CodeQA.AST.Signals.Structural.SQLBlockSignal do + alias CodeQA.AST.Lexing.NewlineToken + alias CodeQA.AST.Lexing.WhitespaceToken + + @moduledoc """ + Emits `:sql_block_split` when a SQL DDL or DML statement keyword appears + at line start after prior content has been seen. + + Recognises uppercase and lowercase SQL statement starters: + DDL: CREATE, DROP, ALTER, TRUNCATE + DML: INSERT, UPDATE, DELETE, SELECT + Procedures/transactions: BEGIN, COMMIT, ROLLBACK, CALL, EXECUTE + + When `opts[:language_module]` is set, uses that language's + `statement_keywords/0` callback. + """ + + defstruct [] + + defimpl CodeQA.AST.Parsing.Signal do + def source(_), do: CodeQA.AST.Signals.Structural.SQLBlockSignal + def group(_), do: :split + + def init(_, lang_mod) do + keywords = CodeQA.Language.statement_keywords(lang_mod) + %{idx: 0, at_line_start: true, seen_content: false, keywords: keywords} + end + + def emit(_, {_, %NewlineToken{}, _}, %{idx: idx} = state), + do: {MapSet.new(), %{state | idx: idx + 1, at_line_start: true}} + + def emit(_, {_, %WhitespaceToken{}, _}, %{idx: idx, at_line_start: true} = state), + do: {MapSet.new(), %{state | idx: idx + 1, at_line_start: true}} + + def emit(_, {_, %WhitespaceToken{}, _}, %{idx: idx} = state), + do: {MapSet.new(), %{state | idx: idx + 1}} + + def emit(_, {_, %{kind: ""} = token, _}, %{idx: idx} = state) do + base = %{state | idx: idx + 1, at_line_start: false, seen_content: true} + + emissions = + if sql_split?(state, token), do: MapSet.new([{:sql_block_split, idx}]), else: MapSet.new() + + {emissions, base} + end + + def emit(_, {_, _, _}, %{idx: idx} = state), + do: {MapSet.new(), %{state | idx: idx + 1, at_line_start: false, seen_content: true}} + + defp sql_split?(%{seen_content: true, at_line_start: true, keywords: kw}, %{content: c}), + do: MapSet.member?(kw, String.downcase(c)) + + defp sql_split?(_, _), do: false + end +end diff --git a/lib/codeqa/ast/signals/structural/triple_quote_signal.ex b/lib/codeqa/ast/signals/structural/triple_quote_signal.ex new file mode 100644 index 00000000..ac5808db --- /dev/null +++ b/lib/codeqa/ast/signals/structural/triple_quote_signal.ex @@ -0,0 +1,31 @@ +defmodule CodeQA.AST.Signals.Structural.TripleQuoteSignal do + @moduledoc """ + Emits `:triple_split` at each `` token boundary. + + The first of each pair marks the opening of a heredoc; the second marks the + token after the closing delimiter. These split values are used by the Parser + to compute protected ranges, preventing other signals' splits from being + applied inside heredoc content. + + Replaces `ParseRules.TripleQuoteRule`. + """ + + defstruct [] + + defimpl CodeQA.AST.Parsing.Signal do + @doc_kind CodeQA.AST.Lexing.StringToken.doc_kind() + def source(_), do: CodeQA.AST.Signals.Structural.TripleQuoteSignal + def group(_), do: :split + + def init(_, _lang_mod), do: %{idx: 0, inside: false} + + def emit(_, {_, %{kind: @doc_kind}, _}, %{idx: idx, inside: false} = state), + do: {MapSet.new([{:triple_split, idx}]), %{state | idx: idx + 1, inside: true}} + + def emit(_, {_, %{kind: @doc_kind}, _}, %{idx: idx, inside: true} = state), + do: {MapSet.new([{:triple_split, idx + 1}]), %{state | idx: idx + 1, inside: false}} + + def emit(_, {_, _, _}, %{idx: idx} = state), + do: {MapSet.new(), %{state | idx: idx + 1}} + end +end diff --git a/lib/codeqa/block_impact/codebase_impact.ex b/lib/codeqa/block_impact/codebase_impact.ex new file mode 100644 index 00000000..50fa5ba3 --- /dev/null +++ b/lib/codeqa/block_impact/codebase_impact.ex @@ -0,0 +1,22 @@ +defmodule CodeQA.BlockImpact.CodebaseImpact do + @moduledoc """ + Leave-one-out codebase aggregate: reconstruct file content without a target node, + replace the file in the files map, and re-run the codebase aggregate. + """ + + alias CodeQA.AST.Enrichment.Node + alias CodeQA.AST.Lexing.TokenNormalizer + alias CodeQA.BlockImpact.FileImpact + alias CodeQA.Engine.Analyzer + + @doc """ + Returns the codebase aggregate after removing the target node from the given file. + """ + @spec compute(String.t(), String.t(), Node.t(), map()) :: map() + def compute(path, content, node, files_map) do + root_tokens = TokenNormalizer.normalize_structural(content) + reconstructed = FileImpact.reconstruct_without(root_tokens, node) + updated_files = Map.put(files_map, path, reconstructed) + Analyzer.analyze_codebase_aggregate(updated_files) + end +end diff --git a/lib/codeqa/block_impact/file_impact.ex b/lib/codeqa/block_impact/file_impact.ex new file mode 100644 index 00000000..10bd1f9f --- /dev/null +++ b/lib/codeqa/block_impact/file_impact.ex @@ -0,0 +1,46 @@ +defmodule CodeQA.BlockImpact.FileImpact do + @moduledoc """ + Leave-one-out file metrics: reconstruct file content without a target node's tokens + and return the re-run file metrics map. + """ + + alias CodeQA.AST.Enrichment.Node + alias CodeQA.AST.Lexing.TokenNormalizer + alias CodeQA.Engine.Analyzer + + @min_tokens 10 + + @doc """ + Computes file metrics for the content with the target node's tokens removed. + + Returns `nil` if the node has fewer than `#{@min_tokens}` tokens. + Returns a raw `%{"group" => %{"key" => value}}` metrics map otherwise. + """ + @spec compute(String.t(), Node.t()) :: map() | nil + def compute(_content, %Node{tokens: tokens}) when length(tokens) < @min_tokens, do: nil + + def compute(content, node) do + root_tokens = TokenNormalizer.normalize_structural(content) + reconstructed = reconstruct_without(root_tokens, node) + Analyzer.analyze_file("", reconstructed) + end + + @spec reconstruct_without([CodeQA.AST.Lexing.Token.t()], Node.t()) :: String.t() + def reconstruct_without(root_tokens, %Node{tokens: []}) do + Enum.map_join(root_tokens, "", & &1.content) + end + + def reconstruct_without(root_tokens, node) do + first = List.first(node.tokens) + + case Enum.find_index(root_tokens, fn t -> t.line == first.line and t.col == first.col end) do + nil -> + Enum.map_join(root_tokens, "", & &1.content) + + start_idx -> + end_idx = start_idx + length(node.tokens) + remaining = Enum.take(root_tokens, start_idx) ++ Enum.drop(root_tokens, end_idx) + Enum.map_join(remaining, "", & &1.content) + end + end +end diff --git a/lib/codeqa/block_impact/refactoring_potentials.ex b/lib/codeqa/block_impact/refactoring_potentials.ex new file mode 100644 index 00000000..4dcceb77 --- /dev/null +++ b/lib/codeqa/block_impact/refactoring_potentials.ex @@ -0,0 +1,145 @@ +defmodule CodeQA.BlockImpact.RefactoringPotentials do + @moduledoc """ + Computes named refactoring potentials for a code block using leave-one-out cosine deltas. + + Given baseline and without-node metrics at both file scope and codebase scope, + computes the cosine delta per behavior, merges the two scopes via max(), and + returns the top N behaviors sorted by delta descending. + + Positive delta = removing the block improved that behavior's cosine → the block + is a contributor to that anti-pattern. + """ + + alias CodeQA.CombinedMetrics.FileScorer + alias CodeQA.CombinedMetrics.SampleRunner + alias CodeQA.CombinedMetrics.Scorer + + @doc """ + Returns top N refactoring potentials for a code block. + + ## Parameters + + - `baseline_file_cosines` — pre-computed cosines list from `SampleRunner.diagnose_aggregate/2` for the baseline file + - `without_file_metrics` — raw `%{"group" => %{"key" => val}}` with the node's tokens removed + - `baseline_codebase_cosines` — pre-computed cosines list for the full codebase baseline + - `without_codebase_agg` — `%{"group" => %{"mean_key" => val}}` with the node removed from the codebase + + ## Options + + - `:top` — number of potentials to return (default 3) + + ## Result shape + + [%{"category" => "function_design", "behavior" => "cyclomatic_complexity_under_10", "cosine_delta" => 0.41}] + """ + @spec compute([map()], map(), [map()], map(), keyword()) :: [map()] + def compute( + baseline_file_cosines, + without_file_metrics, + baseline_codebase_cosines, + without_codebase_agg, + opts \\ [] + ) do + top_n = Keyword.get(opts, :top, 3) + language = Keyword.get(opts, :language) + languages = Keyword.get(opts, :languages) + behavior_map = Keyword.get(opts, :behavior_map) + block_type = Keyword.get(opts, :block_type) + + file_delta = + compute_file_delta(baseline_file_cosines, without_file_metrics, language, behavior_map) + + codebase_delta = + compute_codebase_delta( + baseline_codebase_cosines, + without_codebase_agg, + languages, + behavior_map + ) + + all_keys = Enum.uniq(Map.keys(file_delta) ++ Map.keys(codebase_delta)) + + all_keys + |> Enum.reject(fn {category, behavior} -> + excluded?(category, behavior, block_type, behavior_map) + end) + |> Enum.map(fn {category, behavior} -> + file_d = Map.get(file_delta, {category, behavior}, 0.0) + codebase_d = Map.get(codebase_delta, {category, behavior}, 0.0) + merged = max(file_d, codebase_d) + {category, behavior, merged} + end) + |> Enum.sort_by(fn {_, _, delta} -> delta end, :desc) + |> Enum.take(top_n) + |> Enum.map(fn {category, behavior, delta} -> + %{ + "category" => category, + "behavior" => behavior, + "cosine_delta" => Float.round(delta / 1.0, 4) + } + end) + end + + defp compute_file_delta(baseline_cosines, without_metrics, language, behavior_map) do + without_agg = FileScorer.file_to_aggregate(without_metrics) + + without_cosines = + SampleRunner.diagnose_aggregate(without_agg, + top: 99_999, + language: language, + behavior_map: behavior_map + ) + + cosines_to_delta(baseline_cosines, without_cosines) + end + + defp compute_codebase_delta(baseline_cosines, without_agg, languages, behavior_map) do + without_cosines = + SampleRunner.diagnose_aggregate(without_agg, + top: 99_999, + languages: languages, + behavior_map: behavior_map + ) + + cosines_to_delta(baseline_cosines, without_cosines) + end + + defp cosines_to_delta(baseline_cosines, without_cosines) do + without_map = + Map.new(without_cosines, fn %{category: c, behavior: b, cosine: cos} -> {{c, b}, cos} end) + + Map.new(baseline_cosines, fn %{category: c, behavior: b, cosine: cos} -> + without_cos = Map.get(without_map, {c, b}, 0.0) + {{c, b}, without_cos - cos} + end) + end + + defp excluded?(_category, _behavior, nil, _behavior_map), do: false + + defp excluded?(category, behavior, block_type, behavior_map) do + Atom.to_string(block_type) in excludes_for(category, behavior, behavior_map) + end + + defp excludes_for(category, behavior, behavior_map) when is_map(behavior_map) do + with [_ | _] = behaviors <- Map.get(behavior_map, category, []), + {^behavior, data} <- Enum.find(behaviors, fn {b, _} -> b == behavior end), + list when is_list(list) <- Map.get(data, "_excludes_block_types") do + list + else + _ -> [] + end + end + + defp excludes_for(category, behavior, nil) do + yaml_path = "priv/combined_metrics/#{category}.yml" + + with %{} = yamls <- Scorer.all_yamls(), + %{} = data <- Map.get(yamls, yaml_path), + %{} = behavior_data <- Map.get(data, behavior), + list when is_list(list) <- Map.get(behavior_data, "_excludes_block_types") do + list + else + _ -> [] + end + end +end diff --git a/lib/codeqa/block_impact_analyzer.ex b/lib/codeqa/block_impact_analyzer.ex new file mode 100644 index 00000000..69da2fe7 --- /dev/null +++ b/lib/codeqa/block_impact_analyzer.ex @@ -0,0 +1,479 @@ +defmodule CodeQA.BlockImpactAnalyzer do + @moduledoc """ + Orchestrates block impact analysis across all files in a pipeline result. + + For each file, tokenizes its content, parses it into a node tree, and for each + node (recursively including children) computes refactoring potentials via + leave-one-out impact scoring at both file scope and codebase scope. + + The pipeline result is returned with a `"nodes"` key added to each file entry. + All other keys in the result are preserved unchanged. + + ## Telemetry + + Emits the following events (all durations in microseconds): + + - `[:codeqa, :block_impact, :analyze]` — full run + measurements: `%{duration: us}` + metadata: `%{file_count: n}` + + - `[:codeqa, :block_impact, :codebase_cosines]` — codebase baseline cosine computation + measurements: `%{duration: us}` + metadata: `%{behavior_count: n}` + + - `[:codeqa, :block_impact, :file]` — per-file node computation + measurements: `%{duration: us, tokenize_us: us, parse_us: us, file_cosines_us: us, node_count: n}` + metadata: `%{path: string}` + + - `[:codeqa, :block_impact, :node]` — per-node leave-one-out computation + measurements: `%{duration: us, reconstruct_us: us, analyze_file_us: us, aggregate_us: us, refactoring_us: us}` + metadata: `%{path: string, token_count: n}` + """ + + alias CodeQA.Analysis.BehaviorConfigServer + alias CodeQA.AST.Classification.{NodeClassifier, TypedNodeKind} + alias CodeQA.AST.Enrichment.Node + alias CodeQA.AST.Lexing.TokenNormalizer + alias CodeQA.AST.Parsing.Parser + alias CodeQA.BlockImpact.{FileImpact, RefactoringPotentials} + alias CodeQA.CombinedMetrics.{FileScorer, SampleRunner} + alias CodeQA.Engine.Analyzer + alias CodeQA.Languages.Unknown + + @min_tokens 10 + + @doc """ + Analyzes all files in the pipeline result, adding `"nodes"` to each file entry. + + ## Parameters + + - `pipeline_result` — direct return value of `Engine.Analyzer.analyze_codebase/2`, + containing `"files"` and `"codebase"` keys + - `files_map` — raw `%{path => content}` map used for file-scope leave-one-out + - `opts` — keyword options + + ## Options + + - `:nodes_top` — number of refactoring potentials per node (default 3) + - `:workers` — parallelism for `Task.async_stream` (default `System.schedulers_online()`) + - `:baseline_codebase_agg` — pre-computed codebase aggregate (skips redundant analysis) + """ + @spec analyze(map(), map(), keyword()) :: map() + def analyze(pipeline_result, files_map, opts \\ []) do + nodes_top = Keyword.get(opts, :nodes_top, 3) + workers = Keyword.get(opts, :workers, System.schedulers_online()) + + t0 = now() + + baseline_codebase_agg = + Keyword.get_lazy(opts, :baseline_codebase_agg, fn -> + Analyzer.analyze_codebase_aggregate(files_map) + end) + + cached_behaviors = + case Keyword.get(opts, :behavior_config_pid) do + nil -> nil + pid -> BehaviorConfigServer.get_all_behaviors(pid) + end + + project_langs = project_languages(files_map) + + filtered_behaviors = + if cached_behaviors && project_langs != [] do + filter_behaviors_by_languages(cached_behaviors, project_langs) + else + cached_behaviors + end + + {baseline_codebase_cosines, cosines_us} = + timed(fn -> + SampleRunner.diagnose_aggregate(baseline_codebase_agg, + top: 99_999, + languages: project_langs, + behavior_map: filtered_behaviors + ) + end) + + :telemetry.execute( + [:codeqa, :block_impact, :codebase_cosines], + %{duration: cosines_us}, + %{behavior_count: length(baseline_codebase_cosines)} + ) + + file_results = pipeline_result["files"] + + updated_files = + file_results + |> Task.async_stream( + fn {path, file_data} -> + content = Map.get(files_map, path, "") + baseline_file_metrics = Map.get(file_data, "metrics", %{}) + + {nodes, file_measurements} = + compute_nodes_timed( + path, + content, + baseline_file_metrics, + file_results, + baseline_codebase_cosines, + nodes_top, + filtered_behaviors + ) + + :telemetry.execute( + [:codeqa, :block_impact, :file], + file_measurements, + %{path: path} + ) + + {path, Map.put(file_data, "nodes", nodes)} + end, + max_concurrency: workers, + ordered: false, + timeout: :infinity + ) + |> Enum.reduce(%{}, fn {:ok, {path, data}}, acc -> Map.put(acc, path, data) end) + + :telemetry.execute( + [:codeqa, :block_impact, :analyze], + %{duration: now() - t0}, + %{file_count: map_size(file_results)} + ) + + Map.put(pipeline_result, "files", updated_files) + end + + defp compute_nodes_timed( + path, + content, + baseline_file_metrics, + file_results, + baseline_codebase_cosines, + nodes_top, + cached_behaviors + ) do + if content == "" do + {[], %{duration: 0, tokenize_us: 0, parse_us: 0, file_cosines_us: 0, node_count: 0}} + else + t0 = now() + + {root_tokens, tokenize_us} = timed(fn -> TokenNormalizer.normalize_structural(content) end) + {top_level_nodes, parse_us} = timed(fn -> Parser.detect_blocks(root_tokens, Unknown) end) + + baseline_file_agg = FileScorer.file_to_aggregate(baseline_file_metrics) + lang_mod = CodeQA.Language.detect(path) + language = lang_mod.name() + + {baseline_file_cosines, file_cosines_us} = + timed(fn -> + SampleRunner.diagnose_aggregate(baseline_file_agg, + top: 99_999, + language: language, + behavior_map: cached_behaviors + ) + end) + + inc_agg = build_incremental_agg(file_results) + old_file_triples = file_metrics_to_triples(baseline_file_metrics) + project_langs = project_languages(file_results) + + node_ctx = %{ + inc_agg: inc_agg, + old_file_triples: old_file_triples, + project_langs: project_langs, + cached_behaviors: cached_behaviors, + lang_mod: lang_mod, + baseline_file_metrics: baseline_file_metrics + } + + nodes = + top_level_nodes + |> Enum.map(fn node -> + serialize_node( + node, + path, + root_tokens, + baseline_file_cosines, + baseline_codebase_cosines, + nodes_top, + language, + node_ctx + ) + end) + |> Enum.sort_by(fn n -> {n["start_line"], n["column_start"]} end) + + measurements = %{ + duration: now() - t0, + tokenize_us: tokenize_us, + parse_us: parse_us, + file_cosines_us: file_cosines_us, + node_count: length(top_level_nodes), + token_count: length(root_tokens), + bytes: byte_size(content) + } + + {nodes, measurements} + end + end + + defp serialize_node( + node, + path, + root_tokens, + baseline_file_cosines, + baseline_codebase_cosines, + nodes_top, + language, + node_ctx, + parent_context \\ nil + ) do + block_type = + node + |> NodeClassifier.classify(node_ctx.lang_mod, parent_context) + |> TypedNodeKind.of() + + potentials = + if length(node.tokens) < @min_tokens do + [] + else + compute_potentials_timed( + node, + path, + root_tokens, + baseline_file_cosines, + baseline_codebase_cosines, + nodes_top, + language, + node_ctx, + block_type + ) + end + + children = + node.children + |> Enum.map(fn child -> + child_context = parent_context_for(node.tokens, child) + + serialize_node( + child, + path, + root_tokens, + baseline_file_cosines, + baseline_codebase_cosines, + nodes_top, + language, + node_ctx, + child_context + ) + end) + |> Enum.sort_by(fn n -> {n["start_line"], n["column_start"]} end) + + first_token = List.first(node.tokens) + char_length = Enum.reduce(node.tokens, 0, fn t, acc -> acc + byte_size(t.content) end) + + %{ + "start_line" => node.start_line, + "end_line" => node.end_line, + "column_start" => (first_token && first_token.col) || 0, + "char_length" => char_length, + "type" => Atom.to_string(block_type), + "token_count" => length(node.tokens), + "refactoring_potentials" => potentials, + "children" => children + } + end + + # Returns the parent's tokens that come strictly before `child`'s first token, + # bounded to the same source line (everything since the last newline) and with + # leading whitespace stripped so the classification signals see the keyword at + # indent 0. Lets NodeClassifier see the keyword that drove the bracket-split + # (`alias`, `@name`, etc.) when classifying a sub-block. + defp parent_context_for(parent_tokens, child) do + case List.first(child.tokens) do + nil -> + [] + + child_first -> + nl_kind = CodeQA.AST.Lexing.NewlineToken.kind() + ws_kind = CodeQA.AST.Lexing.WhitespaceToken.kind() + + parent_tokens + |> Enum.take_while(fn t -> t != child_first end) + |> Enum.reverse() + |> Enum.take_while(fn t -> t.kind != nl_kind end) + |> Enum.reverse() + |> Enum.drop_while(fn t -> t.kind == ws_kind end) + end + end + + defp compute_potentials_timed( + %Node{} = node, + path, + root_tokens, + baseline_file_cosines, + baseline_codebase_cosines, + nodes_top, + language, + node_ctx, + block_type + ) do + t0 = now() + + {reconstructed, reconstruct_us} = + timed(fn -> FileImpact.reconstruct_without(root_tokens, node) end) + + block_content = Enum.map_join(node.tokens, "", & &1.content) + + {without_file_metrics, analyze_file_us} = + timed(fn -> + Analyzer.analyze_file_for_loo_partial( + path, + reconstructed, + node_ctx.baseline_file_metrics, + block_content + ) + end) + + {without_codebase_agg, aggregate_us} = + timed(fn -> + new_triples = file_metrics_to_triples(without_file_metrics) + + node_ctx.inc_agg + |> swap_file_in_agg(node_ctx.old_file_triples, new_triples) + |> incremental_agg_to_aggregate() + end) + + {potentials, refactoring_us} = + timed(fn -> + RefactoringPotentials.compute( + baseline_file_cosines, + without_file_metrics, + baseline_codebase_cosines, + without_codebase_agg, + top: nodes_top, + language: language, + languages: node_ctx.project_langs, + behavior_map: node_ctx.cached_behaviors, + block_type: block_type + ) + end) + + :telemetry.execute( + [:codeqa, :block_impact, :node], + %{ + duration: now() - t0, + reconstruct_us: reconstruct_us, + analyze_file_us: analyze_file_us, + aggregate_us: aggregate_us, + refactoring_us: refactoring_us + }, + %{path: path, token_count: length(node.tokens)} + ) + + potentials + end + + defp file_metrics_to_triples(metrics) when is_map(metrics) do + metrics + |> Enum.flat_map(fn + {metric_name, metric_data} when is_map(metric_data) -> + metric_data + |> Enum.filter(fn {_k, v} -> is_number(v) end) + |> Enum.map(fn {key, value} -> {metric_name, key, value / 1} end) + + _ -> + [] + end) + end + + defp build_incremental_agg(file_results) do + file_results + |> Map.values() + |> Enum.flat_map(fn file_data -> + file_data |> Map.get("metrics", %{}) |> file_metrics_to_triples() + end) + |> Enum.group_by(fn {metric, key, _val} -> {metric, key} end, fn {_, _, val} -> val end) + |> Map.new(fn {{metric, key}, values} -> + n = length(values) + sum = Enum.sum(values) + sum_sq = Enum.reduce(values, 0.0, fn v, acc -> acc + v * v end) + + {{metric, key}, + %{sum: sum, sum_sq: sum_sq, min: Enum.min(values), max: Enum.max(values), count: n}} + end) + end + + defp swap_file_in_agg(inc_agg, old_triples, new_triples) do + old_map = Map.new(old_triples, fn {metric, key, val} -> {{metric, key}, val} end) + new_map = Map.new(new_triples, fn {metric, key, val} -> {{metric, key}, val} end) + all_keys = Enum.uniq(Map.keys(old_map) ++ Map.keys(new_map)) + + Enum.reduce(all_keys, inc_agg, fn mk, acc -> + case Map.get(acc, mk) do + nil -> + acc + + state -> + old_val = Map.get(old_map, mk, 0.0) + new_val = Map.get(new_map, mk, 0.0) + + Map.put(acc, mk, %{ + sum: state.sum - old_val + new_val, + sum_sq: state.sum_sq - old_val * old_val + new_val * new_val, + min: min(state.min, new_val), + max: max(state.max, new_val), + count: state.count + }) + end + end) + end + + defp incremental_agg_to_aggregate(inc_agg) do + Enum.reduce(inc_agg, %{}, fn {{metric, key}, state}, acc -> + n = state.count + mean = if n > 0, do: state.sum / n, else: 0.0 + variance = if n > 0, do: max(state.sum_sq / n - mean * mean, 0.0), else: 0.0 + std = :math.sqrt(variance) + + metric_agg = Map.get(acc, metric, %{}) + + updated = + Map.merge(metric_agg, %{ + "mean_#{key}" => Float.round(mean * 1.0, 4), + "std_#{key}" => Float.round(std * 1.0, 4), + "min_#{key}" => Float.round(state.min * 1.0, 4), + "max_#{key}" => Float.round(state.max * 1.0, 4) + }) + + Map.put(acc, metric, updated) + end) + end + + defp filter_behaviors_by_languages(behaviors_map, project_langs) do + Map.new(behaviors_map, fn {category, behaviors} -> + filtered = + Enum.filter(behaviors, fn {_behavior, behavior_data} -> + behavior_langs = Map.get(behavior_data, "_languages", []) + behavior_langs == [] or Enum.any?(behavior_langs, &(&1 in project_langs)) + end) + + {category, filtered} + end) + end + + defp project_languages(path_keyed_map) do + path_keyed_map + |> Map.keys() + |> Enum.map(&CodeQA.Language.detect(&1).name()) + |> Enum.reject(&(&1 == "unknown")) + |> Enum.uniq() + end + + defp timed(fun) do + t = now() + result = fun.() + {result, now() - t} + end + + defp now, do: System.monotonic_time(:microsecond) +end diff --git a/lib/codeqa/cli.ex b/lib/codeqa/cli.ex index 210654d5..3e36d57c 100644 --- a/lib/codeqa/cli.ex +++ b/lib/codeqa/cli.ex @@ -3,27 +3,32 @@ defmodule CodeQA.CLI do @commands %{ "analyze" => CodeQA.CLI.Analyze, - "compare" => CodeQA.CLI.Compare, "history" => CodeQA.CLI.History, "correlate" => CodeQA.CLI.Correlate, - "stopwords" => CodeQA.CLI.Stopwords, - "health-report" => CodeQA.CLI.HealthReport + "health-report" => CodeQA.CLI.HealthReport, + "diagnose" => CodeQA.CLI.Diagnose } def main(args) do case args do - [cmd | rest] when is_map_key(@commands, cmd) -> @commands[cmd].run(rest) - _ -> print_usage() + [cmd | rest] when is_map_key(@commands, cmd) -> + output = @commands[cmd].run(rest) + unless output == "", do: IO.puts(output) + output + + _ -> + output = build_usage() + IO.puts(output) + output end end - defp print_usage do + defp build_usage do command_usages = @commands |> Enum.sort_by(fn {name, _} -> name end) - |> Enum.map(fn {_name, mod} -> mod.usage() end) - |> Enum.join("\n") + |> Enum.map_join("\n", fn {_name, mod} -> mod.usage() end) - IO.puts("Usage: codeqa [options]\n\n" <> command_usages) + "Usage: codeqa [options]\n\n" <> command_usages end end diff --git a/lib/codeqa/cli/analyze.ex b/lib/codeqa/cli/analyze.ex index 4473011e..9c1f8402 100644 --- a/lib/codeqa/cli/analyze.ex +++ b/lib/codeqa/cli/analyze.ex @@ -4,6 +4,9 @@ defmodule CodeQA.CLI.Analyze do @behaviour CodeQA.CLI.Command alias CodeQA.CLI.Options + alias CodeQA.Config + alias CodeQA.Engine.Analyzer + alias CodeQA.Engine.Collector @version "0.1.0" @@ -32,19 +35,18 @@ defmodule CodeQA.CLI.Analyze do @impl CodeQA.CLI.Command def run(args) when args in [["--help"], ["-h"]] do - IO.puts(usage()) + usage() end def run(args) do {opts, [path], _} = - Options.parse(args, [output: :string], [o: :output]) - - if opts[:telemetry], do: CodeQA.Telemetry.setup() + Options.parse(args, [output: :string], o: :output) Options.validate_dir!(path) + Config.load(path) - ignore_patterns = Options.parse_ignore_paths(opts[:ignore_paths]) ++ Options.load_config_ignore_paths(path) - files = CodeQA.Collector.collect_files(path, ignore_patterns: ignore_patterns) + files = + Collector.collect_files(path, Options.parse_ignore_paths(opts[:ignore_paths])) if map_size(files) == 0 do IO.puts(:stderr, "Warning: no source files found in '#{path}'") @@ -53,10 +55,11 @@ defmodule CodeQA.CLI.Analyze do print_progress(opts, files) - analyze_opts = Options.build_analyze_opts(opts) + analyze_opts = + Options.build_analyze_opts(opts) ++ Config.near_duplicate_blocks_opts() start_time = System.monotonic_time(:millisecond) - results = CodeQA.Analyzer.analyze_codebase(files, analyze_opts) + results = Analyzer.analyze_codebase(files, analyze_opts) end_time = System.monotonic_time(:millisecond) IO.puts(:stderr, "Analysis completed in #{end_time - start_time}ms") @@ -80,14 +83,13 @@ defmodule CodeQA.CLI.Analyze do case opts[:output] do nil -> - IO.puts(json) + json file -> File.write!(file, json) IO.puts(:stderr, "Report written to #{file}") + "" end - - if opts[:telemetry], do: CodeQA.Telemetry.print_report() end defp print_progress(opts, files) do diff --git a/lib/codeqa/cli/command.ex b/lib/codeqa/cli/command.ex index e2702a11..c6cd4a19 100644 --- a/lib/codeqa/cli/command.ex +++ b/lib/codeqa/cli/command.ex @@ -1,6 +1,6 @@ defmodule CodeQA.CLI.Command do @moduledoc "Behaviour for CLI commands." - @callback run([String.t()]) :: :ok + @callback run([String.t()]) :: String.t() @callback usage() :: String.t() end diff --git a/lib/codeqa/cli/compare.ex b/lib/codeqa/cli/compare.ex deleted file mode 100644 index b86bc32f..00000000 --- a/lib/codeqa/cli/compare.ex +++ /dev/null @@ -1,242 +0,0 @@ -defmodule CodeQA.CLI.Compare do - @moduledoc false - - @behaviour CodeQA.CLI.Command - - alias CodeQA.CLI.Options - - @version "0.1.0" - - @impl CodeQA.CLI.Command - def usage do - """ - Usage: codeqa compare [options] - - Compare code quality metrics between two git refs. - - Options: - --base-ref REF Base git ref to compare from (required) - --head-ref REF Head git ref to compare to (default: HEAD) - --changes-only Only analyze changed files - --all-files Analyze all source files (default) - --format FORMAT Output format: json, markdown, or github (default: json) - --output MODE Output mode: auto, summary, or changes (default: auto) - --progress Show per-file progress on stderr - -w, --workers N Number of parallel workers - --cache Enable caching file metrics - --cache-dir DIR Directory to store cache (default: .codeqa_cache) - -t, --timeout MS Timeout for similarity analysis (default: 5000) - --show-ncd Compute and show NCD similarity metric - --ncd-top N Number of top similar files to show per file - --ncd-paths PATHS Comma-separated list of paths to compute NCD for - --show-files Include individual file metrics in the output - --show-file-paths P Comma-separated list of paths to include in the output - --ignore-paths PATHS Comma-separated list of path patterns to ignore (supports wildcards, e.g. "test/*,docs/*") - """ - end - - @impl CodeQA.CLI.Command - def run(args) when args in [["--help"], ["-h"]] do - IO.puts(usage()) - end - - def run(args) do - {opts, [path], _} = - Options.parse(args, - [ - base_ref: :string, - head_ref: :string, - changes_only: :boolean, - all_files: :boolean, - format: :string, - output: :string - ], - [] - ) - - if opts[:telemetry], do: CodeQA.Telemetry.setup() - - base_ref = opts[:base_ref] || raise "Missing --base-ref" - head_ref = opts[:head_ref] || "HEAD" - changes_only = if opts[:changes_only], do: true, else: false - format = opts[:format] || "json" - output_mode = opts[:output] || "auto" - - Options.validate_dir!(path) - - ignore_patterns = Options.parse_ignore_paths(opts[:ignore_paths]) ++ Options.load_config_ignore_paths(path) - opts = Keyword.put(opts, :ignore_patterns, ignore_patterns) - - {base_result, head_result, changes} = - run_comparison(path, base_ref, head_ref, changes_only, opts) - - comparison = - CodeQA.Comparator.compare_results(base_result, head_result, changes) - |> enrich_metadata(base_ref, head_ref, changes_only) - |> filter_files_for_output(opts, format) - - output_comparison(comparison, format, output_mode) - - if opts[:telemetry], do: CodeQA.Telemetry.print_report() - end - - defp run_comparison(path, base_ref, head_ref, changes_only, opts) do - ignore_patterns = opts[:ignore_patterns] || [] - changes = CodeQA.Git.changed_files(path, base_ref, head_ref) - changes = CodeQA.Collector.reject_ignored(changes, ignore_patterns, & &1.path) - - file_paths = - if changes_only do - IO.puts(:stderr, "Comparing #{length(changes)} changed files...") - Enum.map(changes, & &1.path) - else - IO.puts(:stderr, "Comparing all source files...") - nil - end - - empty = %{"files" => %{}, "codebase" => %{"aggregate" => %{}, "similarity" => %{}}} - - if changes_only and length(changes) == 0 do - IO.puts(:stderr, "No source files changed — nothing to compare.") - {empty, empty, []} - else - base_files = CodeQA.Git.collect_files_at_ref(path, base_ref, file_paths) - head_files = CodeQA.Git.collect_files_at_ref(path, head_ref, file_paths) - base_files = CodeQA.Collector.reject_ignored_map(base_files, ignore_patterns) - head_files = CodeQA.Collector.reject_ignored_map(head_files, ignore_patterns) - - if map_size(base_files) == 0 and map_size(head_files) == 0 do - IO.puts(:stderr, "Warning: no source files found at either ref") - exit({:shutdown, 1}) - end - - print_progress(opts, base_files, head_files) - - analyze_opts = Options.build_analyze_opts(opts) - - base_result = - if map_size(base_files) > 0, - do: CodeQA.Analyzer.analyze_codebase(base_files, analyze_opts), - else: empty - - head_result = - if map_size(head_files) > 0, - do: CodeQA.Analyzer.analyze_codebase(head_files, analyze_opts), - else: empty - - changes = if changes_only, do: changes, else: synthesize_changes(base_files, head_files) - - {base_result, head_result, changes} - end - end - - defp print_progress(opts, base_files, head_files) do - if opts[:progress] do - step_prefix = if opts[:show_ncd], do: "1/5 ", else: "1/1 " - - IO.puts( - :stderr, - " #{step_prefix}Analyzing base (#{map_size(base_files)} files) and head (#{map_size(head_files)} files)..." - ) - else - IO.puts( - :stderr, - "Analyzing base (#{map_size(base_files)} files) and head (#{map_size(head_files)} files)..." - ) - end - end - - defp enrich_metadata(comparison, base_ref, head_ref, changes_only) do - comparison - |> put_in(["metadata", "base_ref"], base_ref) - |> put_in(["metadata", "head_ref"], head_ref) - |> put_in(["metadata", "changes_only"], changes_only) - |> put_in(["metadata", "version"], @version) - |> put_in(["metadata", "timestamp"], DateTime.utc_now() |> DateTime.to_iso8601()) - end - - defp output_comparison(comparison, "markdown", output_mode) do - IO.puts(CodeQA.Formatter.format_markdown(comparison, output_mode)) - end - - defp output_comparison(comparison, "github", output_mode) do - IO.puts(CodeQA.Formatter.format_github(comparison, output_mode)) - end - - defp output_comparison(comparison, _format, output_mode) do - codebase_summary = CodeQA.Summarizer.summarize_codebase(comparison) - - file_summaries = - Map.new(Map.get(comparison, "files", %{}), fn {path, data} -> - {path, CodeQA.Summarizer.summarize_file(path, data)} - end) - - IO.puts( - Jason.encode!(build_json_output(comparison, codebase_summary, file_summaries, output_mode), - pretty: true - ) - ) - end - - defp build_json_output(comparison, codebase_summary, file_summaries, output_mode) do - result = %{"metadata" => comparison["metadata"]} - - result = - if output_mode in ["auto", "summary"] do - result - |> Map.put("summary", codebase_summary) - |> Map.put("codebase", comparison["codebase"]) - else - result - end - - if output_mode in ["auto", "changes"] and Map.has_key?(comparison, "files") do - files_with_summaries = - Map.new(comparison["files"], fn {path, data} -> - {path, Map.put(data, "summary", Map.get(file_summaries, path, %{}))} - end) - - Map.put(result, "files", files_with_summaries) - else - result - end - end - - defp synthesize_changes(base_files, head_files) do - all_paths = MapSet.union(MapSet.new(Map.keys(base_files)), MapSet.new(Map.keys(head_files))) - - all_paths - |> Enum.sort() - |> Enum.map(fn path -> - status = - cond do - Map.has_key?(base_files, path) and Map.has_key?(head_files, path) -> "modified" - Map.has_key?(head_files, path) -> "added" - true -> "deleted" - end - - %CodeQA.Git.ChangedFile{path: path, status: status} - end) - end - - defp filter_files_for_output(results, _opts, format) when format in ["github", "markdown"], - do: results - - defp filter_files_for_output(results, opts, _format) do - cond do - opts[:show_files] -> - results - - opts[:show_file_paths] -> - target_paths = String.split(opts[:show_file_paths], ",") |> MapSet.new() - - filtered = - Map.filter(results["files"], fn {path, _} -> MapSet.member?(target_paths, path) end) - - Map.put(results, "files", filtered) - - true -> - Map.delete(results, "files") - end - end -end diff --git a/lib/codeqa/cli/correlate.ex b/lib/codeqa/cli/correlate.ex index a3fd2f73..c38a2481 100644 --- a/lib/codeqa/cli/correlate.ex +++ b/lib/codeqa/cli/correlate.ex @@ -4,6 +4,7 @@ defmodule CodeQA.CLI.Correlate do @behaviour CodeQA.CLI.Command alias CodeQA.CLI.Options + alias CodeQA.CLI.UI @impl CodeQA.CLI.Command def usage do @@ -25,7 +26,7 @@ defmodule CodeQA.CLI.Correlate do @impl CodeQA.CLI.Command def run(args) when args in [["--help"], ["-h"]] do - IO.puts(usage()) + usage() end def run(args) do @@ -82,7 +83,7 @@ defmodule CodeQA.CLI.Correlate do sorted = Enum.sort_by(correlations, &abs(&1["correlation"]), :desc) top = Enum.take(sorted, top_n) - IO.puts(Jason.encode!(top, pretty: true)) + Jason.encode!(top, pretty: true) end defp extract_metric_series(path, files) do @@ -204,7 +205,16 @@ defmodule CodeQA.CLI.Correlate do pairs_stream |> Task.async_stream( - &correlate_pair(&1, counter, total_pairs, update_interval, total_start, series, category_map, opts), + &correlate_pair( + &1, + counter, + total_pairs, + update_interval, + total_start, + series, + category_map, + opts + ), max_concurrency: System.schedulers_online(), timeout: :infinity ) @@ -257,9 +267,7 @@ defmodule CodeQA.CLI.Correlate do eta_ms = round((total_pairs - current) * avg_time) output = - CodeQA.CLI.UI.progress_bar(current, total_pairs, - eta: CodeQA.CLI.UI.format_eta(eta_ms) - ) + UI.progress_bar(current, total_pairs, eta: UI.format_eta(eta_ms)) IO.write(:stderr, "\r" <> output) if current == total_pairs, do: IO.puts(:stderr, "") diff --git a/lib/codeqa/cli/diagnose.ex b/lib/codeqa/cli/diagnose.ex new file mode 100644 index 00000000..93c2e8d0 --- /dev/null +++ b/lib/codeqa/cli/diagnose.ex @@ -0,0 +1,71 @@ +defmodule CodeQA.CLI.Diagnose do + @moduledoc false + + @behaviour CodeQA.CLI.Command + + @impl CodeQA.CLI.Command + def usage do + """ + Usage: codeqa diagnose [options] + + Diagnose likely code quality issues using cosine similarity against behavior profiles. + + Options: + --path PATH File or directory path to analyze (required) + --mode MODE Output mode: aggregate (default) or per-file + --top N Number of top issues to display (default: 15) + --format FORMAT Output format: plain (default) or json + --combined-top N Number of worst offender files per behavior (default: 2) + """ + end + + @impl CodeQA.CLI.Command + def run(args) when args in [["--help"], ["-h"]] do + usage() + end + + def run(args) do + {opts, _, _} = + OptionParser.parse(args, + strict: [ + path: :string, + mode: :string, + top: :integer, + format: :string, + combined_top: :integer + ] + ) + + path = opts[:path] + + unless path do + IO.puts(:stderr, "Error: --path required") + exit({:shutdown, 1}) + end + + unless File.exists?(path) do + IO.puts(:stderr, "Error: '#{path}' does not exist") + exit({:shutdown, 1}) + end + + mode = + case opts[:mode] do + "per-file" -> :per_file + _ -> :aggregate + end + + format = + case opts[:format] do + "json" -> :json + _ -> :plain + end + + CodeQA.Diagnostics.run( + path: path, + mode: mode, + top: opts[:top] || 15, + format: format, + combined_top: opts[:combined_top] || 2 + ) + end +end diff --git a/lib/codeqa/cli/health_report.ex b/lib/codeqa/cli/health_report.ex index 8f39186f..5dc8e6b6 100644 --- a/lib/codeqa/cli/health_report.ex +++ b/lib/codeqa/cli/health_report.ex @@ -4,6 +4,11 @@ defmodule CodeQA.CLI.HealthReport do @behaviour CodeQA.CLI.Command alias CodeQA.CLI.Options + alias CodeQA.Config + alias CodeQA.Engine.Analyzer + alias CodeQA.Engine.Collector + alias CodeQA.Git + alias CodeQA.HealthReport @impl CodeQA.CLI.Command def usage do @@ -24,33 +29,44 @@ defmodule CodeQA.CLI.HealthReport do --cache-dir DIR Directory to store cache (default: .codeqa_cache) -t, --timeout MS Timeout for similarity analysis (default: 5000) --ignore-paths PATHS Comma-separated list of path patterns to ignore (supports wildcards, e.g. "test/*,docs/*") + --base-ref REF Base git ref for PR comparison (enables delta and block scoping) + --head-ref REF Head git ref (default: HEAD) + --comment Multi-part mode: writes numbered part files to TMPDIR for PR comments """ end @impl CodeQA.CLI.Command def run(args) when args in [["--help"], ["-h"]] do - IO.puts(usage()) + usage() end + @command_options [ + output: :string, + config: :string, + detail: :string, + top: :integer, + format: :string, + ignore_paths: :string, + base_ref: :string, + head_ref: :string, + telemetry: :boolean, + comment: :boolean + ] + def run(args) do - {opts, [path], _} = - Options.parse(args, - [ - output: :string, - config: :string, - detail: :string, - top: :integer, - format: :string - ], - [o: :output] - ) + {opts, [path], _} = Options.parse(args, @command_options, o: :output) + Options.validate_dir!(path) + extra_ignore_patterns = Options.parse_ignore_paths(opts[:ignore_paths]) - if opts[:telemetry], do: CodeQA.Telemetry.setup() + base_ref = opts[:base_ref] + head_ref = opts[:head_ref] || "HEAD" - Options.validate_dir!(path) + collect_t0 = System.monotonic_time(:microsecond) - ignore_patterns = Options.parse_ignore_paths(opts[:ignore_paths]) ++ Options.load_config_ignore_paths(path) - files = CodeQA.Collector.collect_files(path, ignore_patterns: ignore_patterns) + files = + Collector.collect_files(path, extra_ignore_patterns) + + collect_us = System.monotonic_time(:microsecond) - collect_t0 if map_size(files) == 0 do IO.puts(:stderr, "Warning: no source files found in '#{path}'") @@ -59,14 +75,21 @@ defmodule CodeQA.CLI.HealthReport do IO.puts(:stderr, "Analyzing #{map_size(files)} files for health report...") - analyze_opts = Options.build_analyze_opts(opts) + telemetry_pid = if opts[:telemetry], do: attach_telemetry() + + analyze_opts = + Options.build_analyze_opts(opts) ++ + Config.near_duplicate_blocks_opts() ++ [compute_nodes: true] start_time = System.monotonic_time(:millisecond) - results = CodeQA.Analyzer.analyze_codebase(files, analyze_opts) + results = Analyzer.analyze_codebase(files, analyze_opts) end_time = System.monotonic_time(:millisecond) IO.puts(:stderr, "Analysis completed in #{end_time - start_time}ms") + if telemetry_pid, + do: record_phase(telemetry_pid, :analyze, (end_time - start_time) * 1_000) + total_bytes = results["files"] |> Map.values() |> Enum.map(& &1["bytes"]) |> Enum.sum() results = @@ -77,29 +100,108 @@ defmodule CodeQA.CLI.HealthReport do "total_bytes" => total_bytes }) + {base_results, changed_files, diff_line_ranges} = + if base_ref do + IO.puts(:stderr, "Collecting base snapshot at #{base_ref}...") + base_files = Git.collect_files_at_ref(path, base_ref) + changed = Git.changed_files(path, base_ref, head_ref) + + diff_ranges = + case Git.diff_line_ranges(path, base_ref, head_ref) do + {:ok, ranges} -> + ranges + + {:error, reason} -> + IO.puts(:stderr, "Warning: failed to parse diff line ranges: #{inspect(reason)}") + IO.puts(:stderr, "Block scoping disabled - showing all blocks in changed files") + %{} + end + + IO.puts(:stderr, "Analyzing base snapshot (#{map_size(base_files)} files)...") + base_res = Analyzer.analyze_codebase(base_files, analyze_opts) + + {base_res, changed, diff_ranges} + else + {nil, [], %{}} + end + detail = parse_detail(opts[:detail]) format = parse_format(opts[:format]) top_n = opts[:top] || 5 + report_gen_t0 = System.monotonic_time(:microsecond) + report = - CodeQA.HealthReport.generate(results, + HealthReport.generate(results, config: opts[:config], detail: detail, - top: top_n + top: top_n, + base_results: base_results, + changed_files: changed_files, + diff_line_ranges: diff_line_ranges ) - markdown = CodeQA.HealthReport.to_markdown(report, detail, format) + report_gen_us = System.monotonic_time(:microsecond) - report_gen_t0 - case opts[:output] do - nil -> - IO.puts(markdown) + if telemetry_pid do + record_phase(telemetry_pid, :collect, collect_us) + record_phase(telemetry_pid, :report_gen, report_gen_us) + end + + output = + if opts[:comment] do + write_comment_parts(report, detail) + else + render_t0 = System.monotonic_time(:microsecond) + markdown = HealthReport.to_markdown(report, detail, format) + render_us = System.monotonic_time(:microsecond) - render_t0 + if telemetry_pid, do: record_phase(telemetry_pid, :render, render_us) + + case opts[:output] do + nil -> + markdown + + file -> + File.write!(file, markdown) + IO.puts(:stderr, "Health report written to #{file}") + "" + end + end + + if telemetry_pid, do: print_telemetry(telemetry_pid) + + output + end - file -> - File.write!(file, markdown) - IO.puts(:stderr, "Health report written to #{file}") + defp write_comment_parts(report, detail) do + tmpdir = System.get_env("TMPDIR", "/tmp") + parts = HealthReport.Formatter.render_parts(report, detail: detail) + + # Write each part to a numbered file + Enum.with_index(parts, 1) + |> Enum.each(fn {content, n} -> + path = Path.join(tmpdir, "codeqa-part-#{n}.md") + File.write!(path, content) + IO.puts(:stderr, "Part #{n} written to #{path} (#{byte_size(content)} bytes)") + end) + + # Ensure at least 3 parts exist for stale cleanup + actual_count = length(parts) + padded_count = max(actual_count, 3) + + for n <- (actual_count + 1)..padded_count//1 do + path = Path.join(tmpdir, "codeqa-part-#{n}.md") + placeholder = "> _No content for this section._\n\n" + File.write!(path, placeholder) + IO.puts(:stderr, "Part #{n} (placeholder) written to #{path}") end - if opts[:telemetry], do: CodeQA.Telemetry.print_report() + # Write part count for run.sh to read + count_path = Path.join(tmpdir, "codeqa-part-count.txt") + File.write!(count_path, to_string(padded_count)) + IO.puts(:stderr, "Part count (#{padded_count}) written to #{count_path}") + + "" end defp parse_detail(nil), do: :default @@ -120,4 +222,329 @@ defmodule CodeQA.CLI.HealthReport do IO.puts(:stderr, "Warning: unknown format '#{other}', using 'plain'") :plain end + + # --------------------------------------------------------------------------- + # Pipeline telemetry (block-impact + stage + per-metric + CLI phases) + # --------------------------------------------------------------------------- + + @telemetry_handler "codeqa-telemetry-reporter" + + defp attach_telemetry do + {:ok, pid} = + Agent.start_link(fn -> + %{ + nodes: [], + files: [], + codebase_cosines_us: 0, + stages: %{}, + file_metrics: %{}, + codebase_metrics: %{}, + phases: %{}, + loo_breakdown: %{}, + loo_breakdown_calls: 0, + cosine_breakdown: %{}, + cosine_breakdown_calls: 0 + } + end) + + :telemetry.attach_many( + @telemetry_handler, + [ + [:codeqa, :block_impact, :codebase_cosines], + [:codeqa, :block_impact, :file], + [:codeqa, :block_impact, :node], + [:codeqa, :stage], + [:codeqa, :file_metric], + [:codeqa, :codebase_metric], + [:codeqa, :loo_breakdown], + [:codeqa, :cosine_breakdown] + ], + &handle_event(&1, &2, &3, &4), + pid + ) + + pid + end + + defp record_phase(pid, name, duration_us) do + Agent.update(pid, fn state -> + Map.update!(state, :phases, &Map.put(&1, name, duration_us)) + end) + end + + defp handle_event( + [:codeqa, :block_impact, :codebase_cosines], + measurements, + _metadata, + pid + ) do + Agent.update(pid, &Map.put(&1, :codebase_cosines_us, measurements.duration)) + end + + defp handle_event([:codeqa, :block_impact, :file], measurements, metadata, pid) do + Agent.update(pid, fn state -> + Map.update!(state, :files, &[{metadata.path, measurements} | &1]) + end) + end + + defp handle_event([:codeqa, :block_impact, :node], measurements, metadata, pid) do + Agent.update(pid, fn state -> + Map.update!(state, :nodes, &[{metadata.path, measurements} | &1]) + end) + end + + defp handle_event([:codeqa, :stage], measurements, metadata, pid) do + Agent.update(pid, fn state -> + Map.update!(state, :stages, fn stages -> + Map.put(stages, metadata.stage, measurements.duration) + end) + end) + end + + defp handle_event([:codeqa, :file_metric], measurements, metadata, pid) do + Agent.update(pid, fn state -> + Map.update!(state, :file_metrics, fn fm -> + Map.update(fm, metadata.metric, {1, measurements.duration}, fn {n, sum} -> + {n + 1, sum + measurements.duration} + end) + end) + end) + end + + defp handle_event([:codeqa, :codebase_metric], measurements, metadata, pid) do + Agent.update(pid, fn state -> + Map.update!(state, :codebase_metrics, &Map.put(&1, metadata.metric, measurements.duration)) + end) + end + + defp handle_event([:codeqa, :loo_breakdown], measurements, _metadata, pid) do + Agent.update(pid, fn state -> + merged = + Enum.reduce(measurements, state.loo_breakdown, fn {k, v}, acc -> + Map.update(acc, k, v, &(&1 + v)) + end) + + state + |> Map.put(:loo_breakdown, merged) + |> Map.update!(:loo_breakdown_calls, &(&1 + 1)) + end) + end + + defp handle_event([:codeqa, :cosine_breakdown], measurements, _metadata, pid) do + Agent.update(pid, fn state -> + merged = + Enum.reduce(measurements, state.cosine_breakdown, fn {k, v}, acc -> + Map.update(acc, k, v, &(&1 + v)) + end) + + state + |> Map.put(:cosine_breakdown, merged) + |> Map.update!(:cosine_breakdown_calls, &(&1 + 1)) + end) + end + + defp print_telemetry(pid) do + state = Agent.get(pid, & &1) + Agent.stop(pid) + :telemetry.detach(@telemetry_handler) + + nodes = state.nodes + files = state.files + + total_nodes = length(nodes) + total_files = length(files) + + node_totals = Enum.map(nodes, fn {_, m} -> m end) + file_totals = Enum.map(files, fn {_, m} -> m end) + + IO.puts(:stderr, """ + + ── CLI Phases ────────────────────────────────────────── + #{format_phases(state.phases)} + + ── Top-Level Stages (inside Analyzer.analyze_codebase) ─ + #{format_stages(state.stages)} + + ── Codebase Metrics (run once over all files) ────────── + #{format_codebase_metrics(state.codebase_metrics)} + + ── File Metrics (summed over all files; #{total_files} files) ── + #{format_file_metrics(state.file_metrics, total_files)} + + ── Block Impact Telemetry ────────────────────────────── + Codebase cosines: #{us(state.codebase_cosines_us)} + Files processed: #{total_files} + Nodes processed: #{total_nodes} + + Per-file breakdown (avg across #{total_files} files): + tokenize: #{avg_us(file_totals, :tokenize_us)} + parse blocks: #{avg_us(file_totals, :parse_us)} + file cosines: #{avg_us(file_totals, :file_cosines_us)} + total/file: #{avg_us(file_totals, :duration)} + + Per-node breakdown (avg across #{total_nodes} nodes): + reconstruct: #{avg_us(node_totals, :reconstruct_us)} + analyze_file: #{avg_us(node_totals, :analyze_file_us)} + aggregate: #{avg_us(node_totals, :aggregate_us)} + refactoring cosine: #{avg_us(node_totals, :refactoring_us)} + total/node: #{avg_us(node_totals, :duration)} + + Top 5 slowest files (total node time): + #{top_slow_files(files, nodes)} + + ── LOO breakdown (per analyze_file_for_loo_partial call) ─ + Calls: #{state.loo_breakdown_calls} + #{format_breakdown_avg(state.loo_breakdown, state.loo_breakdown_calls)} + + ── Cosine breakdown (per diagnose_aggregate call) ────── + Calls: #{state.cosine_breakdown_calls} + #{format_breakdown_avg(state.cosine_breakdown, state.cosine_breakdown_calls)} + + ── File-size scaling (block_impact: total node time) ── + #{format_scaling(files, nodes)} + ──────────────────────────────────────────────────────── + """) + end + + defp format_breakdown_avg(breakdown, calls) when map_size(breakdown) == 0 or calls == 0, + do: " (no data)" + + defp format_breakdown_avg(breakdown, calls) do + breakdown + |> Enum.sort_by(fn {_, v} -> -v end) + |> Enum.take(25) + |> Enum.map_join("\n", fn {key, total_us} -> + avg = div(total_us, calls) + pct = total_us * 100 / Enum.sum(Map.values(breakdown)) + + " #{String.pad_trailing(to_string(key), 32)} total #{us(total_us)} avg/call #{us(avg)} (#{Float.round(pct, 1)}%)" + end) + end + + defp format_scaling(files, nodes) do + nodes_by_path = Enum.group_by(nodes, fn {p, _} -> p end, fn {_, m} -> m end) + + rows = + files + |> Enum.map(fn {path, fm} -> + node_durations = nodes_by_path |> Map.get(path, []) |> Enum.map(& &1.duration) + total_node_us = Enum.sum(node_durations) + + %{ + path: path, + bytes: Map.get(fm, :bytes, 0), + tokens: Map.get(fm, :token_count, 0), + nodes: Map.get(fm, :node_count, 0), + file_us: fm.duration, + total_node_us: total_node_us + } + end) + + bins = [ + {"<2KB ", fn r -> r.bytes < 2_000 end}, + {"2-8KB", fn r -> r.bytes >= 2_000 and r.bytes < 8_000 end}, + {"8-32KB", fn r -> r.bytes >= 8_000 and r.bytes < 32_000 end}, + {">32KB", fn r -> r.bytes >= 32_000 end} + ] + + bin_rows = + bins + |> Enum.map(fn {label, pred} -> + bucket = Enum.filter(rows, pred) + n = length(bucket) + + if n == 0 do + " #{label} (none)" + else + avg_bytes = div(Enum.sum(Enum.map(bucket, & &1.bytes)), n) + avg_tokens = div(Enum.sum(Enum.map(bucket, & &1.tokens)), n) + avg_nodes = div(Enum.sum(Enum.map(bucket, & &1.nodes)), n) + avg_node_us = div(Enum.sum(Enum.map(bucket, & &1.total_node_us)), n) + tokens_per_node_us = if avg_nodes > 0, do: div(avg_node_us, avg_nodes), else: 0 + + " #{label} files=#{n} avg bytes=#{avg_bytes} tokens=#{avg_tokens} nodes=#{avg_nodes} total_node=#{us(avg_node_us)} per_node=#{us(tokens_per_node_us)}" + end + end) + + Enum.join(bin_rows, "\n") + end + + defp format_phases(phases) when map_size(phases) == 0, do: " (no phases recorded)" + + defp format_phases(phases) do + [:collect, :analyze, :report_gen, :render] + |> Enum.filter(&Map.has_key?(phases, &1)) + |> Enum.map_join("\n", fn name -> + " #{String.pad_trailing(Atom.to_string(name), 12)} #{us(phases[name])}" + end) + end + + defp format_stages(stages) when map_size(stages) == 0, do: " (no stages recorded)" + + defp format_stages(stages) do + stages + |> Enum.sort_by(fn {_, dur} -> -dur end) + |> Enum.map_join("\n", fn {name, dur} -> + " #{String.pad_trailing(Atom.to_string(name), 20)} #{us(dur)}" + end) + end + + defp format_codebase_metrics(m) when map_size(m) == 0, do: " (none recorded)" + + defp format_codebase_metrics(m) do + m + |> Enum.sort_by(fn {_, dur} -> -dur end) + |> Enum.map_join("\n", fn {name, dur} -> + " #{String.pad_trailing(to_string(name), 32)} #{us(dur)}" + end) + end + + defp format_file_metrics(m, _file_count) when map_size(m) == 0, do: " (none recorded)" + + defp format_file_metrics(m, file_count) do + fc = max(file_count, 1) + + m + |> Enum.map(fn {name, {n, sum}} -> + avg = if n > 0, do: div(sum, n), else: 0 + {name, sum, avg, n} + end) + |> Enum.sort_by(fn {_, sum, _, _} -> -sum end) + |> Enum.map_join("\n", fn {name, sum, avg, n} -> + " #{String.pad_trailing(to_string(name), 32)} total #{us(sum)} avg/file #{us(div(sum, fc))} (#{n} calls, avg/call #{us(avg)})" + end) + end + + defp top_slow_files(files, nodes) do + node_time_by_file = + nodes + |> Enum.group_by(fn {path, _} -> path end, fn {_, m} -> m.duration end) + |> Map.new(fn {path, durations} -> {path, Enum.sum(durations)} end) + + files + |> Enum.map(fn {path, fm} -> + node_time = Map.get(node_time_by_file, path, 0) + {path, fm.node_count, node_time} + end) + |> Enum.sort_by(fn {_, _, t} -> -t end) + |> Enum.take(5) + |> Enum.map_join("\n", fn {path, node_count, node_time} -> + " #{path} (#{node_count} nodes, #{us(node_time)} node time)" + end) + end + + defp avg_us([], _key), do: "n/a" + + defp avg_us(measurements, key) do + total = Enum.sum(Enum.map(measurements, &Map.get(&1, key, 0))) + us(div(total, length(measurements))) + end + + defp us(microseconds) when microseconds >= 1_000_000, + do: "#{Float.round(microseconds / 1_000_000, 2)}s" + + defp us(microseconds) when microseconds >= 1_000, + do: "#{Float.round(microseconds / 1_000, 1)}ms" + + defp us(microseconds), do: "#{microseconds}µs" end diff --git a/lib/codeqa/cli/history.ex b/lib/codeqa/cli/history.ex index 4c73acee..ca40669c 100644 --- a/lib/codeqa/cli/history.ex +++ b/lib/codeqa/cli/history.ex @@ -4,6 +4,11 @@ defmodule CodeQA.CLI.History do @behaviour CodeQA.CLI.Command alias CodeQA.CLI.Options + alias CodeQA.CLI.Progress + alias CodeQA.Config + alias CodeQA.Engine.Analyzer + alias CodeQA.Engine.Collector + alias CodeQA.Git @version "0.1.0" @@ -34,18 +39,20 @@ defmodule CodeQA.CLI.History do @impl CodeQA.CLI.Command def run(args) when args in [["--help"], ["-h"]] do - IO.puts(usage()) + usage() end def run(args) do {opts, [path], _} = - Options.parse(args, + Options.parse( + args, [ commits: :integer, commit_list: :string, output_dir: :string ], - [n: :commits, o: :output_dir] + n: :commits, + o: :output_dir ) output_dir = opts[:output_dir] || raise "Missing --output-dir" @@ -56,14 +63,19 @@ defmodule CodeQA.CLI.History do commits = resolve_commits(opts, path) IO.puts(:stderr, "Found #{length(commits)} commits to analyze.") - analyze_opts = Options.build_analyze_opts(opts) - ignore_patterns = Options.parse_ignore_paths(opts[:ignore_paths]) ++ Options.load_config_ignore_paths(path) + Config.load(path) + + analyze_opts = + Options.build_analyze_opts(opts) ++ Config.near_duplicate_blocks_opts() + + ignore_patterns = Options.parse_ignore_paths(opts[:ignore_paths]) commits |> Enum.with_index(1) |> Enum.each(&analyze_commit(&1, path, output_dir, analyze_opts, ignore_patterns, opts)) IO.puts(:stderr, "Done writing history to #{output_dir}") + "" end defp resolve_commits(opts, path) do @@ -90,14 +102,13 @@ defmodule CodeQA.CLI.History do current_opts = if opts[:progress], do: [ - {:on_progress, - fn c, t, p, _tt -> CodeQA.CLI.Progress.callback(c, t, p, start_time_progress) end} + {:on_progress, fn c, t, p, _tt -> Progress.callback(c, t, p, start_time_progress) end} | analyze_opts ], else: analyze_opts - files = CodeQA.Git.collect_files_at_ref(path, commit) - files = CodeQA.Collector.reject_ignored_map(files, ignore_patterns) + files = Git.collect_files_at_ref(path, commit) + files = Collector.reject_ignored_map(files, ignore_patterns) if map_size(files) == 0 do IO.puts(:stderr, "Warning: no source files found at commit #{commit}") @@ -108,7 +119,7 @@ defmodule CodeQA.CLI.History do defp write_commit_result(commit, path, output_dir, files, analyze_opts) do start_time = System.monotonic_time(:millisecond) - results = CodeQA.Analyzer.analyze_codebase(files, analyze_opts) + results = Analyzer.analyze_codebase(files, analyze_opts) end_time = System.monotonic_time(:millisecond) IO.puts(:stderr, " Analysis completed in #{end_time - start_time}ms") diff --git a/lib/codeqa/cli/options.ex b/lib/codeqa/cli/options.ex index c735d56a..199a95df 100644 --- a/lib/codeqa/cli/options.ex +++ b/lib/codeqa/cli/options.ex @@ -1,6 +1,8 @@ defmodule CodeQA.CLI.Options do @moduledoc false + alias CodeQA.CLI.Progress + @common_strict [ workers: :integer, cache: :boolean, @@ -10,13 +12,11 @@ defmodule CodeQA.CLI.Options do ncd_top: :integer, ncd_paths: :string, combinations: :boolean, - telemetry: :boolean, - experimental_stopwords: :boolean, - stopwords_threshold: :float, show_files: :boolean, show_file_paths: :string, ignore_paths: :string, - progress: :boolean + progress: :boolean, + nodes_top: :integer ] @common_aliases [w: :workers, t: :timeout] @@ -27,7 +27,7 @@ defmodule CodeQA.CLI.Options do @spec common_aliases() :: keyword() def common_aliases, do: @common_aliases - @spec parse(list(String.t()), keyword()) :: {keyword(), list(String.t()), list()} + @spec parse(list(String.t()), keyword(), keyword()) :: {keyword(), list(String.t()), list()} def parse(args, extra_strict \\ [], extra_aliases \\ []) do OptionParser.parse(args, strict: Keyword.merge(@common_strict, extra_strict), @@ -54,22 +54,6 @@ defmodule CodeQA.CLI.Options do |> Enum.map(&String.trim/1) end - @spec load_config_ignore_paths(String.t()) :: [String.t()] - def load_config_ignore_paths(path) do - config_file = Path.join(path, ".codeqa.yml") - - case File.read(config_file) do - {:ok, contents} -> - case YamlElixir.read_from_string(contents) do - {:ok, %{"ignore_paths" => patterns}} when is_list(patterns) -> patterns - _ -> [] - end - - {:error, _} -> - [] - end - end - @spec build_analyze_opts(keyword()) :: keyword() def build_analyze_opts(opts) do start_time_progress = System.monotonic_time(:millisecond) @@ -79,17 +63,14 @@ defmodule CodeQA.CLI.Options do :show_ncd, :ncd_top, :combinations, - :telemetry, - :experimental_stopwords, - :stopwords_threshold + :nodes_top ] base = [{:timeout, opts[:timeout] || 5000}] |> maybe_add( opts[:progress], - {:on_progress, - fn c, t, p, _tt -> CodeQA.CLI.Progress.callback(c, t, p, start_time_progress) end} + {:on_progress, fn c, t, p, _tt -> Progress.callback(c, t, p, start_time_progress) end} ) |> maybe_add(opts[:cache], {:cache_dir, opts[:cache_dir] || ".codeqa_cache"}) |> maybe_add( diff --git a/lib/codeqa/cli/progress.ex b/lib/codeqa/cli/progress.ex index 6ffdd14d..aa09b05f 100644 --- a/lib/codeqa/cli/progress.ex +++ b/lib/codeqa/cli/progress.ex @@ -1,6 +1,8 @@ defmodule CodeQA.CLI.Progress do @moduledoc false + alias CodeQA.CLI.UI + @spec callback(integer(), integer(), String.t(), integer()) :: :ok def callback(completed, total, path, start_time) do now = System.monotonic_time(:millisecond) @@ -11,8 +13,8 @@ defmodule CodeQA.CLI.Progress do label = if String.length(path) > 30, do: "..." <> String.slice(path, -27..-1), else: path output = - CodeQA.CLI.UI.progress_bar(completed, total, - eta: CodeQA.CLI.UI.format_eta(eta_ms), + UI.progress_bar(completed, total, + eta: UI.format_eta(eta_ms), label: label ) diff --git a/lib/codeqa/cli/stopwords.ex b/lib/codeqa/cli/stopwords.ex deleted file mode 100644 index f79027b5..00000000 --- a/lib/codeqa/cli/stopwords.ex +++ /dev/null @@ -1,97 +0,0 @@ -defmodule CodeQA.CLI.Stopwords do - @moduledoc false - - @behaviour CodeQA.CLI.Command - - alias CodeQA.CLI.Options - - @impl CodeQA.CLI.Command - def usage do - """ - Usage: codeqa stopwords [options] - - Print codebase-specific stopwords based on frequency analysis. - - Options: - --stopwords-threshold FLOAT Frequency threshold for stopword detection - --progress Show per-file progress on stderr - -w, --workers N Number of parallel workers - --ignore-paths PATHS Comma-separated list of path patterns to ignore (supports wildcards, e.g. "test/*,docs/*") - """ - end - - @impl CodeQA.CLI.Command - def run(args) when args in [["--help"], ["-h"]] do - IO.puts(usage()) - end - - def run(args) do - {opts, [path], _} = - OptionParser.parse(args, - strict: [ - workers: :integer, - stopwords_threshold: :float, - progress: :boolean, - ignore_paths: :string - ], - aliases: [w: :workers] - ) - - Options.validate_dir!(path) - - ignore_patterns = Options.parse_ignore_paths(opts[:ignore_paths]) ++ Options.load_config_ignore_paths(path) - files = CodeQA.Collector.collect_files(path, ignore_patterns: ignore_patterns) - - if map_size(files) == 0 do - IO.puts(:stderr, "Warning: no source files found in '#{path}'") - exit({:shutdown, 1}) - end - - IO.puts(:stderr, "Extracting stopwords for #{map_size(files)} files...") - start_time = System.monotonic_time(:millisecond) - - word_stopwords = find_word_stopwords(files, opts) - fp_stopwords = find_fingerprint_stopwords(files, opts) - - end_time = System.monotonic_time(:millisecond) - - IO.puts(:stderr, "\nAnalysis completed in #{end_time - start_time}ms") - print_word_stopwords(word_stopwords) - IO.puts(:stderr, "\n--- Fingerprint Stopwords (#{MapSet.size(fp_stopwords)}) ---") - IO.puts(:stderr, "Found #{MapSet.size(fp_stopwords)} structural k-gram hashes.") - end - - defp find_word_stopwords(files, opts) do - word_extractor = fn content -> - Regex.scan(~r/\b[a-zA-Z_]\w*\b/u, content) |> List.flatten() - end - - CodeQA.Stopwords.find_stopwords( - files, - word_extractor, - Keyword.put(opts, :progress_label, "Words") - ) - end - - defp find_fingerprint_stopwords(files, opts) do - fp_extractor = fn content -> - CodeQA.Metrics.TokenNormalizer.normalize(content) |> CodeQA.Metrics.Winnowing.kgrams(5) - end - - CodeQA.Stopwords.find_stopwords( - files, - fp_extractor, - Keyword.put(opts, :progress_label, "Fingerprints") - ) - end - - defp print_word_stopwords(word_stopwords) do - IO.puts(:stderr, "\n--- Word Stopwords (#{MapSet.size(word_stopwords)}) ---") - - word_stopwords - |> MapSet.to_list() - |> Enum.sort() - |> Enum.chunk_every(10) - |> Enum.each(fn chunk -> IO.puts(Enum.join(chunk, ", ")) end) - end -end diff --git a/lib/codeqa/collector.ex b/lib/codeqa/collector.ex deleted file mode 100644 index 02e6f349..00000000 --- a/lib/codeqa/collector.ex +++ /dev/null @@ -1,99 +0,0 @@ -defmodule CodeQA.Collector do - @moduledoc false - - @source_extensions MapSet.new(~w[ - .py .js .ts .jsx .tsx .java .rs .go .c .cpp .h .hpp .rb .ex .exs - .swift .kt .scala .sh .css .scss .html .vue .svelte .zig .lua .pl - .pm .r .jl .cs .fs .ml .hs .erl .clj .dart - ]) - - @skip_dirs MapSet.new(~w[ - .git .hg .svn node_modules __pycache__ _build dist build vendor - .tox .venv venv target .mypy_cache .pytest_cache deps .elixir_ls - .next coverage - ]) - - @spec collect_files(String.t(), keyword()) :: %{String.t() => String.t()} - def collect_files(root, opts \\ []) do - root_path = Path.expand(root) - ignore_patterns = Keyword.get(opts, :ignore_patterns, []) - - unless File.dir?(root_path) do - raise File.Error, reason: :enoent, path: root, action: "find directory" - end - - root_path - |> walk_directory() - |> Map.new(fn path -> - rel = Path.relative_to(path, root_path) - {rel, File.read!(path)} - end) - |> reject_ignored_map(ignore_patterns) - end - - def source_extensions, do: @source_extensions - - @doc false - def ignored?(path, patterns) do - Enum.any?(patterns, fn pattern -> - match_pattern?(path, pattern) - end) - end - - @doc false - def reject_ignored_map(files_map, []), do: files_map - - def reject_ignored_map(files_map, patterns) do - Map.reject(files_map, fn {path, _} -> ignored?(path, patterns) end) - end - - @doc false - def reject_ignored(list, [], _key_fn), do: list - - def reject_ignored(list, patterns, key_fn) do - Enum.reject(list, fn item -> ignored?(key_fn.(item), patterns) end) - end - - defp match_pattern?(path, pattern) do - # Convert glob pattern to regex: - # - ** matches any number of directories - # - * matches anything except / - # - ? matches a single character except / - regex_str = - pattern - |> String.replace(".", "\\.") - |> String.replace("**", "\0GLOBSTAR\0") - |> String.replace("*", "[^/]*") - |> String.replace("?", "[^/]") - |> String.replace("\0GLOBSTAR\0", ".*") - - case Regex.compile("^#{regex_str}$") do - {:ok, regex} -> Regex.match?(regex, path) - _ -> false - end - end - - defp walk_directory(dir) do - dir - |> File.ls!() - |> Enum.flat_map(fn entry -> - full_path = Path.join(dir, entry) - - cond do - File.dir?(full_path) and not skip_dir?(entry) -> - walk_directory(full_path) - - File.regular?(full_path) and source_file?(entry) -> - [full_path] - - true -> - [] - end - end) - end - - defp skip_dir?(name), do: MapSet.member?(@skip_dirs, name) or String.starts_with?(name, ".") - - defp source_file?(name), - do: MapSet.member?(@source_extensions, Path.extname(name) |> String.downcase()) -end diff --git a/lib/codeqa/combined_metrics/category.ex b/lib/codeqa/combined_metrics/category.ex new file mode 100644 index 00000000..def09ad1 --- /dev/null +++ b/lib/codeqa/combined_metrics/category.ex @@ -0,0 +1,40 @@ +defmodule CodeQA.CombinedMetrics.Category do + @moduledoc """ + Macro helper for defining combined-metric category modules. + + Each category module (e.g. `VariableNaming`, `Documentation`) calls + `use CodeQA.CombinedMetrics.Category, yaml_path: "priv/..."`. + + This injects: + - `@callback score(metrics :: map()) :: float()` — making the caller a behaviour + - `compute_score/2` — delegates to `Scorer` with the baked-in yaml path + + ## Example + + defmodule CodeQA.CombinedMetrics.VariableNaming do + use CodeQA.CombinedMetrics.Category, + yaml_path: "priv/combined_metrics/variable_naming.yml" + end + + Leaf modules then declare `@behaviour CodeQA.CombinedMetrics.VariableNaming` + and call `VariableNaming.compute_score("key", metrics)`. + """ + + defmacro __using__(yaml_path: yaml_path) do + quote do + alias CodeQA.CombinedMetrics.Scorer + + @callback score(metrics :: map()) :: float() + + @doc """ + Computes the score for `metric_name` using scalars from this category's YAML file. + + Delegates to `CodeQA.CombinedMetrics.Scorer.compute_score/3`. + """ + @spec compute_score(String.t(), map()) :: float() + def compute_score(metric_name, metrics) do + Scorer.compute_score(unquote(yaml_path), metric_name, metrics) + end + end + end +end diff --git a/lib/codeqa/combined_metrics/code_smells.ex b/lib/codeqa/combined_metrics/code_smells.ex new file mode 100644 index 00000000..13586ba5 --- /dev/null +++ b/lib/codeqa/combined_metrics/code_smells.ex @@ -0,0 +1,29 @@ +defmodule CodeQA.CombinedMetrics.CodeSmells do + @moduledoc """ + Behaviour and submodule registry for code smell detection metrics. + + Scalar weights are defined in `priv/combined_metrics/code_smells.yml`. + See `CodeQA.CombinedMetrics.Category` for the scoring model. + """ + + @yaml_path "priv/combined_metrics/code_smells.yml" + + use CodeQA.CombinedMetrics.Category, yaml_path: @yaml_path + + @behaviors @yaml_path + |> YamlElixir.read_from_file!() + |> Enum.filter(fn {_k, v} -> is_map(v) end) + |> Enum.map(fn {key, groups} -> {key, Map.get(groups, "_doc")} end) + + for {key, doc} <- @behaviors do + defmodule Module.concat(CodeQA.CombinedMetrics.CodeSmells, Macro.camelize(key)) do + alias CodeQA.CombinedMetrics.CodeSmells + @moduledoc doc + @behaviour CodeSmells + @score_key key + @impl true + def score(metrics), + do: CodeSmells.compute_score(@score_key, metrics) + end + end +end diff --git a/lib/codeqa/combined_metrics/consistency.ex b/lib/codeqa/combined_metrics/consistency.ex new file mode 100644 index 00000000..1c4af0c0 --- /dev/null +++ b/lib/codeqa/combined_metrics/consistency.ex @@ -0,0 +1,30 @@ +defmodule CodeQA.CombinedMetrics.Consistency do + @moduledoc """ + Behaviour and submodule registry for codebase consistency metrics. + + Covers naming style uniformity, structural patterns, and cross-file coherence. + Scalar weights are defined in `priv/combined_metrics/consistency.yml`. + See `CodeQA.CombinedMetrics.Category` for the scoring model. + """ + + @yaml_path "priv/combined_metrics/consistency.yml" + + use CodeQA.CombinedMetrics.Category, yaml_path: @yaml_path + + @behaviors @yaml_path + |> YamlElixir.read_from_file!() + |> Enum.filter(fn {_k, v} -> is_map(v) end) + |> Enum.map(fn {key, groups} -> {key, Map.get(groups, "_doc")} end) + + for {key, doc} <- @behaviors do + defmodule Module.concat(CodeQA.CombinedMetrics.Consistency, Macro.camelize(key)) do + alias CodeQA.CombinedMetrics.Consistency + @moduledoc doc + @behaviour Consistency + @score_key key + @impl true + def score(metrics), + do: Consistency.compute_score(@score_key, metrics) + end + end +end diff --git a/lib/codeqa/combined_metrics/cosine_vector.ex b/lib/codeqa/combined_metrics/cosine_vector.ex new file mode 100644 index 00000000..36bbe23f --- /dev/null +++ b/lib/codeqa/combined_metrics/cosine_vector.ex @@ -0,0 +1,90 @@ +defmodule CodeQA.CombinedMetrics.CosineVector do + @moduledoc """ + Computes cosine similarity between a behavior's scalar weight vector and a + log-metric vector derived from an aggregate. + + Pure math — no I/O, no YAML loading. Intended for internal use by `SampleRunner`. + """ + + alias CodeQA.CombinedMetrics.Scorer + + @doc """ + Builds the cosine result entry for a single behavior against the given aggregate. + + Returns a one-element list `[result_map]` on success or `[]` when the behavior + has no non-zero scalars (no sample data) and should be excluded. + + ## Options + + * `:log_metrics` - precomputed log-metric map `%{group => %{key => log_val}}`. + When present, values are looked up directly instead of being recomputed via + `:math.log/1`. Falls back to inline computation when absent or when a key is + not found in the map. + """ + @spec compute(String.t(), String.t(), map(), map(), String.t(), keyword()) :: [map()] + def compute(yaml_path, behavior, behavior_data, aggregate, category, opts \\ []) do + scalars = Scorer.scalars_for(yaml_path, behavior) + + if map_size(scalars) == 0 do + [] + else + build_result(yaml_path, behavior, behavior_data, aggregate, category, scalars, opts) + end + end + + # --- Internal helpers --- + + defp build_result(yaml_path, behavior, behavior_data, aggregate, category, scalars, opts) do + log_baseline = Map.get(behavior_data, "_log_baseline", 0.0) / 1.0 + log_metrics = Keyword.get(opts, :log_metrics) + + {dot, norm_s_sq, norm_v_sq, contributions} = + Enum.reduce(scalars, {0.0, 0.0, 0.0, []}, fn {{group, key}, scalar}, + {d, ns, nv, contribs} -> + log_m = lookup_log_metric(log_metrics, aggregate, group, key) + contrib = scalar * log_m + + {d + contrib, ns + scalar * scalar, nv + log_m * log_m, + [{:"#{group}.#{key}", contrib} | contribs]} + end) + + cos_sim = + if norm_s_sq > 0 and norm_v_sq > 0, + do: dot / (:math.sqrt(norm_s_sq) * :math.sqrt(norm_v_sq)), + else: 0.0 + + raw_score = Scorer.compute_score(yaml_path, behavior, aggregate) + calibrated = :math.log(max(raw_score, 1.0e-300)) - log_baseline + + top_metrics = + contributions + |> Enum.sort_by(fn {_, c} -> c end) + |> Enum.take(5) + |> Enum.map(fn {metric, contribution} -> + %{metric: to_string(metric), contribution: Float.round(contribution, 4)} + end) + + [ + %{ + category: category, + behavior: behavior, + cosine: Float.round(cos_sim, 4), + score: Float.round(calibrated, 4), + top_metrics: top_metrics + } + ] + end + + # Returns a precomputed log value when available, otherwise computes inline. + # Both paths apply the same max(val, 1.0e-300) floor guard to ensure identical + # results regardless of whether log_metrics was precomputed or not. + defp lookup_log_metric(nil, aggregate, group, key), + do: :math.log(max(Scorer.get(aggregate, group, key) / 1.0, 1.0e-300)) + + defp lookup_log_metric(log_metrics, aggregate, group, key) do + case get_in(log_metrics, [group, key]) do + nil -> :math.log(max(Scorer.get(aggregate, group, key) / 1.0, 1.0e-300)) + log_val -> log_val + end + end +end diff --git a/lib/codeqa/combined_metrics/dependencies.ex b/lib/codeqa/combined_metrics/dependencies.ex new file mode 100644 index 00000000..f0b25aa9 --- /dev/null +++ b/lib/codeqa/combined_metrics/dependencies.ex @@ -0,0 +1,29 @@ +defmodule CodeQA.CombinedMetrics.Dependencies do + @moduledoc """ + Behaviour and submodule registry for dependency and coupling quality metrics. + + Scalar weights are defined in `priv/combined_metrics/dependencies.yml`. + See `CodeQA.CombinedMetrics.Category` for the scoring model. + """ + + @yaml_path "priv/combined_metrics/dependencies.yml" + + use CodeQA.CombinedMetrics.Category, yaml_path: @yaml_path + + @behaviors @yaml_path + |> YamlElixir.read_from_file!() + |> Enum.filter(fn {_k, v} -> is_map(v) end) + |> Enum.map(fn {key, groups} -> {key, Map.get(groups, "_doc")} end) + + for {key, doc} <- @behaviors do + defmodule Module.concat(CodeQA.CombinedMetrics.Dependencies, Macro.camelize(key)) do + alias CodeQA.CombinedMetrics.Dependencies + @moduledoc doc + @behaviour Dependencies + @score_key key + @impl true + def score(metrics), + do: Dependencies.compute_score(@score_key, metrics) + end + end +end diff --git a/lib/codeqa/combined_metrics/documentation.ex b/lib/codeqa/combined_metrics/documentation.ex new file mode 100644 index 00000000..94f8a95f --- /dev/null +++ b/lib/codeqa/combined_metrics/documentation.ex @@ -0,0 +1,29 @@ +defmodule CodeQA.CombinedMetrics.Documentation do + @moduledoc """ + Behaviour and submodule registry for documentation quality metrics. + + Scalar weights are defined in `priv/combined_metrics/documentation.yml`. + See `CodeQA.CombinedMetrics.Category` for the scoring model. + """ + + @yaml_path "priv/combined_metrics/documentation.yml" + + use CodeQA.CombinedMetrics.Category, yaml_path: @yaml_path + + @behaviors @yaml_path + |> YamlElixir.read_from_file!() + |> Enum.filter(fn {_k, v} -> is_map(v) end) + |> Enum.map(fn {key, groups} -> {key, Map.get(groups, "_doc")} end) + + for {key, doc} <- @behaviors do + defmodule Module.concat(CodeQA.CombinedMetrics.Documentation, Macro.camelize(key)) do + alias CodeQA.CombinedMetrics.Documentation + @moduledoc doc + @behaviour Documentation + @score_key key + @impl true + def score(metrics), + do: Documentation.compute_score(@score_key, metrics) + end + end +end diff --git a/lib/codeqa/combined_metrics/error_handling.ex b/lib/codeqa/combined_metrics/error_handling.ex new file mode 100644 index 00000000..9039ef61 --- /dev/null +++ b/lib/codeqa/combined_metrics/error_handling.ex @@ -0,0 +1,29 @@ +defmodule CodeQA.CombinedMetrics.ErrorHandling do + @moduledoc """ + Behaviour and submodule registry for error handling quality metrics. + + Scalar weights are defined in `priv/combined_metrics/error_handling.yml`. + See `CodeQA.CombinedMetrics.Category` for the scoring model. + """ + + @yaml_path "priv/combined_metrics/error_handling.yml" + + use CodeQA.CombinedMetrics.Category, yaml_path: @yaml_path + + @behaviors @yaml_path + |> YamlElixir.read_from_file!() + |> Enum.filter(fn {_k, v} -> is_map(v) end) + |> Enum.map(fn {key, groups} -> {key, Map.get(groups, "_doc")} end) + + for {key, doc} <- @behaviors do + defmodule Module.concat(CodeQA.CombinedMetrics.ErrorHandling, Macro.camelize(key)) do + alias CodeQA.CombinedMetrics.ErrorHandling + @moduledoc doc + @behaviour ErrorHandling + @score_key key + @impl true + def score(metrics), + do: ErrorHandling.compute_score(@score_key, metrics) + end + end +end diff --git a/lib/codeqa/combined_metrics/file_scorer.ex b/lib/codeqa/combined_metrics/file_scorer.ex new file mode 100644 index 00000000..e7479b08 --- /dev/null +++ b/lib/codeqa/combined_metrics/file_scorer.ex @@ -0,0 +1,109 @@ +defmodule CodeQA.CombinedMetrics.FileScorer do + @moduledoc """ + Scores individual files against combined metric behaviors. + + Converts per-file raw metric maps to aggregate-compatible format and + identifies which behaviors each file most likely exhibits. + """ + + alias CodeQA.CombinedMetrics.SampleRunner + alias CodeQA.Config + alias CodeQA.HealthReport.Grader + alias CodeQA.Language + + @doc """ + Converts a single file's raw metric map to aggregate format. + + Wraps each key in each group with the `mean_` prefix so the resulting + map is compatible with `SampleRunner.diagnose_aggregate/2`. + + ## Example + + iex> CodeQA.CombinedMetrics.FileScorer.file_to_aggregate(%{"halstead" => %{"tokens" => 42.0}}) + %{"halstead" => %{"mean_tokens" => 42.0}} + """ + @spec file_to_aggregate(map()) :: map() + def file_to_aggregate(metrics) do + Map.new(metrics, fn {group, keys} -> + prefixed_keys = Map.new(keys, fn {key, value} -> {"mean_" <> key, value} end) + {group, prefixed_keys} + end) + end + + @doc """ + Identifies the worst files per combined metric behavior. + + For each file in `files_map`, converts its metrics to aggregate format and + runs `SampleRunner.diagnose_aggregate/2`. The results are collected per + behavior and sorted ascending by cosine similarity (most negative = worst first), + then truncated to `combined_top` entries. + + ## Options + + * `:combined_top` - number of worst files to keep per behavior (default: 2) + + ## Result shape + + %{ + "function_design.no_boolean_parameter" => [ + %{file: "lib/foo/bar.ex", cosine: -0.71}, + %{file: "lib/foo/baz.ex", cosine: -0.44} + ], + ... + } + """ + @spec worst_files_per_behavior(map(), keyword()) :: + %{ + String.t() => [ + %{file: String.t(), cosine: float(), top_metrics: list(), top_nodes: list()} + ] + } + def worst_files_per_behavior(files_map, opts \\ []) do + # NOTE: cosine similarity is computed at file level; a line-level mapping would require computing a separate + # cosine score for each AST node by projecting that node's metric vector against the behavior's + # feature-weight vector. This is not currently possible because serialized nodes do not carry their own + # metric values. + top_n = Keyword.get(opts, :combined_top, 2) + + files_map + |> Enum.reject(fn {_path, file_data} -> + file_data |> Map.get("metrics", %{}) |> map_size() == 0 + end) + |> Enum.reduce(%{}, fn {path, file_data}, acc -> + accumulate_file_behaviors(path, file_data, acc) + end) + |> Map.new(fn {key, entries} -> + threshold = Config.cosine_significance_threshold() + + sorted = + entries + |> Enum.filter(fn e -> e.cosine <= -threshold end) + |> Enum.sort_by(& &1.cosine) + |> Enum.take(top_n) + + {key, sorted} + end) + end + + # Diagnoses a single file's metrics and merges per-behavior entries into the accumulator. + defp accumulate_file_behaviors(path, file_data, acc) do + top_nodes = Grader.top_3_nodes(Map.get(file_data, "nodes")) + language = Language.detect(path).name() + + file_data + |> Map.get("metrics", %{}) + |> file_to_aggregate() + |> SampleRunner.diagnose_aggregate(top: 99_999, language: language) + |> Enum.reduce(acc, fn %{ + category: category, + behavior: behavior, + cosine: cosine, + top_metrics: top_metrics + }, + inner_acc -> + key = "#{category}.#{behavior}" + entry = %{file: path, cosine: cosine, top_metrics: top_metrics, top_nodes: top_nodes} + Map.update(inner_acc, key, [entry], &[entry | &1]) + end) + end +end diff --git a/lib/codeqa/combined_metrics/file_structure.ex b/lib/codeqa/combined_metrics/file_structure.ex new file mode 100644 index 00000000..aa6f153a --- /dev/null +++ b/lib/codeqa/combined_metrics/file_structure.ex @@ -0,0 +1,29 @@ +defmodule CodeQA.CombinedMetrics.FileStructure do + @moduledoc """ + Behaviour and submodule registry for file structure quality metrics. + + Scalar weights are defined in `priv/combined_metrics/file_structure.yml`. + See `CodeQA.CombinedMetrics.Category` for the scoring model. + """ + + @yaml_path "priv/combined_metrics/file_structure.yml" + + use CodeQA.CombinedMetrics.Category, yaml_path: @yaml_path + + @behaviors @yaml_path + |> YamlElixir.read_from_file!() + |> Enum.filter(fn {_k, v} -> is_map(v) end) + |> Enum.map(fn {key, groups} -> {key, Map.get(groups, "_doc")} end) + + for {key, doc} <- @behaviors do + defmodule Module.concat(CodeQA.CombinedMetrics.FileStructure, Macro.camelize(key)) do + alias CodeQA.CombinedMetrics.FileStructure + @moduledoc doc + @behaviour FileStructure + @score_key key + @impl true + def score(metrics), + do: FileStructure.compute_score(@score_key, metrics) + end + end +end diff --git a/lib/codeqa/combined_metrics/function_design.ex b/lib/codeqa/combined_metrics/function_design.ex new file mode 100644 index 00000000..3eab5f78 --- /dev/null +++ b/lib/codeqa/combined_metrics/function_design.ex @@ -0,0 +1,29 @@ +defmodule CodeQA.CombinedMetrics.FunctionDesign do + @moduledoc """ + Behaviour and submodule registry for function design quality metrics. + + Scalar weights are defined in `priv/combined_metrics/function_design.yml`. + See `CodeQA.CombinedMetrics.Category` for the scoring model. + """ + + @yaml_path "priv/combined_metrics/function_design.yml" + + use CodeQA.CombinedMetrics.Category, yaml_path: @yaml_path + + @behaviors @yaml_path + |> YamlElixir.read_from_file!() + |> Enum.filter(fn {_k, v} -> is_map(v) end) + |> Enum.map(fn {key, groups} -> {key, Map.get(groups, "_doc")} end) + + for {key, doc} <- @behaviors do + defmodule Module.concat(CodeQA.CombinedMetrics.FunctionDesign, Macro.camelize(key)) do + alias CodeQA.CombinedMetrics.FunctionDesign + @moduledoc doc + @behaviour FunctionDesign + @score_key key + @impl true + def score(metrics), + do: FunctionDesign.compute_score(@score_key, metrics) + end + end +end diff --git a/lib/codeqa/combined_metrics/naming_conventions.ex b/lib/codeqa/combined_metrics/naming_conventions.ex new file mode 100644 index 00000000..eafb5dcb --- /dev/null +++ b/lib/codeqa/combined_metrics/naming_conventions.ex @@ -0,0 +1,31 @@ +defmodule CodeQA.CombinedMetrics.NamingConventions do + @moduledoc """ + Behaviour and submodule registry for broader naming convention metrics. + + Covers class, file, and function naming patterns not captured by + `VariableNaming`. Scalar weights are defined in + `priv/combined_metrics/naming_conventions.yml`. + See `CodeQA.CombinedMetrics.Category` for the scoring model. + """ + + @yaml_path "priv/combined_metrics/naming_conventions.yml" + + use CodeQA.CombinedMetrics.Category, yaml_path: @yaml_path + + @behaviors @yaml_path + |> YamlElixir.read_from_file!() + |> Enum.filter(fn {_k, v} -> is_map(v) end) + |> Enum.map(fn {key, groups} -> {key, Map.get(groups, "_doc")} end) + + for {key, doc} <- @behaviors do + defmodule Module.concat(CodeQA.CombinedMetrics.NamingConventions, Macro.camelize(key)) do + alias CodeQA.CombinedMetrics.NamingConventions + @moduledoc doc + @behaviour NamingConventions + @score_key key + @impl true + def score(metrics), + do: NamingConventions.compute_score(@score_key, metrics) + end + end +end diff --git a/lib/codeqa/combined_metrics/sample_runner.ex b/lib/codeqa/combined_metrics/sample_runner.ex new file mode 100644 index 00000000..318d007a --- /dev/null +++ b/lib/codeqa/combined_metrics/sample_runner.ex @@ -0,0 +1,508 @@ +defmodule CodeQA.CombinedMetrics.SampleRunner do + @moduledoc """ + Discovers sample directories, analyzes them, and scores each behavior formula. + + Returns structured results suitable for rendering a separation table, enabling + manual scalar tuning of combined metric formulas. + """ + + alias CodeQA.CombinedMetrics.{CosineVector, ScalarApplier, Scorer} + alias CodeQA.Engine.{Analyzer, Collector} + + @samples_root "priv/combined_metrics/samples" + + # --------------------------------------------------------------------------- + # Public API + # --------------------------------------------------------------------------- + + @doc """ + Runs all behaviors found in sample directories, optionally filtered by category. + + ## Options + + * `:category` - restrict to one category (e.g. `"variable_naming"`) + * `:verbose` - when `true`, populates `:metric_detail` in each result + + ## Result shape + + %{ + category: "variable_naming", + behavior: "name_is_generic", + bad_score: 0.074, + good_score: 0.550, + ratio: 7.43, + direction_ok: true, + metric_detail: [...] # empty unless verbose: true + } + """ + @spec run(keyword()) :: [map()] + def run(opts \\ []) do + filter_category = opts[:category] + + @samples_root + |> list_behaviors() + |> Enum.filter(fn {category, behavior} -> + (filter_category == nil or category == filter_category) and + has_both_dirs?(category, behavior) + end) + |> Enum.map(fn {category, behavior} -> + score_behavior(category, behavior, opts) + end) + end + + @doc """ + Builds a per-behavior metric correlation report for scalar tuning. + + For each behavior with sample data, computes all `mean_*` metric values for + both good and bad samples, then suggests normalized scalars in [-2, 2] using + the log-linear method: + + log_diff = log(good_val) - log(bad_val) + suggested_scalar = 2.0 * log_diff / max(|all log_diffs| for this behavior) + + The strongest signal for each behavior maps to ±2.0; all others scale + proportionally. This lets you paste the suggested scalars into the YAML as a + starting point and refine from there. + + ## Result shape (keyed by "category.behavior") + + %{ + "variable_naming.name_is_generic" => %{ + "identifier_length_variance.mean_variance" => %{ + bad: 5.131, good: 25.109, + log_bad: 1.635, log_good: 3.224, + ratio: 4.895, + suggested_scalar: 2.0 + }, + ... + } + } + """ + @spec build_metric_report(keyword()) :: map() + def build_metric_report(opts \\ []) do + filter_category = opts[:category] + + @samples_root + |> list_behaviors() + |> Enum.filter(fn {category, behavior} -> + (filter_category == nil or category == filter_category) and + has_both_dirs?(category, behavior) + end) + |> Map.new(fn {category, behavior} -> + {"#{category}.#{behavior}", behavior_metric_table(category, behavior)} + end) + end + + @doc """ + Scores all combined metric behaviors against the given codebase aggregate map. + + Reads all YAML config files from `priv/combined_metrics/` and returns one entry + per YAML category, each containing the scores for all behaviors within it. + Behaviors are sorted ascending by score so the lowest-scoring (worst) appear first. + + ## Result shape + + [ + %{ + category: "variable_naming", + name: "Variable Naming", + behaviors: [ + %{behavior: "name_is_generic", score: 3.45}, + ... + ] + }, + ... + ] + """ + @spec score_aggregate(map(), keyword()) :: [map()] + def score_aggregate(aggregate, opts \\ []) do + languages = Keyword.get(opts, :languages) + + Scorer.all_yamls() + |> Enum.sort_by(fn {path, _} -> path end) + |> Enum.map(fn {yaml_path, data} -> + category = yaml_path |> Path.basename() |> String.trim_trailing(".yml") + + behaviors = + data + |> Enum.filter(fn {_k, v} -> is_map(v) end) + |> Enum.reject(fn {_behavior, behavior_data} -> + behavior_langs = Map.get(behavior_data, "_languages", []) + not behavior_language_applies?(behavior_langs, nil, languages) + end) + |> Enum.map(fn {behavior, behavior_data} -> + log_baseline = Map.get(behavior_data, "_log_baseline", 0.0) / 1.0 + raw_score = Scorer.compute_score(yaml_path, behavior, aggregate) + calibrated = :math.log(max(raw_score, 1.0e-300)) - log_baseline + %{behavior: behavior, score: Float.round(calibrated, 4)} + end) + |> Enum.sort_by(& &1.score) + + %{category: category, name: humanize(category), behaviors: behaviors} + end) + end + + @doc """ + Identifies the most likely code quality issues in an aggregate by cosine similarity. + + For each behavior, computes the cosine similarity between its scalar weight vector + `s` and the file's log-metric vector `v`: + + cos_sim = (s · v) / (|s| × |v|) + + A negative cosine means the file's metric profile anti-aligns with what good code + looks like for that behavior — i.e. the file likely exhibits that anti-pattern. + + Results are sorted by cosine similarity ascending (most negative = most likely + issue). Behaviors with no non-zero scalars (no sample data) are excluded. + + ## Options + + * `:top` - number of results to return (default 15) + * `:language` - single language string for per-file filtering; when set, only + behaviors whose `_languages` list includes this language are scored + * `:languages` - list of language strings for project-level filtering; when set, only + behaviors whose `_languages` list overlaps with this list are scored + + ## Result shape + + %{ + category: "function_design", + behavior: "no_boolean_parameter", + cosine: -0.83, + score: -13.54, + top_metrics: [%{metric: "branching.mean_branching_density", contribution: -4.1}, ...] + } + """ + @spec diagnose_aggregate(map(), keyword()) :: [map()] + def diagnose_aggregate(aggregate, opts \\ []) do + top_n = Keyword.get(opts, :top, 15) + language = Keyword.get(opts, :language) + languages = Keyword.get(opts, :languages) + behavior_map = Keyword.get(opts, :behavior_map) + + {pre_us, log_metrics} = :timer.tc(fn -> precompute_log_metrics(aggregate) end) + cosine_opts = [log_metrics: log_metrics] + + Process.put(:codeqa_cosine_breakdown, %{precompute_log_us: pre_us}) + + behaviors_stream = + if behavior_map do + behavior_map + |> Enum.sort_by(fn {category, _} -> category end) + |> Enum.flat_map( + &diagnose_from_behavior_map_entry(&1, aggregate, language, languages, cosine_opts) + ) + else + Scorer.all_yamls() + |> Enum.sort_by(fn {path, _} -> path end) + |> Enum.flat_map(&diagnose_from_yaml(&1, aggregate, language, languages, cosine_opts)) + end + + {sort_us, result} = + :timer.tc(fn -> + behaviors_stream + |> Enum.sort_by(& &1.cosine) + |> Enum.take(top_n) + end) + + breakdown = + Process.get(:codeqa_cosine_breakdown, %{}) + |> Map.put(:sort_take_us, sort_us) + + Process.delete(:codeqa_cosine_breakdown) + :telemetry.execute([:codeqa, :cosine_breakdown], breakdown, %{}) + result + end + + @doc """ + Applies suggested scalars from sample analysis back to the YAML config files. + + For each behavior that has sample data, rewrites its scalar entries using the + log-linear suggestion method. Metrics whose ratio falls in the deadzone are + excluded. All non-deadzoned metrics are written, even if they were not + previously present in the YAML. + + Behaviors without sample data are left unchanged. + + Returns a list of per-category stats maps. + """ + @spec apply_scalars(keyword()) :: [map()] + def apply_scalars(opts \\ []) do + report = build_metric_report(opts) + ScalarApplier.apply_scalars(report, opts) + end + + @doc """ + Updates only the `_languages` field in YAML config files based on sample data. + + Scans `bad/` and `good/` sample directories for each behavior, detects languages + from file extensions via `CodeQA.Language.detect/1`, and writes the intersection + as `_languages` to the YAML. Behaviors without sample data are left without a + `_languages` key (treated as applying to all languages at scoring time). + All existing scalars and baselines are preserved. + + Returns a list of `%{category: String.t(), behaviors_with_languages: non_neg_integer()}`. + """ + @spec apply_languages(keyword()) :: [map()] + def apply_languages(opts \\ []) do + ScalarApplier.apply_languages(opts) + end + + # --------------------------------------------------------------------------- + # Sample discovery + # --------------------------------------------------------------------------- + + defp list_behaviors(samples_root) do + samples_root + |> File.ls!() + |> Enum.flat_map(fn category -> + Path.join([samples_root, category]) + |> File.ls!() + |> Enum.map(&{category, &1}) + end) + end + + defp has_both_dirs?(category, behavior) do + File.dir?(sample_path(category, behavior, "bad")) and + File.dir?(sample_path(category, behavior, "good")) + end + + defp sample_path(category, behavior, kind) do + Path.join([@samples_root, category, behavior, kind]) + end + + defp analyze(dir) do + dir + |> Collector.collect_files() + |> Analyzer.analyze_codebase() + |> get_in(["codebase", "aggregate"]) + end + + # --------------------------------------------------------------------------- + # Sample scoring + # --------------------------------------------------------------------------- + + defp score_behavior(category, behavior, opts) do + yaml_path = "priv/combined_metrics/#{category}.yml" + bad_agg = analyze(sample_path(category, behavior, "bad")) + good_agg = analyze(sample_path(category, behavior, "good")) + + bad_score = Scorer.compute_score(yaml_path, behavior, bad_agg) + good_score = Scorer.compute_score(yaml_path, behavior, good_agg) + ratio = if bad_score > 0, do: good_score / bad_score, else: 0.0 + + base = %{ + category: category, + behavior: behavior, + bad_score: bad_score, + good_score: good_score, + ratio: Float.round(ratio, 2), + direction_ok: good_score >= bad_score + } + + if opts[:verbose] do + Map.put(base, :metric_detail, metric_detail(yaml_path, behavior, bad_agg, good_agg)) + else + Map.put(base, :metric_detail, []) + end + end + + defp metric_detail(yaml_path, behavior, bad_agg, good_agg) do + Scorer.scalars_for(yaml_path, behavior) + |> Enum.map(fn {{group, key}, scalar} -> + bad_val = Scorer.get(bad_agg, group, key) + good_val = Scorer.get(good_agg, group, key) + ratio = if bad_val > 0, do: Float.round(good_val / bad_val, 2), else: 0.0 + %{group: group, key: key, scalar: scalar, bad: bad_val, good: good_val, ratio: ratio} + end) + |> Enum.sort_by(&abs(&1.ratio - 1.0), :desc) + end + + # --------------------------------------------------------------------------- + # Metric report (vector building) + # --------------------------------------------------------------------------- + + defp behavior_metric_table(category, behavior) do + bad_agg = analyze(sample_path(category, behavior, "bad")) + good_agg = analyze(sample_path(category, behavior, "good")) + + entries = + Scorer.default_scalars() + |> Map.keys() + |> Enum.map(fn {group, key} -> + bad_val = Scorer.get(bad_agg, group, key) + good_val = Scorer.get(good_agg, group, key) + log_bad = :math.log(bad_val) + log_good = :math.log(good_val) + ratio = good_val / bad_val + log_diff = log_good - log_bad + {"#{group}.#{key}", bad_val, good_val, log_bad, log_good, ratio, log_diff} + end) + + max_abs_log_diff = + entries + |> Enum.map(fn {_, _, _, _, _, _, ld} -> abs(ld) end) + |> Enum.max(fn -> 1.0 end) + |> max(1.0e-10) + + Map.new(entries, fn {metric_key, bad_val, good_val, log_bad, log_good, ratio, log_diff} -> + suggested_scalar = Float.round(2.0 * log_diff / max_abs_log_diff, 4) + + {metric_key, + %{ + bad: r4(bad_val), + good: r4(good_val), + log_bad: r4(log_bad), + log_good: r4(log_good), + ratio: r4(ratio), + suggested_scalar: suggested_scalar + }} + end) + end + + defp r4(f), do: Float.round(f / 1.0, 4) + + # --------------------------------------------------------------------------- + # Cosine diagnosis (delegates vector math to CosineVector) + # --------------------------------------------------------------------------- + + # Builds a nested map of precomputed log values for all numeric entries in the + # aggregate: %{group => %{key => :math.log(max(val, 1.0e-300))}}. + # Called once per diagnose_aggregate/2 invocation so the inner reduce in + # CosineVector.build_result can do O(1) lookups instead of recomputing log + # for every (behavior, metric) pair. + defp precompute_log_metrics(aggregate) do + aggregate + |> Enum.filter(fn {_group, sub_map} -> is_map(sub_map) end) + |> Map.new(fn {group, sub_map} -> + log_sub = + sub_map + |> Enum.filter(fn {_key, val} -> is_number(val) end) + |> Map.new(fn {key, val} -> + {key, :math.log(max(val / 1.0, 1.0e-300))} + end) + + {group, log_sub} + end) + end + + defp diagnose_from_behavior_map_entry( + {category, behaviors}, + aggregate, + language, + languages, + cosine_opts + ) do + yaml_path = "priv/combined_metrics/#{category}.yml" + + Enum.flat_map(behaviors, fn {behavior, behavior_data} -> + maybe_diagnose_behavior( + yaml_path, + behavior, + behavior_data, + aggregate, + category, + language, + languages, + cosine_opts + ) + end) + end + + defp diagnose_from_yaml({yaml_path, data}, aggregate, language, languages, cosine_opts) do + category = yaml_path |> Path.basename() |> String.trim_trailing(".yml") + + data + |> Enum.filter(fn {_k, v} -> is_map(v) end) + |> Enum.flat_map(fn {behavior, behavior_data} -> + maybe_diagnose_behavior( + yaml_path, + behavior, + behavior_data, + aggregate, + category, + language, + languages, + cosine_opts + ) + end) + end + + defp maybe_diagnose_behavior( + yaml_path, + behavior, + behavior_data, + aggregate, + category, + language, + languages, + cosine_opts + ) do + behavior_langs = Map.get(behavior_data, "_languages", []) + + if behavior_language_applies?(behavior_langs, language, languages) do + {us, result} = + :timer.tc(fn -> + CosineVector.compute( + yaml_path, + behavior, + behavior_data, + aggregate, + category, + cosine_opts + ) + end) + + track_behavior_us(behavior, us) + result + else + [] + end + end + + defp track_behavior_us(behavior, us) do + case Process.get(:codeqa_cosine_breakdown) do + nil -> + :ok + + breakdown -> + cur = Map.get(breakdown, behavior, 0) + Process.put(:codeqa_cosine_breakdown, Map.put(breakdown, behavior, cur + us)) + end + end + + # --------------------------------------------------------------------------- + # Language filtering + # --------------------------------------------------------------------------- + + # behavior_langs: the "_languages" list from the YAML ([] = applies to all) + # language: single language string from :language opt (nil = no filter) + # languages: project language list from :languages opt (nil = no filter) + defp behavior_language_applies?(_behavior_langs, nil, nil), do: true + + # Empty behavior_langs means "applies to all languages" — always include. + defp behavior_language_applies?([], _language, _languages), do: true + + defp behavior_language_applies?(_behavior_langs, nil, []), do: true + + defp behavior_language_applies?(behavior_langs, language, nil) when is_binary(language), + do: language in behavior_langs + + defp behavior_language_applies?(behavior_langs, nil, languages) when is_list(languages), + do: Enum.any?(behavior_langs, &(&1 in languages)) + + defp behavior_language_applies?(behavior_langs, language, languages) + when is_binary(language) and is_list(languages), + do: language in behavior_langs or Enum.any?(behavior_langs, &(&1 in languages)) + + # --------------------------------------------------------------------------- + # Misc + # --------------------------------------------------------------------------- + + defp humanize(slug) do + slug + |> String.split("_") + |> Enum.map_join(" ", &String.capitalize/1) + end +end diff --git a/lib/codeqa/combined_metrics/scalar_applier.ex b/lib/codeqa/combined_metrics/scalar_applier.ex new file mode 100644 index 00000000..1c8ec4b7 --- /dev/null +++ b/lib/codeqa/combined_metrics/scalar_applier.ex @@ -0,0 +1,209 @@ +defmodule CodeQA.CombinedMetrics.ScalarApplier do + @moduledoc """ + Writes suggested scalars and language metadata back to the combined-metrics YAML + config files under `priv/combined_metrics/`. + + Intended for internal use by `SampleRunner`. Two entry points: + + * `apply_scalars/2` — rewrites scalar weights using log-linear suggestions + * `apply_languages/2` — rewrites `_languages` based on sample file extensions + """ + + alias CodeQA.CombinedMetrics.YamlFormatter + + @samples_root "priv/combined_metrics/samples" + @yaml_dir "priv/combined_metrics" + @deadzone_low 0.995 + @deadzone_high 1.005 + + @doc """ + Applies suggested scalars from `report` (a `build_metric_report/1` result) to + the YAML files under `priv/combined_metrics/`. + + Returns a list of per-category stats maps with `:category`, `:updated`, + `:deadzoned`, and `:skipped` keys. + """ + @spec apply_scalars(map(), keyword()) :: [map()] + def apply_scalars(report, opts \\ []) do + filter_category = opts[:category] + + @yaml_dir + |> File.ls!() + |> Enum.filter(fn yml_file -> + String.ends_with?(yml_file, ".yml") and + (filter_category == nil or String.trim_trailing(yml_file, ".yml") == filter_category) + end) + |> Enum.sort() + |> Enum.map(fn yml_file -> + category = String.trim_trailing(yml_file, ".yml") + yaml_path = Path.join(@yaml_dir, yml_file) + {:ok, existing} = YamlElixir.read_from_file(yaml_path) + + {updated_yaml, stats} = apply_to_category(existing, category, report) + File.write!(yaml_path, YamlFormatter.format(updated_yaml)) + + Map.put(stats, :category, category) + end) + end + + @doc """ + Updates only the `_languages` field in YAML config files based on sample data. + + Returns a list of `%{category: String.t(), behaviors_with_languages: non_neg_integer()}`. + """ + @spec apply_languages(keyword()) :: [map()] + def apply_languages(opts \\ []) do + filter_category = opts[:category] + + @yaml_dir + |> File.ls!() + |> Enum.filter(fn yml_file -> + String.ends_with?(yml_file, ".yml") and + (filter_category == nil or String.trim_trailing(yml_file, ".yml") == filter_category) + end) + |> Enum.sort() + |> Enum.map(fn yml_file -> + category = String.trim_trailing(yml_file, ".yml") + yaml_path = Path.join(@yaml_dir, yml_file) + {:ok, existing} = YamlElixir.read_from_file(yaml_path) + + updated = + existing + |> Enum.filter(fn {_k, v} -> is_map(v) end) + |> Map.new(fn {behavior, groups} -> + langs = languages_for_behavior(category, behavior) + {behavior, maybe_put_languages(groups, langs)} + end) + + File.write!(yaml_path, YamlFormatter.format(updated)) + + behaviors_with_languages = + Enum.count(updated, fn {_b, groups} -> Map.has_key?(groups, "_languages") end) + + %{category: category, behaviors_with_languages: behaviors_with_languages} + end) + end + + # --------------------------------------------------------------------------- + # Scalar application helpers + # --------------------------------------------------------------------------- + + defp apply_to_category(existing, category, report) do + existing + |> Enum.filter(fn {_k, v} -> is_map(v) end) + |> Enum.reduce({%{}, %{updated: 0, deadzoned: 0, skipped: 0}}, fn + {behavior, current_groups}, {acc_yaml, stats} -> + report_key = "#{category}.#{behavior}" + doc = read_behavior_doc(category, behavior) + + case Map.get(report, report_key) do + nil -> + groups = maybe_put_doc(current_groups, doc) + {Map.put(acc_yaml, behavior, groups), Map.update!(stats, :skipped, &(&1 + 1))} + + metrics -> + apply_metrics(acc_yaml, stats, behavior, current_groups, metrics, doc) + end + end) + end + + defp apply_metrics(acc_yaml, stats, behavior, current_groups, metrics, doc) do + {new_groups, log_baseline, n_updated, n_deadzoned} = groups_from_report(metrics) + # Fall back to current groups if everything was deadzoned + base_groups = if map_size(new_groups) > 0, do: new_groups, else: current_groups + + groups = + base_groups + |> Map.put("_log_baseline", Float.round(log_baseline, 6)) + |> maybe_put_doc(doc) + + {Map.put(acc_yaml, behavior, groups), + %{ + stats + | updated: stats.updated + n_updated, + deadzoned: stats.deadzoned + n_deadzoned + }} + end + + defp groups_from_report(metrics) do + Enum.reduce(metrics, {%{}, 0.0, 0, 0}, fn {metric_key, data}, + {groups, log_baseline, n_updated, n_deadzoned} -> + [group, key] = String.split(metric_key, ".", parts: 2) + + if deadzone?(data.ratio) do + {groups, log_baseline, n_updated, n_deadzoned + 1} + else + accumulate_metric(groups, log_baseline, n_updated, n_deadzoned, group, key, data) + end + end) + end + + defp accumulate_metric(groups, log_baseline, n_updated, n_deadzoned, group, key, data) do + new_groups = + Map.update( + groups, + group, + %{key => data.suggested_scalar}, + &Map.put(&1, key, data.suggested_scalar) + ) + + geo_mean = :math.sqrt(max(data.bad, 1.0e-10) * max(data.good, 1.0e-10)) + new_baseline = log_baseline + data.suggested_scalar * :math.log(geo_mean) + {new_groups, new_baseline, n_updated + 1, n_deadzoned} + end + + defp deadzone?(ratio), do: ratio >= @deadzone_low and ratio <= @deadzone_high + + defp read_behavior_doc(category, behavior) do + config_path = Path.join([@samples_root, category, behavior, "config.yml"]) + + case File.read(config_path) do + {:ok, content} -> + case YamlElixir.read_from_string(content) do + {:ok, %{"doc" => doc}} when is_binary(doc) -> doc + _ -> nil + end + + _ -> + nil + end + end + + defp maybe_put_doc(groups, nil), do: groups + defp maybe_put_doc(groups, doc), do: Map.put(groups, "_doc", doc) + + # --------------------------------------------------------------------------- + # Language detection helpers + # --------------------------------------------------------------------------- + + defp dir_languages(dir) do + case File.ls(dir) do + {:ok, files} -> + files + |> Enum.map(&CodeQA.Language.detect/1) + |> Enum.map(& &1.name()) + |> MapSet.new() + + _ -> + MapSet.new() + end + end + + defp languages_for_behavior(category, behavior) do + bad_langs = dir_languages(sample_path(category, behavior, "bad")) + good_langs = dir_languages(sample_path(category, behavior, "good")) + + bad_langs + |> MapSet.intersection(good_langs) + |> MapSet.to_list() + |> Enum.reject(&(&1 == "unknown")) + |> Enum.sort() + end + + defp maybe_put_languages(groups, []), do: groups + defp maybe_put_languages(groups, langs), do: Map.put(groups, "_languages", langs) + + defp sample_path(category, behavior, kind) do + Path.join([@samples_root, category, behavior, kind]) + end +end diff --git a/lib/codeqa/combined_metrics/scope_and_assignment.ex b/lib/codeqa/combined_metrics/scope_and_assignment.ex new file mode 100644 index 00000000..0b3e616b --- /dev/null +++ b/lib/codeqa/combined_metrics/scope_and_assignment.ex @@ -0,0 +1,29 @@ +defmodule CodeQA.CombinedMetrics.ScopeAndAssignment do + @moduledoc """ + Behaviour and submodule registry for variable scope and assignment quality metrics. + + Scalar weights are defined in `priv/combined_metrics/scope_and_assignment.yml`. + See `CodeQA.CombinedMetrics.Category` for the scoring model. + """ + + @yaml_path "priv/combined_metrics/scope_and_assignment.yml" + + use CodeQA.CombinedMetrics.Category, yaml_path: @yaml_path + + @behaviors @yaml_path + |> YamlElixir.read_from_file!() + |> Enum.filter(fn {_k, v} -> is_map(v) end) + |> Enum.map(fn {key, groups} -> {key, Map.get(groups, "_doc")} end) + + for {key, doc} <- @behaviors do + defmodule Module.concat(CodeQA.CombinedMetrics.ScopeAndAssignment, Macro.camelize(key)) do + alias CodeQA.CombinedMetrics.ScopeAndAssignment + @moduledoc doc + @behaviour ScopeAndAssignment + @score_key key + @impl true + def score(metrics), + do: ScopeAndAssignment.compute_score(@score_key, metrics) + end + end +end diff --git a/lib/codeqa/combined_metrics/scorer.ex b/lib/codeqa/combined_metrics/scorer.ex new file mode 100644 index 00000000..b1220aa9 --- /dev/null +++ b/lib/codeqa/combined_metrics/scorer.ex @@ -0,0 +1,106 @@ +defmodule CodeQA.CombinedMetrics.Scorer do + alias CodeQA.Engine.Analyzer + + @moduledoc """ + Pure computation engine for combined metric formulas. + + Loads scalar weights from a YAML file and computes a score as a product of + metric powers: + + score = metric_a ^ s_a * metric_b ^ s_b * ... + + Scalars of 0.0 contribute nothing (x^0 = 1.0) and are the default for all + metric keys not listed in the YAML. Negative scalars penalise a metric + (higher raw value → lower score). + """ + + @doc """ + Computes the score for `metric_name` using scalars from `yaml_path`. + + `metrics` is the `codebase.aggregate` map returned by `codeqa analyze`. + """ + @spec compute_score(String.t(), String.t(), map()) :: float() + def compute_score(yaml_path, metric_name, metrics) do + default_scalars() + |> Map.merge(scalars_for(yaml_path, metric_name)) + |> Enum.reduce(1.0, fn {{group, key}, scalar}, acc -> + acc * pow(get(metrics, group, key), scalar) + end) + end + + @doc "Returns the non-zero scalar overrides for `metric_name` from `yaml_path`." + @spec scalars_for(String.t(), String.t()) :: %{{String.t(), String.t()} => float()} + def scalars_for(yaml_path, metric_name) do + yaml_path + |> yaml_data() + |> Map.get(metric_name, %{}) + |> Enum.flat_map(fn + {group, keys} when is_map(keys) -> + Enum.map(keys, fn {key, scalar} -> {{group, key}, scalar / 1.0} end) + + _ -> + [] + end) + |> Map.new() + end + + @doc "Returns the full default scalar map: all registered file metric keys mapped to 0.0." + @spec default_scalars() :: %{{String.t(), String.t()} => float()} + def default_scalars do + Analyzer.build_registry().file_metrics + |> Enum.flat_map(fn mod -> + Enum.map(mod.keys(), fn key -> {{mod.name(), "mean_" <> key}, 0.0} end) + end) + |> Map.new() + end + + @doc "Safely fetches a nested metric value, returning 1.0 if missing or non-positive." + @spec get(map(), String.t(), String.t()) :: float() + def get(metrics, group, key) do + case get_in(metrics, [group, key]) do + val when is_number(val) and val > 0 -> val / 1.0 + _ -> 1.0 + end + end + + @doc "Computes `base ^ scalar`, returning 1.0 for non-positive bases." + @spec pow(float(), float()) :: float() + def pow(base, scalar) when base > 0, do: :math.pow(base, scalar) + def pow(_base, _scalar), do: 1.0 + + @yaml_dir "priv/combined_metrics" + @yaml_paths Path.wildcard(Path.join(@yaml_dir, "*.yml")) + for path <- @yaml_paths, do: @external_resource(path) + + @compiled_yamls Map.new(@yaml_paths, fn path -> + {:ok, data} = YamlElixir.read_from_file(path) + {path, data} + end) + + @doc "Returns all compiled YAML data as `%{path => parsed_map}`." + @spec all_yamls() :: %{String.t() => map()} + def all_yamls, do: @compiled_yamls + + @referenced_file_metric_names for( + {_path, behaviors} <- @compiled_yamls, + is_map(behaviors), + {_behavior, body} <- behaviors, + is_map(body), + {key, _val} <- body, + not String.starts_with?(key, "_"), + do: key + ) + |> MapSet.new() + + @doc """ + Returns the set of file-metric module names (e.g. "halstead", "ngram") that + are referenced by any behavior in any compiled YAML. + + Computed at compile time from `@compiled_yamls`. Used by the LOO cache to + skip recompute of metrics whose values cannot influence any cosine. + """ + @spec referenced_file_metric_names() :: MapSet.t() + def referenced_file_metric_names, do: @referenced_file_metric_names + + defp yaml_data(yaml_path), do: Map.get(@compiled_yamls, yaml_path, %{}) +end diff --git a/lib/codeqa/combined_metrics/testing.ex b/lib/codeqa/combined_metrics/testing.ex new file mode 100644 index 00000000..52b41e40 --- /dev/null +++ b/lib/codeqa/combined_metrics/testing.ex @@ -0,0 +1,29 @@ +defmodule CodeQA.CombinedMetrics.Testing do + @moduledoc """ + Behaviour and submodule registry for test quality metrics. + + Scalar weights are defined in `priv/combined_metrics/testing.yml`. + See `CodeQA.CombinedMetrics.Category` for the scoring model. + """ + + @yaml_path "priv/combined_metrics/testing.yml" + + use CodeQA.CombinedMetrics.Category, yaml_path: @yaml_path + + @behaviors @yaml_path + |> YamlElixir.read_from_file!() + |> Enum.filter(fn {_k, v} -> is_map(v) end) + |> Enum.map(fn {key, groups} -> {key, Map.get(groups, "_doc")} end) + + for {key, doc} <- @behaviors do + defmodule Module.concat(CodeQA.CombinedMetrics.Testing, Macro.camelize(key)) do + alias CodeQA.CombinedMetrics.Testing + @moduledoc doc + @behaviour Testing + @score_key key + @impl true + def score(metrics), + do: Testing.compute_score(@score_key, metrics) + end + end +end diff --git a/lib/codeqa/combined_metrics/type_and_value.ex b/lib/codeqa/combined_metrics/type_and_value.ex new file mode 100644 index 00000000..d461c60b --- /dev/null +++ b/lib/codeqa/combined_metrics/type_and_value.ex @@ -0,0 +1,29 @@ +defmodule CodeQA.CombinedMetrics.TypeAndValue do + @moduledoc """ + Behaviour and submodule registry for type safety and value assignment quality metrics. + + Scalar weights are defined in `priv/combined_metrics/type_and_value.yml`. + See `CodeQA.CombinedMetrics.Category` for the scoring model. + """ + + @yaml_path "priv/combined_metrics/type_and_value.yml" + + use CodeQA.CombinedMetrics.Category, yaml_path: @yaml_path + + @behaviors @yaml_path + |> YamlElixir.read_from_file!() + |> Enum.filter(fn {_k, v} -> is_map(v) end) + |> Enum.map(fn {key, groups} -> {key, Map.get(groups, "_doc")} end) + + for {key, doc} <- @behaviors do + defmodule Module.concat(CodeQA.CombinedMetrics.TypeAndValue, Macro.camelize(key)) do + alias CodeQA.CombinedMetrics.TypeAndValue + @moduledoc doc + @behaviour TypeAndValue + @score_key key + @impl true + def score(metrics), + do: TypeAndValue.compute_score(@score_key, metrics) + end + end +end diff --git a/lib/codeqa/combined_metrics/variable_naming.ex b/lib/codeqa/combined_metrics/variable_naming.ex new file mode 100644 index 00000000..db49793e --- /dev/null +++ b/lib/codeqa/combined_metrics/variable_naming.ex @@ -0,0 +1,29 @@ +defmodule CodeQA.CombinedMetrics.VariableNaming do + @moduledoc """ + Behaviour and submodule registry for variable naming quality metrics. + + Scalar weights are defined in `priv/combined_metrics/variable_naming.yml`. + See `CodeQA.CombinedMetrics.Category` for the scoring model. + """ + + @yaml_path "priv/combined_metrics/variable_naming.yml" + + use CodeQA.CombinedMetrics.Category, yaml_path: @yaml_path + + @behaviors @yaml_path + |> YamlElixir.read_from_file!() + |> Enum.filter(fn {_k, v} -> is_map(v) end) + |> Enum.map(fn {key, groups} -> {key, Map.get(groups, "_doc")} end) + + for {key, doc} <- @behaviors do + defmodule Module.concat(CodeQA.CombinedMetrics.VariableNaming, Macro.camelize(key)) do + alias CodeQA.CombinedMetrics.VariableNaming + @moduledoc doc + @behaviour VariableNaming + @score_key key + @impl true + def score(metrics), + do: VariableNaming.compute_score(@score_key, metrics) + end + end +end diff --git a/lib/codeqa/combined_metrics/yaml_formatter.ex b/lib/codeqa/combined_metrics/yaml_formatter.ex new file mode 100644 index 00000000..8c76a668 --- /dev/null +++ b/lib/codeqa/combined_metrics/yaml_formatter.ex @@ -0,0 +1,84 @@ +defmodule CodeQA.CombinedMetrics.YamlFormatter do + @moduledoc """ + Serialises a combined-metrics behavior map back to the hand-authored YAML format. + + Intended for internal use by `SampleRunner`. The output format preserves the + conventions used across `priv/combined_metrics/*.yml`: + + - Behaviors sorted alphabetically + - Meta-keys (`_doc`, `_fix_hint`, `_languages`, `_log_baseline`) emitted before + group sections + - Groups and keys within groups sorted alphabetically + - Floats written with four decimal places + """ + + @doc """ + Serialises a `%{behavior => groups}` map to a YAML string. + """ + @spec format(map()) :: String.t() + def format(data) do + lines = + data + |> Enum.sort_by(fn {behavior, _} -> behavior end) + |> Enum.flat_map(fn {behavior, groups} -> behavior_lines(behavior, groups) end) + + Enum.join(lines, "\n") <> "\n" + end + + # --- Behavior-level serialisation --- + + defp behavior_lines(behavior, groups) do + doc_line = doc_line(Map.get(groups, "_doc")) + baseline_line = baseline_line(Map.get(groups, "_log_baseline")) + fix_hint_line = fix_hint_line(Map.get(groups, "_fix_hint")) + languages_line = languages_line(Map.get(groups, "_languages")) + + excludes_block_types_line = + excludes_block_types_line(Map.get(groups, "_excludes_block_types")) + + group_lines = group_lines(groups) + + ["#{behavior}:" | doc_line] ++ + excludes_block_types_line ++ + fix_hint_line ++ languages_line ++ baseline_line ++ group_lines ++ [""] + end + + defp doc_line(nil), do: [] + defp doc_line(doc), do: [" _doc: #{inspect(doc)}"] + + defp baseline_line(nil), do: [] + defp baseline_line(val), do: [" _log_baseline: #{fmt_scalar(val)}"] + + defp fix_hint_line(nil), do: [] + defp fix_hint_line(hint), do: [" _fix_hint: #{inspect(hint)}"] + + defp languages_line(nil), do: [] + defp languages_line([]), do: [] + defp languages_line(langs), do: [" _languages: [#{Enum.join(langs, ", ")}]"] + + defp excludes_block_types_line(nil), do: [] + defp excludes_block_types_line([]), do: [] + + defp excludes_block_types_line(types), + do: [" _excludes_block_types: [#{Enum.join(types, ", ")}]"] + + defp group_lines(groups) do + groups + |> Enum.filter(fn {k, v} -> + k not in ["_doc", "_log_baseline", "_fix_hint", "_languages", "_excludes_block_types"] and + is_map(v) + end) + |> Enum.sort_by(fn {group, _} -> group end) + |> Enum.flat_map(fn {group, keys} -> + key_lines = + keys + |> Enum.sort_by(fn {key, _} -> key end) + |> Enum.map(fn {key, scalar} -> " #{key}: #{fmt_scalar(scalar)}" end) + + [" #{group}:" | key_lines] + end) + end + + defp fmt_scalar(f) when is_float(f), do: :erlang.float_to_binary(f, decimals: 4) + defp fmt_scalar(n) when is_integer(n), do: "#{n}.0" +end diff --git a/lib/codeqa/comparator.ex b/lib/codeqa/comparator.ex deleted file mode 100644 index 4fbfa40e..00000000 --- a/lib/codeqa/comparator.ex +++ /dev/null @@ -1,109 +0,0 @@ -defmodule CodeQA.Comparator do - @moduledoc "Compare two analysis results and compute metric deltas." - - def compare_results(base_result, head_result, changes) do - base_files = Map.get(base_result, "files", %{}) - head_files = Map.get(head_result, "files", %{}) - - {file_comparisons, status_counts} = - changes - |> Enum.reduce({%{}, %{"added" => 0, "modified" => 0, "deleted" => 0}}, fn change, - {files, counts} -> - base_data = Map.get(base_files, change.path) - head_data = Map.get(head_files, change.path) - delta = compute_file_delta(base_data, head_data) - - file_entry = %{ - "status" => change.status, - "base" => base_data, - "head" => head_data, - "delta" => delta - } - - {Map.put(files, change.path, file_entry), Map.update!(counts, change.status, &(&1 + 1))} - end) - - base_agg = get_in(base_result, ["codebase", "aggregate"]) || %{} - head_agg = get_in(head_result, ["codebase", "aggregate"]) || %{} - agg_delta = compute_aggregate_delta(base_agg, head_agg) - - summary = build_summary(status_counts) - - %{ - "metadata" => %{ - "total_files_compared" => length(changes), - "summary" => summary - }, - "files" => file_comparisons, - "codebase" => %{ - "base" => %{"aggregate" => base_agg}, - "head" => %{"aggregate" => head_agg}, - "delta" => %{"aggregate" => agg_delta} - } - } - end - - defp compute_file_delta(nil, _head), do: nil - defp compute_file_delta(_base, nil), do: nil - - defp compute_file_delta(base_data, head_data) do - top_delta = - ["bytes", "lines"] - |> Enum.reduce(%{}, fn key, acc -> - case {Map.get(base_data, key), Map.get(head_data, key)} do - {b, h} when is_number(b) and is_number(h) -> Map.put(acc, key, h - b) - _ -> acc - end - end) - - base_metrics = Map.get(base_data, "metrics", %{}) - head_metrics = Map.get(head_data, "metrics", %{}) - - metrics_delta = - MapSet.new(Map.keys(base_metrics) ++ Map.keys(head_metrics)) - |> Enum.reduce(%{}, fn metric_name, acc -> - base_m = Map.get(base_metrics, metric_name, %{}) - head_m = Map.get(head_metrics, metric_name, %{}) - delta = compute_numeric_delta(base_m, head_m) - if delta == %{}, do: acc, else: Map.put(acc, metric_name, delta) - end) - - Map.put(top_delta, "metrics", metrics_delta) - end - - defp compute_aggregate_delta(base_agg, head_agg) do - MapSet.new(Map.keys(base_agg) ++ Map.keys(head_agg)) - |> Enum.reduce(%{}, fn metric_name, acc -> - base_m = Map.get(base_agg, metric_name, %{}) - head_m = Map.get(head_agg, metric_name, %{}) - delta = compute_numeric_delta(base_m, head_m) - if delta == %{}, do: acc, else: Map.put(acc, metric_name, delta) - end) - end - - defp compute_numeric_delta(base, head) do - MapSet.new(Map.keys(base) ++ Map.keys(head)) - |> Enum.reduce(%{}, fn key, acc -> - case {Map.get(base, key), Map.get(head, key)} do - {b, h} when is_number(b) and is_number(h) -> - Map.put(acc, key, Float.round((h - b) / 1, 4)) - - _ -> - acc - end - end) - end - - defp build_summary(counts) do - parts = - [ - {"added", counts["added"]}, - {"modified", counts["modified"]}, - {"deleted", counts["deleted"]} - ] - |> Enum.filter(fn {_, c} -> c > 0 end) - |> Enum.map(fn {status, count} -> "#{count} #{status}" end) - - if parts == [], do: "no changes", else: Enum.join(parts, ", ") - end -end diff --git a/lib/codeqa/config.ex b/lib/codeqa/config.ex new file mode 100644 index 00000000..5171eacb --- /dev/null +++ b/lib/codeqa/config.ex @@ -0,0 +1,99 @@ +defmodule CodeQA.Config do + @moduledoc "Loads and caches .codeqa.yml configuration via :persistent_term." + + @key {__MODULE__, :config} + + @default_impact %{ + "complexity" => 5, + "file_structure" => 4, + "function_design" => 4, + "code_smells" => 3, + "naming_conventions" => 2, + "error_handling" => 2, + "consistency" => 2, + "documentation" => 1, + "testing" => 1 + } + + defstruct ignore_paths: [], + impact_map: @default_impact, + combined_top: 2, + cosine_significance_threshold: 0.15, + near_duplicate_blocks: [] + + @spec load(String.t()) :: :ok + def load(path) do + if :persistent_term.get(@key, nil) == nil do + config = parse(path) + :persistent_term.put(@key, config) + end + + :ok + end + + @spec reset() :: :ok + def reset do + :persistent_term.erase(@key) + :ok + end + + @spec ignore_paths() :: [String.t()] + def ignore_paths, do: fetch().ignore_paths + + @spec impact_map() :: %{String.t() => pos_integer()} + def impact_map, do: fetch().impact_map + + @spec combined_top() :: pos_integer() + def combined_top, do: fetch().combined_top + + @spec cosine_significance_threshold() :: float() + def cosine_significance_threshold, do: fetch().cosine_significance_threshold + + @spec near_duplicate_blocks_opts() :: keyword() + def near_duplicate_blocks_opts, do: fetch().near_duplicate_blocks + + defp fetch do + :persistent_term.get(@key, %__MODULE__{}) + end + + defp parse(path) do + config_file = Path.join(path, ".codeqa.yml") + + case File.read(config_file) do + {:ok, contents} -> + case YamlElixir.read_from_string(contents) do + {:ok, yaml} -> from_yaml(yaml) + _ -> %__MODULE__{} + end + + {:error, _} -> + %__MODULE__{} + end + end + + defp from_yaml(yaml) do + %__MODULE__{ + ignore_paths: parse_ignore_paths(yaml), + impact_map: parse_impact(yaml), + combined_top: Map.get(yaml, "combined_top", 2), + cosine_significance_threshold: Map.get(yaml, "cosine_significance_threshold", 0.15), + near_duplicate_blocks: parse_near_duplicate_blocks(yaml) + } + end + + defp parse_ignore_paths(%{"ignore_paths" => patterns}) when is_list(patterns), do: patterns + defp parse_ignore_paths(_), do: [] + + defp parse_impact(%{"impact" => overrides}) when is_map(overrides) do + string_overrides = Map.new(overrides, fn {k, v} -> {to_string(k), v} end) + Map.merge(@default_impact, string_overrides) + end + + defp parse_impact(_), do: @default_impact + + defp parse_near_duplicate_blocks(%{"near_duplicate_blocks" => %{"max_pairs_per_bucket" => n}}) + when is_integer(n), + do: [max_pairs_per_bucket: n] + + defp parse_near_duplicate_blocks(_), do: [] +end diff --git a/lib/codeqa/diagnostics.ex b/lib/codeqa/diagnostics.ex new file mode 100644 index 00000000..f2479e09 --- /dev/null +++ b/lib/codeqa/diagnostics.ex @@ -0,0 +1,171 @@ +defmodule CodeQA.Diagnostics do + @moduledoc """ + Diagnoses a codebase by identifying likely code quality issues using + cosine similarity against combined metric behavior profiles. + """ + + alias CodeQA.CombinedMetrics.FileScorer + alias CodeQA.CombinedMetrics.SampleRunner + alias CodeQA.Engine.Analyzer + alias CodeQA.Engine.Collector + alias CodeQA.HealthReport.Grader + + @doc """ + Runs diagnostics on the given path and returns results as a string. + + ## Options + + * `:path` - file or directory path (required) + * `:mode` - `:aggregate` (default) or `:per_file` + * `:top` - number of top issues to display (default 15) + * `:format` - `:plain` or `:json` (default `:plain`) + * `:combined_top` - worst offender files per behavior (default 2) + """ + @spec run(keyword()) :: String.t() + def run(opts) do + path = opts[:path] + mode = opts[:mode] || :aggregate + top = opts[:top] || 15 + format = opts[:format] || :plain + + files = Collector.collect_files(path) + result = Analyzer.analyze_codebase(files, []) + + case mode do + :per_file -> run_per_file(result, top, format) + _ -> run_aggregate(result, top, format) + end + end + + defp run_aggregate(result, top, format) do + aggregate = get_in(result, ["codebase", "aggregate"]) + files = Map.get(result, "files", %{}) + project_langs = project_languages(files) + + issues_task = + Task.async(fn -> + SampleRunner.diagnose_aggregate(aggregate, top: top, languages: project_langs) + end) + + categories_task = + Task.async(fn -> SampleRunner.score_aggregate(aggregate, languages: project_langs) end) + + issues = Task.await(issues_task) + categories = Task.await(categories_task) + + case format do + :json -> + Jason.encode!(%{issues: issues, categories: categories}, pretty: true) + + _ -> + "## Diagnose: aggregate\n\n" <> + issues_table(issues) <> + "\n" <> + categories_text(categories) + end + end + + defp run_per_file(result, top, format) do + files = Map.get(result, "files", %{}) + + file_diagnoses = + Map.new(files, fn {file_path, file_data} -> + metrics = Map.get(file_data, "metrics", %{}) + file_agg = FileScorer.file_to_aggregate(metrics) + language = CodeQA.Language.detect(file_path).name() + diagnoses = SampleRunner.diagnose_aggregate(file_agg, top: top, language: language) + {file_path, diagnoses} + end) + + case format do + :json -> + files_json = + Enum.map(file_diagnoses, fn {file_path, diagnoses} -> + %{file: file_path, behaviors: Enum.map(diagnoses, &diagnosis_to_map/1)} + end) + + Jason.encode!(%{files: files_json}, pretty: true) + + _ -> + file_rows = + Enum.flat_map(file_diagnoses, fn {file_path, diagnoses} -> + diagnoses_to_rows(file_path, diagnoses) + end) + + "## Diagnose: per-file\n\n" <> per_file_table(file_rows, top) + end + end + + defp diagnosis_to_map(d) do + %{ + behavior: "#{d.category}.#{d.behavior}", + cosine: d.cosine, + score: Grader.score_cosine(d.cosine) + } + end + + defp diagnoses_to_rows(file_path, diagnoses) do + Enum.map(diagnoses, fn %{category: cat, behavior: beh, cosine: cosine, score: score} -> + {file_path, "#{cat}.#{beh}", cosine, score} + end) + end + + defp project_languages(files_map) do + files_map + |> Map.keys() + |> Enum.map(&CodeQA.Language.detect(&1).name()) + |> Enum.reject(&(&1 == "unknown")) + |> Enum.uniq() + end + + defp issues_table(issues) do + rows = + Enum.map(issues, fn %{category: cat, behavior: beh, cosine: cosine, score: score} -> + cosine_str = :erlang.float_to_binary(cosine / 1.0, decimals: 2) + score_str = :erlang.float_to_binary(score / 1.0, decimals: 2) + "| #{cat}.#{beh} | #{cosine_str} | #{score_str} |" + end) + + Enum.join( + ["| Behavior | Cosine | Score |", "|----------|--------|-------|"] ++ rows ++ [""], + "\n" + ) + end + + defp categories_text(categories) do + Enum.map_join(categories, "\n", fn %{name: name, behaviors: behaviors} -> + rows = + Enum.map(behaviors, fn %{behavior: beh, score: score} -> + score_str = :erlang.float_to_binary(score / 1.0, decimals: 2) + "| #{beh} | #{score_str} |" + end) + + Enum.join( + ["### #{name}", "| Behavior | Score |", "|----------|-------|"] ++ rows ++ [""], + "\n" + ) + end) + end + + defp per_file_table(rows, top) do + data_rows = + rows + |> Enum.group_by(fn {file_path, _, _, _} -> file_path end) + |> Enum.flat_map(fn {_file_path, file_rows} -> + file_rows + |> Enum.sort_by(fn {_, _, cosine, _} -> cosine end) + |> Enum.take(top) + end) + |> Enum.map(fn {file_path, behavior_key, cosine, _score} -> + cosine_str = :erlang.float_to_binary(cosine / 1.0, decimals: 2) + cosine_score = Grader.score_cosine(cosine) + "| #{file_path} | #{behavior_key} | #{cosine_str} | #{cosine_score} |" + end) + + Enum.join( + ["| File | Behavior | Cosine | Score |", "|------|----------|--------|-------|"] ++ + data_rows, + "\n" + ) + end +end diff --git a/lib/codeqa/engine/analyzer.ex b/lib/codeqa/engine/analyzer.ex new file mode 100644 index 00000000..2436581c --- /dev/null +++ b/lib/codeqa/engine/analyzer.ex @@ -0,0 +1,221 @@ +defmodule CodeQA.Engine.Analyzer do + @moduledoc "Orchestrates metric computation across files." + + alias CodeQA.Analysis.RunSupervisor + alias CodeQA.BlockImpactAnalyzer + alias CodeQA.Engine.Parallel + alias CodeQA.Engine.Pipeline + alias CodeQA.Engine.Registry + alias CodeQA.Metrics.Codebase, as: CodebaseMetrics + alias CodeQA.Metrics.File, as: Metrics + + @registry Registry.new() + |> Registry.register_file_metric(Metrics.Entropy) + |> Registry.register_file_metric(Metrics.Compression) + |> Registry.register_file_metric(Metrics.Zipf) + |> Registry.register_file_metric(Metrics.Heaps) + |> Registry.register_file_metric(Metrics.Vocabulary) + |> Registry.register_file_metric(Metrics.Ngram) + |> Registry.register_file_metric(Metrics.Halstead) + |> Registry.register_file_metric(Metrics.Readability) + |> Registry.register_file_metric(Metrics.CasingEntropy) + |> Registry.register_file_metric(Metrics.IdentifierLengthVariance) + |> Registry.register_file_metric(Metrics.Indentation) + |> Registry.register_file_metric(Metrics.Branching) + |> Registry.register_file_metric(Metrics.FunctionMetrics) + |> Registry.register_file_metric(Metrics.MagicNumberDensity) + |> Registry.register_file_metric(Metrics.SymbolDensity) + |> Registry.register_file_metric(Metrics.VowelDensity) + |> Registry.register_file_metric(Metrics.Brevity) + |> Registry.register_file_metric(Metrics.PunctuationDensity) + |> Registry.register_file_metric(Metrics.CommentStructure) + |> Registry.register_file_metric(Metrics.SeparatorCounts) + |> Registry.register_file_metric(Metrics.LinePatterns) + |> Registry.register_codebase_metric(CodebaseMetrics.Similarity) + |> Registry.register_file_metric(Metrics.NearDuplicateBlocksFile) + |> Registry.register_codebase_metric(CodebaseMetrics.NearDuplicateBlocksCodebase) + + def build_registry, do: @registry + + @spec analyze_file(String.t(), String.t()) :: map() + def analyze_file(_path, content) do + ctx = Pipeline.build_file_context(content) + Registry.run_file_metrics(@registry, ctx, []) + end + + @spec analyze_file_for_loo(String.t(), String.t()) :: map() + def analyze_file_for_loo(_path, content) do + ctx = Pipeline.build_file_context(content, skip_structural: true) + Registry.run_file_metrics(@registry, ctx, []) + end + + @doc """ + Like `analyze_file_for_loo/2` but only re-runs file metrics whose name is in + `Scorer.referenced_file_metric_names/0`. Metrics not referenced by any + behavior YAML inherit their value from `baseline_metrics`. Metrics that + implement the optional `analyze_loo/2` callback derive their LOO value from + the baseline + the removed block's content, skipping a full file re-analyze. + """ + @spec analyze_file_for_loo_partial(String.t(), String.t(), map(), String.t()) :: map() + def analyze_file_for_loo_partial(_path, content, baseline_metrics, block_content \\ "") do + referenced = CodeQA.CombinedMetrics.Scorer.referenced_file_metric_names() + + {ctx_us, ctx} = + :timer.tc(fn -> Pipeline.build_file_context(content, skip_structural: true) end) + + {result, breakdown} = + Enum.reduce(baseline_metrics, {[], %{ctx: ctx_us}}, fn {name, baseline_value}, + {acc, breakdown} -> + if MapSet.member?(referenced, name) do + mod = registered_module_for(name) + + {us, value} = + if function_exported?(mod, :analyze_loo, 2) do + :timer.tc(fn -> mod.analyze_loo(baseline_value, block_content) end) + else + :timer.tc(fn -> mod.analyze(ctx) end) + end + + {[{name, value} | acc], Map.put(breakdown, name, us)} + else + {[{name, baseline_value} | acc], breakdown} + end + end) + + :telemetry.execute([:codeqa, :loo_breakdown], breakdown, %{}) + Map.new(result) + end + + defp registered_module_for(name) do + Enum.find(@registry.file_metrics, fn mod -> mod.name() == name end) || + raise "no registered file metric module for name #{inspect(name)}" + end + + @spec analyze_codebase_aggregate(map(), keyword()) :: map() + def analyze_codebase_aggregate(files_map, opts \\ []) do + with_run_context(opts, fn opts -> + file_results = Parallel.analyze_files(files_map, opts) + aggregate_file_metrics(file_results) + end) + end + + def analyze_codebase(files, opts \\ []) do + with_run_context(opts, &do_analyze_codebase(files, &1)) + end + + defp with_run_context(opts, fun) do + {:ok, sup} = RunSupervisor.start_link() + run_ctx = RunSupervisor.run_context(sup) + opts = Keyword.put(opts, :file_context_pid, run_ctx.file_context_pid) + opts = Keyword.put(opts, :behavior_config_pid, run_ctx.behavior_config_pid) + + try do + fun.(opts) + after + Supervisor.stop(sup) + end + end + + defp do_analyze_codebase(files, opts) do + registry = @registry + + file_results = + stage(:parallel_files, %{file_count: map_size(files)}, fn -> + Parallel.analyze_files(files, opts) + end) + + aggregate = stage(:aggregate, %{}, fn -> aggregate_file_metrics(file_results) end) + + if Keyword.get(opts, :compute_nodes, false) do + nodes_opts = + [baseline_codebase_agg: aggregate] ++ + Keyword.take(opts, [:nodes_top, :workers, :behavior_config_pid]) + + pipeline_result = %{ + "files" => file_results, + "codebase" => %{"aggregate" => aggregate} + } + + updated_pipeline_result = + stage(:block_impact, %{file_count: map_size(files)}, fn -> + BlockImpactAnalyzer.analyze(pipeline_result, files, nodes_opts) + end) + + codebase_metrics = + stage(:codebase_metrics, %{file_count: map_size(files)}, fn -> + Registry.run_codebase_metrics(registry, files, opts) + end) + + updated_codebase = + Map.merge(codebase_metrics, updated_pipeline_result["codebase"]) + + Map.put(updated_pipeline_result, "codebase", updated_codebase) + else + codebase_metrics = + stage(:codebase_metrics, %{file_count: map_size(files)}, fn -> + Registry.run_codebase_metrics(registry, files, opts) + end) + + %{ + "files" => file_results, + "codebase" => Map.put(codebase_metrics, "aggregate", aggregate) + } + end + end + + defp stage(name, metadata, fun) do + t0 = System.monotonic_time(:microsecond) + result = fun.() + duration = System.monotonic_time(:microsecond) - t0 + :telemetry.execute([:codeqa, :stage], %{duration: duration}, Map.put(metadata, :stage, name)) + result + end + + defp metric_data_to_triples({metric_name, metric_data}) do + metric_data + |> Enum.filter(fn {_k, v} -> is_number(v) end) + |> Enum.map(fn {key, value} -> {metric_name, key, value / 1} end) + end + + def aggregate_file_metrics(file_results) do + file_results + |> Map.values() + |> Enum.flat_map(fn file_data -> + file_data + |> Map.get("metrics", %{}) + |> Enum.flat_map(&metric_data_to_triples/1) + end) + |> Enum.group_by(fn {metric, key, _val} -> {metric, key} end, fn {_, _, val} -> val end) + |> Enum.reduce(%{}, fn {{metric, key}, values}, acc -> + stats = compute_stats(values) + metric_agg = Map.get(acc, metric, %{}) + + updated = + Map.merge(metric_agg, %{ + "mean_#{key}" => stats.mean, + "std_#{key}" => stats.std, + "min_#{key}" => stats.min, + "max_#{key}" => stats.max + }) + + Map.put(acc, metric, updated) + end) + end + + defp compute_stats([]), do: %{mean: 0.0, std: 0.0, min: 0.0, max: 0.0} + + defp compute_stats(values) do + n = length(values) + mean = Enum.sum(values) / n + sum_squares = Enum.reduce(values, 0.0, fn v, acc -> acc + (v - mean) ** 2 end) + variance = sum_squares / n + std = :math.sqrt(variance) + + %{ + mean: Float.round(mean * 1.0, 4), + std: Float.round(std * 1.0, 4), + min: Float.round(Enum.min(values) * 1.0, 4), + max: Float.round(Enum.max(values) * 1.0, 4) + } + end +end diff --git a/lib/codeqa/engine/collector.ex b/lib/codeqa/engine/collector.ex new file mode 100644 index 00000000..3d1b8b41 --- /dev/null +++ b/lib/codeqa/engine/collector.ex @@ -0,0 +1,112 @@ +defmodule CodeQA.Engine.Collector do + @moduledoc false + + @skip_dirs MapSet.new(~w[ + .git .hg .svn node_modules __pycache__ _build dist build vendor + .tox .venv venv target .mypy_cache .pytest_cache deps .elixir_ls + .next coverage + ]) + + @default_ignore_patterns ~w[**/*.md **/*.mdx] + + @spec source_extensions() :: MapSet.t() + def source_extensions do + CodeQA.Language.all() + |> Enum.flat_map(& &1.extensions()) + |> Enum.map(&".#{&1}") + |> MapSet.new() + end + + @spec collect_files(String.t(), [String.t()]) :: %{String.t() => String.t()} + def collect_files(root, extra_ignore_patterns \\ []) do + root_path = Path.expand(root) + CodeQA.Config.load(root_path) + patterns = all_ignore_patterns(extra_ignore_patterns) + extensions = source_extensions() + + unless File.dir?(root_path) do + raise File.Error, reason: :enoent, path: root, action: "find directory" + end + + files_map = + root_path + |> walk_directory(extensions) + |> Map.new(fn path -> + rel = Path.relative_to(path, root_path) + {rel, File.read!(path)} + end) + |> do_reject_ignored_map(patterns) + + gitignored = CodeQA.Git.gitignored_files(root_path, Map.keys(files_map)) + Map.reject(files_map, fn {path, _} -> MapSet.member?(gitignored, path) end) + end + + @doc false + def ignored?(path, patterns) do + Enum.any?(patterns, fn pattern -> + match_pattern?(path, pattern) + end) + end + + @doc false + def reject_ignored_map(files_map, extra_patterns \\ []) do + do_reject_ignored_map(files_map, all_ignore_patterns(extra_patterns)) + end + + @doc false + def reject_ignored(list, key_fn, extra_patterns \\ []) do + patterns = all_ignore_patterns(extra_patterns) + Enum.reject(list, fn item -> ignored?(key_fn.(item), patterns) end) + end + + defp all_ignore_patterns(extra), + do: extra ++ @default_ignore_patterns ++ CodeQA.Config.ignore_paths() + + defp do_reject_ignored_map(files_map, patterns) do + Map.reject(files_map, fn {path, _} -> ignored?(path, patterns) end) + end + + defp match_pattern?(path, pattern) do + # Convert glob pattern to regex: + # - ** matches any number of directories + # - * matches anything except / + # - ? matches a single character except / + regex_str = + pattern + |> String.replace(".", "\\.") + |> String.replace("**", "\0GLOBSTAR\0") + |> String.replace("*", "[^/]*") + |> String.replace("?", "[^/]") + |> String.replace("\0GLOBSTAR\0", ".*") + + case Regex.compile("^#{regex_str}$") do + {:ok, regex} -> Regex.match?(regex, path) + _ -> false + end + end + + defp walk_directory(dir, extensions) do + dir + |> File.ls!() + |> Enum.flat_map(fn entry -> + full_path = Path.join(dir, entry) + + cond do + File.dir?(full_path) and not skip_dir?(entry) -> + walk_directory(full_path, extensions) + + File.regular?(full_path) and source_file?(entry, extensions) and + not String.starts_with?(entry, ".") -> + [full_path] + + true -> + [] + end + end) + end + + defp skip_dir?(name), do: MapSet.member?(@skip_dirs, name) or String.starts_with?(name, ".") + + defp source_file?(name, extensions), + do: MapSet.member?(extensions, Path.extname(name) |> String.downcase()) +end diff --git a/lib/codeqa/engine/file_context.ex b/lib/codeqa/engine/file_context.ex new file mode 100644 index 00000000..6e1da6ba --- /dev/null +++ b/lib/codeqa/engine/file_context.ex @@ -0,0 +1,29 @@ +defmodule CodeQA.Engine.FileContext do + @moduledoc "Immutable pre-computed data shared across all file metrics." + @enforce_keys [ + :content, + :tokens, + :token_counts, + :words, + :identifiers, + :lines, + :encoded, + :byte_count, + :line_count + ] + defstruct @enforce_keys ++ [:path, :blocks] + + @type t :: %__MODULE__{ + content: String.t(), + tokens: [CodeQA.Engine.Pipeline.Token.t()], + token_counts: map(), + words: list(), + identifiers: list(), + lines: list(), + encoded: String.t(), + byte_count: non_neg_integer(), + line_count: non_neg_integer(), + path: String.t() | nil, + blocks: [CodeQA.AST.Enrichment.Node.t()] | nil + } +end diff --git a/lib/codeqa/parallel.ex b/lib/codeqa/engine/parallel.ex similarity index 66% rename from lib/codeqa/parallel.ex rename to lib/codeqa/engine/parallel.ex index 0e2cc460..f5a8da15 100644 --- a/lib/codeqa/parallel.ex +++ b/lib/codeqa/engine/parallel.ex @@ -1,4 +1,8 @@ -defmodule CodeQA.Parallel do +defmodule CodeQA.Engine.Parallel do + alias CodeQA.Analysis.FileContextServer + alias CodeQA.Engine.Analyzer + alias CodeQA.Engine.Registry + @moduledoc "Parallel file analysis using Flow (GenStage-based)." def analyze_files(files, opts \\ []) when is_map(files) do @@ -22,7 +26,7 @@ defmodule CodeQA.Parallel do |> Flow.map(fn {path, content} -> start_time = System.monotonic_time(:millisecond) - result = maybe_cached_analyze(content, cache_dir, opts) + result = maybe_cached_analyze(path, content, cache_dir, opts) end_time = System.monotonic_time(:millisecond) time_taken = end_time - start_time @@ -38,9 +42,10 @@ defmodule CodeQA.Parallel do |> Enum.into(%{}) end - defp maybe_cached_analyze(content, nil, opts), do: analyze_single_file(content, opts) + defp maybe_cached_analyze(path, content, nil, opts), + do: analyze_single_file(path, content, opts) - defp maybe_cached_analyze(content, cache_dir, opts) do + defp maybe_cached_analyze(path, content, cache_dir, opts) do hash = :crypto.hash(:sha256, content) |> Base.encode16(case: :lower) cache_file = Path.join(cache_dir, hash <> ".json") @@ -51,30 +56,25 @@ defmodule CodeQA.Parallel do data _ -> - data = analyze_single_file(content, opts) + data = analyze_single_file(path, content, opts) File.write!(cache_file, Jason.encode!(data)) data end _ -> - data = analyze_single_file(content, opts) + data = analyze_single_file(path, content, opts) File.write!(cache_file, Jason.encode!(data)) data end end - defp analyze_single_file(content, opts) do - registry = CodeQA.Analyzer.build_registry() - - ctx = - CodeQA.Telemetry.time(:pipeline_build_context, fn -> - CodeQA.Pipeline.build_file_context(content, opts) - end) + defp analyze_single_file(path, content, opts) do + registry = Analyzer.build_registry() + file_opts = Keyword.put(opts, :path, path) + pid = Keyword.fetch!(opts, :file_context_pid) - metrics = - CodeQA.Telemetry.time(:registry_run_metrics, fn -> - CodeQA.Registry.run_file_metrics(registry, ctx, opts) - end) + ctx = FileContextServer.get(pid, content, file_opts) + metrics = Registry.run_file_metrics(registry, ctx, opts) %{ "bytes" => ctx.byte_count, diff --git a/lib/codeqa/engine/pipeline.ex b/lib/codeqa/engine/pipeline.ex new file mode 100644 index 00000000..53e25b4f --- /dev/null +++ b/lib/codeqa/engine/pipeline.ex @@ -0,0 +1,114 @@ +defmodule CodeQA.Engine.Pipeline do + @moduledoc "Pre-computed shared context for file-level metrics." + + defmodule Token do + @moduledoc "A lexical token with its string content, kind tag, and 1-based source line." + defstruct [:content, :kind, :line] + + @type t :: %__MODULE__{ + content: String.t(), + kind: String.t(), + line: pos_integer() + } + end + + alias CodeQA.AST.Lexing.TokenNormalizer + alias CodeQA.AST.Parsing.Parser + alias CodeQA.Engine.FileContext + alias CodeQA.Language + + @word_re ~r/\b[a-zA-Z_]\w*\b/u + + @spec build_file_context(String.t(), keyword()) :: FileContext.t() + def build_file_context(content, opts \\ []) when is_binary(content) do + tokens = tokenize(content) + token_counts = tokens |> Enum.map(& &1.content) |> Enum.frequencies() + + keywords = cached_keywords() + + words = + Regex.scan(@word_re, content) + |> List.flatten() + + identifiers = Enum.reject(words, &MapSet.member?(keywords, &1)) + lines = content |> String.split("\n") |> trim_trailing_empty() + encoded = content + + skip_structural = Keyword.get(opts, :skip_structural, false) + + {path, blocks} = + case Keyword.get(opts, :path) do + nil -> + {nil, nil} + + p when skip_structural -> + {p, nil} + + p -> + lang_mod = Language.detect(p) + structural_tokens = TokenNormalizer.normalize_structural(content) + {p, Parser.detect_blocks(structural_tokens, lang_mod)} + end + + %FileContext{ + content: content, + tokens: tokens, + token_counts: token_counts, + words: words, + identifiers: identifiers, + lines: lines, + encoded: encoded, + byte_count: byte_size(content), + line_count: length(lines), + path: path, + blocks: blocks + } + end + + # Matches identifiers, integer/float literals, and single non-whitespace chars. + @token_re ~r/[a-zA-Z_]\w*|[0-9]+(?:\.[0-9]+)?|[^\s]/u + + defp tokenize(content) do + content + |> String.split("\n") + |> Enum.with_index(1) + |> Enum.flat_map(fn {line, line_num} -> + @token_re + |> Regex.scan(line) + |> List.flatten() + |> Enum.map(&%Token{content: &1, kind: classify(&1), line: line_num}) + end) + end + + defp classify(tok) do + cond do + Regex.match?(~r/^[a-zA-Z_]\w*$/, tok) -> "" + Regex.match?(~r/^[0-9]/, tok) -> "" + true -> "" + end + end + + # Caches the all-languages keyword MapSet across calls. Without the cache, + # MapSet.new(Language.all_keywords()) ran ~150ms per call (driven by the + # :application.get_key reflection in Language.all/0) — multiplied by every + # block-impact LOO call, this dominated the analyzer hot path. + defp cached_keywords do + case :persistent_term.get({__MODULE__, :keywords}, nil) do + nil -> + set = MapSet.new(Language.all_keywords()) + :persistent_term.put({__MODULE__, :keywords}, set) + set + + set -> + set + end + end + + defp trim_trailing_empty(lines) do + # Match Python's str.splitlines() behavior + case List.last(lines) do + "" -> List.delete_at(lines, -1) + _ -> lines + end + end +end diff --git a/lib/codeqa/registry.ex b/lib/codeqa/engine/registry.ex similarity index 59% rename from lib/codeqa/registry.ex rename to lib/codeqa/engine/registry.ex index 76dfe23b..135385ac 100644 --- a/lib/codeqa/registry.ex +++ b/lib/codeqa/engine/registry.ex @@ -1,4 +1,4 @@ -defmodule CodeQA.Registry do +defmodule CodeQA.Engine.Registry do @moduledoc "Metric registration and execution." defstruct file_metrics: [], codebase_metrics: [] @@ -16,15 +16,22 @@ defmodule CodeQA.Registry do def run_file_metrics(%__MODULE__{} = reg, ctx, opts \\ []) do base_metrics = Map.new(reg.file_metrics, fn mod -> - {mod.name(), - CodeQA.Telemetry.time(String.to_atom("metric_" <> mod.name()), fn -> mod.analyze(ctx) end)} + t0 = System.monotonic_time(:microsecond) + result = mod.analyze(ctx) + duration = System.monotonic_time(:microsecond) - t0 + + :telemetry.execute( + [:codeqa, :file_metric], + %{duration: duration}, + %{metric: mod.name()} + ) + + {mod.name(), result} end) if Keyword.get(opts, :combinations, false) do - CodeQA.Telemetry.time(:registry_combinations, fn -> - combinations = generate_combinations(flat_numeric_metrics(base_metrics), []) - Map.merge(base_metrics, Map.new(combinations)) - end) + combinations = generate_combinations(flat_numeric_metrics(base_metrics), []) + Map.merge(base_metrics, Map.new(combinations)) else base_metrics end @@ -60,6 +67,26 @@ defmodule CodeQA.Registry do end def run_codebase_metrics(%__MODULE__{} = reg, files, opts \\ []) do - Map.new(reg.codebase_metrics, fn mod -> {mod.name(), mod.analyze(files, opts)} end) + has_progress = Keyword.has_key?(opts, :on_progress) + total = length(reg.codebase_metrics) + + reg.codebase_metrics + |> Enum.with_index(1) + |> Map.new(fn {mod, idx} -> + if has_progress, + do: IO.puts(:stderr, "\nCODEBASE #{idx}/#{total}: #{mod.name()}...") + + t0 = System.monotonic_time(:microsecond) + result = mod.analyze(files, opts) + duration = System.monotonic_time(:microsecond) - t0 + + :telemetry.execute( + [:codeqa, :codebase_metric], + %{duration: duration}, + %{metric: mod.name()} + ) + + {mod.name(), result} + end) end end diff --git a/lib/codeqa/formatter.ex b/lib/codeqa/formatter.ex deleted file mode 100644 index 55ba6efe..00000000 --- a/lib/codeqa/formatter.ex +++ /dev/null @@ -1,344 +0,0 @@ -defmodule CodeQA.Formatter do - @moduledoc false - - @summary_metrics [ - {"entropy", "char_entropy", "Entropy"}, - {"halstead", "volume", "Halstead Vol."}, - {"halstead", "difficulty", "Difficulty"}, - {"readability", "flesch_adapted", "Readability"}, - {"compression", "redundancy", "Redundancy"} - ] - - @bar_width 20 - @filled "█" - @empty "░" - - def format_github(comparison, output_mode \\ "auto") do - metadata = comparison["metadata"] - files = comparison["files"] || %{} - codebase = comparison["codebase"] || %{} - - if metadata["total_files_compared"] == 0 do - "## Code Quality: PR Comparison\n\nNo file changes detected." - else - build_github_report(metadata, files, codebase, output_mode) - end - end - - defp build_github_report(metadata, files, codebase, output_mode) do - categories = CodeQA.HealthReport.Categories.defaults() - scale = CodeQA.HealthReport.Categories.default_grade_scale() - - base_agg = get_in(codebase, ["base", "aggregate"]) || %{} - head_agg = get_in(codebase, ["head", "aggregate"]) || %{} - - base_grades = CodeQA.HealthReport.Grader.grade_aggregate(categories, base_agg, scale) - head_grades = CodeQA.HealthReport.Grader.grade_aggregate(categories, head_agg, scale) - - paired = Enum.zip(base_grades, head_grades) - - lines = - [ - "## Code Quality: PR Comparison", - "", - "**#{metadata["total_files_compared"]} files compared** (#{metadata["summary"]})", - "" - ] ++ - mermaid_chart(head_grades) ++ - progress_bars(paired) ++ - [""] ++ - file_details(files, codebase, output_mode) ++ - aggregate_details(codebase) - - Enum.join(lines, "\n") - end - - defp mermaid_chart(head_grades) do - names = Enum.map(head_grades, fn g -> ~s("#{g.name}") end) |> Enum.join(", ") - scores = Enum.map(head_grades, fn g -> to_string(g.score) end) |> Enum.join(", ") - - [ - "```mermaid", - "%%{init: {'theme': 'neutral'}}%%", - "xychart-beta", - " title \"Code Health After PR\"", - " x-axis [#{names}]", - " y-axis \"Score\" 0 --> 100", - " bar [#{scores}]", - "```", - "" - ] - end - - defp progress_bars(paired) do - max_name_len = - Enum.reduce(paired, 0, fn {_base, head}, acc -> - max(acc, String.length(head.name)) - end) - - rows = - Enum.map(paired, fn {base, head} -> - name = String.pad_trailing(head.name, max_name_len) - base_bar = build_bar(base.score) - head_bar = build_bar(head.score) - emoji = grade_emoji(head.grade) - delta = head.score - base.score - delta_str = if delta >= 0, do: "+#{delta}", else: to_string(delta) - "#{name} #{base_bar} #{base.score} → #{head_bar} #{head.score} #{emoji} #{delta_str}" - end) - - ["```"] ++ rows ++ ["```"] - end - - defp file_details(files, codebase, _output_mode) do - codebase_summary = CodeQA.Summarizer.summarize_codebase(%{"files" => files, "codebase" => codebase}) - - file_summaries = - Map.new(files, fn {path, data} -> - {path, CodeQA.Summarizer.summarize_file(path, data)} - end) - - inner = - (format_file_table(files, file_summaries) ++ [""]) - |> Enum.join("\n") - - [ - "
", - "File changes — #{codebase_summary["gist"]}", - "", - inner, - "
", - "" - ] - end - - defp aggregate_details(codebase) do - inner = - format_aggregate_table(codebase, build_direction_map()) - |> Enum.join("\n") - - if inner == "" do - [] - else - [ - "
", - "Aggregate metrics", - "", - inner, - "", - "
", - "" - ] - end - end - - defp build_bar(score) do - filled = round(score / 100 * @bar_width) - filled = min(max(filled, 0), @bar_width) - empty = @bar_width - filled - String.duplicate(@filled, filled) <> String.duplicate(@empty, empty) - end - - defp grade_emoji(grade) do - cond do - grade in ["A", "A-"] -> "🟢" - grade in ["B+", "B", "B-"] -> "🟡" - grade in ["C+", "C", "C-"] -> "🟠" - true -> "🔴" - end - end - - def format_markdown(comparison, output_mode \\ "auto") do - metadata = comparison["metadata"] - files = comparison["files"] || %{} - codebase = comparison["codebase"] - - if metadata["total_files_compared"] == 0 do - "## Code Quality: PR Comparison\n\nNo file changes detected." - else - build_report(metadata, files, codebase, output_mode) - end - end - - defp build_report(metadata, files, codebase, output_mode) do - codebase_summary = - CodeQA.Summarizer.summarize_codebase(%{"files" => files, "codebase" => codebase}) - - lines = [ - "## Code Quality: PR Comparison", - "", - "**#{metadata["total_files_compared"]} files compared** (#{metadata["summary"]})", - "" - ] - - lines = - if output_mode in ["auto", "summary"] do - lines ++ ["> #{codebase_summary["gist"]}", ""] - else - lines - end - - lines = - if output_mode in ["auto", "changes"] do - file_summaries = - Map.new(files, fn {path, data} -> - {path, CodeQA.Summarizer.summarize_file(path, data)} - end) - - lines ++ format_file_table(files, file_summaries) ++ [""] - else - lines - end - - lines = - if output_mode in ["auto", "summary"] do - lines ++ format_aggregate_table(codebase) - else - lines - end - - Enum.join(lines, "\n") - end - - defp format_file_table(files, file_summaries) do - columns = detect_columns(files) - - if columns == [], - do: ["No metric data available."], - else: build_file_rows(files, file_summaries, columns) - end - - defp build_file_rows(files, file_summaries, columns) do - header = - "| File | Status | Summary | " <> - Enum.map_join(columns, " | ", fn {_, _, label} -> label end) <> " |" - - separator = - "|------|--------|---------|" <> Enum.map_join(columns, "", fn _ -> "--------|" end) - - rows = - files - |> Enum.sort_by(fn {path, _} -> path end) - |> Enum.map(fn {path, data} -> - gist = get_in(file_summaries, [path, "gist"]) || "" - cells = format_file_row(data, columns) - "| `#{path}` | #{data["status"]} | #{gist} | " <> Enum.join(cells, " | ") <> " |" - end) - - [header, separator | rows] - end - - defp format_file_row(data, columns) do - Enum.map(columns, fn {metric_name, key, _label} -> - case data["status"] do - "modified" -> format_modified_cell(data, metric_name, key) - "added" -> format_added_cell(data, metric_name, key) - "deleted" -> format_deleted_cell(data, metric_name, key) - _ -> "—" - end - end) - end - - defp format_modified_cell(data, metric_name, key) do - case get_in(data, ["delta", "metrics", metric_name, key]) do - nil -> "—" - val -> format_delta(val) - end - end - - defp format_added_cell(data, metric_name, key) do - case get_in(data, ["head", "metrics", metric_name, key]) do - nil -> "—" - val -> "*#{format_value(val)}*" - end - end - - defp format_deleted_cell(data, metric_name, key) do - case get_in(data, ["base", "metrics", metric_name, key]) do - nil -> "—" - val -> "~~#{format_value(val)}~~" - end - end - - defp format_aggregate_table(codebase, direction_map \\ %{}) do - base_agg = get_in(codebase, ["base", "aggregate"]) || %{} - head_agg = get_in(codebase, ["head", "aggregate"]) || %{} - delta_agg = get_in(codebase, ["delta", "aggregate"]) || %{} - - if base_agg == %{} and head_agg == %{}, - do: [], - else: build_aggregate_rows(base_agg, head_agg, delta_agg, direction_map) - end - - defp build_aggregate_rows(base_agg, head_agg, delta_agg, direction_map) do - header = [ - "### Aggregate Metrics", - "", - "| Metric | Base | Head | Delta |", - "|--------|------|------|-------|" - ] - - rows = - MapSet.new(Map.keys(base_agg) ++ Map.keys(head_agg)) - |> Enum.sort() - |> Enum.flat_map(fn metric_name -> - base_m = Map.get(base_agg, metric_name, %{}) - head_m = Map.get(head_agg, metric_name, %{}) - delta_m = Map.get(delta_agg, metric_name, %{}) - - MapSet.new(Map.keys(base_m) ++ Map.keys(head_m)) - |> Enum.sort() - |> Enum.map(fn key -> - direction = Map.get(direction_map, "#{metric_name}.#{key}") - delta_cell = format_delta_with_direction(delta_m[key], direction) - "| #{metric_name}.#{key} | #{format_value(base_m[key])} | #{format_value(head_m[key])} | #{delta_cell} |" - end) - end) - - header ++ rows - end - - defp build_direction_map do - CodeQA.HealthReport.Categories.defaults() - |> Enum.flat_map(fn cat -> - Enum.map(cat.metrics, fn m -> {"#{m.source}.mean_#{m.name}", m.good} end) - end) - |> Map.new() - end - - defp format_delta_with_direction(nil, _direction), do: "—" - - defp format_delta_with_direction(value, direction) do - formatted = format_delta(value) - emoji = delta_emoji(value, direction) - if emoji, do: "#{emoji} #{formatted}", else: formatted - end - - defp delta_emoji(_value, nil), do: nil - defp delta_emoji(value, :high) when value > 0, do: "🟢" - defp delta_emoji(value, :high) when value < 0, do: "🔴" - defp delta_emoji(value, :low) when value < 0, do: "🟢" - defp delta_emoji(value, :low) when value > 0, do: "🔴" - defp delta_emoji(_value, _direction), do: nil - - defp detect_columns(files) do - Enum.filter(@summary_metrics, fn {metric_name, key, _label} -> - Enum.any?(files, fn {_path, data} -> - source = data["head"] || data["base"] - source && get_in(source, ["metrics", metric_name, key]) != nil - end) - end) - end - - defp format_delta(nil), do: "—" - - defp format_delta(value) when value > 0, - do: "+#{:erlang.float_to_binary(value / 1, decimals: 2)}" - - defp format_delta(value) when value < 0, do: :erlang.float_to_binary(value / 1, decimals: 2) - defp format_delta(_), do: "0.00" - - defp format_value(nil), do: "—" - defp format_value(value) when is_float(value), do: :erlang.float_to_binary(value, decimals: 2) - defp format_value(value), do: to_string(value) -end diff --git a/lib/codeqa/git.ex b/lib/codeqa/git.ex index 78c0bdb8..44892058 100644 --- a/lib/codeqa/git.ex +++ b/lib/codeqa/git.ex @@ -8,8 +8,26 @@ defmodule CodeQA.Git do defstruct @enforce_keys end + alias CodeQA.Engine.Collector + @status_map %{"A" => "added", "M" => "modified", "D" => "deleted"} + @spec gitignored_files(String.t(), [String.t()]) :: MapSet.t() + def gitignored_files(_repo_path, []), do: MapSet.new() + + def gitignored_files(repo_path, paths) do + {output, _exit_code} = + System.cmd("git", ["check-ignore", "--no-index" | paths], + cd: repo_path, + stderr_to_stdout: false + ) + + output + |> String.trim() + |> String.split("\n", trim: true) + |> MapSet.new() + end + def changed_files(repo_path, base_ref, head_ref) do {output, 0} = System.cmd( @@ -25,6 +43,78 @@ defmodule CodeQA.Git do |> Enum.flat_map(&parse_change_line/1) end + @doc """ + Returns a map of file paths to lists of changed line ranges in the head version. + + Each range is a tuple `{start_line, end_line}` representing lines that were + added or modified in the diff between base_ref and head_ref. + """ + @spec diff_line_ranges(String.t(), String.t(), String.t()) :: + {:ok, %{String.t() => [{pos_integer(), pos_integer()}]}} | {:error, term()} + def diff_line_ranges(repo_path, base_ref, head_ref) do + case System.cmd( + "git", + ["diff", "-U0", "#{base_ref}..#{head_ref}"], + cd: repo_path, + stderr_to_stdout: false + ) do + {output, 0} -> + {:ok, parse_diff_hunks(output)} + + {_output, code} -> + {:error, "git diff exited with code #{code}"} + end + end + + @typep parse_state :: {String.t() | nil, %{String.t() => [{pos_integer(), pos_integer()}]}} + + @spec parse_diff_hunks(String.t()) :: %{String.t() => [{pos_integer(), pos_integer()}]} + defp parse_diff_hunks(diff_output) do + diff_output + |> String.split("\n") + |> Enum.reduce({nil, %{}}, &parse_diff_line/2) + |> elem(1) + |> Map.new(fn {path, ranges} -> {path, Enum.reverse(ranges)} end) + end + + @spec parse_diff_line(String.t(), parse_state()) :: parse_state() + defp parse_diff_line("diff --git a/" <> rest, {_current_file, acc}) do + # Extract the "b/..." path from the diff header + case Regex.run(~r/ b\/(.+)$/, rest) do + [_, path] -> {path, acc} + nil -> {nil, acc} + end + end + + defp parse_diff_line("@@ " <> rest, {current_file, acc}) when is_binary(current_file) do + # Parse hunk header: @@ -old_start,old_count +new_start,new_count @@ + case Regex.run(~r/\+(\d+)(?:,(\d+))?/, rest) do + [_, start_str] -> + # Single line change (no count means 1 line) + start = String.to_integer(start_str) + updated = Map.update(acc, current_file, [{start, start}], &[{start, start} | &1]) + {current_file, updated} + + [_, start_str, count_str] -> + start = String.to_integer(start_str) + count = String.to_integer(count_str) + + if count == 0 do + # Deletion only, no new lines + {current_file, acc} + else + end_line = start + count - 1 + updated = Map.update(acc, current_file, [{start, end_line}], &[{start, end_line} | &1]) + {current_file, updated} + end + + nil -> + {current_file, acc} + end + end + + defp parse_diff_line(_line, state), do: state + def read_file_at_ref(repo_path, ref, path) do case System.cmd("git", ["show", "#{ref}:#{path}"], cd: repo_path, stderr_to_stdout: true) do {output, 0} -> output @@ -66,6 +156,6 @@ defmodule CodeQA.Git do defp source_file?(path) do ext = path |> Path.extname() |> String.downcase() - MapSet.member?(CodeQA.Collector.source_extensions(), ext) + MapSet.member?(Collector.source_extensions(), ext) end end diff --git a/lib/codeqa/health_report.ex b/lib/codeqa/health_report.ex index 982b4698..183b737a 100644 --- a/lib/codeqa/health_report.ex +++ b/lib/codeqa/health_report.ex @@ -1,46 +1,116 @@ defmodule CodeQA.HealthReport do @moduledoc "Orchestrates health report generation from analysis results." - alias CodeQA.HealthReport.{Config, Grader, Formatter} + alias CodeQA.CombinedMetrics.{FileScorer, SampleRunner} + alias CodeQA.HealthReport.{Config, Delta, Formatter, Grader, TopBlocks} @spec generate(map(), keyword()) :: map() def generate(analysis_results, opts \\ []) do config_path = Keyword.get(opts, :config) - detail = Keyword.get(opts, :detail, :default) - top_n = Keyword.get(opts, :top, 5) + base_results = Keyword.get(opts, :base_results) + changed_files = Keyword.get(opts, :changed_files, []) + diff_line_ranges = Keyword.get(opts, :diff_line_ranges, %{}) + + %{ + categories: categories, + grade_scale: grade_scale, + impact_map: impact_map, + combined_top: combined_top, + block_min_lines: block_min_lines, + block_max_lines: block_max_lines + } = + Config.load(config_path) - %{categories: categories, grade_scale: grade_scale} = Config.load(config_path) aggregate = get_in(analysis_results, ["codebase", "aggregate"]) || %{} files = Map.get(analysis_results, "files", %{}) + project_langs = project_languages(files) - category_grades = Grader.grade_aggregate(categories, aggregate, grade_scale) - - category_grades = - Enum.zip(categories, category_grades) - |> Enum.map(fn {cat_def, graded} -> + threshold_grades = + categories + |> Grader.grade_aggregate(aggregate, grade_scale) + |> Enum.zip(categories) + |> Enum.map(fn {graded, _cat_def} -> summary = build_category_summary(graded) - cat_top = Map.get(cat_def, :top, top_n) + graded + |> Map.put(:type, :threshold) + |> Map.merge(%{summary: summary, worst_offenders: []}) + end) + + worst_files_map = FileScorer.worst_files_per_behavior(files, combined_top: combined_top) + + all_cosines = + SampleRunner.diagnose_aggregate(aggregate, top: 99_999, languages: project_langs) - worst = - case detail do - :summary -> [] - :full -> Grader.worst_offenders(cat_def, files, map_size(files), grade_scale) - _default -> Grader.worst_offenders(cat_def, files, cat_top, grade_scale) - end + cosines_by_category = Enum.group_by(all_cosines, & &1.category) - Map.merge(graded, %{summary: summary, worst_offenders: worst}) + cosine_grades = + Grader.grade_cosine_categories(cosines_by_category, worst_files_map, grade_scale) + + all_categories = + (threshold_grades ++ cosine_grades) + |> Enum.map(fn cat -> + Map.put(cat, :impact, Map.get(impact_map, to_string(cat.key), 1)) end) - {overall_score, overall_grade} = Grader.overall_score(category_grades, grade_scale) + {overall_score, overall_grade} = Grader.overall_score(all_categories, grade_scale, impact_map) metadata = build_metadata(analysis_results) + top_issues = Enum.take(all_cosines, 10) + + codebase_cosine_lookup = + Map.new(all_cosines, fn i -> {{i.category, i.behavior}, i.cosine} end) + + block_opts = [ + block_min_lines: block_min_lines, + block_max_lines: block_max_lines, + diff_line_ranges: diff_line_ranges + ] + + top_blocks = + TopBlocks.build(analysis_results, changed_files, codebase_cosine_lookup, block_opts) + + worst_blocks_by_category = + TopBlocks.worst_per_category( + analysis_results, + changed_files, + codebase_cosine_lookup, + block_opts + ) + + grading_cfg = %{ + category_defs: categories, + grade_scale: grade_scale, + impact_map: impact_map, + combined_top: combined_top + } + + {codebase_delta, pr_summary} = + if base_results do + build_delta_and_summary( + base_results, + analysis_results, + overall_score, + overall_grade, + grading_cfg, + changed_files, + top_blocks + ) + else + {nil, nil} + end + %{ metadata: metadata, + pr_summary: pr_summary, overall_score: overall_score, overall_grade: overall_grade, - categories: category_grades + codebase_delta: codebase_delta, + categories: all_categories, + top_issues: top_issues, + top_blocks: top_blocks, + worst_blocks_by_category: worst_blocks_by_category } end @@ -49,6 +119,77 @@ defmodule CodeQA.HealthReport do Formatter.format_markdown(report, detail, format) end + defp build_delta_and_summary( + base_results, + head_results, + head_score, + head_grade, + %{ + category_defs: category_defs, + grade_scale: grade_scale, + impact_map: impact_map, + combined_top: combined_top + }, + changed_files, + top_blocks + ) do + delta = Delta.compute(base_results, head_results) + + base_aggregate = get_in(base_results, ["codebase", "aggregate"]) || %{} + base_files = Map.get(base_results, "files", %{}) + base_project_langs = project_languages(base_files) + + base_threshold_grades = + category_defs + |> Grader.grade_aggregate(base_aggregate, grade_scale) + |> Enum.zip(category_defs) + |> Enum.map(fn {graded, _cat_def} -> + graded + |> Map.put(:type, :threshold) + |> Map.merge(%{summary: "", worst_offenders: []}) + end) + + base_worst_files_map = + FileScorer.worst_files_per_behavior(base_files, combined_top: combined_top) + + base_cosines_by_category = + SampleRunner.diagnose_aggregate(base_aggregate, top: 99_999, languages: base_project_langs) + |> Enum.group_by(& &1.category) + + base_cosine_grades = + Grader.grade_cosine_categories( + base_cosines_by_category, + base_worst_files_map, + grade_scale + ) + + base_all_categories = + (base_threshold_grades ++ base_cosine_grades) + |> Enum.map(fn cat -> + Map.put(cat, :impact, Map.get(impact_map, to_string(cat.key), 1)) + end) + + {base_score, base_grade} = Grader.overall_score(base_all_categories, grade_scale, impact_map) + + blocks_flagged = length(top_blocks) + files_added = Enum.count(changed_files, &(&1.status == "added")) + files_modified = Enum.count(changed_files, &(&1.status == "modified")) + + summary = %{ + base_score: base_score, + head_score: head_score, + score_delta: head_score - base_score, + base_grade: base_grade, + head_grade: head_grade, + blocks_flagged: blocks_flagged, + files_changed: length(changed_files), + files_added: files_added, + files_modified: files_modified + } + + {delta, summary} + end + defp build_metadata(analysis_results) do meta = Map.get(analysis_results, "metadata", %{}) @@ -59,6 +200,16 @@ defmodule CodeQA.HealthReport do } end + defp project_languages(files_map) do + files_map + |> Map.keys() + |> Enum.map(&CodeQA.Language.detect(&1).name()) + |> Enum.reject(&(&1 == "unknown")) + |> Enum.uniq() + end + + defp build_category_summary(%{type: :cosine}), do: "" + defp build_category_summary(graded) do low_scorers = graded.metric_scores diff --git a/lib/codeqa/health_report/behavior_labels.ex b/lib/codeqa/health_report/behavior_labels.ex new file mode 100644 index 00000000..3cd4f94b --- /dev/null +++ b/lib/codeqa/health_report/behavior_labels.ex @@ -0,0 +1,77 @@ +defmodule CodeQA.HealthReport.BehaviorLabels do + @moduledoc "Maps category/behavior pairs to human-readable labels and action items." + + alias CodeQA.CombinedMetrics.Scorer + + @labels %{ + {"function_design", "no_boolean_parameter"} => + {"Boolean parameter increases coupling", "Use separate functions or options map"}, + {"function_design", "boolean_function_has_question_mark"} => + {"Boolean function missing ? suffix", "Rename to use question mark convention"}, + {"function_design", "has_verb_in_name"} => + {"Function name lacks verb", "Use action verbs in function names"}, + {"function_design", "no_magic_numbers"} => + {"Magic numbers detected", "Extract constants with descriptive names"}, + {"function_design", "uses_ternary_expression"} => + {"Ternary expression overuse", "Use pattern matching or if/else"}, + {"code_smells", "cyclomatic_complexity_under_10"} => + {"High cyclomatic complexity", "Reduce branching or extract guard clauses"}, + {"code_smells", "no_deeply_nested_code"} => + {"Deeply nested code", "Extract helper functions to reduce nesting"}, + {"code_smells", "function_length_under_25"} => + {"Long function likely untestable", "Split into smaller functions"}, + {"code_smells", "no_duplicate_code"} => {"Duplicate logic detected", "Extract shared helper"}, + {"code_smells", "no_debug_print_statements"} => + {"Debug print left in code", "Remove `IO.puts`/`IO.inspect`/`console.log` or use a logger"}, + {"scope_and_assignment", "used_only_once"} => + {"Variable used only once", "Inline the expression unless the name aids readability"}, + {"consistency", "consistent_error_return_shape"} => + {"Mixed error-return shapes", + "Return errors in one shape (e.g. `{:error, reason}` everywhere)"}, + {"file_structure", "single_module_per_file"} => + {"Multiple modules in one file", "Split into separate files"}, + {"file_structure", "file_length_under_300"} => + {"File too long", "Split into focused modules"}, + {"dependencies", "no_circular_dependencies"} => + {"Circular dependency detected", "Reorganize module boundaries"}, + {"error_handling", "uses_tagged_tuples"} => + {"Missing tagged tuple returns", "Use {:ok, val} / {:error, reason} pattern"}, + {"naming_conventions", "filename_matches_module"} => + {"Filename doesn't match module", "Rename file to match module"}, + {"scope_and_assignment", "no_unused_variables"} => + {"Unused variables", "Remove or prefix with underscore"}, + {"testing", "test_file_exists"} => {"Missing test file", "Add tests for this module"}, + {"documentation", "has_moduledoc"} => {"Missing @moduledoc", "Add module documentation"} + } + + @spec label(String.t(), String.t()) :: String.t() + def label(category, behavior) do + case Map.get(@labels, {category, behavior}) do + {label, _action} -> label + nil -> humanize(behavior) + end + end + + @spec action(String.t(), String.t()) :: String.t() + def action(category, behavior) do + case Map.get(@labels, {category, behavior}) do + {_label, action} -> action + nil -> fix_hint_fallback(category, behavior) + end + end + + defp fix_hint_fallback(category, behavior) do + Scorer.all_yamls() + |> Enum.find_value(fn {yaml_path, data} -> + cat = yaml_path |> Path.basename() |> String.trim_trailing(".yml") + if cat == category, do: get_in(data, [behavior, "_fix_hint"]) + end) || "Review this code block" + end + + defp humanize(behavior) do + behavior + |> String.replace("_", " ") + |> String.split() + |> Enum.map_join(" ", &String.capitalize/1) + end +end diff --git a/lib/codeqa/health_report/categories.ex b/lib/codeqa/health_report/categories.ex index 69970beb..98b2e972 100644 --- a/lib/codeqa/health_report/categories.ex +++ b/lib/codeqa/health_report/categories.ex @@ -36,28 +36,36 @@ defmodule CodeQA.HealthReport.Categories do source: "readability", weight: 0.4, good: :high, - thresholds: %{a: 70, b: 50, c: 35, d: 20} + thresholds: %{a: 70, b: 50, c: 35, d: 20}, + fix_hint: + "Low readability score — simplify sentences, prefer short identifiers, avoid deeply nested expressions" }, %{ name: "fog_adapted", source: "readability", weight: 0.3, good: :low, - thresholds: %{a: 6, b: 10, c: 15, d: 22} + thresholds: %{a: 6, b: 10, c: 15, d: 22}, + fix_hint: + "High fog index — reduce complex multi-word identifiers and long compound expressions" }, %{ name: "avg_tokens_per_line", source: "readability", weight: 0.2, good: :low, - thresholds: %{a: 6, b: 10, c: 14, d: 20} + thresholds: %{a: 6, b: 10, c: 14, d: 20}, + fix_hint: + "Too many tokens per line — break long lines into multiple shorter statements" }, %{ name: "avg_line_length", source: "readability", weight: 0.1, good: :low, - thresholds: %{a: 40, b: 60, c: 80, d: 100} + thresholds: %{a: 40, b: 60, c: 80, d: 100}, + fix_hint: + "Lines too long — wrap at 80–120 characters and extract intermediate variables" } ] }, @@ -70,28 +78,35 @@ defmodule CodeQA.HealthReport.Categories do source: "halstead", weight: 0.35, good: :low, - thresholds: %{a: 10, b: 20, c: 35, d: 50} + thresholds: %{a: 10, b: 20, c: 35, d: 50}, + fix_hint: + "High operator/operand ratio — extract repeated sub-expressions into named variables" }, %{ name: "effort", source: "halstead", weight: 0.30, good: :low, - thresholds: %{a: 5000, b: 20000, c: 50000, d: 100_000} + thresholds: %{a: 5000, b: 20_000, c: 50_000, d: 100_000}, + fix_hint: + "High implementation effort — simplify logic by extracting helpers and reducing branching" }, %{ name: "volume", source: "halstead", weight: 0.20, good: :low, - thresholds: %{a: 300, b: 1000, c: 3000, d: 8000} + thresholds: %{a: 300, b: 1000, c: 3000, d: 8000}, + fix_hint: + "High token volume — extract helper functions to reduce the total operation count" }, %{ name: "estimated_bugs", source: "halstead", weight: 0.15, good: :low, - thresholds: %{a: 0.1, b: 0.5, c: 1.0, d: 3.0} + thresholds: %{a: 0.1, b: 0.5, c: 1.0, d: 3.0}, + fix_hint: "High defect estimate — reduce complexity; simpler code has fewer bugs" } ] }, @@ -104,56 +119,69 @@ defmodule CodeQA.HealthReport.Categories do source: "branching", weight: 0.25, good: :low, - thresholds: %{a: 0.08, b: 0.17, c: 0.30, d: 0.45} + thresholds: %{a: 0.08, b: 0.17, c: 0.30, d: 0.45}, + fix_hint: + "Too many branches per line — flatten conditionals using guard clauses or early returns" }, %{ name: "mean_depth", source: "indentation", weight: 0.2, good: :low, - thresholds: %{a: 3.5, b: 7, c: 10, d: 15} + thresholds: %{a: 3.5, b: 7, c: 10, d: 15}, + fix_hint: "High average nesting — extract inner blocks into helper functions" }, %{ name: "avg_function_lines", source: "function_metrics", weight: 0.2, good: :low, - thresholds: %{a: 8, b: 15, c: 30, d: 65} + thresholds: %{a: 8, b: 15, c: 30, d: 65}, + fix_hint: + "Functions too long on average — split into smaller single-purpose functions" }, %{ name: "max_depth", source: "indentation", weight: 0.1, good: :low, - thresholds: %{a: 8, b: 16, c: 25, d: 35} + thresholds: %{a: 8, b: 16, c: 25, d: 35}, + fix_hint: "Deep nesting — restructure using early returns or extract nested logic" }, %{ name: "max_function_lines", source: "function_metrics", weight: 0.1, good: :low, - thresholds: %{a: 20, b: 50, c: 100, d: 200} + thresholds: %{a: 20, b: 50, c: 100, d: 200}, + fix_hint: + "Largest function too long — decompose the longest function into focused helpers" }, %{ name: "variance", source: "indentation", weight: 0.1, good: :low, - thresholds: %{a: 7, b: 20, c: 40, d: 65} + thresholds: %{a: 7, b: 20, c: 40, d: 65}, + fix_hint: + "Inconsistent indentation depth — standardize nesting by flattening or restructuring" }, %{ name: "avg_param_count", source: "function_metrics", weight: 0.03, good: :low, - thresholds: %{a: 2, b: 3, c: 5, d: 7} + thresholds: %{a: 2, b: 3, c: 5, d: 7}, + fix_hint: "Too many parameters on average — group related params into a struct or map" }, %{ name: "max_param_count", source: "function_metrics", weight: 0.02, good: :low, - thresholds: %{a: 3, b: 5, c: 7, d: 10} + thresholds: %{a: 3, b: 5, c: 7, d: 10}, + fix_hint: + "Function has too many parameters — introduce a parameter object or options map" } ] }, @@ -166,21 +194,27 @@ defmodule CodeQA.HealthReport.Categories do source: "compression", weight: 0.5, good: :low, - thresholds: %{a: 0.3, b: 0.5, c: 0.65, d: 0.8} + thresholds: %{a: 0.3, b: 0.5, c: 0.65, d: 0.8}, + fix_hint: + "High redundancy — extract repeated patterns into shared helpers or abstractions" }, %{ name: "bigram_repetition_rate", source: "ngram", weight: 0.3, good: :low, - thresholds: %{a: 0.15, b: 0.30, c: 0.45, d: 0.60} + thresholds: %{a: 0.15, b: 0.30, c: 0.45, d: 0.60}, + fix_hint: + "Repeated two-token sequences — consolidate duplicated patterns into named functions" }, %{ name: "trigram_repetition_rate", source: "ngram", weight: 0.2, good: :low, - thresholds: %{a: 0.05, b: 0.15, c: 0.30, d: 0.45} + thresholds: %{a: 0.05, b: 0.15, c: 0.30, d: 0.45}, + fix_hint: + "Repeated three-token sequences — extract duplicated logic into reusable abstractions" } ] }, @@ -193,28 +227,34 @@ defmodule CodeQA.HealthReport.Categories do source: "casing_entropy", weight: 0.3, good: :low, - thresholds: %{a: 1.0, b: 1.5, c: 2.0, d: 2.3} + thresholds: %{a: 1.0, b: 1.5, c: 2.0, d: 2.3}, + fix_hint: + "Mixed casing styles — use a single consistent casing convention throughout the file" }, %{ name: "mean", source: "identifier_length_variance", weight: 0.25, good: :low, - thresholds: %{a: 12, b: 18, c: 25, d: 35} + thresholds: %{a: 12, b: 18, c: 25, d: 35}, + fix_hint: "Identifiers too long on average — prefer concise, intent-revealing names" }, %{ name: "variance", source: "identifier_length_variance", weight: 0.25, good: :low, - thresholds: %{a: 15, b: 30, c: 50, d: 80} + thresholds: %{a: 15, b: 30, c: 50, d: 80}, + fix_hint: "High identifier length variance — standardize name length conventions" }, %{ name: "avg_sub_words_per_id", source: "readability", weight: 0.2, good: :low, - thresholds: %{a: 3, b: 4, c: 5, d: 7} + thresholds: %{a: 3, b: 4, c: 5, d: 7}, + fix_hint: + "Identifiers have too many sub-words — simplify to 2–3 word names where possible" } ] }, @@ -227,7 +267,8 @@ defmodule CodeQA.HealthReport.Categories do source: "magic_number_density", weight: 1.0, good: :low, - thresholds: %{a: 0.02, b: 0.05, c: 0.10, d: 0.20} + thresholds: %{a: 0.02, b: 0.05, c: 0.10, d: 0.20}, + fix_hint: "Too many magic numbers — replace literal values with named constants" } ] } diff --git a/lib/codeqa/health_report/config.ex b/lib/codeqa/health_report/config.ex index 15bf125f..7c457b29 100644 --- a/lib/codeqa/health_report/config.ex +++ b/lib/codeqa/health_report/config.ex @@ -3,9 +3,24 @@ defmodule CodeQA.HealthReport.Config do alias CodeQA.HealthReport.Categories - @spec load(String.t() | nil) :: %{categories: [map()], grade_scale: [{number(), String.t()}]} - def load(nil), - do: %{categories: Categories.defaults(), grade_scale: Categories.default_grade_scale()} + @spec load(String.t() | nil) :: %{ + categories: [map()], + grade_scale: [{number(), String.t()}], + impact_map: %{String.t() => pos_integer()}, + combined_top: pos_integer(), + block_min_lines: pos_integer(), + block_max_lines: pos_integer() + } + def load(nil) do + %{ + categories: Categories.defaults(), + grade_scale: Categories.default_grade_scale(), + impact_map: CodeQA.Config.impact_map(), + combined_top: CodeQA.Config.combined_top(), + block_min_lines: 3, + block_max_lines: 20 + } + end def load(path) do yaml = YamlElixir.read_from_file!(path) @@ -30,8 +45,26 @@ defmodule CodeQA.HealthReport.Config do end) grade_scale = parse_grade_scale(Map.get(yaml, "grade_scale")) + impact_map = parse_impact(Map.get(yaml, "impact")) + combined_top = Map.get(yaml, "combined_top", 2) + block_min_lines = Map.get(yaml, "block_min_lines", 3) + block_max_lines = Map.get(yaml, "block_max_lines", 20) + + %{ + categories: categories, + grade_scale: grade_scale, + impact_map: impact_map, + combined_top: combined_top, + block_min_lines: block_min_lines, + block_max_lines: block_max_lines + } + end + + defp parse_impact(nil), do: CodeQA.Config.impact_map() - %{categories: categories, grade_scale: grade_scale} + defp parse_impact(overrides) when is_map(overrides) do + string_overrides = Map.new(overrides, fn {k, v} -> {to_string(k), v} end) + Map.merge(CodeQA.Config.impact_map(), string_overrides) end defp parse_grade_scale(nil), do: Categories.default_grade_scale() diff --git a/lib/codeqa/health_report/delta.ex b/lib/codeqa/health_report/delta.ex new file mode 100644 index 00000000..52b0085e --- /dev/null +++ b/lib/codeqa/health_report/delta.ex @@ -0,0 +1,42 @@ +defmodule CodeQA.HealthReport.Delta do + @moduledoc "Computes aggregate metric delta between two codebase analysis results." + + @spec compute(map(), map()) :: %{ + base: %{aggregate: map()}, + head: %{aggregate: map()}, + delta: %{aggregate: map()} + } + def compute(base_results, head_results) do + base_agg = get_in(base_results, ["codebase", "aggregate"]) || %{} + head_agg = get_in(head_results, ["codebase", "aggregate"]) || %{} + + %{ + base: %{aggregate: base_agg}, + head: %{aggregate: head_agg}, + delta: %{aggregate: compute_aggregate_delta(base_agg, head_agg)} + } + end + + defp compute_aggregate_delta(base_agg, head_agg) do + MapSet.new(Map.keys(base_agg) ++ Map.keys(head_agg)) + |> Enum.reduce(%{}, fn metric_name, acc -> + base_m = Map.get(base_agg, metric_name, %{}) + head_m = Map.get(head_agg, metric_name, %{}) + delta = compute_numeric_delta(base_m, head_m) + if delta == %{}, do: acc, else: Map.put(acc, metric_name, delta) + end) + end + + defp compute_numeric_delta(base, head) do + MapSet.new(Map.keys(base) ++ Map.keys(head)) + |> Enum.reduce(%{}, fn key, acc -> + case {Map.get(base, key), Map.get(head, key)} do + {b, h} when is_number(b) and is_number(h) -> + Map.put(acc, key, Float.round((h - b) * 1.0, 4)) + + _ -> + acc + end + end) + end +end diff --git a/lib/codeqa/health_report/formatter.ex b/lib/codeqa/health_report/formatter.ex index df17d8d9..d166f145 100644 --- a/lib/codeqa/health_report/formatter.ex +++ b/lib/codeqa/health_report/formatter.ex @@ -8,4 +8,23 @@ defmodule CodeQA.HealthReport.Formatter do def format_markdown(report, detail, :plain, _opts), do: Plain.render(report, detail) def format_markdown(report, detail, :github, opts), do: Github.render(report, detail, opts) + + @doc """ + Renders the report as multiple parts for GitHub PR comments. + Returns a flat list of strings: [part_1, part_2, part_3, ...]. + + Part 1: Header, summary, PR summary, delta, chart, progress bars + Part 2: Top issues, category detail sections + Part 3+: Blocks section, sliced at 60,000 chars per part + + Each part ends with a sentinel comment for sticky comment identification. + """ + @spec render_parts(map(), keyword()) :: [String.t()] + def render_parts(report, opts \\ []) do + part_1 = Github.render_part_1(report, opts) + part_2 = Github.render_part_2(report, opts) + parts_3 = Github.render_parts_3(report, opts) + + [part_1, part_2 | parts_3] + end end diff --git a/lib/codeqa/health_report/formatter/github.ex b/lib/codeqa/health_report/formatter/github.ex index 72bb9ee8..5bf9f7f2 100644 --- a/lib/codeqa/health_report/formatter/github.ex +++ b/lib/codeqa/health_report/formatter/github.ex @@ -8,18 +8,120 @@ defmodule CodeQA.HealthReport.Formatter.Github do @spec render(map(), atom(), keyword()) :: String.t() def render(report, detail, opts \\ []) do chart? = Keyword.get(opts, :chart, true) + display_categories = merge_cosine_categories(report.categories) + worst_blocks = Map.get(report, :worst_blocks_by_category, %{}) [ + pr_summary_section(Map.get(report, :pr_summary)), header(report), - if(chart?, do: mermaid_chart(report.categories), else: []), - progress_bars(report.categories), - category_sections(report.categories, detail), + cosine_legend(), + delta_section(Map.get(report, :codebase_delta)), + if(chart?, do: mermaid_chart(display_categories), else: []), + progress_bars(display_categories), + top_issues_section(Map.get(report, :top_issues, []), detail), + blocks_section(Map.get(report, :top_blocks, [])), + category_sections(display_categories, detail, worst_blocks), footer() ] |> List.flatten() |> Enum.join("\n") end + @doc """ + Renders Part 1: header, summary table, PR summary, delta, mermaid chart, progress bars. + Each part ends with a sentinel HTML comment for sticky comment identification. + """ + @spec render_part_1(map(), keyword()) :: String.t() + def render_part_1(report, opts \\ []) do + chart? = Keyword.get(opts, :chart, true) + display_categories = merge_cosine_categories(report.categories) + + [ + pr_summary_section(Map.get(report, :pr_summary)), + header(report), + cosine_legend(), + delta_section(Map.get(report, :codebase_delta)), + if(chart?, do: mermaid_chart(display_categories), else: []), + progress_bars(display_categories), + sentinel(1) + ] + |> List.flatten() + |> Enum.join("\n") + end + + @doc """ + Renders Part 2: top issues + all category detail sections. + """ + @spec render_part_2(map(), keyword()) :: String.t() + def render_part_2(report, opts \\ []) do + detail = Keyword.get(opts, :detail, :default) + display_categories = merge_cosine_categories(report.categories) + worst_blocks = Map.get(report, :worst_blocks_by_category, %{}) + + [ + top_issues_section(Map.get(report, :top_issues, []), detail), + category_sections(display_categories, detail, worst_blocks), + sentinel(2) + ] + |> List.flatten() + |> Enum.join("\n") + end + + @doc """ + Renders Part 3: blocks section (top 10 blocks with code). + Returns a list with a single part since blocks are now limited to top 10. + """ + @spec render_parts_3(map(), keyword()) :: [String.t()] + def render_parts_3(report, _opts \\ []) do + top_blocks = Map.get(report, :top_blocks, []) + blocks_content = blocks_section(top_blocks) |> List.flatten() |> Enum.join("\n") + [blocks_content <> "\n\n" <> sentinel_str(3)] + end + + defp sentinel(n), do: [sentinel_str(n)] + + defp sentinel_str(n), do: "" + + defp merge_cosine_categories(categories) do + {cosine, threshold} = Enum.split_with(categories, &(&1.type == :cosine)) + + case cosine do + [] -> + threshold + + _ -> + total_impact = Enum.sum(Enum.map(cosine, & &1.impact)) + + combined_score = + round(Enum.sum(Enum.map(cosine, &(&1.score * &1.impact))) / max(total_impact, 1)) + + combined = %{ + type: :cosine_group, + key: "combined_metrics", + name: "Combined Metrics", + score: combined_score, + grade: grade_letter_from_score(combined_score), + categories: cosine + } + + threshold ++ [combined] + end + end + + defp grade_letter_from_score(score) when score >= 97, do: "A+" + defp grade_letter_from_score(score) when score >= 93, do: "A" + defp grade_letter_from_score(score) when score >= 90, do: "A-" + defp grade_letter_from_score(score) when score >= 87, do: "B+" + defp grade_letter_from_score(score) when score >= 83, do: "B" + defp grade_letter_from_score(score) when score >= 80, do: "B-" + defp grade_letter_from_score(score) when score >= 77, do: "C+" + defp grade_letter_from_score(score) when score >= 73, do: "C" + defp grade_letter_from_score(score) when score >= 70, do: "C-" + defp grade_letter_from_score(score) when score >= 67, do: "D+" + defp grade_letter_from_score(score) when score >= 63, do: "D" + defp grade_letter_from_score(score) when score >= 60, do: "D-" + defp grade_letter_from_score(_score), do: "F" + defp header(report) do emoji = grade_emoji(report.overall_grade) @@ -31,9 +133,16 @@ defmodule CodeQA.HealthReport.Formatter.Github do ] end + defp cosine_legend do + [ + "> *Combined metric scores use cosine similarity: +1 = metric profile perfectly matches healthy pattern for this behavior, 0 = no signal, −1 = anti-pattern detected. Mapped to 0–100 using breakpoints (approx: ≥0.5→A, ≥0.2→B, ≥0.0→C, ≥−0.3→D, <−0.3→F); actual letter grades use the full 15-step scale.*", + "" + ] + end + defp mermaid_chart(categories) do - names = Enum.map(categories, fn c -> ~s("#{c.name}") end) |> Enum.join(", ") - scores = Enum.map(categories, fn c -> to_string(c.score) end) |> Enum.join(", ") + names = Enum.map_join(categories, ", ", fn c -> ~s("#{c.name}") end) + scores = Enum.map_join(categories, ", ", fn c -> to_string(c.score) end) [ "```mermaid", @@ -74,35 +183,159 @@ defmodule CodeQA.HealthReport.Formatter.Github do String.duplicate(@filled, filled) <> String.duplicate(@empty, empty) end - defp category_sections(_categories, :summary), do: [] + defp category_sections(_categories, :summary, _worst_blocks), do: [] + + defp category_sections(categories, detail, worst_blocks) do + Enum.flat_map(categories, &render_category(&1, detail, worst_blocks)) + end + + defp render_category(%{type: :cosine_group} = group, detail, worst_blocks) do + emoji = grade_emoji(group.grade) + summary_line = "#{emoji} #{group.name} — #{group.grade} (#{group.score}/100)" + + inner = + cosine_group_content(group, detail, worst_blocks) + |> List.flatten() + |> Enum.join("\n") + + [ + "
", + "#{summary_line}", + "", + inner, + "", + "
", + "" + ] + end + + defp render_category(%{type: :cosine} = cat, detail, worst_blocks) do + emoji = grade_emoji(cat.grade) + summary_line = "#{emoji} #{cat.name} — #{cat.grade} (#{cat.score}/100)" + + inner = + cosine_section_content(cat, detail, worst_blocks) + |> List.flatten() + |> Enum.join("\n") + + [ + "
", + "#{summary_line}", + "", + inner, + "", + "
", + "" + ] + end + + defp render_category(cat, detail, _worst_blocks) do + emoji = grade_emoji(cat.grade) + summary_line = "#{emoji} #{cat.name} — #{cat.grade} (#{cat.score}/100)" + + inner = + section_content(cat, detail) + |> List.flatten() + |> Enum.join("\n") + + [ + "
", + "#{summary_line}", + "", + inner, + "", + "
", + "" + ] + end + + defp cosine_group_content(group, detail, worst_blocks) do + rows = + Enum.map(group.categories, fn cat -> + emoji = grade_emoji(cat.grade) + "| #{cat.name} | #{cat.score} | #{emoji} #{cat.grade} |" + end) + + summary_table = [ + "| Category | Score | Grade |", + "|----------|-------|-------|" + | rows + ] + + sub_sections = + Enum.flat_map(group.categories, fn cat -> + emoji = grade_emoji(cat.grade) - defp category_sections(categories, detail) do - Enum.flat_map(categories, fn cat -> - emoji = grade_emoji(cat.grade) - summary_line = "#{emoji} #{cat.name} — #{cat.grade} (#{cat.score}/100)" + inner = + cosine_section_content(cat, detail, worst_blocks) + |> List.flatten() + |> Enum.join("\n") + + [ + "
", + "#{emoji} #{cat.name} — #{cat.grade} (#{cat.score}/100)", + "", + inner, + "", + "
", + "" + ] + end) + + summary_table ++ [""] ++ sub_sections + end - inner = - section_content(cat, detail) - |> List.flatten() - |> Enum.join("\n") + defp cosine_section_content(cat, _detail, worst_blocks) do + n = length(cat.behaviors) + category_key = to_string(cat.key) + + behaviors_rows = + Enum.map(cat.behaviors, fn b -> + "| #{b.behavior} | #{format_num(b.cosine)} | #{b.score} | #{b.grade} |" + end) + + behaviors_table = [ + "> Cosine similarity scores for #{n} behaviors.", + "", + "| Behavior | Cosine | Score | Grade |", + "|----------|--------|-------|-------|" + | behaviors_rows + ] + + worst_block_section = + case Map.get(worst_blocks, category_key) do + nil -> [] + block -> render_worst_block(block) + end + + behaviors_table ++ [""] ++ worst_block_section + end + + defp render_worst_block(block) do + line_count = (block.end_line || block.start_line) - block.start_line + 1 + location = "#{block.path}:#{block.start_line}-#{block.end_line}" + + if line_count >= 1 and line_count <= 15 and block.source do + lang = block.language || "" [ - "
", - "#{summary_line}", - "", - inner, - "", - "
", + "> **Worst offender** (`#{location}`):", + "> ```#{lang}", + block.source |> String.split("\n") |> Enum.map(&"> #{&1}") |> Enum.join("\n"), + "> ```", "" ] - end) + else + [ + "> **Worst offender**: `#{location}` (#{line_count} lines)", + "" + ] + end end defp section_content(cat, _detail) do metric_summary = - cat.metric_scores - |> Enum.map(fn m -> "#{m.name}=#{format_num(m.value)}" end) - |> Enum.join(", ") + Enum.map_join(cat.metric_scores, ", ", fn m -> "#{m.name}=#{format_num(m.value)}" end) metrics_table = if cat.metric_scores != [] do @@ -124,42 +357,35 @@ defmodule CodeQA.HealthReport.Formatter.Github do "Codebase averages: #{metric_summary}", "" | metrics_table - ] ++ [""] ++ worst_offenders(cat) + ] ++ [""] end - defp worst_offenders(cat) do - offenders = Map.get(cat, :worst_offenders, []) + defp top_issues_section([], _detail), do: [] + defp top_issues_section(_issues, :summary), do: [] - if offenders == [] do - [] - else - averages = Map.new(cat.metric_scores, &{&1.name, &1.value}) - - rows = - Enum.map(offenders, fn f -> - issues = - f.metric_scores - |> Enum.map(fn m -> - avg = Map.get(averages, m.name) - avg_str = if avg, do: " (avg: #{format_num(avg)})", else: "" - "#{direction(m.good)}#{m.name}=#{format_num(m.value)}#{avg_str}" - end) - |> Enum.join("
") - - "| #{format_path(f.path)}
#{format_lines(f[:lines])} lines · #{format_size(f[:bytes])} | #{f.grade} (#{f.score}) | #{issues} |" - end) + defp top_issues_section(issues, _detail) do + rows = + Enum.map_join(issues, "\n", fn i -> + "| `#{i.category}.#{i.behavior}` | #{format_num(i.cosine)} | #{format_num(i.score)} |" + end) - [ - "**Worst Offenders**", - "", - "| File | Grade | Issues |", - "|------|-------|--------|" - | rows - ] - end + table = "| Behavior | Cosine | Score |\n|----------|--------|-------|\n#{rows}" + + [ + "
", + "🔍 Top Likely Issues (cosine similarity)", + "", + "> Most negative cosine = file's metric profile best matches this anti-pattern.", + "", + table, + "", + "
", + "" + ] end defp footer do + # Legacy footer for single-part render/3 (used by --output file mode) ["", ""] end @@ -179,29 +405,244 @@ defmodule CodeQA.HealthReport.Formatter.Github do defp extract_project_name(_), do: "unknown" - defp format_path(path) when byte_size(path) < 80, do: "`#{path}`" + defp format_num(value) when is_float(value), do: :erlang.float_to_binary(value, decimals: 2) + defp format_num(value) when is_integer(value), do: to_string(value) + defp format_num(value), do: to_string(value) + + defp format_date(timestamp) when is_binary(timestamp), do: String.slice(timestamp, 0, 10) + defp format_date(_), do: "unknown" + + defp pr_summary_section(nil), do: [] + + defp pr_summary_section(summary) do + delta_str = + if summary.score_delta >= 0, + do: "+#{summary.score_delta}", + else: "#{summary.score_delta}" + + status_str = "#{summary.files_modified} modified, #{summary.files_added} added" + + [ + "> **Score:** #{summary.base_grade} → #{summary.head_grade} | **Δ** #{delta_str} pts | **#{summary.blocks_flagged}** blocks flagged across #{summary.files_changed} files | #{status_str}", + "" + ] + end + + defp delta_section(nil), do: [] + + defp delta_section(delta) do + base_agg = delta.base.aggregate + head_agg = delta.head.aggregate - defp format_path(path) do - case String.split(path, "/") do - [file] -> "`#{file}`" - parts -> Enum.join(Enum.drop(parts, -1), "/") <> "/
`#{List.last(parts)}`" + metrics = [ + {"Readability", "readability", "mean_flesch_adapted"}, + {"Complexity", "halstead", "mean_difficulty"}, + {"Duplication", "compression", "mean_redundancy"}, + {"Structure", "branching", "mean_branch_count"} + ] + + rows = Enum.flat_map(metrics, &format_metric_row(&1, base_agg, head_agg)) + + if rows == [] do + [] + else + [ + "## Metric Changes", + "", + "| Category | Base | Head | Δ |", + "|----------|------|------|---|" + | rows + ] ++ [""] end end - defp direction(:high), do: "↑ " - defp direction(_), do: "↓ " + defp format_metric_row({label, group, key}, base_agg, head_agg) do + base_val = get_in(base_agg, [group, key]) + head_val = get_in(head_agg, [group, key]) - defp format_lines(nil), do: "—" - defp format_lines(n), do: to_string(n) + if is_number(base_val) and is_number(head_val) do + diff = Float.round(head_val - base_val, 2) + diff_str = if diff >= 0, do: "+#{format_num(diff)}", else: "#{format_num(diff)}" + ["| #{label} | #{format_num(base_val)} | #{format_num(head_val)} | #{diff_str} |"] + else + [] + end + end - defp format_size(nil), do: "—" - defp format_size(bytes) when bytes < 1024, do: "#{bytes} B" - defp format_size(bytes), do: "#{Float.round(bytes / 1024, 1)} KB" + defp blocks_section([]) do + ["> 🟢 **No block-level issues detected**", ""] + end - defp format_num(value) when is_float(value), do: :erlang.float_to_binary(value, decimals: 2) - defp format_num(value) when is_integer(value), do: to_string(value) - defp format_num(value), do: to_string(value) + defp blocks_section(top_blocks) do + alias CodeQA.HealthReport.BehaviorLabels - defp format_date(timestamp) when is_binary(timestamp), do: String.slice(timestamp, 0, 10) - defp format_date(_), do: "unknown" + severity_counts = count_severities(top_blocks) + worst = worst_severity(severity_counts) + {icon, verdict} = verdict_text(worst, severity_counts) + + {actionable, medium_blocks} = + Enum.split_with(top_blocks, fn b -> + top = List.first(b.potentials) + top && top.severity in [:critical, :high] + end) + + verdict_box = [ + "> ### #{icon} #{verdict}", + "> #{severity_summary(severity_counts)}", + "" + ] + + action_table = + if actionable != [] do + rows = + Enum.map(actionable, fn block -> + top = List.first(block.potentials) + sev_icon = severity_icon(top.severity) + label = BehaviorLabels.label(top.category, top.behavior) + location = "`#{block.path}:#{block.start_line}-#{block.end_line || block.start_line}`" + action = BehaviorLabels.action(top.category, top.behavior) + "| #{sev_icon} #{label} | #{location} | #{action} |" + end) + + [ + "| What | Where | Action |", + "|------|-------|--------|" + | rows + ] ++ [""] + else + [] + end + + actionable_details = Enum.flat_map(actionable, &format_block_card/1) + + medium_section = + if medium_blocks != [] do + n = length(medium_blocks) + word = if n == 1, do: "block", else: "blocks" + inner = Enum.flat_map(medium_blocks, &format_block_card/1) |> Enum.join("\n") + + [ + "
", + "#{n} medium-severity #{word} (expand)", + "", + inner, + "", + "
", + "" + ] + else + [] + end + + verdict_box ++ action_table ++ actionable_details ++ medium_section + end + + defp count_severities(blocks) do + blocks + |> Enum.map(fn b -> (List.first(b.potentials) || %{severity: :medium}).severity end) + |> Enum.frequencies() + end + + defp worst_severity(counts) do + cond do + Map.get(counts, :critical, 0) > 0 -> :critical + Map.get(counts, :high, 0) > 0 -> :high + Map.get(counts, :medium, 0) > 0 -> :medium + true -> :none + end + end + + defp verdict_text(:critical, counts) do + n = Map.get(counts, :critical, 0) + {"🔴", "#{n} critical #{pl(n, "block")} — review required before merge"} + end + + defp verdict_text(:high, counts) do + n = Map.get(counts, :high, 0) + Map.get(counts, :critical, 0) + {"🟠", "#{n} #{pl(n, "block")} need attention before merge"} + end + + defp verdict_text(:medium, counts) do + n = Map.get(counts, :medium, 0) + {"🟡", "#{n} #{pl(n, "block")} with minor issues (safe to merge)"} + end + + defp verdict_text(:none, _), do: {"🟢", "No block-level issues detected"} + + defp pl(1, word), do: word + defp pl(_, word), do: word <> "s" + + defp severity_summary(counts) do + [:critical, :high, :medium] + |> Enum.map(fn sev -> {sev, Map.get(counts, sev, 0)} end) + |> Enum.reject(fn {_, n} -> n == 0 end) + |> Enum.map_join(" · ", fn {sev, n} -> "**#{n} #{sev}**" end) + end + + defp format_block_card(block) do + alias CodeQA.HealthReport.BehaviorLabels + + end_line = block.end_line || block.start_line + top_potential = List.first(block.potentials) + icon = severity_icon(top_potential.severity) + label = BehaviorLabels.label(top_potential.category, top_potential.behavior) + + summary_line = "#{icon} #{block.path}:#{block.start_line}-#{end_line} — #{label}" + + issues = format_block_issues(block.potentials) + code_block = format_code_block(block) + + [ + "
", + "#{summary_line}", + "", + "**Issues:**", + "" + | issues + ] ++ ["", code_block, "", "
", ""] + end + + defp format_block_issues(potentials) do + Enum.flat_map(potentials, fn p -> + icon = severity_icon(p.severity) + label = String.upcase(to_string(p.severity)) + delta_str = format_num(p.cosine_delta) + line = "- #{icon} **#{label}** `#{p.category}/#{p.behavior}` (Δ #{delta_str})" + fix = if p.fix_hint, do: [" > #{p.fix_hint}"], else: [] + [line | fix] + end) + end + + defp format_code_block(%{source: nil}), do: "_Source code not available_" + + defp format_code_block(%{source: source, language: lang, start_line: start_line}) do + lang_hint = code_fence_lang(lang) + # Add line number comments for context + lines = String.split(source, "\n") + + numbered_lines = + lines + |> Enum.with_index(start_line) + |> Enum.map(fn {line, num} -> "#{String.pad_leading(to_string(num), 4)} │ #{line}" end) + |> Enum.join("\n") + + "```#{lang_hint}\n#{numbered_lines}\n```" + end + + defp code_fence_lang("elixir"), do: "elixir" + defp code_fence_lang("ruby"), do: "ruby" + defp code_fence_lang("javascript"), do: "javascript" + defp code_fence_lang("typescript"), do: "typescript" + defp code_fence_lang("python"), do: "python" + defp code_fence_lang("swift"), do: "swift" + defp code_fence_lang("kotlin"), do: "kotlin" + defp code_fence_lang("java"), do: "java" + defp code_fence_lang("go"), do: "go" + defp code_fence_lang("rust"), do: "rust" + defp code_fence_lang(_), do: "" + + defp severity_icon(:critical), do: "🔴" + defp severity_icon(:high), do: "🟠" + defp severity_icon(:medium), do: "🟡" + defp severity_icon(_), do: "⚪" end diff --git a/lib/codeqa/health_report/formatter/plain.ex b/lib/codeqa/health_report/formatter/plain.ex index 8471aef5..517fc5f8 100644 --- a/lib/codeqa/health_report/formatter/plain.ex +++ b/lib/codeqa/health_report/formatter/plain.ex @@ -4,8 +4,13 @@ defmodule CodeQA.HealthReport.Formatter.Plain do @spec render(map(), atom()) :: String.t() def render(report, detail) do [ + pr_summary_section(Map.get(report, :pr_summary)), header(report), + cosine_legend(), + delta_section(Map.get(report, :codebase_delta)), overall_table(report), + top_issues_section(Map.get(report, :top_issues, []), detail), + blocks_section(Map.get(report, :top_blocks, [])), category_sections(report.categories, detail) ] |> List.flatten() @@ -23,16 +28,24 @@ defmodule CodeQA.HealthReport.Formatter.Plain do ] end + defp cosine_legend do + [ + "> *Combined metric scores use cosine similarity: +1 = metric profile perfectly matches healthy pattern for this behavior, 0 = no signal, −1 = anti-pattern detected. Mapped to 0–100 using breakpoints (approx: ≥0.5→A, ≥0.2→B, ≥0.0→C, ≥−0.3→D, <−0.3→F); actual letter grades use the full 15-step scale.*", + "" + ] + end + defp overall_table(report) do rows = Enum.map(report.categories, fn cat -> summary = Map.get(cat, :summary, "") - "| #{cat.name} | #{cat.grade} | #{cat.score} | #{summary} |" + impact = Map.get(cat, :impact, "") + "| #{cat.name} | #{cat.grade} | #{cat.score} | #{impact} | #{summary} |" end) [ - "| Category | Grade | Score | Summary |", - "|----------|-------|-------|---------|" + "| Category | Grade | Score | Impact | Summary |", + "|----------|-------|-------|--------|---------|" | rows ] ++ [""] end @@ -41,15 +54,45 @@ defmodule CodeQA.HealthReport.Formatter.Plain do defp category_sections(categories, detail) do Enum.flat_map(categories, fn cat -> - section_header(cat) ++ metric_detail(cat) ++ worst_offenders_section(cat, detail) + render_category(cat, detail) end) end + defp render_category(%{type: :cosine} = cat, _detail) do + cosine_section_header(cat) ++ cosine_behaviors_table(cat) + end + + defp render_category(cat, _detail) do + section_header(cat) ++ metric_detail(cat) + end + + defp cosine_section_header(cat) do + n = length(cat.behaviors) + + [ + "## #{cat.name} — #{cat.grade}", + "", + "> Cosine similarity scores for #{n} behaviors.", + "" + ] + end + + defp cosine_behaviors_table(cat) do + rows = + Enum.map(cat.behaviors, fn b -> + "| #{b.behavior} | #{format_num(b.cosine)} | #{b.score} | #{b.grade} |" + end) + + [ + "| Behavior | Cosine | Score | Grade |", + "|----------|--------|-------|-------|" + | rows + ] ++ [""] + end + defp section_header(cat) do metric_summary = - cat.metric_scores - |> Enum.map(fn m -> "#{m.name}=#{format_num(m.value)}" end) - |> Enum.join(", ") + Enum.map_join(cat.metric_scores, ", ", fn m -> "#{m.name}=#{format_num(m.value)}" end) [ "## #{cat.name} — #{cat.grade}", @@ -76,66 +119,209 @@ defmodule CodeQA.HealthReport.Formatter.Plain do end end - defp worst_offenders_section(_cat, :summary), do: [] + defp format_num(value) when is_float(value), do: :erlang.float_to_binary(value, decimals: 2) + defp format_num(value) when is_integer(value), do: to_string(value) + defp format_num(value), do: to_string(value) + + defp format_date(timestamp) when is_binary(timestamp) do + timestamp |> String.slice(0, 10) + end + + defp format_date(_), do: "unknown" + + defp top_issues_section([], _detail), do: [] + defp top_issues_section(_issues, :summary), do: [] + + defp top_issues_section(issues, _detail) do + rows = + Enum.map(issues, fn i -> + "| #{i.category}.#{i.behavior} | #{format_num(i.cosine)} | #{format_num(i.score)} |" + end) + + [ + "## Top Likely Issues", + "", + "> Ranked by cosine similarity — most negative means the file's metric profile best matches this anti-pattern.", + "", + "| Behavior | Cosine | Score |", + "|----------|--------|-------|" + | rows + ] ++ [""] + end + + defp pr_summary_section(nil), do: [] - defp worst_offenders_section(cat, _detail) do - offenders = Map.get(cat, :worst_offenders, []) + defp pr_summary_section(summary) do + delta_str = + if summary.score_delta >= 0, + do: "+#{summary.score_delta}", + else: "#{summary.score_delta}" - if offenders == [] do + status_str = "#{summary.files_modified} modified, #{summary.files_added} added" + + [ + "> **Score:** #{summary.base_grade} → #{summary.head_grade} | **Δ** #{delta_str} pts | **#{summary.blocks_flagged}** blocks flagged across #{summary.files_changed} files | #{status_str}", + "" + ] + end + + defp delta_section(nil), do: [] + + defp delta_section(delta) do + base_agg = delta.base.aggregate + head_agg = delta.head.aggregate + + metrics = [ + {"Readability", "readability", "mean_flesch_adapted"}, + {"Complexity", "halstead", "mean_difficulty"}, + {"Duplication", "compression", "mean_redundancy"}, + {"Structure", "branching", "mean_branch_count"} + ] + + rows = Enum.flat_map(metrics, &format_metric_row(&1, base_agg, head_agg)) + + if rows == [] do [] else - averages = Map.new(cat.metric_scores, &{&1.name, &1.value}) - - rows = - Enum.map(offenders, fn f -> - issues = - f.metric_scores - |> Enum.map(fn m -> - avg = Map.get(averages, m.name) - avg_str = if avg, do: " (avg: #{format_num(avg)})", else: "" - "#{direction(m.good)}#{m.name}=#{format_num(m.value)}#{avg_str}" - end) - |> Enum.join("
") - - "| #{format_path(f.path)}
#{format_lines(f[:lines])} lines · #{format_size(f[:bytes])} | #{f.grade} | #{issues} |" - end) - [ - "### Worst Offenders", + "## Metric Changes", "", - "| File | Grade | Issues |", - "|------|-------|--------|" + "| Category | Base | Head | Δ |", + "|----------|------|------|---|" | rows ] ++ [""] end end - defp format_path(path) when byte_size(path) < 80, do: "`#{path}`" + defp format_metric_row({label, group, key}, base_agg, head_agg) do + base_val = get_in(base_agg, [group, key]) + head_val = get_in(head_agg, [group, key]) - defp format_path(path) do - case String.split(path, "/") do - [file] -> "`#{file}`" - parts -> Enum.join(Enum.drop(parts, -1), "/") <> "/
`#{List.last(parts)}`" + if is_number(base_val) and is_number(head_val) do + diff = Float.round(head_val - base_val, 2) + diff_str = if diff >= 0, do: "+#{format_num(diff)}", else: "#{format_num(diff)}" + ["| #{label} | #{format_num(base_val)} | #{format_num(head_val)} | #{diff_str} |"] + else + [] end end - defp direction(:high), do: "↑ " - defp direction(_), do: "↓ " + defp blocks_section([]), do: ["## Code Blocks: 🟢 No block-level issues detected", ""] - defp format_lines(nil), do: "—" - defp format_lines(n), do: to_string(n) + defp blocks_section(top_blocks) do + alias CodeQA.HealthReport.BehaviorLabels - defp format_size(nil), do: "—" - defp format_size(bytes) when bytes < 1024, do: "#{bytes} B" - defp format_size(bytes), do: "#{Float.round(bytes / 1024, 1)} KB" + severity_counts = count_severities(top_blocks) + worst = worst_severity(severity_counts) + {icon, verdict} = verdict_text(worst, severity_counts) - defp format_num(value) when is_float(value), do: :erlang.float_to_binary(value, decimals: 2) - defp format_num(value) when is_integer(value), do: to_string(value) - defp format_num(value), do: to_string(value) + {actionable, medium_blocks} = + Enum.split_with(top_blocks, fn b -> + top = List.first(b.potentials) + top && top.severity in [:critical, :high] + end) - defp format_date(timestamp) when is_binary(timestamp) do - timestamp |> String.slice(0, 10) + header = ["## Code Blocks: #{icon} #{verdict}", ""] + + action_table = + if actionable != [] do + rows = + Enum.map(actionable, fn block -> + top = List.first(block.potentials) + label = BehaviorLabels.label(top.category, top.behavior) + location = "#{block.path}:#{block.start_line}-#{block.end_line || block.start_line}" + action = BehaviorLabels.action(top.category, top.behavior) + "| #{label} | #{location} | #{action} |" + end) + + [ + "| What | Where | Action |", + "|------|-------|--------|" + | rows + ] ++ [""] + else + [] + end + + block_details = Enum.flat_map(actionable ++ medium_blocks, &format_block/1) + + header ++ action_table ++ block_details end - defp format_date(_), do: "unknown" + defp count_severities(blocks) do + blocks + |> Enum.map(fn b -> (List.first(b.potentials) || %{severity: :medium}).severity end) + |> Enum.frequencies() + end + + defp worst_severity(counts) do + cond do + Map.get(counts, :critical, 0) > 0 -> :critical + Map.get(counts, :high, 0) > 0 -> :high + Map.get(counts, :medium, 0) > 0 -> :medium + true -> :none + end + end + + defp verdict_text(:critical, counts) do + n = Map.get(counts, :critical, 0) + {"🔴", "#{n} critical #{pl(n, "block")} — review required before merge"} + end + + defp verdict_text(:high, counts) do + n = Map.get(counts, :high, 0) + Map.get(counts, :critical, 0) + {"🟠", "#{n} #{pl(n, "block")} need attention before merge"} + end + + defp verdict_text(:medium, counts) do + n = Map.get(counts, :medium, 0) + {"🟡", "#{n} #{pl(n, "block")} with minor issues (safe to merge)"} + end + + defp verdict_text(:none, _), do: {"🟢", "No block-level issues detected"} + + defp pl(1, word), do: word + defp pl(_, word), do: word <> "s" + + defp format_block(block) do + end_line = block.end_line || block.start_line + status_str = if block.status, do: " [#{block.status}]", else: "" + + header = + "### #{block.path}:#{block.start_line}-#{end_line}#{status_str}" + + subheader = + "#{block.type} · #{block.token_count} tokens" + + potential_lines = Enum.flat_map(block.potentials, &format_potential/1) + code_lines = format_code_block(block) + [header, subheader, "" | potential_lines] ++ ["" | code_lines] ++ [""] + end + + defp format_code_block(%{source: nil}), do: ["_Source code not available_"] + + defp format_code_block(%{source: source, start_line: start_line}) do + lines = String.split(source, "\n") + + numbered_lines = + lines + |> Enum.with_index(start_line) + |> Enum.map(fn {line, num} -> " #{String.pad_leading(to_string(num), 4)} │ #{line}" end) + + ["```" | numbered_lines] ++ ["```"] + end + + defp format_potential(p) do + icon = severity_icon(p.severity) + delta_str = format_num(p.cosine_delta) + label = String.upcase(to_string(p.severity)) + line = " #{icon} #{label} #{p.category} / #{p.behavior} (Δ #{delta_str})" + fix = if p.fix_hint, do: [" → #{p.fix_hint}"], else: [] + [line | fix] + end + + defp severity_icon(:critical), do: "🔴" + defp severity_icon(:high), do: "🟠" + defp severity_icon(:medium), do: "🟡" + defp severity_icon(_), do: "⚪" end diff --git a/lib/codeqa/health_report/grader.ex b/lib/codeqa/health_report/grader.ex index 864cad32..d671a0bf 100644 --- a/lib/codeqa/health_report/grader.ex +++ b/lib/codeqa/health_report/grader.ex @@ -1,6 +1,9 @@ defmodule CodeQA.HealthReport.Grader do @moduledoc "Scores metrics and assigns letter grades." + alias CodeQA.Config + alias CodeQA.HealthReport.Categories + @doc """ Score a single metric value (0-100) based on thresholds and direction. @@ -9,35 +12,60 @@ defmodule CodeQA.HealthReport.Grader do """ @spec score_metric(map(), number()) :: integer() def score_metric(%{good: :high, thresholds: t}, value) do - value |> score_high_is_good(t) |> clamp(0, 100) + score_by_direction(:high, value, t) |> clamp(0, 100) end def score_metric(%{good: _, thresholds: t}, value) do - value |> score_low_is_good(t) |> clamp(0, 100) + score_by_direction(:low, value, t) |> clamp(0, 100) end - # Lower values are better: below A = 100, A = 90, A-B = 70-90, etc. - defp score_low_is_good(val, t) do + @doc """ + Maps cosine similarity [-1, +1] to a score [0, 100] with linear interpolation + within each band. Result is clamped to [0, 100] and rounded to an integer. + + | Cosine range | Score range | + |---------------|-------------| + | [0.5, 1.0] | [90, 100] | + | [0.2, 0.5) | [70, 90) | + | [0.0, 0.2) | [50, 70) | + | [-0.3, 0.0) | [30, 50) | + | [-1.0, -0.3) | [0, 30) | + """ + @spec score_cosine(float()) :: integer() + def score_cosine(cosine) do + cosine + |> cosine_to_score() + |> clamp(0, 100) + |> round() + end + + defp cosine_to_score(c) when c >= 0.5, do: interpolate_between(c, 0.5, 90, 1.0, 100) + defp cosine_to_score(c) when c >= 0.2, do: interpolate_between(c, 0.2, 70, 0.5, 90) + defp cosine_to_score(c) when c >= 0.0, do: interpolate_between(c, 0.0, 50, 0.2, 70) + defp cosine_to_score(c) when c >= -0.3, do: interpolate_between(c, -0.3, 30, 0.0, 50) + defp cosine_to_score(c), do: interpolate_between(c, -1.0, 0, -0.3, 30) + + # :low — lower values are better (t.a < t.b < t.c < t.d); below t.a = 100 + # :high — higher values are better (t.a > t.b > t.c > t.d); above t.a = 100 + defp score_by_direction(:low, val, t) do cond do val < t.a -> 100 val == t.a -> 90 val <= t.b -> interpolate_between(val, t.a, 90, t.b, 70) val <= t.c -> interpolate_between(val, t.b, 70, t.c, 50) val <= t.d -> interpolate_between(val, t.c, 50, t.d, 30) - true -> interpolate_below_d(val, t.d, 30) + true -> interpolate_beyond_d(val, t.d, 30) end end - # Higher values are better: above A = 100, A = 90, A-B = 70-90, etc. - # Thresholds are in descending order (a > b > c > d) - defp score_high_is_good(val, t) do + defp score_by_direction(:high, val, t) do cond do val > t.a -> 100 val == t.a -> 90 val >= t.b -> interpolate_between(val, t.a, 90, t.b, 70) val >= t.c -> interpolate_between(val, t.b, 70, t.c, 50) val >= t.d -> interpolate_between(val, t.c, 50, t.d, 30) - true -> interpolate_below_d_high(val, t.d, 30) + true -> interpolate_beyond_d(val, t.d, 30) end end @@ -52,27 +80,22 @@ defmodule CodeQA.HealthReport.Grader do end end - # Value beyond D threshold (low is good): score degrades below 30 - defp interpolate_below_d(_val, threshold_d, _score_at_d) when threshold_d == 0, do: 0 + # Score degrades below 30 when value is beyond the D threshold in either direction. + # abs(val - threshold_d) captures overshoot for :low and undershoot for :high uniformly. + defp interpolate_beyond_d(_val, 0, _score_at_d), do: 0 - defp interpolate_below_d(val, threshold_d, score_at_d) do - overshoot = (val - threshold_d) / threshold_d - round(Kernel.max(0, score_at_d - overshoot * score_at_d)) + defp interpolate_beyond_d(val, threshold_d, score_at_d) do + deviation = abs(val - threshold_d) / threshold_d + round(Kernel.max(0, score_at_d - deviation * score_at_d)) end - # Value below D threshold (high is good): score degrades below 30 - defp interpolate_below_d_high(_val, threshold_d, _score_at_d) when threshold_d == 0, do: 0 - - defp interpolate_below_d_high(val, threshold_d, score_at_d) do - undershoot = (threshold_d - val) / threshold_d - round(Kernel.max(0, score_at_d - undershoot * score_at_d)) + defp clamp(val, min_val, max_val) do + val |> Kernel.max(min_val) |> Kernel.min(max_val) end - defp clamp(val, min_val, max_val), do: val |> Kernel.max(min_val) |> Kernel.min(max_val) - @doc "Convert a numeric score (0-100) to a letter grade using the given scale." @spec grade_letter(number(), [{number(), String.t()}]) :: String.t() - def grade_letter(score, scale \\ CodeQA.HealthReport.Categories.default_grade_scale()) do + def grade_letter(score, scale \\ Categories.default_grade_scale()) do Enum.find_value(scale, "F", fn {min, letter} -> if score >= min, do: letter end) @@ -86,35 +109,14 @@ defmodule CodeQA.HealthReport.Grader do def grade_category( category, file_metrics, - scale \\ CodeQA.HealthReport.Categories.default_grade_scale() + scale \\ Categories.default_grade_scale() ) do scored = category.metrics - |> Enum.map(fn metric_def -> - value = get_in(file_metrics, [metric_def.source, metric_def.name]) - - if value do - %{ - name: metric_def.name, - source: metric_def.source, - weight: metric_def.weight, - good: metric_def.good, - value: value, - score: score_metric(metric_def, value) - } - end - end) + |> Enum.map(&score_metric_entry(&1, file_metrics)) |> Enum.reject(&is_nil/1) - total_weight = Enum.reduce(scored, 0.0, fn s, acc -> acc + s.weight end) - - score = - if total_weight > 0 do - weighted = Enum.reduce(scored, 0.0, fn s, acc -> acc + s.score * s.weight end) - round(weighted / total_weight) - else - 0 - end + score = weighted_category_score(scored) %{ key: category.key, @@ -125,6 +127,34 @@ defmodule CodeQA.HealthReport.Grader do } end + defp score_metric_entry(metric_def, file_metrics) do + value = get_in(file_metrics, [metric_def.source, metric_def.name]) + + if value do + %{ + name: metric_def.name, + source: metric_def.source, + weight: metric_def.weight, + good: metric_def.good, + value: value, + score: score_metric(metric_def, value) + } + end + end + + defp weighted_category_score([]), do: 0 + + defp weighted_category_score(scored) do + total_weight = Enum.reduce(scored, 0.0, fn s, acc -> acc + s.weight end) + + if total_weight > 0 do + weighted = Enum.reduce(scored, 0.0, fn s, acc -> acc + s.score * s.weight end) + round(weighted / total_weight) + else + 0 + end + end + @doc """ Grade a file's metrics against all categories. `file_metrics` is the `%{"entropy" => %{...}, "halstead" => %{...}}` map from analysis. @@ -133,7 +163,7 @@ defmodule CodeQA.HealthReport.Grader do def grade_file( categories, file_metrics, - scale \\ CodeQA.HealthReport.Categories.default_grade_scale() + scale \\ Categories.default_grade_scale() ) do Enum.map(categories, &grade_category(&1, file_metrics, scale)) end @@ -145,7 +175,7 @@ defmodule CodeQA.HealthReport.Grader do def grade_aggregate( categories, aggregate, - scale \\ CodeQA.HealthReport.Categories.default_grade_scale() + scale \\ Categories.default_grade_scale() ) do # Convert aggregate format (mean_X keys) to file-metric-like format file_like = @@ -161,23 +191,114 @@ defmodule CodeQA.HealthReport.Grader do Enum.map(categories, &grade_category(&1, file_like, scale)) end - @doc "Compute overall score as average of category scores." - @spec overall_score(list(), [{number(), String.t()}]) :: {integer(), String.t()} + @doc """ + Compute overall score as a weighted average of category scores. + + Each category's weight is looked up from `impact_map` by converting + `category.key` (atom) to string. Defaults to `1` if the key is absent. + + Backward compatible: calling with two arguments (empty `impact_map`) produces + the same arithmetic mean as the old `/2` signature. + """ + @spec overall_score( + categories :: [map()], + grade_scale :: [{number(), String.t()}], + impact_map :: %{String.t() => pos_integer()} + ) :: {integer(), String.t()} def overall_score( category_grades, - scale \\ CodeQA.HealthReport.Categories.default_grade_scale() + scale \\ Categories.default_grade_scale(), + impact_map \\ %{} ) do if category_grades == [] do {0, "F"} else - avg = - Enum.reduce(category_grades, 0, fn g, acc -> acc + g.score end) - |> div(length(category_grades)) + {weighted_sum, total_impact} = + Enum.reduce(category_grades, {0, 0}, fn g, {ws, ti} -> + impact = Map.get(impact_map, to_string(g.key), 1) + {ws + g.score * impact, ti + impact} + end) + avg = round(weighted_sum / total_impact) {avg, grade_letter(avg, scale)} end end + @doc """ + Grade codebase aggregate metrics using cosine similarity. + + Accepts `cosines_by_category`, a map of category string keys to lists of + behavior cosine entries as returned by + `Enum.group_by(SampleRunner.diagnose_aggregate(...), & &1.category)`. + + The caller is responsible for computing `cosines_by_category` so that + `diagnose_aggregate/2` is invoked only once across the report pipeline. + + Categories with zero behaviors are skipped. + """ + @spec grade_cosine_categories( + cosines_by_category :: %{String.t() => [map()]}, + worst_files :: %{String.t() => [map()]}, + grade_scale :: [{number(), String.t()}] + ) :: [map()] + def grade_cosine_categories( + cosines_by_category, + worst_files, + scale \\ Categories.default_grade_scale() + ) do + threshold = Config.cosine_significance_threshold() + + cosines_by_category + |> Enum.map(fn {category, behaviors} -> + behavior_entries = + score_behavior_entries(behaviors, threshold, worst_files, scale, category) + + category_score = average_behavior_score(behavior_entries) + build_cosine_category(category, category_score, behavior_entries, scale) + end) + end + + defp score_behavior_entries(behaviors, threshold, worst_files, scale, category) do + behaviors + |> Enum.reject(fn b -> abs(b.cosine) < threshold end) + |> Enum.map(&score_behavior_entry(&1, worst_files, scale, category)) + end + + defp score_behavior_entry(b, worst_files, scale, category) do + cosine_score = score_cosine(b.cosine) + + %{ + behavior: b.behavior, + cosine: b.cosine, + score: cosine_score, + grade: grade_letter(cosine_score, scale), + worst_offenders: Map.get(worst_files, "#{category}.#{b.behavior}", []) + } + end + + defp average_behavior_score([]), do: 50 + + defp average_behavior_score(entries) do + round(Enum.sum(Enum.map(entries, & &1.score)) / length(entries)) + end + + defp build_cosine_category(category, category_score, behavior_entries, scale) do + %{ + type: :cosine, + key: category, + name: humanize_category(category), + score: category_score, + grade: grade_letter(category_score, scale), + behaviors: behavior_entries + } + end + + defp humanize_category(slug) do + slug + |> String.split("_") + |> Enum.map_join(" ", &String.capitalize/1) + end + @doc """ Find worst offender files for a category. Returns top N files sorted by worst score. `all_file_metrics` is `%{"path" => %{"metrics" => %{...}}}` from analysis results. @@ -187,23 +308,51 @@ defmodule CodeQA.HealthReport.Grader do category, all_file_metrics, top_n, - scale \\ CodeQA.HealthReport.Categories.default_grade_scale() + scale \\ Categories.default_grade_scale() ) do + # NOTE: threshold metric scores are file-level aggregates; line-level attribution would require + # each AST node to carry its own per-metric values so that the node with the highest + # contribution to the bad metric score could be identified and reported directly. all_file_metrics |> Enum.map(fn {path, file_data} -> metrics = Map.get(file_data, "metrics", %{}) graded = grade_category(category, metrics, scale) + %{ path: path, score: graded.score, grade: graded.grade, metric_scores: graded.metric_scores, lines: file_data["lines"], - bytes: file_data["bytes"] + bytes: file_data["bytes"], + top_nodes: top_3_nodes(Map.get(file_data, "nodes")) } end) |> Enum.filter(fn f -> f.metric_scores != [] end) |> Enum.sort_by(& &1.score, :asc) |> Enum.take(top_n) end + + @doc """ + Returns the top 3 nodes by refactoring potential impact, ranked by cosine_delta sum. + + Only considers top-level nodes; children are not traversed. Returns an empty list + if input is nil, empty, or nodes lack refactoring_potentials data. + """ + @spec top_3_nodes(list() | nil) :: list() + def top_3_nodes(nil), do: [] + def top_3_nodes([]), do: [] + + def top_3_nodes(nodes) when is_list(nodes) do + nodes + |> Enum.sort_by(&node_impact_score/1, :desc) + |> Enum.take(3) + end + + defp node_impact_score(%{"refactoring_potentials" => potentials}) + when is_list(potentials) and potentials != [] do + Enum.sum(Enum.map(potentials, & &1["cosine_delta"])) + end + + defp node_impact_score(_), do: 0.0 end diff --git a/lib/codeqa/health_report/top_blocks.ex b/lib/codeqa/health_report/top_blocks.ex new file mode 100644 index 00000000..5ceddf52 --- /dev/null +++ b/lib/codeqa/health_report/top_blocks.ex @@ -0,0 +1,222 @@ +defmodule CodeQA.HealthReport.TopBlocks do + @moduledoc "Assembles the top_blocks report section from analysis node data." + + alias CodeQA.CombinedMetrics.Scorer + + @min_tokens 10 + @severity_critical 0.50 + @severity_high 0.25 + @severity_medium 0.10 + @gap_floor 0.01 + @top_n 10 + @default_min_lines 3 + @default_max_lines 20 + + defp build_fix_hint_lookup do + Scorer.all_yamls() + |> Enum.flat_map(fn {yaml_path, data} -> + category = yaml_path |> Path.basename() |> String.trim_trailing(".yml") + Enum.flat_map(data, &hints_for_behavior(category, &1)) + end) + |> Map.new() + end + + defp hints_for_behavior(category, {behavior, behavior_data}) when is_map(behavior_data) do + case Map.get(behavior_data, "_fix_hint") do + nil -> [] + hint -> [{{category, behavior}, hint}] + end + end + + defp hints_for_behavior(_category, _entry), do: [] + + @spec build(map(), [struct()], map(), keyword()) :: [map()] + def build(analysis_results, changed_files, codebase_cosine_lookup, opts \\ []) do + base_path = get_in(analysis_results, ["metadata", "path"]) || "." + + analysis_results + |> collect_enriched_blocks(changed_files, codebase_cosine_lookup, opts) + # Rank by highest cosine_delta and take top N + |> Enum.sort_by(&(-max_delta(&1))) + |> Enum.take(@top_n) + # Add source code for each block + |> Enum.map(&add_source_code(&1, base_path)) + end + + @doc """ + Returns a map of category => worst offending block for that category. + Only includes blocks that overlap with the diff (if diff_line_ranges provided). + """ + @spec worst_per_category(map(), [struct()], map(), keyword()) :: %{String.t() => map()} + def worst_per_category(analysis_results, changed_files, codebase_cosine_lookup, opts \\ []) do + base_path = get_in(analysis_results, ["metadata", "path"]) || "." + + all_blocks = + collect_enriched_blocks(analysis_results, changed_files, codebase_cosine_lookup, opts) + + # Group blocks by category, finding the worst block per category + all_blocks + |> Enum.flat_map(fn block -> + # Each block may contribute to multiple categories via its potentials + block.potentials + |> Enum.map(fn potential -> + {potential.category, block, potential.cosine_delta} + end) + end) + |> Enum.group_by(&elem(&1, 0), fn {_cat, block, delta} -> {block, delta} end) + |> Enum.map(fn {category, block_deltas} -> + # Find the block with highest cosine_delta for this category + {worst_block, _delta} = Enum.max_by(block_deltas, fn {_block, delta} -> delta end) + {category, add_source_code(worst_block, base_path)} + end) + |> Map.new() + end + + # Shared logic for collecting and enriching blocks + defp collect_enriched_blocks(analysis_results, changed_files, codebase_cosine_lookup, opts) do + files = Map.get(analysis_results, "files", %{}) + fix_hints = build_fix_hint_lookup() + + min_lines = Keyword.get(opts, :block_min_lines, @default_min_lines) + max_lines = Keyword.get(opts, :block_max_lines, @default_max_lines) + diff_line_ranges = Keyword.get(opts, :diff_line_ranges, %{}) + + file_entries = + if changed_files == [] do + Enum.map(files, fn {path, data} -> {path, nil, data} end) + else + changed_index = Map.new(changed_files, &{&1.path, &1.status}) + + files + |> Enum.filter(fn {path, _} -> Map.has_key?(changed_index, path) end) + |> Enum.map(fn {path, data} -> {path, Map.get(changed_index, path), data} end) + end + + # Flatten all blocks across all files, enrich with path + file_entries + |> Enum.flat_map(fn {path, status, file_data} -> + path_diff_ranges = Map.get(diff_line_ranges, path, []) + + file_data + |> Map.get("nodes", []) + |> Enum.flat_map(&collect_nodes/1) + |> Enum.filter(&(&1["token_count"] >= @min_tokens)) + |> Enum.filter(&block_in_line_range?(&1, min_lines, max_lines)) + |> filter_by_diff_overlap(path_diff_ranges, diff_line_ranges) + |> Enum.map(&enrich_block(&1, codebase_cosine_lookup, fix_hints)) + |> Enum.reject(&(&1.potentials == [])) + |> Enum.map(&Map.merge(&1, %{path: path, status: status})) + end) + end + + @spec block_in_line_range?(map(), pos_integer(), pos_integer()) :: boolean() + defp block_in_line_range?(node, min_lines, max_lines) do + start_line = node["start_line"] || 1 + end_line = node["end_line"] || start_line + line_count = end_line - start_line + 1 + line_count >= min_lines and line_count <= max_lines + end + + # When no diff_line_ranges provided (empty map), show all blocks - no filtering needed + @spec filter_by_diff_overlap([map()], [{pos_integer(), pos_integer()}], map()) :: [map()] + defp filter_by_diff_overlap(blocks, _path_ranges, diff_line_ranges) + when map_size(diff_line_ranges) == 0, + do: blocks + + # When diff_line_ranges provided, filter blocks by overlap + defp filter_by_diff_overlap(blocks, path_ranges, _diff_line_ranges) do + Enum.filter(blocks, &block_overlaps_diff?(&1, path_ranges)) + end + + @spec block_overlaps_diff?(map(), [{pos_integer(), pos_integer()}]) :: boolean() + defp block_overlaps_diff?(_node, []), do: false + + defp block_overlaps_diff?(node, path_ranges) do + block_start = node["start_line"] || 1 + block_end = node["end_line"] || block_start + + Enum.any?(path_ranges, fn {diff_start, diff_end} -> + ranges_overlap?(block_start, block_end, diff_start, diff_end) + end) + end + + @spec ranges_overlap?(pos_integer(), pos_integer(), pos_integer(), pos_integer()) :: boolean() + defp ranges_overlap?(start1, end1, start2, end2) do + start1 <= end2 and start2 <= end1 + end + + defp collect_nodes(node) do + children = node |> Map.get("children", []) |> Enum.flat_map(&collect_nodes/1) + [node | children] + end + + defp enrich_block(node, cosine_lookup, fix_hints) do + potentials = + node + |> Map.get("refactoring_potentials", []) + |> Enum.map(&enrich_potential(&1, cosine_lookup, fix_hints)) + |> Enum.reject(&is_nil/1) + |> Enum.sort_by(& &1.cosine_delta, :desc) + + %{ + start_line: node["start_line"], + end_line: node["end_line"], + type: node["type"], + token_count: node["token_count"], + potentials: potentials + } + end + + defp enrich_potential(p, cosine_lookup, fix_hints) do + category = p["category"] + behavior = p["behavior"] + cosine_delta = p["cosine_delta"] + + codebase_cosine = Map.get(cosine_lookup, {category, behavior}, 0.0) + gap = max(@gap_floor, 1.0 - codebase_cosine) + severity = classify(cosine_delta / gap) + + if severity == :filtered do + nil + else + %{ + category: category, + behavior: behavior, + cosine_delta: cosine_delta, + severity: severity, + fix_hint: Map.get(fix_hints, {category, behavior}) + } + end + end + + defp classify(ratio) when ratio > @severity_critical, do: :critical + defp classify(ratio) when ratio > @severity_high, do: :high + defp classify(ratio) when ratio > @severity_medium, do: :medium + defp classify(_ratio), do: :filtered + + defp max_delta(%{potentials: []}), do: 0.0 + + defp max_delta(%{potentials: potentials}), + do: Enum.max_by(potentials, & &1.cosine_delta).cosine_delta + + defp add_source_code(block, base_path) do + full_path = Path.join(base_path, block.path) + start_line = block.start_line + end_line = block.end_line || start_line + + source = + case File.read(full_path) do + {:ok, content} -> + content + |> String.split("\n") + |> Enum.slice((start_line - 1)..(end_line - 1)//1) + |> Enum.join("\n") + + {:error, _} -> + nil + end + + lang = CodeQA.Language.detect(block.path).name() + Map.merge(block, %{source: source, language: lang}) + end +end diff --git a/lib/codeqa/languages/code/native/cpp.ex b/lib/codeqa/languages/code/native/cpp.ex new file mode 100644 index 00000000..31cbb4e1 --- /dev/null +++ b/lib/codeqa/languages/code/native/cpp.ex @@ -0,0 +1,49 @@ +defmodule CodeQA.Languages.Code.Native.Cpp do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "cpp" + + @impl true + def extensions, do: ~w[c cpp cc cxx hpp h hh] + + @impl true + def comment_prefixes, do: ~w[//] + + @impl true + def block_comments, do: [{"/*", "*/"}] + + @impl true + def keywords, do: ~w[ + if else for while do class struct namespace using include template typename + return new delete this public private protected virtual override static + const constexpr inline extern try catch throw switch case break continue + default auto void true false nullptr + ] + + @impl true + def operators, do: ~w[ + == != <= >= + - * / % << >> & | ^ ~ && || = += -= *= /= %= -> :: + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; @ # * + ] ++ ~w( [ ] ) + + @impl true + def declaration_keywords, do: ~w[class struct namespace template] + + @impl true + def branch_keywords, do: ~w[else catch case default] + + @impl true + def block_end_tokens, do: ~w[}] + + @impl true + def access_modifiers, do: ~w[public private protected static virtual override inline] + + @impl true + def module_keywords, do: ~w[class struct namespace enum] +end diff --git a/lib/codeqa/languages/code/native/go.ex b/lib/codeqa/languages/code/native/go.ex new file mode 100644 index 00000000..b728aab4 --- /dev/null +++ b/lib/codeqa/languages/code/native/go.ex @@ -0,0 +1,51 @@ +defmodule CodeQA.Languages.Code.Native.Go do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "go" + + @impl true + def extensions, do: ~w[go] + + @impl true + def comment_prefixes, do: ~w[//] + + @impl true + def block_comments, do: [{"/*", "*/"}] + + @impl true + def keywords, do: ~w[ + if else for func type struct interface package import return var const + map chan go defer select switch case break continue default fallthrough + range make new append len cap close nil true false + ] + + @impl true + def operators, do: ~w[ + == != <= >= + - * / % << >> & | ^ ~ && || = += -= *= /= %= := + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; + ] ++ ~w( [ ] ) + + @impl true + def declaration_keywords, do: ~w[func type struct interface] + + @impl true + def branch_keywords, do: ~w[else case default] + + @impl true + def block_end_tokens, do: ~w[}] + + @impl true + def access_modifiers, do: [] + + @impl true + def function_keywords, do: ~w[func] + + @impl true + def import_keywords, do: ~w[import package] +end diff --git a/lib/codeqa/languages/code/native/haskell.ex b/lib/codeqa/languages/code/native/haskell.ex new file mode 100644 index 00000000..48cd6462 --- /dev/null +++ b/lib/codeqa/languages/code/native/haskell.ex @@ -0,0 +1,57 @@ +defmodule CodeQA.Languages.Code.Native.Haskell do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "haskell" + + @impl true + def extensions, do: ~w[hs lhs] + + @impl true + def comment_prefixes, do: ~w[--] + + @impl true + def block_comments, do: [{"{-", "-}"}] + + @impl true + def keywords, do: ~w[ + if else then for do let in where module import data type newtype class + instance deriving case of return True False Nothing Just do + infixl infixr infix qualified as hiding + ] + + @impl true + def operators, do: ~w[ + == /= <= >= + - * / ^ && || ! $ . <$> <*> >>= >> -> <- :: = | @ ~ + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; | @ -> <- :: + ] ++ ~w( [ ] ) + + @impl true + def declaration_keywords, do: ~w[data type newtype class instance] + + @impl true + def branch_keywords, do: ~w[else] + + @impl true + def block_end_tokens, do: [] + + @impl true + def function_keywords, do: ~w[where let] + + @impl true + def module_keywords, do: ~w[module class instance] + + @impl true + def import_keywords, do: ~w[import] + + @impl true + def test_keywords, do: ~w[test it describe prop] + + @impl true + def uses_colon_indent?, do: true +end diff --git a/lib/codeqa/languages/code/native/ocaml.ex b/lib/codeqa/languages/code/native/ocaml.ex new file mode 100644 index 00000000..d1e8b213 --- /dev/null +++ b/lib/codeqa/languages/code/native/ocaml.ex @@ -0,0 +1,54 @@ +defmodule CodeQA.Languages.Code.Native.Ocaml do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "ocaml" + + @impl true + def extensions, do: ~w[ml mli] + + @impl true + def comment_prefixes, do: [] + + @impl true + def block_comments, do: [{"(*", "*)"}] + + @impl true + def keywords, do: ~w[ + let rec fun if then else for while do done begin end match with type module + open struct sig functor val mutable exception raise try when and or not in + of as include class object method inherit new virtual + ] + + @impl true + def operators, do: ~w[ + == = != <> <= >= + - * / mod << >> & | ^ ~ && || @ :: |> -> <- := ! + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; | @ -> + ] ++ ~w( [ ] ) + + @impl true + def declaration_keywords, do: ~w[let type module class] + + @impl true + def branch_keywords, do: ~w[else with when] + + @impl true + def block_end_tokens, do: ~w[end] + + @impl true + def access_modifiers, do: ~w[mutable virtual] + + @impl true + def function_keywords, do: ~w[let fun] + + @impl true + def module_keywords, do: ~w[module struct functor class] + + @impl true + def import_keywords, do: ~w[open include] +end diff --git a/lib/codeqa/languages/code/native/rust.ex b/lib/codeqa/languages/code/native/rust.ex new file mode 100644 index 00000000..0616834d --- /dev/null +++ b/lib/codeqa/languages/code/native/rust.ex @@ -0,0 +1,54 @@ +defmodule CodeQA.Languages.Code.Native.Rust do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "rust" + + @impl true + def extensions, do: ~w[rs] + + @impl true + def comment_prefixes, do: ~w[//] + + @impl true + def block_comments, do: [{"/*", "*/"}] + + @impl true + def keywords, do: ~w[ + if else for while loop fn struct enum trait impl use mod pub let mut const + static return match type where as in ref move async await dyn unsafe extern + crate self super true false + ] + + @impl true + def operators, do: ~w[ + == != <= >= + - * / % << >> & | ^ ~ && || = += -= *= /= %= -> => :: + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; @ # | + ] ++ ~w( [ ] ) + + @impl true + def declaration_keywords, do: ~w[fn struct enum trait impl mod] + + @impl true + def branch_keywords, do: ~w[else match] + + @impl true + def block_end_tokens, do: ~w[}] + + @impl true + def access_modifiers, do: ~w[pub] + + @impl true + def function_keywords, do: ~w[fn] + + @impl true + def module_keywords, do: ~w[impl trait struct enum] + + @impl true + def import_keywords, do: ~w[use extern] +end diff --git a/lib/codeqa/languages/code/native/swift.ex b/lib/codeqa/languages/code/native/swift.ex new file mode 100644 index 00000000..04225287 --- /dev/null +++ b/lib/codeqa/languages/code/native/swift.ex @@ -0,0 +1,54 @@ +defmodule CodeQA.Languages.Code.Native.Swift do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "swift" + + @impl true + def extensions, do: ~w[swift] + + @impl true + def comment_prefixes, do: ~w[//] + + @impl true + def block_comments, do: [{"/*", "*/"}] + + @impl true + def keywords, do: ~w[ + if else for while repeat func class struct enum protocol extension import + return let var guard defer do try catch throw switch case break continue + default in as is init self super nil true false async await + ] + + @impl true + def operators, do: ~w[ + == != <= >= + - * / % << >> & | ^ ~ && || ?? = += -= *= /= %= -> => + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; @ # | + ] ++ ~w( [ ] ) + + @impl true + def declaration_keywords, do: ~w[func class struct enum protocol extension] + + @impl true + def branch_keywords, do: ~w[else catch case default] + + @impl true + def block_end_tokens, do: ~w[}] + + @impl true + def access_modifiers, do: ~w[public private internal fileprivate open] + + @impl true + def function_keywords, do: ~w[func] + + @impl true + def module_keywords, do: ~w[class struct protocol extension enum] + + @impl true + def import_keywords, do: ~w[import] +end diff --git a/lib/codeqa/languages/code/native/zig.ex b/lib/codeqa/languages/code/native/zig.ex new file mode 100644 index 00000000..f3e13f85 --- /dev/null +++ b/lib/codeqa/languages/code/native/zig.ex @@ -0,0 +1,54 @@ +defmodule CodeQA.Languages.Code.Native.Zig do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "zig" + + @impl true + def extensions, do: ~w[zig] + + @impl true + def comment_prefixes, do: ~w[//] + + @impl true + def block_comments, do: [] + + @impl true + def keywords, do: ~w[ + const var fn if else for while switch return pub try catch error defer errdefer + comptime inline struct enum union test break continue null undefined unreachable + async await suspend resume orelse anytype anyerror bool void noreturn type + ] + + @impl true + def operators, do: ~w[ + == != <= >= + - * / % << >> & | ^ ~ && || = += -= *= /= %= orelse catch + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; @ + ] ++ ~w( [ ] ) + + @impl true + def declaration_keywords, do: ~w[fn struct enum union] + + @impl true + def branch_keywords, do: ~w[else] + + @impl true + def block_end_tokens, do: ~w[}] + + @impl true + def access_modifiers, do: ~w[pub inline comptime] + + @impl true + def function_keywords, do: ~w[fn] + + @impl true + def module_keywords, do: ~w[struct enum union] + + @impl true + def test_keywords, do: ~w[test] +end diff --git a/lib/codeqa/languages/code/scripting/julia.ex b/lib/codeqa/languages/code/scripting/julia.ex new file mode 100644 index 00000000..8f859d1d --- /dev/null +++ b/lib/codeqa/languages/code/scripting/julia.ex @@ -0,0 +1,54 @@ +defmodule CodeQA.Languages.Code.Scripting.Julia do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "julia" + + @impl true + def extensions, do: ~w[jl] + + @impl true + def comment_prefixes, do: ~w[#] + + @impl true + def block_comments, do: [{"#=", "=#"}] + + @impl true + def keywords, do: ~w[ + if else elseif for while do end function return module import using export + struct mutable abstract type primitive begin let local global const try catch + finally throw macro quote true false nothing + ] + + @impl true + def operators, do: ~w[ + == != <= >= + - * / % ^ << >> & | ~ && || = += -= *= /= ÷ → ← |> + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; @ | + ] ++ ~w( [ ] ) + + @impl true + def declaration_keywords, do: ~w[function struct macro module] + + @impl true + def branch_keywords, do: ~w[else elseif catch finally] + + @impl true + def block_end_tokens, do: ~w[end] + + @impl true + def function_keywords, do: ~w[function macro] + + @impl true + def module_keywords, do: ~w[module struct] + + @impl true + def import_keywords, do: ~w[import using] + + @impl true + def test_keywords, do: ~w[@test @testset] +end diff --git a/lib/codeqa/languages/code/scripting/lua.ex b/lib/codeqa/languages/code/scripting/lua.ex new file mode 100644 index 00000000..7ae8e9d3 --- /dev/null +++ b/lib/codeqa/languages/code/scripting/lua.ex @@ -0,0 +1,47 @@ +defmodule CodeQA.Languages.Code.Scripting.Lua do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "lua" + + @impl true + def extensions, do: ~w[lua] + + @impl true + def comment_prefixes, do: ~w[--] + + @impl true + def block_comments, do: [{"--[[", "]]"}] + + @impl true + def keywords, do: ~w[ + and break do else elseif end false for function goto if in local nil not or + repeat return then true until while + ] + + @impl true + def operators, do: ~w[ + == ~= <= >= + - * / % ^ # & | ~ << >> // .. = and or not + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; + ] ++ ~w( [ ] ) + + @impl true + def declaration_keywords, do: ~w[function local] + + @impl true + def branch_keywords, do: ~w[else elseif] + + @impl true + def block_end_tokens, do: ~w[end] + + @impl true + def function_keywords, do: ~w[function] + + @impl true + def import_keywords, do: ~w[require] +end diff --git a/lib/codeqa/languages/code/scripting/perl.ex b/lib/codeqa/languages/code/scripting/perl.ex new file mode 100644 index 00000000..3155f1c3 --- /dev/null +++ b/lib/codeqa/languages/code/scripting/perl.ex @@ -0,0 +1,54 @@ +defmodule CodeQA.Languages.Code.Scripting.Perl do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "perl" + + @impl true + def extensions, do: ~w[pl pm t] + + @impl true + def comment_prefixes, do: ~w[#] + + @impl true + def block_comments, do: [] + + @impl true + def keywords, do: ~w[ + if else elsif unless for foreach while do until sub my our local use require + package return last next redo goto print say die warn eval and or not defined + undef true false + ] + + @impl true + def operators, do: ~w[ + == != <= >= eq ne lt gt le ge + - * / % ** . x = += -= *= /= .= && || ! ~ & | + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; @ $ % + ] ++ ~w( [ ] ) + + @impl true + def declaration_keywords, do: ~w[sub package] + + @impl true + def branch_keywords, do: ~w[else elsif] + + @impl true + def block_end_tokens, do: ~w[}] + + @impl true + def function_keywords, do: ~w[sub] + + @impl true + def module_keywords, do: ~w[package] + + @impl true + def import_keywords, do: ~w[use require] + + @impl true + def test_keywords, do: ~w[ok is isnt like unlike cmp_ok] +end diff --git a/lib/codeqa/languages/code/scripting/php.ex b/lib/codeqa/languages/code/scripting/php.ex new file mode 100644 index 00000000..294b9a1f --- /dev/null +++ b/lib/codeqa/languages/code/scripting/php.ex @@ -0,0 +1,55 @@ +defmodule CodeQA.Languages.Code.Scripting.PHP do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "php" + + @impl true + def extensions, do: ~w[php phtml php3 php4 php5 php7 php8] + + @impl true + def comment_prefixes, do: ~w[// #] + + @impl true + def block_comments, do: [{"/*", "*/"}] + + @impl true + def keywords, do: ~w[ + if else elseif for foreach while do function class interface trait namespace + use return new echo print public private protected static abstract final + try catch finally throw switch case break continue default include require + include_once require_once extends implements null true false + ] + + @impl true + def operators, do: ~w[ + == === != !== <= >= + - * / % ** << >> & | ^ ~ && || ?? = += -= *= /= %= -> :: => + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; @ # $ + ] ++ ~w( [ ] ) + + @impl true + def declaration_keywords, do: ~w[function class interface trait namespace] + + @impl true + def branch_keywords, do: ~w[else elseif catch finally case default] + + @impl true + def block_end_tokens, do: ~w[} endif endfor endforeach endwhile endswitch] + + @impl true + def access_modifiers, do: ~w[public private protected static abstract final] + + @impl true + def function_keywords, do: ~w[function fn] + + @impl true + def module_keywords, do: ~w[class interface trait namespace] + + @impl true + def import_keywords, do: ~w[use namespace] +end diff --git a/lib/codeqa/languages/code/scripting/python.ex b/lib/codeqa/languages/code/scripting/python.ex new file mode 100644 index 00000000..e1c4bb4f --- /dev/null +++ b/lib/codeqa/languages/code/scripting/python.ex @@ -0,0 +1,57 @@ +defmodule CodeQA.Languages.Code.Scripting.Python do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "python" + + @impl true + def extensions, do: ~w[py pyi] + + @impl true + def comment_prefixes, do: ~w[#] + + @impl true + def block_comments, do: [] + + @impl true + def keywords, do: ~w[ + if else elif for while def class import from return pass break continue + not and or in is lambda with as try except finally raise yield async await + global nonlocal del assert True False None + ] + + @impl true + def operators, do: ~w[ + == != <= >= + - * / % ** // << >> & | ^ ~ = += -= *= /= %= **= //= + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; @ # + ] ++ ~w( [ ] ) + + @impl true + def declaration_keywords, do: ~w[def class async] + + @impl true + def branch_keywords, do: ~w[elif else except finally] + + @impl true + def block_end_tokens, do: [] + + @impl true + def access_modifiers, do: [] + + @impl true + def function_keywords, do: ~w[def async] + + @impl true + def module_keywords, do: ~w[class] + + @impl true + def import_keywords, do: ~w[import from] + + @impl true + def uses_colon_indent?, do: true +end diff --git a/lib/codeqa/languages/code/scripting/r.ex b/lib/codeqa/languages/code/scripting/r.ex new file mode 100644 index 00000000..d735d2b1 --- /dev/null +++ b/lib/codeqa/languages/code/scripting/r.ex @@ -0,0 +1,49 @@ +defmodule CodeQA.Languages.Code.Scripting.R do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "r" + + @impl true + def extensions, do: ~w[r R Rmd rmd] + + @impl true + def comment_prefixes, do: ~w[#] + + @impl true + def block_comments, do: [] + + @impl true + def keywords, do: ~w[ + if else for while repeat break next return function TRUE FALSE NULL NA Inf NaN + ] + + @impl true + def operators, do: ~w[ + == != <= >= + - * / ^ %% %/% %in% <- -> = & | ! && || ~ : :: + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; + ] ++ ~w( [ ] ) + + @impl true + def declaration_keywords, do: ~w[function] + + @impl true + def branch_keywords, do: ~w[else] + + @impl true + def block_end_tokens, do: ~w[}] + + @impl true + def function_keywords, do: ~w[function] + + @impl true + def import_keywords, do: ~w[library require source] + + @impl true + def test_keywords, do: ~w[test_that expect_equal expect_true describe it] +end diff --git a/lib/codeqa/languages/code/scripting/ruby.ex b/lib/codeqa/languages/code/scripting/ruby.ex new file mode 100644 index 00000000..d1e9761e --- /dev/null +++ b/lib/codeqa/languages/code/scripting/ruby.ex @@ -0,0 +1,58 @@ +defmodule CodeQA.Languages.Code.Scripting.Ruby do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "ruby" + + @impl true + def extensions, do: ~w[rb rake gemspec] + + @impl true + def comment_prefixes, do: ~w[#] + + @impl true + def block_comments, do: [] + + @impl true + def keywords, do: ~w[ + if else elsif unless for while until def class module do end return begin + rescue ensure raise yield include extend require require_relative + attr_accessor attr_reader attr_writer then case when next break in + and or not true false nil self super + ] + + @impl true + def operators, do: ~w[ + == != <= >= + - * / % ** << >> & | ^ ~ = += -= *= /= %= **= <=> === =~ + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; @ | # ? + ] ++ ~w( [ ] ) + + @impl true + def declaration_keywords, do: ~w[def class module] + + @impl true + def branch_keywords, do: ~w[else elsif rescue ensure when] + + @impl true + def block_end_tokens, do: ~w[end] + + @impl true + def access_modifiers, do: [] + + @impl true + def function_keywords, do: ~w[def] + + @impl true + def module_keywords, do: ~w[class module] + + @impl true + def import_keywords, do: ~w[require require_relative include] + + @impl true + def test_keywords, do: ~w[it describe context scenario feature given] +end diff --git a/lib/codeqa/languages/code/scripting/shell.ex b/lib/codeqa/languages/code/scripting/shell.ex new file mode 100644 index 00000000..710d28aa --- /dev/null +++ b/lib/codeqa/languages/code/scripting/shell.ex @@ -0,0 +1,47 @@ +defmodule CodeQA.Languages.Code.Scripting.Shell do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "shell" + + @impl true + def extensions, do: ~w[sh bash zsh fish] + + @impl true + def comment_prefixes, do: ~w[#] + + @impl true + def block_comments, do: [] + + @impl true + def keywords, do: ~w[ + if else elif fi for while do done case esac function return then in until + select break continue exit local export readonly unset + ] + + @impl true + def operators, do: ~w[ + == != <= >= + - * / % && || | & > < >> << = += -= *= /= %= -eq -ne -lt -gt -le -ge + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; @ # $ ! ? | + ] ++ ~w( [ ] ) + + @impl true + def declaration_keywords, do: ~w[function] + + @impl true + def branch_keywords, do: ~w[else elif case] + + @impl true + def block_end_tokens, do: ~w[fi done esac] + + @impl true + def access_modifiers, do: [] + + @impl true + def function_keywords, do: ~w[function] +end diff --git a/lib/codeqa/languages/code/vm/clojure.ex b/lib/codeqa/languages/code/vm/clojure.ex new file mode 100644 index 00000000..5dd149be --- /dev/null +++ b/lib/codeqa/languages/code/vm/clojure.ex @@ -0,0 +1,54 @@ +defmodule CodeQA.Languages.Code.Vm.Clojure do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "clojure" + + @impl true + def extensions, do: ~w[clj cljs cljc edn] + + @impl true + def comment_prefixes, do: ~w[;] + + @impl true + def block_comments, do: [] + + @impl true + def keywords, do: ~w[ + def defn defmacro let fn if do when cond case for loop recur ns require use + import try catch finally throw quote defprotocol defrecord deftype reify + extend-type extend-protocol nil true false and or not + ] + + @impl true + def operators, do: ~w[ + = == not= < > <= >= + - * / mod rem quot and or not + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; # @ ^ + ] ++ ~w( [ ] ) + + @impl true + def declaration_keywords, do: ~w[def defn defmacro defprotocol defrecord deftype] + + @impl true + def branch_keywords, do: ~w[else] + + @impl true + def block_end_tokens, do: ~w[)] + + @impl true + def function_keywords, do: ~w[defn fn] + + @impl true + def module_keywords, do: ~w[ns defprotocol defrecord] + + @impl true + def import_keywords, do: ~w[ns require use import] + + @impl true + def test_keywords, do: ~w[deftest is testing] +end diff --git a/lib/codeqa/languages/code/vm/csharp.ex b/lib/codeqa/languages/code/vm/csharp.ex new file mode 100644 index 00000000..85edce73 --- /dev/null +++ b/lib/codeqa/languages/code/vm/csharp.ex @@ -0,0 +1,54 @@ +defmodule CodeQA.Languages.Code.Vm.CSharp do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "csharp" + + @impl true + def extensions, do: ~w[cs csx] + + @impl true + def comment_prefixes, do: ~w[//] + + @impl true + def block_comments, do: [{"/*", "*/"}] + + @impl true + def keywords, do: ~w[ + if else for foreach while do class interface struct enum namespace using + return var new this base public private protected internal static abstract + virtual override sealed async await try catch finally throw switch case + break continue default in out ref void true false null readonly const + ] + + @impl true + def operators, do: ~w[ + == != <= >= + - * / % << >> & | ^ ~ && || ?? = += -= *= /= %= + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; @ # => + ] ++ ~w( [ ] ) + + @impl true + def declaration_keywords, do: ~w[class interface struct enum namespace] + + @impl true + def branch_keywords, do: ~w[else catch finally case default] + + @impl true + def block_end_tokens, do: ~w[}] + + @impl true + def access_modifiers, + do: + ~w[public private protected internal static abstract virtual override sealed readonly const async] + + @impl true + def module_keywords, do: ~w[class interface struct enum namespace] + + @impl true + def import_keywords, do: ~w[using namespace] +end diff --git a/lib/codeqa/languages/code/vm/dart.ex b/lib/codeqa/languages/code/vm/dart.ex new file mode 100644 index 00000000..e821e226 --- /dev/null +++ b/lib/codeqa/languages/code/vm/dart.ex @@ -0,0 +1,58 @@ +defmodule CodeQA.Languages.Code.Vm.Dart do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "dart" + + @impl true + def extensions, do: ~w[dart] + + @impl true + def comment_prefixes, do: ~w[//] + + @impl true + def block_comments, do: [{"/*", "*/"}] + + @impl true + def keywords, do: ~w[ + if else for while do switch case break continue return class extends implements + with new final const var void null true false import export part library + abstract static dynamic async await yield try catch finally throw rethrow + enum typedef mixin factory is as in + ] + + @impl true + def operators, do: ~w[ + == != <= >= + - * / % ~/ << >> & | ^ ~ && || ?? = += -= *= /= %= ??= -> => + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; @ # => + ] ++ ~w( [ ] ) + + @impl true + def declaration_keywords, do: ~w[class enum typedef mixin] + + @impl true + def branch_keywords, do: ~w[else catch finally case] + + @impl true + def block_end_tokens, do: ~w[}] + + @impl true + def access_modifiers, do: ~w[static final const abstract] + + @impl true + def function_keywords, do: ~w[void async] + + @impl true + def module_keywords, do: ~w[class enum mixin] + + @impl true + def import_keywords, do: ~w[import export] + + @impl true + def test_keywords, do: ~w[test group setUp tearDown expect] +end diff --git a/lib/codeqa/languages/code/vm/elixir.ex b/lib/codeqa/languages/code/vm/elixir.ex new file mode 100644 index 00000000..2eab0274 --- /dev/null +++ b/lib/codeqa/languages/code/vm/elixir.ex @@ -0,0 +1,59 @@ +defmodule CodeQA.Languages.Code.Vm.Elixir do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "elixir" + + @impl true + def extensions, do: ~w[ex exs] + + @impl true + def comment_prefixes, do: ~w[#] + + @impl true + def block_comments, do: [] + + @impl true + def keywords, do: ~w[ + if else unless for do end def defp defmodule defmacro defmacrop defprotocol + defimpl defguard defdelegate defstruct case cond with when fn try rescue + catch raise receive in not and or true false nil + ] + + @impl true + def operators, do: ~w[ + == === != !== <= >= + - * / % << >> & | ^ ~ && || |> <> <- -> = ! not and or in + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; @ | + ] ++ ~w( [ ] ) + + @impl true + def declaration_keywords, + do: + ~w[def defp defmodule defmacro defmacrop defprotocol defimpl defdelegate defoverridable defguard] + + @impl true + def branch_keywords, do: ~w[else rescue catch ensure cond when case] + + @impl true + def block_end_tokens, do: ~w[end] + + @impl true + def access_modifiers, do: [] + + @impl true + def function_keywords, do: ~w[def defp defmacro defmacrop defdelegate defguard] + + @impl true + def module_keywords, do: ~w[defmodule defprotocol defimpl] + + @impl true + def import_keywords, do: ~w[import require use alias] + + @impl true + def test_keywords, do: ~w[test describe] +end diff --git a/lib/codeqa/languages/code/vm/erlang.ex b/lib/codeqa/languages/code/vm/erlang.ex new file mode 100644 index 00000000..c835dd63 --- /dev/null +++ b/lib/codeqa/languages/code/vm/erlang.ex @@ -0,0 +1,54 @@ +defmodule CodeQA.Languages.Code.Vm.Erlang do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "erlang" + + @impl true + def extensions, do: ~w[erl hrl] + + @impl true + def comment_prefixes, do: ~w[%] + + @impl true + def block_comments, do: [] + + @impl true + def keywords, do: ~w[ + if case when of begin end receive after fun try catch throw error exit + module export import define record true false ok undefined andalso orelse + not band bor bxor bnot bsl bsr div rem + ] + + @impl true + def operators, do: ~w[ + == /= =< >= =:= =/= + - * / ! <- -> :: | . , ; : + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; | -> + ] ++ ~w( [ ] ) + + @impl true + def declaration_keywords, do: ~w[-module -record -define] + + @impl true + def branch_keywords, do: ~w[of after catch] + + @impl true + def block_end_tokens, do: ~w[end] + + @impl true + def function_keywords, do: ~w[fun] + + @impl true + def module_keywords, do: ~w[-module] + + @impl true + def import_keywords, do: ~w[-import -include] + + @impl true + def test_keywords, do: ~w[_test_ _test] +end diff --git a/lib/codeqa/languages/code/vm/fsharp.ex b/lib/codeqa/languages/code/vm/fsharp.ex new file mode 100644 index 00000000..9c7792f3 --- /dev/null +++ b/lib/codeqa/languages/code/vm/fsharp.ex @@ -0,0 +1,60 @@ +defmodule CodeQA.Languages.Code.Vm.Fsharp do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "fsharp" + + @impl true + def extensions, do: ~w[fs fsi fsx] + + @impl true + def comment_prefixes, do: ~w[//] + + @impl true + def block_comments, do: [{"(*", "*)"}] + + @impl true + def keywords, do: ~w[ + let rec if then else for while do match with type module open namespace val + mutable abstract member override new return yield async await try finally + raise true false null and or not in when downto to + ] + + @impl true + def operators, do: ~w[ + == != <= >= + - * / % << >> & | ^ ~ && || = |> <| >> << -> <- :: @ ? + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; | @ # -> + ] ++ ~w( [ ] ) + + @impl true + def declaration_keywords, do: ~w[let type module] + + @impl true + def branch_keywords, do: ~w[else with] + + @impl true + def block_end_tokens, do: [] + + @impl true + def access_modifiers, do: ~w[public private protected internal static abstract override] + + @impl true + def function_keywords, do: ~w[let fun] + + @impl true + def module_keywords, do: ~w[module namespace type] + + @impl true + def import_keywords, do: ~w[open] + + @impl true + def test_keywords, do: ~w[testCase test testProperty] + + @impl true + def uses_colon_indent?, do: true +end diff --git a/lib/codeqa/languages/code/vm/java.ex b/lib/codeqa/languages/code/vm/java.ex new file mode 100644 index 00000000..fa018e0c --- /dev/null +++ b/lib/codeqa/languages/code/vm/java.ex @@ -0,0 +1,52 @@ +defmodule CodeQA.Languages.Code.Vm.Java do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "java" + + @impl true + def extensions, do: ~w[java] + + @impl true + def comment_prefixes, do: ~w[//] + + @impl true + def block_comments, do: [{"/*", "*/"}] + + @impl true + def keywords, do: ~w[ + if else for while do class interface extends implements import package + return new this super public private protected static abstract final + synchronized volatile try catch finally throw throws switch case break + continue default void true false null instanceof + ] + + @impl true + def operators, do: ~w[ + == != <= >= + - * / % << >> >>> & | ^ ~ && || = += -= *= /= %= + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; @ + ] ++ ~w( [ ] ) + + @impl true + def declaration_keywords, do: ~w[class interface] + + @impl true + def branch_keywords, do: ~w[else catch finally case default] + + @impl true + def block_end_tokens, do: ~w[}] + + @impl true + def access_modifiers, do: ~w[public private protected static abstract final synchronized] + + @impl true + def module_keywords, do: ~w[class interface enum] + + @impl true + def import_keywords, do: ~w[import package] +end diff --git a/lib/codeqa/languages/code/vm/kotlin.ex b/lib/codeqa/languages/code/vm/kotlin.ex new file mode 100644 index 00000000..4c286c27 --- /dev/null +++ b/lib/codeqa/languages/code/vm/kotlin.ex @@ -0,0 +1,55 @@ +defmodule CodeQA.Languages.Code.Vm.Kotlin do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "kotlin" + + @impl true + def extensions, do: ~w[kt kts] + + @impl true + def comment_prefixes, do: ~w[//] + + @impl true + def block_comments, do: [{"/*", "*/"}] + + @impl true + def keywords, do: ~w[ + if else for while do fun class object interface data sealed abstract enum + companion import package return val var when is as in out by override open + final private protected public internal suspend inline reified crossinline + noinline try catch finally throw break continue null true false this super init + ] + + @impl true + def operators, do: ~w[ + == === != !== <= >= + - * / % << >> & | ^ ~ && || ?: = += -= *= /= %= -> => :: + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; @ # | + ] ++ ~w( [ ] ) + + @impl true + def declaration_keywords, do: ~w[fun class object interface data sealed abstract enum] + + @impl true + def branch_keywords, do: ~w[else when catch finally] + + @impl true + def block_end_tokens, do: ~w[}] + + @impl true + def access_modifiers, do: ~w[public private protected internal override open abstract final] + + @impl true + def function_keywords, do: ~w[fun] + + @impl true + def module_keywords, do: ~w[class interface object] + + @impl true + def import_keywords, do: ~w[import package] +end diff --git a/lib/codeqa/languages/code/vm/scala.ex b/lib/codeqa/languages/code/vm/scala.ex new file mode 100644 index 00000000..08ac7ab1 --- /dev/null +++ b/lib/codeqa/languages/code/vm/scala.ex @@ -0,0 +1,58 @@ +defmodule CodeQA.Languages.Code.Vm.Scala do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "scala" + + @impl true + def extensions, do: ~w[scala sc] + + @impl true + def comment_prefixes, do: ~w[//] + + @impl true + def block_comments, do: [{"/*", "*/"}] + + @impl true + def keywords, do: ~w[ + if else for while do def class object trait extends with new return import + package val var type match case sealed abstract override final protected + private implicit lazy yield try catch finally throw true false null this super + ] + + @impl true + def operators, do: ~w[ + == != <= >= + - * / % << >> & | ^ ~ && || = += -= *= /= => <- <: >: : + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; @ # => + ] ++ ~w( [ ] ) + + @impl true + def declaration_keywords, do: ~w[def class object trait type] + + @impl true + def branch_keywords, do: ~w[else catch case finally] + + @impl true + def block_end_tokens, do: ~w[}] + + @impl true + def access_modifiers, + do: ~w[public private protected override abstract final sealed implicit lazy] + + @impl true + def function_keywords, do: ~w[def] + + @impl true + def module_keywords, do: ~w[class object trait package] + + @impl true + def import_keywords, do: ~w[import package] + + @impl true + def test_keywords, do: ~w[test it describe should] +end diff --git a/lib/codeqa/languages/code/web/javascript.ex b/lib/codeqa/languages/code/web/javascript.ex new file mode 100644 index 00000000..87f48f59 --- /dev/null +++ b/lib/codeqa/languages/code/web/javascript.ex @@ -0,0 +1,57 @@ +defmodule CodeQA.Languages.Code.Web.JavaScript do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "javascript" + + @impl true + def extensions, do: ~w[js mjs cjs jsx vue svelte] + + @impl true + def comment_prefixes, do: ~w[//] + + @impl true + def block_comments, do: [{"/*", "*/"}] + + @impl true + def keywords, do: ~w[ + if else for while function class return var let const import export from + new this typeof instanceof try catch finally throw switch case break + continue default delete in of async await yield true false null undefined + ] + + @impl true + def operators, do: ~w[ + == === != !== <= >= + - * / % ** << >> >>> & | ^ ~ && || ?? = += -= *= /= %= + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; @ # => + ] ++ ~w( [ ] ) + + @impl true + def declaration_keywords, do: ~w[function class async] + + @impl true + def branch_keywords, do: ~w[else catch finally case default] + + @impl true + def block_end_tokens, do: ~w[}] + + @impl true + def access_modifiers, do: ~w[export static] + + @impl true + def function_keywords, do: ~w[function async] + + @impl true + def module_keywords, do: ~w[class] + + @impl true + def import_keywords, do: ~w[import] + + @impl true + def test_keywords, do: ~w[test it describe context scenario feature given] +end diff --git a/lib/codeqa/languages/code/web/typescript.ex b/lib/codeqa/languages/code/web/typescript.ex new file mode 100644 index 00000000..b8a422af --- /dev/null +++ b/lib/codeqa/languages/code/web/typescript.ex @@ -0,0 +1,60 @@ +defmodule CodeQA.Languages.Code.Web.TypeScript do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "typescript" + + @impl true + def extensions, do: ~w[ts tsx] + + @impl true + def comment_prefixes, do: ~w[//] + + @impl true + def block_comments, do: [{"/*", "*/"}] + + @impl true + def keywords, do: ~w[ + if else for while function class return var let const import export from + new this typeof instanceof try catch finally throw switch case break + continue default delete in of async await yield true false null undefined + type interface enum namespace declare abstract override readonly implements + extends satisfies as keyof typeof infer never unknown any void + ] + + @impl true + def operators, do: ~w[ + == === != !== <= >= + - * / % ** << >> >>> & | ^ ~ && || ?? = += -= *= /= %= + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; @ # => < + ] ++ ~w( [ ] ) + + @impl true + def declaration_keywords, do: ~w[function class async interface enum namespace type declare] + + @impl true + def branch_keywords, do: ~w[else catch finally case default] + + @impl true + def block_end_tokens, do: ~w[}] + + @impl true + def access_modifiers, + do: ~w[export public private protected static abstract override readonly sealed] + + @impl true + def function_keywords, do: ~w[function async] + + @impl true + def module_keywords, do: ~w[class interface enum namespace] + + @impl true + def import_keywords, do: ~w[import] + + @impl true + def test_keywords, do: ~w[test it describe context scenario feature given] +end diff --git a/lib/codeqa/languages/config/dockerfile.ex b/lib/codeqa/languages/config/dockerfile.ex new file mode 100644 index 00000000..e1ed3a69 --- /dev/null +++ b/lib/codeqa/languages/config/dockerfile.ex @@ -0,0 +1,35 @@ +defmodule CodeQA.Languages.Config.Dockerfile do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "dockerfile" + + @impl true + def extensions, do: ~w[Dockerfile] + + @impl true + def comment_prefixes, do: ~w[#] + + @impl true + def block_comments, do: [] + + @impl true + def keywords, do: ~w[ + FROM RUN CMD LABEL EXPOSE ENV ADD COPY ENTRYPOINT VOLUME USER WORKDIR ARG + ONBUILD STOPSIGNAL HEALTHCHECK SHELL AS + ] + + @impl true + def operators, do: ~w[ + = \ + ] + + @impl true + def delimiters, do: ~w[ + ( ) , : # + ] ++ ~w( [ ] ) + + @impl true + def declaration_keywords, do: ~w[FROM] +end diff --git a/lib/codeqa/languages/config/makefile.ex b/lib/codeqa/languages/config/makefile.ex new file mode 100644 index 00000000..ffb45221 --- /dev/null +++ b/lib/codeqa/languages/config/makefile.ex @@ -0,0 +1,32 @@ +defmodule CodeQA.Languages.Config.Makefile do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "makefile" + + @impl true + def extensions, do: ~w[Makefile GNUmakefile mk] + + @impl true + def comment_prefixes, do: ~w[#] + + @impl true + def block_comments, do: [] + + @impl true + def keywords, do: ~w[ + ifeq ifneq ifdef ifndef else endif define endef include export unexport + override private vpath all clean install + ] + + @impl true + def operators, do: ~w[ + = := ::= ?= += != + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; @ $ % # \ + ] ++ ~w( [ ] ) +end diff --git a/lib/codeqa/languages/config/terraform.ex b/lib/codeqa/languages/config/terraform.ex new file mode 100644 index 00000000..c35cb9f2 --- /dev/null +++ b/lib/codeqa/languages/config/terraform.ex @@ -0,0 +1,33 @@ +defmodule CodeQA.Languages.Config.Terraform do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "terraform" + + @impl true + def extensions, do: ~w[tf tfvars] + + @impl true + def comment_prefixes, do: ~w[# //] + + @impl true + def block_comments, do: [{"/*", "*/"}] + + @impl true + def keywords, do: ~w[ + resource data variable output locals module provider terraform + required_providers backend for_each count depends_on lifecycle + source version true false null for if + ] + + @impl true + def operators, do: ~w[ + = == != <= >= && || ! ? : + ] + + @impl true + def delimiters, do: ~w[ + { } ( ) , . : = " # // + ] ++ ~w( [ ] ) +end diff --git a/lib/codeqa/languages/data/graphql.ex b/lib/codeqa/languages/data/graphql.ex new file mode 100644 index 00000000..47dbc51f --- /dev/null +++ b/lib/codeqa/languages/data/graphql.ex @@ -0,0 +1,32 @@ +defmodule CodeQA.Languages.Data.GraphQL do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "graphql" + + @impl true + def extensions, do: ~w[graphql gql] + + @impl true + def comment_prefixes, do: ~w[#] + + @impl true + def block_comments, do: [] + + @impl true + def keywords, do: ~w[ + query mutation subscription fragment on type interface union enum input + scalar schema directive extend implements true false null + ] + + @impl true + def operators, do: ~w[ + = : ! | & + ] + + @impl true + def delimiters, do: ~w[ + { } ( ) , . : # @ ! + ] ++ ~w( [ ] ) +end diff --git a/lib/codeqa/languages/data/json.ex b/lib/codeqa/languages/data/json.ex new file mode 100644 index 00000000..0b1909e5 --- /dev/null +++ b/lib/codeqa/languages/data/json.ex @@ -0,0 +1,31 @@ +defmodule CodeQA.Languages.Data.Json do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "json" + + @impl true + def extensions, do: ~w[json jsonc] + + @impl true + def comment_prefixes, do: [] + + @impl true + def block_comments, do: [] + + @impl true + def keywords, do: ~w[ + true false null + ] + + @impl true + def operators, do: ~w[ + : + ] + + @impl true + def delimiters, do: ~w[ + { } , " ' + ] ++ ~w( [ ] ) +end diff --git a/lib/codeqa/languages/data/sql.ex b/lib/codeqa/languages/data/sql.ex new file mode 100644 index 00000000..ddc40181 --- /dev/null +++ b/lib/codeqa/languages/data/sql.ex @@ -0,0 +1,40 @@ +defmodule CodeQA.Languages.Data.Sql do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "sql" + + @impl true + def extensions, do: ~w[sql] + + @impl true + def comment_prefixes, do: ~w[--] + + @impl true + def block_comments, do: [{"/*", "*/"}] + + @impl true + def keywords, do: ~w[ + SELECT FROM WHERE INSERT INTO UPDATE DELETE SET CREATE DROP ALTER TABLE + INDEX VIEW JOIN LEFT RIGHT INNER OUTER FULL CROSS ON AND OR NOT IN EXISTS + AS GROUP BY ORDER HAVING LIMIT OFFSET DISTINCT NULL TRUE FALSE PRIMARY KEY + FOREIGN REFERENCES CASCADE UNIQUE DEFAULT VALUES RETURNING WITH UNION + INTERSECT EXCEPT CASE WHEN THEN ELSE END IF BEGIN COMMIT ROLLBACK + ] + + @impl true + def operators, do: ~w[ + = != <> <= >= + - * / % LIKE BETWEEN IS IN + ] + + @impl true + def delimiters, do: ~w[ + ( ) , . ; ' " -- /* + ] ++ ~w( [ ] ) + + @impl true + def statement_keywords, + do: + ~w[select insert update delete create drop alter truncate begin commit rollback call execute] +end diff --git a/lib/codeqa/languages/data/toml.ex b/lib/codeqa/languages/data/toml.ex new file mode 100644 index 00000000..1051c0dc --- /dev/null +++ b/lib/codeqa/languages/data/toml.ex @@ -0,0 +1,31 @@ +defmodule CodeQA.Languages.Data.Toml do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "toml" + + @impl true + def extensions, do: ~w[toml] + + @impl true + def comment_prefixes, do: ~w[#] + + @impl true + def block_comments, do: [] + + @impl true + def keywords, do: ~w[ + true false + ] + + @impl true + def operators, do: ~w[ + = + ] + + @impl true + def delimiters, do: ~w[ + { } , . : # " ' + ] ++ ~w( [ ] ) +end diff --git a/lib/codeqa/languages/data/yaml.ex b/lib/codeqa/languages/data/yaml.ex new file mode 100644 index 00000000..8beb0cbb --- /dev/null +++ b/lib/codeqa/languages/data/yaml.ex @@ -0,0 +1,31 @@ +defmodule CodeQA.Languages.Data.Yaml do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "yaml" + + @impl true + def extensions, do: ~w[yml yaml] + + @impl true + def comment_prefixes, do: ~w[#] + + @impl true + def block_comments, do: [] + + @impl true + def keywords, do: ~w[ + true false null yes no on off + ] + + @impl true + def operators, do: ~w[ + : | > & * ! + ] + + @impl true + def delimiters, do: ~w[ + { } , . # @ --- + ] ++ ~w( [ ] ) +end diff --git a/lib/codeqa/languages/language.ex b/lib/codeqa/languages/language.ex new file mode 100644 index 00000000..3ccd1728 --- /dev/null +++ b/lib/codeqa/languages/language.ex @@ -0,0 +1,181 @@ +defmodule CodeQA.Language do + @moduledoc false + @callback name() :: String.t() + @callback extensions() :: [String.t()] + @callback comment_prefixes() :: [String.t()] + @callback block_comments() :: [{String.t(), String.t()}] + @callback keywords() :: [String.t()] + @callback operators() :: [String.t()] + @callback delimiters() :: [String.t()] + + @callback declaration_keywords() :: [String.t()] + @callback branch_keywords() :: [String.t()] + @callback block_end_tokens() :: [String.t()] + @callback access_modifiers() :: [String.t()] + @callback statement_keywords() :: [String.t()] + + @callback function_keywords() :: [String.t()] + @callback module_keywords() :: [String.t()] + @callback import_keywords() :: [String.t()] + @callback test_keywords() :: [String.t()] + @callback uses_colon_indent?() :: boolean() + @callback divider_indicators() :: [String.t()] + + @optional_callbacks [ + declaration_keywords: 0, + branch_keywords: 0, + block_end_tokens: 0, + access_modifiers: 0, + statement_keywords: 0, + function_keywords: 0, + module_keywords: 0, + import_keywords: 0, + test_keywords: 0, + uses_colon_indent?: 0, + divider_indicators: 0 + ] + + defmacro __using__(_opts) do + quote do + @behaviour CodeQA.Language + def declaration_keywords, do: [] + def branch_keywords, do: [] + def block_end_tokens, do: [] + def access_modifiers, do: [] + def statement_keywords, do: [] + def function_keywords, do: [] + def module_keywords, do: [] + def import_keywords, do: [] + def test_keywords, do: [] + def uses_colon_indent?, do: false + def divider_indicators, do: ~w[-- - == === ~ * ** # // / =] + + defoverridable declaration_keywords: 0, + branch_keywords: 0, + block_end_tokens: 0, + access_modifiers: 0, + statement_keywords: 0, + function_keywords: 0, + module_keywords: 0, + import_keywords: 0, + test_keywords: 0, + uses_colon_indent?: 0, + divider_indicators: 0 + end + end + + @spec all() :: [module()] + def all do + {:ok, modules} = :application.get_key(:codeqa, :modules) + Enum.filter(modules, &implements?/1) + end + + @spec all_keywords() :: [String.t()] + def all_keywords do + all() + |> Enum.flat_map(& &1.keywords()) + |> Enum.uniq() + end + + @spec keywords(atom() | String.t()) :: MapSet.t() + def keywords(language) do + case find(language) do + nil -> MapSet.new() + mod -> MapSet.new(mod.keywords()) + end + end + + @spec operators(atom() | String.t()) :: MapSet.t() + def operators(language) do + case find(language) do + nil -> MapSet.new() + mod -> MapSet.new(mod.operators()) + end + end + + @spec delimiters(atom() | String.t()) :: MapSet.t() + def delimiters(language) do + case find(language) do + nil -> MapSet.new() + mod -> MapSet.new(mod.delimiters()) + end + end + + @spec declaration_keywords(module()) :: MapSet.t() + def declaration_keywords(mod), do: MapSet.new(mod.declaration_keywords()) + + @spec branch_keywords(module()) :: MapSet.t() + def branch_keywords(mod), do: MapSet.new(mod.branch_keywords()) + + @spec block_end_tokens(module()) :: MapSet.t() + def block_end_tokens(mod), do: MapSet.new(mod.block_end_tokens()) + + @spec access_modifiers(module()) :: MapSet.t() + def access_modifiers(mod), do: MapSet.new(mod.access_modifiers()) + + @spec statement_keywords(module()) :: MapSet.t() + def statement_keywords(mod), do: MapSet.new(mod.statement_keywords()) + + @spec function_keywords(module()) :: MapSet.t() + def function_keywords(mod), do: MapSet.new(mod.function_keywords()) + + @spec module_keywords(module()) :: MapSet.t() + def module_keywords(mod), do: MapSet.new(mod.module_keywords()) + + @spec import_keywords(module()) :: MapSet.t() + def import_keywords(mod), do: MapSet.new(mod.import_keywords()) + + @spec test_keywords(module()) :: MapSet.t() + def test_keywords(mod), do: MapSet.new(mod.test_keywords()) + + @spec divider_indicators(module()) :: MapSet.t() + def divider_indicators(mod), do: MapSet.new(mod.divider_indicators()) + + @spec find(atom() | String.t()) :: module() + def find(language) do + name = to_string(language) + Enum.find(all(), fn mod -> mod.name() == name end) || CodeQA.Languages.Unknown + end + + @spec detect(String.t()) :: module() + def detect(path) do + basename = Path.basename(path) + ext = path |> Path.extname() |> String.trim_leading(".") + + Enum.find(all(), fn mod -> + ext in mod.extensions() or (ext == "" and basename in mod.extensions()) + end) || CodeQA.Languages.Unknown + end + + @spec strip_comments(String.t(), module()) :: String.t() + def strip_comments(content, language_mod) do + content + |> strip_block_comments(language_mod.block_comments()) + |> strip_line_comments(language_mod.comment_prefixes()) + end + + defp strip_block_comments(content, []), do: content + + defp strip_block_comments(content, pairs) do + Enum.reduce(pairs, content, fn {open, close}, acc -> + regex = Regex.compile!(Regex.escape(open) <> ".*?" <> Regex.escape(close), [:dotall]) + + Regex.replace(regex, acc, fn match -> + String.replace(match, ~r/[^\n]/, "") + end) + end) + end + + defp strip_line_comments(content, []), do: content + + defp strip_line_comments(content, prefixes) do + pattern = Enum.map_join(prefixes, "|", &Regex.escape/1) + Regex.replace(Regex.compile!("(#{pattern}).*$", [:multiline]), content, "") + end + + defp implements?(module) do + CodeQA.Language in (module.__info__(:attributes)[:behaviour] || []) + rescue + _ -> false + end +end diff --git a/lib/codeqa/languages/markup/css.ex b/lib/codeqa/languages/markup/css.ex new file mode 100644 index 00000000..0b0af143 --- /dev/null +++ b/lib/codeqa/languages/markup/css.ex @@ -0,0 +1,32 @@ +defmodule CodeQA.Languages.Markup.Css do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "css" + + @impl true + def extensions, do: ~w[css scss sass less] + + @impl true + def comment_prefixes, do: [] + + @impl true + def block_comments, do: [{"/*", "*/"}] + + @impl true + def keywords, do: ~w[ + media keyframes import charset supports layer font-face from to + auto none inherit initial unset normal bold italic + ] + + @impl true + def operators, do: ~w[ + : ; > + ~ * = ^= $= *= ~= |= + ] + + @impl true + def delimiters, do: ~w[ + { } ( ) , . # : ; @ + ] ++ ~w( [ ] ) +end diff --git a/lib/codeqa/languages/markup/html.ex b/lib/codeqa/languages/markup/html.ex new file mode 100644 index 00000000..31a0fe8d --- /dev/null +++ b/lib/codeqa/languages/markup/html.ex @@ -0,0 +1,34 @@ +defmodule CodeQA.Languages.Markup.Html do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "html" + + @impl true + def extensions, do: ~w[html htm heex eex leex erb htmlbars hbs mustache jinja jinja2 njk liquid] + + @impl true + def comment_prefixes, do: [] + + @impl true + def block_comments, do: [{""}] + + @impl true + def keywords, do: ~w[ + html head body div span p a img input form button select option textarea + script style link meta title h1 h2 h3 h4 h5 h6 ul ol li table tr td th + header footer nav main section article aside figure figcaption + class id href src type name value rel action method placeholder + ] + + @impl true + def operators, do: ~w[ + < > / = & + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } , . : ; " ' # ! ? + ] ++ ~w( [ ] ) +end diff --git a/lib/codeqa/languages/markup/markdown.ex b/lib/codeqa/languages/markup/markdown.ex new file mode 100644 index 00000000..ee75d60c --- /dev/null +++ b/lib/codeqa/languages/markup/markdown.ex @@ -0,0 +1,31 @@ +defmodule CodeQA.Languages.Markup.Markdown do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "markdown" + + @impl true + def extensions, do: ~w[md mdx] + + @impl true + def comment_prefixes, do: [] + + @impl true + def block_comments, do: [] + + @impl true + def keywords, do: ~w[ + TODO NOTE FIXME WARNING IMPORTANT + ] + + @impl true + def operators, do: ~w[ + # ## ### #### ##### ###### > ``` ** * _ __ ~~ + ] + + @impl true + def delimiters, do: ~w[ + ( ) . ! ? ` * _ ~ + ] ++ ~w( [ ] ) +end diff --git a/lib/codeqa/languages/markup/xml.ex b/lib/codeqa/languages/markup/xml.ex new file mode 100644 index 00000000..85c76687 --- /dev/null +++ b/lib/codeqa/languages/markup/xml.ex @@ -0,0 +1,31 @@ +defmodule CodeQA.Languages.Markup.Xml do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "xml" + + @impl true + def extensions, do: ~w[xml svg xsl xslt xsd wsdl plist] + + @impl true + def comment_prefixes, do: [] + + @impl true + def block_comments, do: [{""}] + + @impl true + def keywords, do: ~w[ + xmlns version encoding standalone + ] + + @impl true + def operators, do: ~w[ + < > / = & + ] + + @impl true + def delimiters, do: ~w[ + ( ) , . : ; " ' # ! ? + ] ++ ~w( [ ] ) +end diff --git a/lib/codeqa/languages/unknown.ex b/lib/codeqa/languages/unknown.ex new file mode 100644 index 00000000..11a0f7ac --- /dev/null +++ b/lib/codeqa/languages/unknown.ex @@ -0,0 +1,31 @@ +defmodule CodeQA.Languages.Unknown do + @moduledoc false + use CodeQA.Language + + @impl true + def name, do: "unknown" + + @impl true + def extensions, do: [] + + @impl true + def comment_prefixes, do: [] + + @impl true + def block_comments, do: [] + + @impl true + def keywords, do: ~w[ + if else + ] + + @impl true + def operators, do: ~w[ + == != + ] + + @impl true + def delimiters, do: ~w[ + ( ) { } + ] +end diff --git a/lib/codeqa/metrics/codebase/codebase_metric.ex b/lib/codeqa/metrics/codebase/codebase_metric.ex new file mode 100644 index 00000000..8b275c82 --- /dev/null +++ b/lib/codeqa/metrics/codebase/codebase_metric.ex @@ -0,0 +1,14 @@ +defmodule CodeQA.Metrics.Codebase.CodebaseMetric do + @moduledoc """ + Behaviour for metrics that operate across an entire codebase. + + Unlike `FileMetric`, which analyzes a single file, codebase metrics receive + a map of all source files and can compute cross-file properties such as + duplication or structural similarity. + + See [software metrics](https://en.wikipedia.org/wiki/Software_metric). + """ + + @callback name() :: String.t() + @callback analyze(%{String.t() => String.t()}, keyword()) :: map() +end diff --git a/lib/codeqa/metrics/codebase/near_duplicate_blocks_codebase.ex b/lib/codeqa/metrics/codebase/near_duplicate_blocks_codebase.ex new file mode 100644 index 00000000..2e821e98 --- /dev/null +++ b/lib/codeqa/metrics/codebase/near_duplicate_blocks_codebase.ex @@ -0,0 +1,44 @@ +defmodule CodeQA.Metrics.Codebase.NearDuplicateBlocksCodebase do + @moduledoc """ + Counts near-duplicate and exact-duplicate natural code blocks across the codebase. + + Detects blocks per file, pools them, and finds pairs across all files. + Includes pair source lists (capped by max_pairs_per_bucket). + + Configure in .codeqa.yml: + near_duplicate_blocks: + max_pairs_per_bucket: 50 + """ + + @behaviour CodeQA.Metrics.Codebase.CodebaseMetric + + alias CodeQA.Analysis.FileContextServer + alias CodeQA.Metrics.File.NearDuplicateBlocks + + @impl true + def name, do: "near_duplicate_blocks_codebase" + + @impl true + def analyze(files, opts \\ []) do + ndb_opts = Keyword.get(opts, :near_duplicate_blocks, []) + max_pairs = Keyword.get(ndb_opts, :max_pairs_per_bucket, nil) + workers = Keyword.get(opts, :workers, System.schedulers_online()) + + ndb_opts = + [include_pairs: true, max_pairs_per_bucket: max_pairs, workers: workers] ++ + Keyword.take(opts, [:on_progress]) + + pid = Keyword.fetch!(opts, :file_context_pid) + + all_blocks = + Enum.flat_map(files, fn {path, content} -> + ctx = FileContextServer.get(pid, content, path: path) + NearDuplicateBlocks.label_blocks(ctx.blocks, path) + end) + + result = NearDuplicateBlocks.analyze_from_blocks(all_blocks, ndb_opts) + + result + |> Map.reject(fn {k, _} -> k in ["block_count", "sub_block_count"] end) + end +end diff --git a/lib/codeqa/metrics/similarity.ex b/lib/codeqa/metrics/codebase/similarity.ex similarity index 65% rename from lib/codeqa/metrics/similarity.ex rename to lib/codeqa/metrics/codebase/similarity.ex index 910e631d..e20e556f 100644 --- a/lib/codeqa/metrics/similarity.ex +++ b/lib/codeqa/metrics/codebase/similarity.ex @@ -1,4 +1,4 @@ -defmodule CodeQA.Metrics.Similarity do +defmodule CodeQA.Metrics.Codebase.Similarity do @moduledoc """ Detects cross-file code duplication at the codebase level. @@ -9,23 +9,19 @@ defmodule CodeQA.Metrics.Similarity do See [winnowing](https://theory.stanford.edu/~aiken/publications/papers/sigmod03.pdf), [locality-sensitive hashing](https://en.wikipedia.org/wiki/Locality-sensitive_hashing), and [normalized compression distance](https://en.wikipedia.org/wiki/Normalized_compression_distance). - - ## Options - - - `:show_ncd` — boolean, whether to compute per-pair NCD scores (default: `false`) - - `:ncd_paths` — list of file paths to compute similarity for (default: all files) - - `:ncd_top` — integer, max similar files to return per path (default: all) - - `:ncd_threshold` — minimum Jaccard similarity to consider as candidate (default: `0.20`) - - `:workers` — number of parallel workers (default: `System.schedulers_online/0`) - - `:on_progress` — include this key (any value) to enable progress output to stderr - - `:fp_stopwords` — `MapSet` of fingerprint hashes to exclude (default: empty) """ - @behaviour CodeQA.Metrics.CodebaseMetric + @behaviour CodeQA.Metrics.Codebase.CodebaseMetric + + alias CodeQA.AST.Lexing.TokenNormalizer + alias CodeQA.CLI.UI + alias CodeQA.Metrics.File.Winnowing @impl true def name, do: "similarity" + def keys, do: ["ncd_pairs", "cross_file_density"] + @spec analyze(map(), keyword()) :: map() @impl true def analyze(files, opts \\ []) @@ -37,6 +33,9 @@ defmodule CodeQA.Metrics.Similarity do def analyze(files, opts) do names = Map.keys(files) contents = Map.values(files) + has_progress = Keyword.has_key?(opts, :on_progress) + + if has_progress, do: IO.puts(:stderr, " Computing cross-file density...") result = %{ "cross_file_density" => cross_file_density(contents) @@ -80,38 +79,45 @@ defmodule CodeQA.Metrics.Similarity do if has_progress, do: IO.puts(:stderr, " 2/5 Computing Winnowing fingerprints...") result = - CodeQA.Telemetry.time(:ncd_fingerprinting, fn -> - contents - |> Enum.with_index() - |> Task.async_stream( - fn {content, i} -> - fp = compute_fingerprints(content, opts) - {i, fp} - end, max_concurrency: workers, timeout: :infinity) - |> Enum.map(fn {:ok, {i, fp}} -> - print_bar_progress(has_progress, i, length(contents), "Fingerprinting") + contents + |> Enum.with_index() + |> Task.async_stream( + fn {content, i} -> + fp = compute_fingerprints(content, opts) {i, fp} - end) - |> Map.new() + end, + max_concurrency: workers, + timeout: :infinity + ) + |> Enum.map(fn {:ok, {i, fp}} -> + maybe_print_fingerprint_progress(has_progress, i, length(contents)) + {i, fp} end) + |> Map.new() if has_progress, do: IO.puts(:stderr, "") result end + defp maybe_print_fingerprint_progress(false, _i, _total), do: :ok + + defp maybe_print_fingerprint_progress(true, i, total) do + if rem(i + 1, max(1, div(total, 20))) == 0 do + IO.write(:stderr, "\r" <> UI.progress_bar(i + 1, total, label: "Fingerprinting")) + end + end + defp build_inverted_index(fingerprints_by_id, has_progress) do if has_progress, do: IO.puts(:stderr, " 3/5 Building inverted index...") total = map_size(fingerprints_by_id) result = - CodeQA.Telemetry.time(:ncd_build_index, fn -> - fingerprints_by_id - |> Enum.with_index() - |> Enum.reduce(%{}, fn {{i, set}, idx}, acc -> - print_bar_progress(has_progress, idx, total, "Indexing") - index_fingerprint_set(set, i, acc) - end) + fingerprints_by_id + |> Enum.with_index() + |> Enum.reduce(%{}, fn {{i, set}, idx}, acc -> + maybe_print_index_progress(has_progress, idx, total) + index_fingerprint_set(set, i, acc) end) if has_progress, do: IO.puts(:stderr, "") @@ -124,6 +130,14 @@ defmodule CodeQA.Metrics.Similarity do end) end + defp maybe_print_index_progress(false, _idx, _total), do: :ok + + defp maybe_print_index_progress(true, idx, total) do + if rem(idx + 1, max(1, div(total, 20))) == 0 do + IO.write(:stderr, "\r" <> UI.progress_bar(idx + 1, total, label: "Indexing")) + end + end + defp find_candidate_pairs( fingerprints_by_id, inverted_index, @@ -136,37 +150,37 @@ defmodule CodeQA.Metrics.Similarity do if has_progress, do: IO.puts(:stderr, " 4/5 Identifying candidate pairs...") total = map_size(fingerprints_by_id) - names_tuple = List.to_tuple(names) candidates = - CodeQA.Telemetry.time(:ncd_lsh_filter, fn -> - fingerprints_by_id - |> Enum.with_index() - |> Task.async_stream( - fn {{i, set}, idx} -> - valid_pairs = - collect_valid_pairs( - i, - set, - inverted_index, - fingerprints_by_id, - names_tuple, - target_set, - threshold - ) - - {idx, valid_pairs} - end, max_concurrency: workers, timeout: :infinity) - |> Enum.reduce(%{}, fn {:ok, {idx, valid_pairs}}, acc -> - print_bar_progress(has_progress, idx, total, "LSH Filter") - merge_valid_pairs(valid_pairs, acc) - end) + fingerprints_by_id + |> Enum.with_index() + |> Task.async_stream( + fn {{i, set}, idx} -> + valid_pairs = + collect_valid_pairs( + i, + set, + inverted_index, + fingerprints_by_id, + names, + target_set, + threshold + ) + + {idx, valid_pairs} + end, + max_concurrency: workers, + timeout: :infinity + ) + |> Enum.reduce(%{}, fn {:ok, {idx, valid_pairs}}, acc -> + maybe_print_lsh_progress(has_progress, idx, total) + merge_valid_pairs(valid_pairs, acc) end) if has_progress, do: IO.puts(:stderr, "") Enum.map(candidates, fn {{i, j}, jaccard} -> - {elem(names_tuple, i), i, elem(names_tuple, j), j, jaccard} + {Enum.at(names, i), i, Enum.at(names, j), j, jaccard} end) end @@ -175,19 +189,19 @@ defmodule CodeQA.Metrics.Similarity do set, inverted_index, fingerprints_by_id, - names_tuple, + names, target_set, threshold ) do collisions = count_collisions(set, inverted_index, i) size_a = MapSet.size(set) - name_a = elem(names_tuple, i) + name_a = Enum.at(names, i) is_target_a = MapSet.member?(target_set, name_a) collisions - |> Enum.filter(fn {j, _} -> is_target_a or MapSet.member?(target_set, elem(names_tuple, j)) end) + |> Enum.filter(fn {j, _} -> is_target_a or MapSet.member?(target_set, Enum.at(names, j)) end) |> Enum.reduce([], fn {j, intersection}, acc_pairs -> jaccard = compute_jaccard(size_a, MapSet.size(Map.get(fingerprints_by_id, j)), intersection) if jaccard >= threshold, do: [{{i, j}, jaccard} | acc_pairs], else: acc_pairs @@ -217,11 +231,11 @@ defmodule CodeQA.Metrics.Similarity do end) end - defp print_bar_progress(false, _current, _total, _label), do: :ok + defp maybe_print_lsh_progress(false, _idx, _total), do: :ok - defp print_bar_progress(true, current, total, label) do - if rem(current + 1, max(1, div(total, 20))) == 0 do - IO.write(:stderr, "\r" <> CodeQA.CLI.UI.progress_bar(current + 1, total, label: label)) + defp maybe_print_lsh_progress(true, idx, total) do + if rem(idx + 1, max(1, div(total, 20))) == 0 do + IO.write(:stderr, "\r" <> UI.progress_bar(idx + 1, total, label: "LSH Filter")) end end @@ -240,26 +254,25 @@ defmodule CodeQA.Metrics.Similarity do counter = :counters.new(1, [:atomics]) start_time_ncd = System.monotonic_time(:millisecond) - CodeQA.Telemetry.time(:ncd_exact_compression_phase, fn -> - filtered_pairs - |> Task.async_stream( - fn {name_a, i, name_b, j, _jaccard} -> - ncd = compute_single_ncd(precomputed, i, j) - maybe_print_ncd_progress(has_progress, counter, total_pairs, start_time_ncd) - {name_a, name_b, ncd} - end, max_concurrency: workers, timeout: :infinity) - |> Enum.map(fn {:ok, res} -> res end) - end) + filtered_pairs + |> Task.async_stream( + fn {name_a, i, name_b, j, _jaccard} -> + ncd = compute_single_ncd(precomputed, i, j) + maybe_print_ncd_progress(has_progress, counter, total_pairs, start_time_ncd) + {name_a, name_b, ncd} + end, + max_concurrency: workers, + timeout: :infinity + ) + |> Enum.map(fn {:ok, res} -> res end) end defp compute_single_ncd(precomputed, i, j) do - CodeQA.Telemetry.time(:ncd_single_compression, fn -> - {a, ca} = elem(precomputed, i) - {b, cb} = elem(precomputed, j) - cab = byte_size(:zlib.compress([a, b])) - ncd = if max(ca, cb) > 0, do: (cab - min(ca, cb)) / max(ca, cb), else: 0.0 - Float.round(ncd, 4) - end) + {a, ca} = elem(precomputed, i) + {b, cb} = elem(precomputed, j) + cab = byte_size(:zlib.compress([a, b])) + ncd = if max(ca, cb) > 0, do: (cab - min(ca, cb)) / max(ca, cb), else: 0.0 + Float.round(ncd, 4) end defp maybe_print_ncd_progress(false, _counter, _total_pairs, _start_time), do: :ok @@ -275,8 +288,8 @@ defmodule CodeQA.Metrics.Similarity do eta_ms = round((total_pairs - c) * avg_time) output = - CodeQA.CLI.UI.progress_bar(c, total_pairs, - eta: CodeQA.CLI.UI.format_eta(eta_ms), + UI.progress_bar(c, total_pairs, + eta: UI.format_eta(eta_ms), label: "NCD Compression" ) @@ -316,13 +329,11 @@ defmodule CodeQA.Metrics.Similarity do end end - defp compute_fingerprints(content, opts) do - fp_stopwords = Keyword.get(opts, :fp_stopwords, MapSet.new()) - + defp compute_fingerprints(content, _opts) do content - |> CodeQA.Metrics.TokenNormalizer.normalize() - |> CodeQA.Metrics.Winnowing.kgrams(5) - |> Enum.reject(&MapSet.member?(fp_stopwords, &1)) + |> TokenNormalizer.normalize_structural() + |> Enum.map(& &1.kind) + |> Winnowing.kgrams(5) |> MapSet.new() end diff --git a/lib/codeqa/metrics/codebase_metric.ex b/lib/codeqa/metrics/codebase_metric.ex deleted file mode 100644 index 0b1284d6..00000000 --- a/lib/codeqa/metrics/codebase_metric.ex +++ /dev/null @@ -1,42 +0,0 @@ -defmodule CodeQA.Metrics.CodebaseMetric do - @moduledoc """ - Behaviour for metrics that operate across an entire codebase. - - Unlike `FileMetric`, which analyzes a single file, codebase metrics receive - a map of all source files and can compute cross-file properties such as - duplication or structural similarity. - - ## Common opts keys - - Implementations may accept keyword options including: - - `:workers` — number of parallel workers (default: `System.schedulers_online/0`) - - `:on_progress` — progress callback key (presence enables progress output) - - ## Minimal implementation - - defmodule MyCodebaseMetric do - @behaviour CodeQA.Metrics.CodebaseMetric - - @impl true - def name, do: "my_metric" - - @impl true - def analyze(files, _opts) do - %{"file_count" => map_size(files)} - end - end - - See [software metrics](https://en.wikipedia.org/wiki/Software_metric). - """ - - @typedoc "Map of file path to file content string." - @type file_map :: %{required(String.t()) => String.t()} - - @callback name() :: String.t() - @callback analyze(file_map(), keyword()) :: map() - - @doc "Human-readable description of what this metric measures." - @callback description() :: String.t() - - @optional_callbacks [description: 0] -end diff --git a/lib/codeqa/metrics/file/bradford.ex b/lib/codeqa/metrics/file/bradford.ex new file mode 100644 index 00000000..22b7bcee --- /dev/null +++ b/lib/codeqa/metrics/file/bradford.ex @@ -0,0 +1,99 @@ +defmodule CodeQA.Metrics.File.Bradford do + @moduledoc """ + Applies Bradford's concentration law to token density across lines. + + Lines are ranked by token count (densest first), then grouped into three + zones of equal total tokens. The ratio between zone sizes gives Bradford's + k values: how many more lines each successive zone needs to match the + token yield of the previous one. + + k ≈ 1 uniform density — tokens spread evenly across lines + k = 3–5 Bradford-like — a small dense core, long sparse tail + k >> 5 extreme concentration — a few lines carry almost all tokens + + k1 = zone2_lines / zone1_lines (core → middle transition) + k2 = zone3_lines / zone2_lines (middle → tail transition) + k_ratio = k2 / k1 (> 1 means tail is more stretched than core) + + In a perfect Bradford distribution k1 ≈ k2. In practice k2 > k1 is common + (moderate core, very stretched tail); k1 > k2 suggests extreme concentration + that levels off quickly. + + See [Bradford's law](https://en.wikipedia.org/wiki/Bradford%27s_law). + """ + + @behaviour CodeQA.Metrics.File.FileMetric + + @impl true + def name, do: "bradford" + + @impl true + def keys, do: ["k1", "k2", "k_ratio"] + + @spec analyze(map()) :: map() + @impl true + def analyze(%{tokens: []}) do + %{"k1" => 0.0, "k2" => 0.0, "k_ratio" => 0.0} + end + + def analyze(%{tokens: tokens}) do + # Count tokens per line using the .line field, then rank densest-first — + # this is Bradford's "sort sources by yield" step. + counts = + tokens + |> Enum.group_by(& &1.line) + |> Enum.map(fn {_line, toks} -> length(toks) end) + |> Enum.sort(:desc) + + total = Enum.sum(counts) + + # Need at least 3 lines and 3 tokens to form meaningful zones. + if total < 3 or length(counts) < 3 do + %{"k1" => 0.0, "k2" => 0.0, "k_ratio" => 0.0} + else + # Each zone should contain one third of all tokens. + # We find zone boundaries by walking the ranked list until each third is filled. + third = total / 3 + + # n1: lines in zone 1 (the dense core — fewest lines, highest token density) + # n2: lines in zone 2 (middle tier) + # n3: all remaining lines (the sparse tail) + {n1, rest} = count_until(counts, third) + {n2, _} = count_until(rest, third) + n3 = length(counts) - n1 - n2 + + # k1 > 1 always: the middle zone always needs more lines than the core. + # Higher k1 = more extreme concentration in the core (fewer lines do more work). + k1 = if n1 > 0, do: Float.round(n2 / n1, 4), else: 0.0 + + # k2 > 1 always: the tail always needs more lines than the middle. + # Higher k2 = longer sparse tail relative to the middle zone. + k2 = if n2 > 0, do: Float.round(n3 / n2, 4), else: 0.0 + + # k_ratio = k2 / k1 + # > 1: the tail is more stretched than the core is concentrated (common — many trivial lines) + # < 1: the core is more extreme than the tail is sparse (god-function pattern) + # ≈ 1: a clean Bradford distribution where each zone multiplies evenly + k_ratio = if k1 > 0, do: Float.round(k2 / k1, 4), else: 0.0 + + %{"k1" => k1, "k2" => k2, "k_ratio" => k_ratio} + end + end + + # Walks the density-ranked list, consuming lines until the accumulated token + # count reaches the zone target. Returns {lines_consumed, remaining_list}. + # The remaining list is passed directly to the next zone's count_until call, + # so zones are computed in a single linear pass over the sorted counts. + defp count_until(counts, target), do: do_count(counts, target, 0, 0) + + defp do_count([], _target, n, _acc), do: {n, []} + + defp do_count([h | rest], target, n, acc) do + new_acc = acc + h + # Once we've accumulated enough tokens to fill the zone, stop and return + # the remainder so the next zone can continue from where we left off. + if new_acc >= target, + do: {n + 1, rest}, + else: do_count(rest, target, n + 1, new_acc) + end +end diff --git a/lib/codeqa/metrics/branching.ex b/lib/codeqa/metrics/file/branching.ex similarity index 70% rename from lib/codeqa/metrics/branching.ex rename to lib/codeqa/metrics/file/branching.ex index 2cfdbe1d..ce5e20a0 100644 --- a/lib/codeqa/metrics/branching.ex +++ b/lib/codeqa/metrics/file/branching.ex @@ -1,4 +1,4 @@ -defmodule CodeQA.Metrics.Branching do +defmodule CodeQA.Metrics.File.Branching do @moduledoc """ Measures branching density as a proxy for cyclomatic complexity. @@ -12,7 +12,7 @@ defmodule CodeQA.Metrics.Branching do See [cyclomatic complexity](https://en.wikipedia.org/wiki/Cyclomatic_complexity). """ - @behaviour CodeQA.Metrics.FileMetric + @behaviour CodeQA.Metrics.File.FileMetric # Python: if elif else for while try except finally with match case # Ruby: if elsif else unless for while until case when begin rescue ensure @@ -43,18 +43,14 @@ defmodule CodeQA.Metrics.Branching do @impl true def name, do: "branching" - @spec analyze(map()) :: map() @impl true - def analyze(%{lines: lines, tokens: tokens}) do - non_blank_count = - lines - |> Tuple.to_list() - |> Enum.count(&(String.trim(&1) != "")) + def keys, do: ["branching_density", "branch_count", "non_blank_count", "max_nesting_depth"] - branch_count = - tokens - |> Tuple.to_list() - |> Enum.count(&MapSet.member?(@branching_keywords, &1)) + @spec analyze(CodeQA.Engine.FileContext.t()) :: map() + @impl true + def analyze(%{lines: lines, tokens: tokens, content: content}) do + non_blank_count = Enum.count(lines, &(String.trim(&1) != "")) + branch_count = Enum.count(tokens, &MapSet.member?(@branching_keywords, &1.content)) density = if non_blank_count > 0, @@ -64,7 +60,19 @@ defmodule CodeQA.Metrics.Branching do %{ "branching_density" => density, "branch_count" => branch_count, - "non_blank_count" => non_blank_count + "non_blank_count" => non_blank_count, + "max_nesting_depth" => max_nesting_depth(content) } end + + defp max_nesting_depth(content) do + content + |> String.graphemes() + |> Enum.reduce({0, 0}, fn + c, {depth, max} when c in ["(", "[", "{"] -> {depth + 1, max(depth + 1, max)} + c, {depth, max} when c in [")", "]", "}"] -> {max(depth - 1, 0), max} + _, acc -> acc + end) + |> elem(1) + end end diff --git a/lib/codeqa/metrics/file/brevity.ex b/lib/codeqa/metrics/file/brevity.ex new file mode 100644 index 00000000..bc0d9a62 --- /dev/null +++ b/lib/codeqa/metrics/file/brevity.ex @@ -0,0 +1,50 @@ +defmodule CodeQA.Metrics.File.Brevity do + @moduledoc """ + Measures how well Brevity law holds in the token distribution. + + Computes the Pearson correlation between token length and token frequency. + A negative value indicates shorter tokens appear more often (law holds). + A positive value indicates longer tokens appear more often (law violated). + Also fits a log-log regression to capture the power-law slope. + + See [Brevity law](https://en.wikipedia.org/wiki/Brevity_law). + """ + + @behaviour CodeQA.Metrics.File.FileMetric + + @impl true + def name, do: "brevity" + + @impl true + def keys, do: ["correlation", "slope", "sample_size"] + + @spec analyze(map()) :: map() + @impl true + def analyze(%{token_counts: token_counts}) when map_size(token_counts) < 3 do + %{"correlation" => 0.0, "slope" => 0.0, "sample_size" => map_size(token_counts)} + end + + def analyze(%{token_counts: token_counts}) do + pairs = Enum.map(token_counts, fn {token, freq} -> {String.length(token), freq} end) + lengths = Enum.map(pairs, &elem(&1, 0)) + freqs = Enum.map(pairs, &elem(&1, 1)) + + %{ + "correlation" => CodeQA.Math.pearson_correlation_list(lengths, freqs), + "slope" => log_log_slope(lengths, freqs), + "sample_size" => map_size(token_counts) + } + end + + defp log_log_slope(lengths, freqs) do + log_lengths = lengths |> Enum.map(&:math.log(max(&1, 1))) |> Nx.tensor(type: :f64) + log_freqs = freqs |> Enum.map(&:math.log(max(&1, 1))) |> Nx.tensor(type: :f64) + + {slope, _intercept, _r_squared} = CodeQA.Math.linear_regression(log_lengths, log_freqs) + + case Nx.to_number(slope) do + val when is_float(val) -> Float.round(val, 4) + _ -> 0.0 + end + end +end diff --git a/lib/codeqa/metrics/casing_entropy.ex b/lib/codeqa/metrics/file/casing_entropy.ex similarity index 60% rename from lib/codeqa/metrics/casing_entropy.ex rename to lib/codeqa/metrics/file/casing_entropy.ex index cb380112..4256e0e6 100644 --- a/lib/codeqa/metrics/casing_entropy.ex +++ b/lib/codeqa/metrics/file/casing_entropy.ex @@ -1,4 +1,4 @@ -defmodule CodeQA.Metrics.CasingEntropy do +defmodule CodeQA.Metrics.File.CasingEntropy do @moduledoc """ Measures Shannon entropy of identifier casing styles in a file. @@ -12,31 +12,45 @@ defmodule CodeQA.Metrics.CasingEntropy do - `"pascal_case_count"`, `"camel_case_count"`, `"snake_case_count"`, `"macro_case_count"`, `"kebab_case_count"`, `"other_count"` — per-style counts (only keys for styles that appear are included) + - `"screaming_snake_density"` — ratio of MACRO_CASE identifiers to total identifiers See [Shannon entropy](https://en.wikipedia.org/wiki/Entropy_(information_theory)) and [naming conventions](https://en.wikipedia.org/wiki/Naming_convention_(programming)). """ - @behaviour CodeQA.Metrics.FileMetric + @behaviour CodeQA.Metrics.File.FileMetric + + alias CodeQA.Metrics.File.Inflector @impl true def name, do: "casing_entropy" + @impl true + def keys, + do: [ + "entropy", + "pascal_case_count", + "camel_case_count", + "snake_case_count", + "macro_case_count", + "kebab_case_count", + "other_count", + "screaming_snake_density" + ] + @spec analyze(map()) :: map() @impl true - def analyze(%{identifiers: identifiers}) when tuple_size(identifiers) == 0 do - %{"entropy" => 0.0} + def analyze(%{identifiers: []}) do + %{"entropy" => 0.0, "screaming_snake_density" => 0.0} end def analyze(%{identifiers: identifiers}) do - identifiers_list = Tuple.to_list(identifiers) - counts = - identifiers_list - |> Enum.map(&CodeQA.Metrics.Inflector.detect_casing/1) + identifiers + |> Enum.map(&Inflector.detect_casing/1) |> Enum.frequencies() - total = length(identifiers_list) + total = length(identifiers) entropy = counts @@ -46,7 +60,10 @@ defmodule CodeQA.Metrics.CasingEntropy do acc - p * :math.log2(p) end) - %{"entropy" => Float.round(entropy, 4)} + macro_count = Map.get(counts, :macro_case, 0) + screaming_density = Float.round(macro_count / total, 4) + + %{"entropy" => Float.round(entropy, 4), "screaming_snake_density" => screaming_density} |> Map.merge(counts_to_output(counts)) end diff --git a/lib/codeqa/metrics/file/comment_structure.ex b/lib/codeqa/metrics/file/comment_structure.ex new file mode 100644 index 00000000..65bc0e0a --- /dev/null +++ b/lib/codeqa/metrics/file/comment_structure.ex @@ -0,0 +1,45 @@ +defmodule CodeQA.Metrics.File.CommentStructure do + @moduledoc """ + Measures comment density and annotation patterns. + + Counts lines that begin with a comment marker (language-agnostic: `#`, `//`, + `/*`, ` *`) relative to non-blank lines. Also counts TODO/FIXME/HACK/XXX + markers which indicate deferred work or known issues. + + ## Output keys + + - `"comment_line_ratio"` — comment lines / non-blank lines + - `"comment_line_count"` — raw count of comment lines + - `"todo_fixme_count"` — occurrences of TODO, FIXME, HACK, or XXX + """ + + @behaviour CodeQA.Metrics.File.FileMetric + + @impl true + def name, do: "comment_structure" + + @impl true + def keys, do: ["comment_line_ratio", "comment_line_count", "todo_fixme_count"] + + @comment_line ~r/^\s*(?:#|\/\/|\/\*|\*)/ + @todo_marker ~r/\b(?:TODO|FIXME|HACK|XXX)\b/ + + @spec analyze(map()) :: map() + @impl true + def analyze(%{content: content, lines: lines}) do + non_blank = Enum.reject(lines, &(String.trim(&1) == "")) + non_blank_count = length(non_blank) + + comment_count = Enum.count(non_blank, &Regex.match?(@comment_line, &1)) + todo_count = @todo_marker |> Regex.scan(content) |> length() + + comment_ratio = + if non_blank_count > 0, do: Float.round(comment_count / non_blank_count, 4), else: 0.0 + + %{ + "comment_line_ratio" => comment_ratio, + "comment_line_count" => comment_count, + "todo_fixme_count" => todo_count + } + end +end diff --git a/lib/codeqa/metrics/compression.ex b/lib/codeqa/metrics/file/compression.ex similarity index 56% rename from lib/codeqa/metrics/compression.ex rename to lib/codeqa/metrics/file/compression.ex index fe687059..9f0981b9 100644 --- a/lib/codeqa/metrics/compression.ex +++ b/lib/codeqa/metrics/file/compression.ex @@ -1,4 +1,4 @@ -defmodule CodeQA.Metrics.Compression do +defmodule CodeQA.Metrics.File.Compression do @moduledoc """ Measures file redundancy via zlib compression ratio. @@ -6,34 +6,49 @@ defmodule CodeQA.Metrics.Compression do original. A high compression ratio signals repetitive or boilerplate-heavy code. - `ctx.encoded` is the binary representation of the file content used for - compression, distinct from `ctx.content` which is the UTF-8 string. - See [Kolmogorov complexity](https://en.wikipedia.org/wiki/Kolmogorov_complexity) and [data compression ratio](https://en.wikipedia.org/wiki/Data_compression_ratio). """ - @behaviour CodeQA.Metrics.FileMetric + @behaviour CodeQA.Metrics.File.FileMetric @impl true def name, do: "compression" + @impl true + def keys, do: ["raw_bytes", "zlib_bytes", "zlib_ratio", "redundancy", "unique_line_ratio"] + @spec analyze(map()) :: map() @impl true def analyze(%{content: "", byte_count: 0}) do - %{"raw_bytes" => 0, "zlib_bytes" => 0, "zlib_ratio" => 0.0, "redundancy" => 0.0} + %{ + "raw_bytes" => 0, + "zlib_bytes" => 0, + "zlib_ratio" => 0.0, + "redundancy" => 0.0, + "unique_line_ratio" => 0.0 + } end def analyze(ctx) do raw_size = ctx.byte_count - zlib_data = :zlib.compress(ctx.encoded) + zlib_data = :zlib.compress(ctx.content) zlib_size = byte_size(zlib_data) + non_blank = ctx.lines |> Enum.reject(&(String.trim(&1) == "")) + + unique_line_ratio = + case length(non_blank) do + 0 -> 0.0 + n -> Float.round(length(Enum.uniq(non_blank)) / n, 4) + end + %{ "raw_bytes" => raw_size, "zlib_bytes" => zlib_size, "zlib_ratio" => Float.round(raw_size / max(1, zlib_size), 4), - "redundancy" => Float.round(max(0.0, 1.0 - zlib_size / raw_size), 4) + "redundancy" => Float.round(max(0.0, 1.0 - zlib_size / raw_size), 4), + "unique_line_ratio" => unique_line_ratio } end end diff --git a/lib/codeqa/metrics/entropy.ex b/lib/codeqa/metrics/file/entropy.ex similarity index 82% rename from lib/codeqa/metrics/entropy.ex rename to lib/codeqa/metrics/file/entropy.ex index 47564715..6533a21a 100644 --- a/lib/codeqa/metrics/entropy.ex +++ b/lib/codeqa/metrics/file/entropy.ex @@ -1,4 +1,4 @@ -defmodule CodeQA.Metrics.Entropy do +defmodule CodeQA.Metrics.File.Entropy do @moduledoc """ Computes Shannon entropy at both character and token levels. @@ -11,11 +11,24 @@ defmodule CodeQA.Metrics.Entropy do See [Shannon entropy](https://en.wikipedia.org/wiki/Entropy_(information_theory)). """ - @behaviour CodeQA.Metrics.FileMetric + @behaviour CodeQA.Metrics.File.FileMetric @impl true def name, do: "entropy" + @impl true + def keys, + do: [ + "char_entropy", + "char_max_entropy", + "char_normalized", + "token_entropy", + "token_max_entropy", + "token_normalized", + "vocab_size", + "total_tokens" + ] + @spec analyze(map()) :: map() @impl true def analyze(ctx) do @@ -30,13 +43,12 @@ defmodule CodeQA.Metrics.Entropy do compute_entropy(counts, total, "char") end - defp token_entropy(%{tokens: tokens, token_counts: _token_counts}) - when tuple_size(tokens) == 0 do + defp token_entropy(%{tokens: [], token_counts: _token_counts}) do Map.merge(zero_entropy_map("token"), %{"vocab_size" => 0, "total_tokens" => 0}) end defp token_entropy(%{tokens: tokens, token_counts: token_counts}) do - total = tuple_size(tokens) + total = length(tokens) vocab_size = map_size(token_counts) entropy_map = compute_entropy(token_counts, total, "token") diff --git a/lib/codeqa/metrics/file/file_metric.ex b/lib/codeqa/metrics/file/file_metric.ex new file mode 100644 index 00000000..5d127163 --- /dev/null +++ b/lib/codeqa/metrics/file/file_metric.ex @@ -0,0 +1,52 @@ +defmodule CodeQA.Metrics.File.FileMetric do + @moduledoc """ + Behaviour for metrics that analyze a single source file. + + Implementations receive a `CodeQA.Engine.FileContext` struct containing + pre-parsed data (tokens, identifiers, lines, etc.) and return a map of + metric key-value pairs. On error, return an empty map `%{}` rather than + raising. + + ## Minimal implementation + + defmodule MyMetric do + @behaviour CodeQA.Metrics.FileMetric + + @impl true + def name, do: "my_metric" + + @impl true + def analyze(ctx) do + %{"value" => compute(ctx)} + end + end + + See [software metrics](https://en.wikipedia.org/wiki/Software_metric). + """ + + @callback name() :: String.t() + @callback analyze(CodeQA.Engine.FileContext.t()) :: map() + + @doc "List of metric keys returned by analyze/1." + @callback keys() :: [String.t()] + + @doc "Human-readable description of what this metric measures." + @callback description() :: String.t() + + @doc "Whether this metric is enabled. Defaults to true when not implemented." + @callback enabled?() :: boolean() + + @doc """ + Subtractive leave-one-out path. When implemented, the block-impact analyzer + uses this instead of a full re-run on the file-minus-block reconstruction: + it derives the new metric values from the unchanged baseline values for the + whole file plus the content of the block being removed. + + Must return the same map shape as `analyze/1` and produce values bit-equal + to what `analyze/1` would yield on the file-minus-block content. A goldfile + test asserts this. + """ + @callback analyze_loo(baseline :: map(), block_content :: String.t()) :: map() + + @optional_callbacks [description: 0, enabled?: 0, analyze_loo: 2] +end diff --git a/lib/codeqa/metrics/function_metrics.ex b/lib/codeqa/metrics/file/function_metrics.ex similarity index 92% rename from lib/codeqa/metrics/function_metrics.ex rename to lib/codeqa/metrics/file/function_metrics.ex index 7fd22627..6a9bb0c6 100644 --- a/lib/codeqa/metrics/function_metrics.ex +++ b/lib/codeqa/metrics/file/function_metrics.ex @@ -1,4 +1,4 @@ -defmodule CodeQA.Metrics.FunctionMetrics do +defmodule CodeQA.Metrics.File.FunctionMetrics do @moduledoc """ Estimates function-level structure metrics from source text. @@ -13,7 +13,7 @@ defmodule CodeQA.Metrics.FunctionMetrics do - C#: lines starting with access modifiers (`public`, `private`, etc.) """ - @behaviour CodeQA.Metrics.FileMetric + @behaviour CodeQA.Metrics.File.FileMetric # Python, Ruby, Elixir: `def` family # JavaScript: `function` @@ -39,14 +39,23 @@ defmodule CodeQA.Metrics.FunctionMetrics do @impl true def name, do: "function_metrics" + @impl true + def keys, + do: [ + "function_count", + "avg_function_lines", + "max_function_lines", + "avg_param_count", + "max_param_count" + ] + @spec analyze(map()) :: map() @impl true def analyze(%{lines: lines}) do - lines_list = Tuple.to_list(lines) - total = length(lines_list) + total = length(lines) {func_indices, param_counts} = - lines_list + lines |> Enum.with_index() |> Enum.filter(fn {line, _} -> Regex.match?(@func_keyword_re, line) or Regex.match?(@csharp_method_re, line) diff --git a/lib/codeqa/metrics/halstead.ex b/lib/codeqa/metrics/file/halstead.ex similarity index 92% rename from lib/codeqa/metrics/halstead.ex rename to lib/codeqa/metrics/file/halstead.ex index ca38665f..157f67b5 100644 --- a/lib/codeqa/metrics/halstead.ex +++ b/lib/codeqa/metrics/file/halstead.ex @@ -1,4 +1,4 @@ -defmodule CodeQA.Metrics.Halstead do +defmodule CodeQA.Metrics.File.Halstead do @moduledoc """ Implements Halstead software-science complexity metrics. @@ -9,11 +9,27 @@ defmodule CodeQA.Metrics.Halstead do See [Halstead complexity measures](https://en.wikipedia.org/wiki/Halstead_complexity_measures). """ - @behaviour CodeQA.Metrics.FileMetric + @behaviour CodeQA.Metrics.File.FileMetric @impl true def name, do: "halstead" + @impl true + def keys, + do: [ + "n1_unique_operators", + "n2_unique_operands", + "N1_total_operators", + "N2_total_operands", + "vocabulary", + "length", + "volume", + "difficulty", + "effort", + "estimated_bugs", + "time_to_implement_seconds" + ] + # Keyword operators for: # Python, Ruby, JavaScript, Elixir, C#, # Java, C++, Go, Rust, PHP, Swift, Shell, Kotlin diff --git a/lib/codeqa/metrics/heaps.ex b/lib/codeqa/metrics/file/heaps.ex similarity index 84% rename from lib/codeqa/metrics/heaps.ex rename to lib/codeqa/metrics/file/heaps.ex index edc390bc..b7cae9c3 100644 --- a/lib/codeqa/metrics/heaps.ex +++ b/lib/codeqa/metrics/file/heaps.ex @@ -1,4 +1,4 @@ -defmodule CodeQA.Metrics.Heaps do +defmodule CodeQA.Metrics.File.Heaps do @moduledoc """ Fits Heaps' law to vocabulary growth in a file. @@ -9,25 +9,27 @@ defmodule CodeQA.Metrics.Heaps do See [Heaps' law](https://en.wikipedia.org/wiki/Heaps%27_law). """ - @behaviour CodeQA.Metrics.FileMetric + @behaviour CodeQA.Metrics.File.FileMetric @impl true def name, do: "heaps" + @impl true + def keys, do: ["k", "beta", "r_squared"] + @max_samples 50 @spec analyze(map()) :: map() @impl true - def analyze(%{tokens: tokens}) when tuple_size(tokens) == 0 do + def analyze(%{tokens: []}) do %{"k" => 0.0, "beta" => 0.0, "r_squared" => 0.0} end def analyze(%{tokens: tokens}) do - token_list = Tuple.to_list(tokens) - total = length(token_list) + total = length(tokens) interval = max(1, div(total, @max_samples)) - data_points = sample_vocabulary_growth(token_list, interval) + data_points = sample_vocabulary_growth(tokens, interval) if length(data_points) < 5 do %{"k" => 0.0, "beta" => 0.0, "r_squared" => 0.0} @@ -40,7 +42,7 @@ defmodule CodeQA.Metrics.Heaps do tokens |> Enum.with_index(1) |> Enum.reduce({MapSet.new(), []}, fn {token, i}, {seen, points} -> - seen = MapSet.put(seen, token) + seen = MapSet.put(seen, token.content) if rem(i, interval) == 0 do {seen, [{i, MapSet.size(seen)} | points]} diff --git a/lib/codeqa/metrics/identifier_length_variance.ex b/lib/codeqa/metrics/file/identifier_length_variance.ex similarity index 81% rename from lib/codeqa/metrics/identifier_length_variance.ex rename to lib/codeqa/metrics/file/identifier_length_variance.ex index 2203b100..424b95b5 100644 --- a/lib/codeqa/metrics/identifier_length_variance.ex +++ b/lib/codeqa/metrics/file/identifier_length_variance.ex @@ -1,4 +1,4 @@ -defmodule CodeQA.Metrics.IdentifierLengthVariance do +defmodule CodeQA.Metrics.File.IdentifierLengthVariance do @moduledoc """ Measures the mean, variance, and maximum length of identifiers. @@ -11,20 +11,22 @@ defmodule CodeQA.Metrics.IdentifierLengthVariance do and [variance](https://en.wikipedia.org/wiki/Variance). """ - @behaviour CodeQA.Metrics.FileMetric + @behaviour CodeQA.Metrics.File.FileMetric @impl true def name, do: "identifier_length_variance" + @impl true + def keys, do: ["mean", "variance", "std_dev", "max"] + @spec analyze(map()) :: map() @impl true - def analyze(%{identifiers: identifiers}) when tuple_size(identifiers) == 0 do + def analyze(%{identifiers: []}) do %{"mean" => 0.0, "variance" => 0.0, "std_dev" => 0.0, "max" => 0} end def analyze(%{identifiers: identifiers}) do - list = Tuple.to_list(identifiers) - lengths = Enum.map(list, &String.length/1) + lengths = Enum.map(identifiers, &String.length/1) n = length(lengths) mean = Enum.sum(lengths) / n diff --git a/lib/codeqa/metrics/indentation.ex b/lib/codeqa/metrics/file/indentation.ex similarity index 60% rename from lib/codeqa/metrics/indentation.ex rename to lib/codeqa/metrics/file/indentation.ex index ab44743b..75923b98 100644 --- a/lib/codeqa/metrics/indentation.ex +++ b/lib/codeqa/metrics/file/indentation.ex @@ -1,4 +1,4 @@ -defmodule CodeQA.Metrics.Indentation do +defmodule CodeQA.Metrics.File.Indentation do @moduledoc """ Analyzes indentation depth patterns across non-blank lines. @@ -10,20 +10,27 @@ defmodule CodeQA.Metrics.Indentation do See [indentation style](https://en.wikipedia.org/wiki/Indentation_style). """ - @behaviour CodeQA.Metrics.FileMetric + @behaviour CodeQA.Metrics.File.FileMetric @impl true def name, do: "indentation" + @impl true + def keys, do: ["mean_depth", "variance", "max_depth", "uses_tabs", "blank_line_ratio"] + @spec analyze(map()) :: map() @impl true def analyze(%{lines: lines}) do - lines_list = Tuple.to_list(lines) + uses_tabs = Enum.any?(lines, &String.match?(&1, ~r/^\t/)) + + total_lines = length(lines) + blank_count = Enum.count(lines, &(String.trim(&1) == "")) - uses_tabs = Enum.any?(lines_list, &String.match?(&1, ~r/^\t/)) + blank_line_ratio = + if total_lines > 0, do: Float.round(blank_count / total_lines, 4), else: 0.0 depths = - lines_list + lines |> Enum.reject(&(String.trim(&1) == "")) |> Enum.map(fn line -> [leading] = Regex.run(~r/^\s*/, line) @@ -31,7 +38,13 @@ defmodule CodeQA.Metrics.Indentation do end) if depths == [] do - %{"mean_depth" => 0.0, "max_depth" => 0, "variance" => 0.0, "uses_tabs" => uses_tabs} + %{ + "mean_depth" => 0.0, + "max_depth" => 0, + "variance" => 0.0, + "uses_tabs" => uses_tabs, + "blank_line_ratio" => blank_line_ratio + } else n = length(depths) mean = Enum.sum(depths) / n @@ -45,7 +58,8 @@ defmodule CodeQA.Metrics.Indentation do "mean_depth" => Float.round(mean, 4), "variance" => Float.round(variance, 4), "max_depth" => Enum.max(depths), - "uses_tabs" => uses_tabs + "uses_tabs" => uses_tabs, + "blank_line_ratio" => blank_line_ratio } end end diff --git a/lib/codeqa/metrics/inflector.ex b/lib/codeqa/metrics/file/inflector.ex similarity index 89% rename from lib/codeqa/metrics/inflector.ex rename to lib/codeqa/metrics/file/inflector.ex index 7c495314..04e732cb 100644 --- a/lib/codeqa/metrics/inflector.ex +++ b/lib/codeqa/metrics/file/inflector.ex @@ -1,4 +1,4 @@ -defmodule CodeQA.Metrics.Inflector do +defmodule CodeQA.Metrics.File.Inflector do @moduledoc """ Utility for detecting identifier casing styles. @@ -30,7 +30,8 @@ defmodule CodeQA.Metrics.Inflector do iex> CodeQA.Metrics.Inflector.detect_casing("FOO_BAR") :macro_case """ - @spec detect_casing(String.t()) :: :pascal_case | :camel_case | :snake_case | :macro_case | :kebab_case | :other + @spec detect_casing(String.t()) :: + :pascal_case | :camel_case | :snake_case | :macro_case | :kebab_case | :other def detect_casing(identifier) do cond do identifier =~ ~r/^[A-Z][a-zA-Z0-9]*$/ -> :pascal_case diff --git a/lib/codeqa/metrics/file/line_patterns.ex b/lib/codeqa/metrics/file/line_patterns.ex new file mode 100644 index 00000000..e8b2b452 --- /dev/null +++ b/lib/codeqa/metrics/file/line_patterns.ex @@ -0,0 +1,83 @@ +defmodule CodeQA.Metrics.File.LinePatterns do + @moduledoc """ + Structural line-level and nesting metrics. + + ## Output keys + + - `"blank_line_ratio"` — blank lines / total lines (spacing/organisation signal) + - `"unique_line_ratio"` — distinct non-blank trimmed lines / total non-blank lines + (low values indicate repetition or boilerplate) + - `"max_nesting_depth"` — maximum bracket nesting depth across `()`, `[]`, `{}` + (complexity proxy independent of branching keywords) + - `"string_literal_ratio"` — quoted string literal spans / total tokens + (high values may indicate magic strings or hardcoded data) + """ + + @behaviour CodeQA.Metrics.File.FileMetric + + @impl true + def name, do: "line_patterns" + + @impl true + def keys, + do: ["blank_line_ratio", "unique_line_ratio", "max_nesting_depth", "string_literal_ratio"] + + @string_literal ~r/(?:"[^"]*"|'[^']*')/ + + @spec analyze(map()) :: map() + @impl true + def analyze(%{content: content, lines: lines, tokens: tokens}) do + total_lines = length(lines) + total_tokens = length(tokens) + + if total_lines == 0 do + %{ + "blank_line_ratio" => 0.0, + "unique_line_ratio" => 1.0, + "max_nesting_depth" => 0, + "string_literal_ratio" => 0.0 + } + else + blank_count = Enum.count(lines, &(String.trim(&1) == "")) + blank_ratio = Float.round(blank_count / total_lines, 4) + + non_blank = lines |> Enum.map(&String.trim/1) |> Enum.reject(&(&1 == "")) + + unique_ratio = + if non_blank == [], + do: 1.0, + else: Float.round(length(Enum.uniq(non_blank)) / length(non_blank), 4) + + string_count = @string_literal |> Regex.scan(content) |> length() + + string_ratio = + if total_tokens == 0, + do: 0.0, + else: Float.round(string_count / total_tokens, 4) + + %{ + "blank_line_ratio" => blank_ratio, + "unique_line_ratio" => unique_ratio, + "max_nesting_depth" => max_nesting_depth(content), + "string_literal_ratio" => string_ratio + } + end + end + + defp max_nesting_depth(content) do + content + |> String.graphemes() + |> Enum.reduce({0, 0}, fn + char, {depth, max_d} when char in ["(", "[", "{"] -> + new_depth = depth + 1 + {new_depth, max(max_d, new_depth)} + + char, {depth, max_d} when char in [")", "]", "}"] -> + {max(depth - 1, 0), max_d} + + _, acc -> + acc + end) + |> elem(1) + end +end diff --git a/lib/codeqa/metrics/magic_number_density.ex b/lib/codeqa/metrics/file/magic_number_density.ex similarity index 51% rename from lib/codeqa/metrics/magic_number_density.ex rename to lib/codeqa/metrics/file/magic_number_density.ex index 3e28bb4f..20428df6 100644 --- a/lib/codeqa/metrics/magic_number_density.ex +++ b/lib/codeqa/metrics/file/magic_number_density.ex @@ -1,10 +1,10 @@ -defmodule CodeQA.Metrics.MagicNumberDensity do +defmodule CodeQA.Metrics.File.MagicNumberDensity do @moduledoc """ - Measures the density of magic numbers in source code. + Measures the density of magic numbers and string literals in source code. - Counts numeric literals (excluding common constants 0, 1, 0.0, 1.0) as a - proportion of total tokens. A high density suggests unexplained constants - that should be extracted into named values. + Counts numeric literals (excluding common constants 0, 1, 0.0, 1.0) and + double-quoted string literals as proportions of total tokens. High densities + suggest unexplained constants or hardcoded values that should be extracted. Note: negative numbers (e.g. `-42`) are not detected since the minus sign is a separate token. @@ -12,22 +12,25 @@ defmodule CodeQA.Metrics.MagicNumberDensity do See [magic number](). """ - @behaviour CodeQA.Metrics.FileMetric + @behaviour CodeQA.Metrics.File.FileMetric @impl true def name, do: "magic_number_density" + @impl true + def keys, do: ["density", "magic_number_count", "string_literal_ratio"] + @number_re ~r/\b\d+\.?\d*(?:[eE][+-]?\d+)?\b/ @idiomatic_constants ~w[0 1 2 0.0 1.0 0.5] + @string_literal_re ~r/"(?:[^"\\]|\\.)*"/ @spec analyze(map()) :: map() @impl true def analyze(%{content: content, tokens: tokens}) do - token_list = Tuple.to_list(tokens) - total_tokens = length(token_list) + total_tokens = length(tokens) if total_tokens == 0 do - %{"density" => 0.0, "magic_number_count" => 0} + %{"density" => 0.0, "magic_number_count" => 0, "string_literal_ratio" => 0.0} else numbers = @number_re @@ -36,10 +39,12 @@ defmodule CodeQA.Metrics.MagicNumberDensity do |> Enum.reject(&(&1 in @idiomatic_constants)) magic_count = length(numbers) + string_count = @string_literal_re |> Regex.scan(content) |> length() %{ "density" => Float.round(magic_count / total_tokens, 4), - "magic_number_count" => magic_count + "magic_number_count" => magic_count, + "string_literal_ratio" => Float.round(string_count / total_tokens, 4) } end end diff --git a/lib/codeqa/metrics/file/near_duplicate_blocks.ex b/lib/codeqa/metrics/file/near_duplicate_blocks.ex new file mode 100644 index 00000000..e1e0c08a --- /dev/null +++ b/lib/codeqa/metrics/file/near_duplicate_blocks.ex @@ -0,0 +1,198 @@ +defmodule CodeQA.Metrics.File.NearDuplicateBlocks do + @moduledoc """ + Near-duplicate block detection using natural code blocks. + + Detects blocks via blank-line boundaries and sub-blocks via bracket/indentation rules. + Compares structurally similar blocks by token-level edit distance, bucketed as a + percentage of the smaller block's token count. + + Distance buckets: + d0 = exact (0%), d1 ≤ 5%, d2 ≤ 10%, d3 ≤ 15%, d4 ≤ 20%, + d5 ≤ 25%, d6 ≤ 30%, d7 ≤ 40%, d8 ≤ 50% + """ + + alias CodeQA.AST.Enrichment.Node + alias CodeQA.AST.Lexing.TokenNormalizer + alias CodeQA.AST.Parsing.Parser + alias CodeQA.Language + alias CodeQA.Metrics.File.NearDuplicateBlocks.Candidates + alias CodeQA.Metrics.File.NearDuplicateBlocks.Distance + + @max_bucket 8 + + # --------------------------------------------------------------------------- + # Public API — distance helpers delegated to Distance submodule + # --------------------------------------------------------------------------- + + @doc "Standard Levenshtein distance between two token lists." + @spec token_edit_distance([String.t()], [String.t()]) :: non_neg_integer() + defdelegate token_edit_distance(a, b), to: Distance + + @doc "Map an edit distance and min token count to a percentage bucket 0–8, or nil if > 50%." + @spec percent_bucket(non_neg_integer(), non_neg_integer()) :: 0..8 | nil + defdelegate percent_bucket(ed, min_count), to: Distance + + # --------------------------------------------------------------------------- + # Public API — analysis entry points + # --------------------------------------------------------------------------- + + @doc """ + Analyze a list of `{path, content}` pairs for near-duplicate blocks. + Returns count keys `near_dup_block_d0..d8`, `block_count`, `sub_block_count`. + With `include_pairs: true` in opts, also returns `_pairs` keys. + """ + @dialyzer {:nowarn_function, analyze: 2} + @spec analyze([{String.t(), String.t()}], keyword()) :: map() + def analyze(labeled_content, opts) do + all_blocks = + Enum.flat_map(labeled_content, fn {path, content} -> + lang_mod = Language.detect(path) + tokens = TokenNormalizer.normalize_structural(content) + + Parser.detect_blocks(tokens, lang_mod) + |> label_blocks(path) + end) + + analyze_from_blocks(all_blocks, opts) + end + + @doc """ + Analyze a pre-built list of labeled `Node.t()` structs for near-duplicate blocks. + Skips tokenization and block detection — use when blocks are already available. + Returns the same keys as `analyze/2`. + """ + @dialyzer {:nowarn_function, analyze_from_blocks: 2} + @spec analyze_from_blocks([Node.t()], keyword()) :: map() + def analyze_from_blocks(all_blocks, opts) do + workers = Keyword.get(opts, :workers, System.schedulers_online()) + max_pairs = Keyword.get(opts, :max_pairs_per_bucket, nil) + include_pairs = Keyword.get(opts, :include_pairs, false) + + block_count = length(all_blocks) + + find_pairs_opts = + [workers: workers, max_pairs_per_bucket: max_pairs] ++ + Keyword.take(opts, [:on_progress, :idf_max_freq]) + + # do_find_pairs computes sub_block_count from the decorated list it already + # builds, eliminating the separate NodeProtocol.children pass. + {buckets, sub_block_count} = do_find_pairs(all_blocks, find_pairs_opts) + + result = + for d <- 0..@max_bucket, into: %{} do + {"near_dup_block_d#{d}", Map.get(buckets, d, %{count: 0}).count} + end + + result = + Map.merge(result, %{"block_count" => block_count, "sub_block_count" => sub_block_count}) + + case include_pairs do + true -> + pairs_result = + for d <- 0..@max_bucket, into: %{} do + {"near_dup_block_d#{d}_pairs", + Map.get(buckets, d, %{pairs: []}).pairs |> format_pairs()} + end + + Map.merge(result, pairs_result) + + false -> + result + end + end + + @doc "Find near-duplicate pairs across a list of %Node{} structs." + @spec find_pairs([Node.t()], keyword()) :: map() + def find_pairs(blocks, opts) do + {buckets, _sub_block_count} = do_find_pairs(blocks, opts) + buckets + end + + @doc false + def label_blocks(blocks, path) do + Enum.map(blocks, fn block -> + label = if block.start_line, do: "#{path}:#{block.start_line}", else: path + %{block | label: label} + end) + end + + # --------------------------------------------------------------------------- + # Internal pair-finding pipeline + # --------------------------------------------------------------------------- + + # Internal implementation returning {buckets, sub_block_count} so that + # analyze_from_blocks gets both without a redundant NodeProtocol.children pass. + defp do_find_pairs(blocks, opts) do + workers = Keyword.get(opts, :workers, System.schedulers_online()) + max_pairs = Keyword.get(opts, :max_pairs_per_bucket, nil) + idf_max_freq = Keyword.get(opts, :idf_max_freq, 1.0) + has_progress = Keyword.has_key?(opts, :on_progress) + + if length(blocks) < 2 do + {%{}, 0} + else + decorated = Candidates.decorate(blocks) + + # sub_block_count derived from the already-computed children_count in decorated. + sub_block_count = + Enum.reduce(decorated, 0, fn {_, _, _, _, _, cc, _, _}, acc -> acc + cc end) + + # IDF: prune bigrams that appear in more than idf_max_freq fraction of blocks. + # These are structural noise (e.g. "end nil", "return false") that inflate the + # candidate set without helping identify true duplicates. + pruned = Candidates.compute_frequent_bigrams(decorated, idf_max_freq) + + decorated = + if MapSet.size(pruned) > 0 do + Enum.map(decorated, &Candidates.prune_bigrams(&1, pruned)) + else + decorated + end + + {exact_index, shingle_index} = Candidates.build_indexes(decorated) + + total = length(decorated) + # Convert to tuple for O(1) indexed lookup inside the hot comparison loop. + decorated_arr = List.to_tuple(decorated) + + if has_progress, + do: IO.puts(:stderr, " Comparing #{total} blocks for near-duplicates...") + + raw_pairs = + decorated + |> Flow.from_enumerable(max_demand: 10, stages: workers) + |> Flow.flat_map( + &Candidates.find_pairs_for_block(&1, decorated_arr, exact_index, shingle_index) + ) + |> Enum.to_list() + + {bucket_pairs(raw_pairs, max_pairs), sub_block_count} + end + end + + defp bucket_pairs(raw_pairs, max_pairs) do + Enum.reduce(raw_pairs, %{}, fn {bucket, pair}, acc -> + Map.update( + acc, + bucket, + %{count: 1, pairs: maybe_append([], pair, max_pairs, 0)}, + fn existing -> + %{ + count: existing.count + 1, + pairs: maybe_append(existing.pairs, pair, max_pairs, existing.count) + } + end + ) + end) + end + + # Uses the already-tracked count instead of length(list) to avoid an O(n) walk. + defp maybe_append(list, _pair, max, count) when is_integer(max) and count >= max, do: list + defp maybe_append(list, pair, _max, _count), do: [pair | list] + + defp format_pairs(pairs) do + Enum.map(pairs, fn {label_a, label_b} -> + %{"source_a" => label_a, "source_b" => label_b} + end) + end +end diff --git a/lib/codeqa/metrics/file/near_duplicate_blocks/candidates.ex b/lib/codeqa/metrics/file/near_duplicate_blocks/candidates.ex new file mode 100644 index 00000000..522f5481 --- /dev/null +++ b/lib/codeqa/metrics/file/near_duplicate_blocks/candidates.ex @@ -0,0 +1,238 @@ +defmodule CodeQA.Metrics.File.NearDuplicateBlocks.Candidates do + @moduledoc """ + Block fingerprinting, indexing, and candidate-pair matching for near-duplicate detection. + + Handles: + - Canonical token-value extraction (stripping leading/trailing whitespace tokens) + - Exact-hash and shingle indexes for fast candidate lookup + - IDF-based bigram pruning to reduce structural-noise candidates + - Structural compatibility checks (child-count and line-ratio guards) + - Pair scoring and bucketing + """ + + alias CodeQA.AST.Classification.NodeProtocol + alias CodeQA.AST.Lexing.{NewlineToken, WhitespaceToken} + alias CodeQA.Metrics.File.NearDuplicateBlocks.Distance + + # Pre-compute token kind strings to avoid repeated function calls in the hot path. + @nl_kind NewlineToken.kind() + @ws_kind WhitespaceToken.kind() + + @doc """ + Decorate a list of blocks with pre-computed canonical values, hashes, bigrams, and + structural metadata. Each entry is an 8-tuple: + + {index, block, values, hash, len_values, children_count, newline_count, bigrams} + """ + @spec decorate([term()]) :: [tuple()] + def decorate(blocks) do + blocks + |> Enum.with_index() + |> Enum.map(fn {block, i} -> + values = canonical_values(NodeProtocol.flat_tokens(block)) + children_count = length(NodeProtocol.children(block)) + newline_count = Enum.count(values, &(&1 == @nl_kind)) + bigrams = Enum.chunk_every(values, 2, 1, :discard) + + {i, block, values, :erlang.phash2(values), length(values), children_count, newline_count, + bigrams} + end) + end + + @doc """ + Build both exact (hash → [idx]) and shingle (bigram_hash → [idx]) indexes in one pass, + using the pre-computed values from the decorated list. + """ + @spec build_indexes([tuple()]) :: {map(), map()} + def build_indexes(decorated) do + Enum.reduce(decorated, {%{}, %{}}, fn {idx, _block, _values, hash, _len, _children, _newlines, + bigrams}, + {exact_acc, shingle_acc} -> + exact_acc = Map.update(exact_acc, hash, [idx], &[idx | &1]) + + shingle_acc = + bigrams + |> Enum.reduce(shingle_acc, fn bigram, sh_acc -> + h = :erlang.phash2(bigram) + Map.update(sh_acc, h, [idx], &[idx | &1]) + end) + + {exact_acc, shingle_acc} + end) + end + + @doc """ + Returns the set of bigram hashes that appear in more than `max_freq` fraction of blocks. + + Minimum threshold of 2 so a bigram must appear in 3+ blocks before being pruned — + prevents over-pruning when the total block count is very small. + """ + @spec compute_frequent_bigrams([tuple()], float()) :: MapSet.t() + def compute_frequent_bigrams(decorated, max_freq) do + total = length(decorated) + threshold = max(2, round(total * max_freq)) + + decorated + |> Enum.reduce(%{}, fn {_, _, _, _, _, _, _, bigrams}, acc -> + bigrams + |> Enum.uniq_by(&:erlang.phash2/1) + |> Enum.reduce(acc, fn bigram, a -> + Map.update(a, :erlang.phash2(bigram), 1, &(&1 + 1)) + end) + end) + |> Enum.filter(fn {_, count} -> count > threshold end) + |> Enum.map(&elem(&1, 0)) + |> MapSet.new() + end + + @doc "Remove bigrams whose hash is in the pruned set from a decorated tuple." + @spec prune_bigrams(tuple(), MapSet.t()) :: tuple() + def prune_bigrams({i, b, v, h, l, c, n, bigrams}, pruned) do + {i, b, v, h, l, c, n, Enum.reject(bigrams, &MapSet.member?(pruned, :erlang.phash2(&1)))} + end + + @doc """ + Find all near-duplicate pairs for a single block against the full decorated array. + Returns a list of `{bucket, {label_a, label_b}}` pairs. + """ + @spec find_pairs_for_block(tuple(), tuple(), map(), map()) :: list() + def find_pairs_for_block( + {i, block_a, values_a, hash_a, len_a, children_a, newlines_a, bigrams_a}, + decorated_arr, + exact_index, + shingle_index + ) do + # For small exact-match lists (typically 0–3 entries) a plain list membership + # check avoids the overhead of constructing a MapSet. + exact_list = Map.get(exact_index, hash_a, []) + exact_set = if length(exact_list) > 3, do: MapSet.new(exact_list), else: nil + + # For d0 (exact), find hash-matching blocks and confirm with value equality + # to guard against phash2 collisions. + exact_pairs = + exact_list + |> Enum.filter(&(&1 > i)) + |> Enum.map(fn j -> + {_j, block_b, values_b, _hash_b, _len_b, children_b, newlines_b, _bigrams_b} = + elem(decorated_arr, j) + + if values_b == values_a and + structure_compatible?(children_a, newlines_a, children_b, newlines_b) do + {0, {block_a.label, block_b.label}} + else + nil + end + end) + |> Enum.reject(&is_nil/1) + + # For d1-d8 (near), use shingle index to find candidates. + min_shared = max(0, round(len_a * 0.5) - 1) + counter = :counters.new(tuple_size(decorated_arr), []) + + # Reduce bigrams → shingle index → counter array. We track the list of + # touched indices so the post-pass only iterates the candidates we actually + # encountered, not the full counter range. The first-touch check on the + # counter is O(1) (a single :counters.get), much cheaper than the previous + # HAMT-based Map.update accumulator on a per-block basis. + touched = + Enum.reduce(bigrams_a, [], fn bigram, touched_acc -> + h = :erlang.phash2(bigram) + + shingle_index + |> Map.get(h, []) + |> Enum.reduce(touched_acc, fn + j, acc when j > i -> + idx = j + 1 + old = :counters.get(counter, idx) + :counters.add(counter, idx, 1) + if old == 0, do: [j | acc], else: acc + + _j, acc -> + acc + end) + end) + + in_exact? = fn j -> + if exact_set, do: MapSet.member?(exact_set, j), else: j in exact_list + end + + near_pairs = + Enum.flat_map(touched, fn j -> + count = :counters.get(counter, j + 1) + + if count >= min_shared and not in_exact?.(j) do + near_pair_for_candidate( + j, + decorated_arr, + block_a, + values_a, + len_a, + children_a, + newlines_a + ) + else + [] + end + end) + + exact_pairs ++ near_pairs + end + + # --------------------------------------------------------------------------- + # Private helpers + # --------------------------------------------------------------------------- + + # Strip leading/trailing and tokens and extract kind values as strings. + # Optimised to 3 passes: one reduce (skip leading NL/WS + collect reversed kinds), + # one drop_while (strip trailing), one :lists.reverse. + defp canonical_values(tokens) do + {reversed, _in_content} = + Enum.reduce(tokens, {[], false}, fn t, {acc, in_content} -> + kind = t.kind + is_skip = kind == @nl_kind or kind == @ws_kind + + if in_content or not is_skip do + {[kind | acc], true} + else + {acc, false} + end + end) + + reversed + |> Enum.drop_while(&(&1 == @nl_kind or &1 == @ws_kind)) + |> :lists.reverse() + end + + defp near_pair_for_candidate(j, decorated_arr, block_a, values_a, len_a, children_a, newlines_a) do + {_j, block_b, values_b, _hash_b, len_b, children_b, newlines_b, _bigrams_b} = + elem(decorated_arr, j) + + min_count = min(len_a, len_b) + max_allowed = round(min_count * 0.5) + + if structure_compatible?(children_a, newlines_a, children_b, newlines_b) and + abs(len_a - len_b) <= max_allowed do + ed = Distance.token_edit_distance_bounded(values_a, values_b, max_allowed) + + case Distance.percent_bucket(ed, min_count) do + nil -> [] + bucket when bucket > 0 -> [{bucket, {block_a.label, block_b.label}}] + # ed=0 handled by exact_pairs above + _ -> [] + end + else + [] + end + end + + # Uses pre-computed children counts and newline counts from the decorated tuple + # so NodeProtocol.children/1 and Enum.count/2 are not called per candidate pair. + defp structure_compatible?(children_a, newlines_a, children_b, newlines_b) do + sub_diff = abs(children_a - children_b) + lines_a = newlines_a + 1 + lines_b = newlines_b + 1 + max_lines = max(lines_a, lines_b) + line_ratio = if max_lines > 0, do: abs(lines_a - lines_b) / max_lines, else: 0.0 + sub_diff <= 1 and line_ratio <= 0.30 + end +end diff --git a/lib/codeqa/metrics/file/near_duplicate_blocks/distance.ex b/lib/codeqa/metrics/file/near_duplicate_blocks/distance.ex new file mode 100644 index 00000000..475aa3e2 --- /dev/null +++ b/lib/codeqa/metrics/file/near_duplicate_blocks/distance.ex @@ -0,0 +1,114 @@ +defmodule CodeQA.Metrics.File.NearDuplicateBlocks.Distance do + @moduledoc """ + Token-level edit distance and percentage-bucket classification for near-duplicate detection. + + Provides standard Levenshtein distance, a bounded variant that short-circuits + when the distance already exceeds a threshold, and a bucket classifier that maps + an edit distance + minimum token count to a similarity bucket (d0–d8). + + Distance buckets: + d0 = exact (0%), d1 ≤ 5%, d2 ≤ 10%, d3 ≤ 15%, d4 ≤ 20%, + d5 ≤ 25%, d6 ≤ 30%, d7 ≤ 40%, d8 ≤ 50% + """ + + @bucket_thresholds [ + {0, 0.0}, + {1, 0.05}, + {2, 0.10}, + {3, 0.15}, + {4, 0.20}, + {5, 0.25}, + {6, 0.30}, + {7, 0.40}, + {8, 0.50} + ] + + @doc "Standard Levenshtein distance between two token lists." + @spec token_edit_distance([String.t()], [String.t()]) :: non_neg_integer() + def token_edit_distance([], b), do: length(b) + def token_edit_distance(a, []), do: length(a) + + def token_edit_distance(a, b) do + a_arr = List.to_tuple(a) + b_arr = List.to_tuple(b) + lb = tuple_size(b_arr) + init_row = List.to_tuple(Enum.to_list(0..lb)) + result_row = levenshtein_rows(a_arr, b_arr, tuple_size(a_arr), lb, init_row, 1) + elem(result_row, lb) + end + + defp levenshtein_rows(_a, _b, la, _lb, prev, i) when i > la, do: prev + + defp levenshtein_rows(a, b, la, lb, prev, i) do + ai = elem(a, i - 1) + curr_reversed = levenshtein_cols(b, lb, prev, ai, [i], 1) + curr = List.to_tuple(:lists.reverse(curr_reversed)) + levenshtein_rows(a, b, la, lb, curr, i + 1) + end + + defp levenshtein_cols(_b, lb, _prev, _ai, acc, j) when j > lb, do: acc + + defp levenshtein_cols(b, lb, prev, ai, [last_val | _] = acc, j) do + cost = if ai == elem(b, j - 1), do: 0, else: 1 + val = min(elem(prev, j) + 1, min(last_val + 1, elem(prev, j - 1) + cost)) + levenshtein_cols(b, lb, prev, ai, [val | acc], j + 1) + end + + # Bounded Levenshtein: returns the edit distance, or max_distance + 1 if the + # distance would exceed max_distance. Bails after each row when the row minimum + # already exceeds max_distance — the final distance can only grow from there. + @doc false + @spec token_edit_distance_bounded([String.t()], [String.t()], non_neg_integer()) :: + non_neg_integer() + def token_edit_distance_bounded([], b, _max), do: length(b) + def token_edit_distance_bounded(a, [], _max), do: length(a) + + def token_edit_distance_bounded(a, b, max_distance) do + a_arr = List.to_tuple(a) + b_arr = List.to_tuple(b) + lb = tuple_size(b_arr) + init_row = List.to_tuple(Enum.to_list(0..lb)) + levenshtein_rows_bounded(a_arr, b_arr, tuple_size(a_arr), lb, init_row, max_distance, 1) + end + + defp levenshtein_rows_bounded(_a, _b, la, lb, prev, _max, i) when i > la, do: elem(prev, lb) + + defp levenshtein_rows_bounded(a, b, la, lb, prev, max_distance, i) do + ai = elem(a, i - 1) + # levenshtein_cols_with_min tracks the row minimum as it builds, avoiding + # a separate O(lb) pass to find the min after the row is complete. + {curr_reversed, row_min} = levenshtein_cols_with_min(b, lb, prev, ai, {[i], i}, 1) + curr = List.to_tuple(:lists.reverse(curr_reversed)) + + if row_min > max_distance do + max_distance + 1 + else + levenshtein_rows_bounded(a, b, la, lb, curr, max_distance, i + 1) + end + end + + defp levenshtein_cols_with_min(_b, lb, _prev, _ai, acc_and_min, j) when j > lb, + do: acc_and_min + + defp levenshtein_cols_with_min(b, lb, prev, ai, {[last_val | _] = acc, min_val}, j) do + cost = if ai == elem(b, j - 1), do: 0, else: 1 + val = min(elem(prev, j) + 1, min(last_val + 1, elem(prev, j - 1) + cost)) + levenshtein_cols_with_min(b, lb, prev, ai, {[val | acc], min(min_val, val)}, j + 1) + end + + @doc "Map an edit distance and min token count to a percentage bucket 0–8, or nil if > 50%." + @spec percent_bucket(non_neg_integer(), non_neg_integer()) :: 0..8 | nil + def percent_bucket(_ed, 0), do: nil + def percent_bucket(0, _min_count), do: 0 + + def percent_bucket(ed, min_count) do + pct = ed / min_count + + @bucket_thresholds + |> Enum.find(fn {bucket, threshold} -> bucket > 0 and pct <= threshold end) + |> case do + {bucket, _} -> bucket + nil -> nil + end + end +end diff --git a/lib/codeqa/metrics/file/near_duplicate_blocks_file.ex b/lib/codeqa/metrics/file/near_duplicate_blocks_file.ex new file mode 100644 index 00000000..7a15e749 --- /dev/null +++ b/lib/codeqa/metrics/file/near_duplicate_blocks_file.ex @@ -0,0 +1,39 @@ +defmodule CodeQA.Metrics.File.NearDuplicateBlocksFile do + @moduledoc """ + Counts near-duplicate and exact-duplicate natural code blocks within a single file. + + Blocks are detected at blank-line boundaries with sub-block detection via bracket rules. + Distance is a percentage of the smaller block's token count, bucketed d0–d8. + Also reports block_count and sub_block_count as standalone metrics. + """ + + @behaviour CodeQA.Metrics.File.FileMetric + + alias CodeQA.Metrics.File.NearDuplicateBlocks + + @impl true + def name, do: "near_duplicate_blocks_file" + + @impl true + def keys do + ["block_count", "sub_block_count"] ++ for(d <- 0..8, do: "near_dup_block_d#{d}") + end + + @impl true + def analyze(%{blocks: nil}), do: Map.new(keys(), fn k -> {k, 0} end) + + def analyze(%{path: path, blocks: blocks}) when is_list(blocks) do + NearDuplicateBlocks.analyze_from_blocks( + NearDuplicateBlocks.label_blocks(blocks, path || "unknown"), + [] + ) + |> Map.reject(fn {k, _} -> String.ends_with?(k, "_pairs") end) + end + + def analyze(ctx) do + path = ctx.path || "unknown" + + NearDuplicateBlocks.analyze([{path, ctx.content}], []) + |> Map.reject(fn {k, _} -> String.ends_with?(k, "_pairs") end) + end +end diff --git a/lib/codeqa/metrics/ngram.ex b/lib/codeqa/metrics/file/ngram.ex similarity index 71% rename from lib/codeqa/metrics/ngram.ex rename to lib/codeqa/metrics/file/ngram.ex index fb2b44b5..b100513c 100644 --- a/lib/codeqa/metrics/ngram.ex +++ b/lib/codeqa/metrics/file/ngram.ex @@ -1,4 +1,4 @@ -defmodule CodeQA.Metrics.Ngram do +defmodule CodeQA.Metrics.File.Ngram do @moduledoc """ Computes bigram and trigram statistics over the token stream. @@ -10,15 +10,30 @@ defmodule CodeQA.Metrics.Ngram do and [hapax legomenon](https://en.wikipedia.org/wiki/Hapax_legomenon). """ - @behaviour CodeQA.Metrics.FileMetric + @behaviour CodeQA.Metrics.File.FileMetric @impl true def name, do: "ngram" + @impl true + def keys, + do: [ + "bigram_total", + "bigram_unique", + "bigram_repetition_rate", + "bigram_hapax_fraction", + "bigram_repeated_unique", + "trigram_total", + "trigram_unique", + "trigram_repetition_rate", + "trigram_hapax_fraction", + "trigram_repeated_unique" + ] + @spec analyze(map()) :: map() @impl true def analyze(ctx) do - tokens = Tuple.to_list(ctx.tokens) + tokens = Enum.map(ctx.tokens, & &1.content) bigram_stats = ngram_stats(tokens, 2) |> rename_keys("bigram") trigram_stats = ngram_stats(tokens, 3) |> rename_keys("trigram") @@ -27,7 +42,13 @@ defmodule CodeQA.Metrics.Ngram do end defp ngram_stats(tokens, n) when length(tokens) < n do - %{"total" => 0, "unique" => 0, "repetition_rate" => 0.0, "hapax_fraction" => 0.0, "repeated_unique" => 0} + %{ + "total" => 0, + "unique" => 0, + "repetition_rate" => 0.0, + "hapax_fraction" => 0.0, + "repeated_unique" => 0 + } end defp ngram_stats(tokens, n) do diff --git a/lib/codeqa/metrics/file/punctuation_density.ex b/lib/codeqa/metrics/file/punctuation_density.ex new file mode 100644 index 00000000..8b42ee41 --- /dev/null +++ b/lib/codeqa/metrics/file/punctuation_density.ex @@ -0,0 +1,96 @@ +defmodule CodeQA.Metrics.File.PunctuationDensity do + @moduledoc """ + Character-level punctuation and structural pattern metrics. + + Captures signals that character-level metrics miss: naming conventions using + `?`/`!` suffixes, chained method calls (dots), non-standard bracket adjacency, + and numeric bracket pair patterns. + """ + + @behaviour CodeQA.Metrics.File.FileMetric + + @impl true + def name, do: "punctuation_density" + + @impl true + def keys do + [ + "question_mark_density", + "exclamation_density", + "dot_count", + "id_nonalpha_suffix_density", + "bracket_nonalpha_prefix_count", + "bracket_nonalpha_suffix_count", + "bracket_number_pair_count", + "arrow_density", + "colon_suffix_density" + ] + end + + # identifier-like token (starts with letter/underscore) ending with non-alphanumeric non-whitespace + @id_nonalpha_suffix ~r/[a-zA-Z_]\w*[^\w\s]/ + # opening bracket immediately preceded by non-alphanumeric non-whitespace (e.g. `?(`, `==[`) + @bracket_nonalpha_prefix ~r/[^\w\s\(\[\{][\(\[\{]/ + # closing bracket immediately followed by non-alphanumeric non-whitespace (e.g. `}.`, `)?`) + @bracket_nonalpha_suffix ~r/[\)\]\}][^\w\s\)\]\}]/ + # number (with optional underscores) wrapped in brackets: (42), [1_000], (3.14) + @bracket_number_pair ~r/[\(\[]\d[\d_]*(?:\.\d+)?[\)\]]/ + # arrow operators: -> and => + @arrow ~r/->|=>/ + # identifier immediately followed by colon (keyword args, dict keys, labels) + @colon_suffix ~r/[a-zA-Z_]\w*:/ + + @spec analyze(map()) :: map() + @impl true + def analyze(%{content: content, tokens: tokens}) do + total_chars = String.length(content) + total_tokens = length(tokens) + + if total_chars == 0 do + %{ + "question_mark_density" => 0.0, + "exclamation_density" => 0.0, + "dot_count" => 0, + "id_nonalpha_suffix_density" => 0.0, + "bracket_nonalpha_prefix_count" => 0, + "bracket_nonalpha_suffix_count" => 0, + "bracket_number_pair_count" => 0, + "arrow_density" => 0.0, + "colon_suffix_density" => 0.0 + } + else + qmarks = count_char(content, "?") + excls = count_char(content, "!") + dots = count_char(content, ".") + + id_suffix_count = count_matches(content, @id_nonalpha_suffix) + bracket_prefix = count_matches(content, @bracket_nonalpha_prefix) + bracket_suffix = count_matches(content, @bracket_nonalpha_suffix) + bracket_num = count_matches(content, @bracket_number_pair) + + id_denom = max(total_tokens, 1) + arrows = count_matches(content, @arrow) + colon_suffixes = count_matches(content, @colon_suffix) + + %{ + "question_mark_density" => Float.round(qmarks / total_chars, 6), + "exclamation_density" => Float.round(excls / total_chars, 6), + "dot_count" => dots, + "id_nonalpha_suffix_density" => Float.round(id_suffix_count / id_denom, 4), + "bracket_nonalpha_prefix_count" => bracket_prefix, + "bracket_nonalpha_suffix_count" => bracket_suffix, + "bracket_number_pair_count" => bracket_num, + "arrow_density" => Float.round(arrows / id_denom, 4), + "colon_suffix_density" => Float.round(colon_suffixes / id_denom, 4) + } + end + end + + defp count_char(content, char) do + content |> String.graphemes() |> Enum.count(&(&1 == char)) + end + + defp count_matches(content, regex) do + regex |> Regex.scan(content) |> length() + end +end diff --git a/lib/codeqa/metrics/readability.ex b/lib/codeqa/metrics/file/readability.ex similarity index 89% rename from lib/codeqa/metrics/readability.ex rename to lib/codeqa/metrics/file/readability.ex index 5ffa9e1f..3e1bd2c4 100644 --- a/lib/codeqa/metrics/readability.ex +++ b/lib/codeqa/metrics/file/readability.ex @@ -1,4 +1,4 @@ -defmodule CodeQA.Metrics.Readability do +defmodule CodeQA.Metrics.File.Readability do @moduledoc """ Computes adapted Flesch and Fog readability indices for source code. @@ -10,17 +10,27 @@ defmodule CodeQA.Metrics.Readability do and [Gunning fog index](https://en.wikipedia.org/wiki/Gunning_fog_index). """ - @behaviour CodeQA.Metrics.FileMetric + @behaviour CodeQA.Metrics.File.FileMetric @impl true def name, do: "readability" + @impl true + def keys, + do: [ + "avg_tokens_per_line", + "avg_line_length", + "avg_sub_words_per_id", + "flesch_adapted", + "fog_adapted", + "total_lines" + ] + @spec analyze(map()) :: map() @impl true def analyze(ctx) do lines = ctx.lines - |> Tuple.to_list() |> Enum.filter(fn line -> trimmed = String.trim(line) trimmed != "" and not String.starts_with?(trimmed, "#") @@ -42,11 +52,11 @@ defmodule CodeQA.Metrics.Readability do defp compute_readability(ctx, lines) do total_lines = length(lines) - total_tokens = tuple_size(ctx.tokens) + total_tokens = length(ctx.tokens) avg_tokens = total_tokens / total_lines avg_line_length = lines |> Enum.map(&String.length/1) |> Enum.sum() |> Kernel./(total_lines) - words = Tuple.to_list(ctx.words) + words = ctx.words {avg_sub_words, complex_fraction} = if words != [] do diff --git a/lib/codeqa/metrics/file/rfc.ex b/lib/codeqa/metrics/file/rfc.ex new file mode 100644 index 00000000..5416c684 --- /dev/null +++ b/lib/codeqa/metrics/file/rfc.ex @@ -0,0 +1,81 @@ +defmodule CodeQA.Metrics.File.RFC do + @moduledoc """ + Response For a Class (RFC) — a coupling metric from the Chidamber & Kemerer suite. + + RFC ≈ number of distinct methods/functions reachable from this file, counting + both locally-defined functions and distinct external call targets. + + Formula: `RFC = function_def_count + |distinct_call_targets|` + + Computed from the token stream without requiring a real AST: + - Function definitions are detected by function-keyword tokens (`def`, `fn`, etc.) + followed by an `` token. + - Call targets are detected by `` tokens immediately followed by `(`. + Duplicates are collapsed to a set. + + Higher RFC values indicate a module with more responsibility and more external + coupling, correlating empirically with higher fault density. + + See [CK metrics suite](https://en.wikipedia.org/wiki/Programming_complexity#Chidamber_and_Kemerer_metrics). + """ + + @behaviour CodeQA.Metrics.File.FileMetric + + @func_keywords MapSet.new(~w[ + def defp defmacro defmacrop defguard defdelegate + function func fun fn + sub proc method + ]) + + @impl true + def name, do: "rfc" + + @impl true + def keys, do: ["rfc_count", "rfc_density", "function_def_count", "distinct_call_count"] + + @impl true + def description, + do: "Response For a Class: function definitions + distinct call targets (CK suite)" + + @spec analyze(CodeQA.Engine.FileContext.t()) :: map() + @impl true + def analyze(%{tokens: tokens, line_count: line_count}) do + {func_def_count, call_targets} = scan_tokens(tokens) + + distinct_call_count = MapSet.size(call_targets) + rfc_count = func_def_count + distinct_call_count + + density = + if line_count > 0, + do: Float.round(rfc_count / line_count, 4), + else: 0.0 + + %{ + "rfc_count" => rfc_count, + "rfc_density" => density, + "function_def_count" => func_def_count, + "distinct_call_count" => distinct_call_count + } + end + + # Single pass: detect function definitions and call sites simultaneously. + # Uses a sliding window of two adjacent tokens. + defp scan_tokens(tokens) do + tokens + |> Enum.zip(Enum.drop(tokens, 1)) + |> Enum.reduce({0, MapSet.new()}, fn {tok, next}, {defs, calls} -> + cond do + # Function definition: keyword followed by an identifier + MapSet.member?(@func_keywords, tok.content) and next.kind == "" -> + {defs + 1, calls} + + # Call site: identifier followed by open paren + tok.kind == "" and next.content == "(" -> + {defs, MapSet.put(calls, tok.content)} + + true -> + {defs, calls} + end + end) + end +end diff --git a/lib/codeqa/metrics/file/separator_counts.ex b/lib/codeqa/metrics/file/separator_counts.ex new file mode 100644 index 00000000..62586560 --- /dev/null +++ b/lib/codeqa/metrics/file/separator_counts.ex @@ -0,0 +1,44 @@ +defmodule CodeQA.Metrics.File.SeparatorCounts do + @moduledoc """ + Counts dividing characters (`_`, `-`, `/`, `.`) in source code. + + These separators appear in identifiers (snake_case, kebab-case), + paths, and dotted access. Their frequency can distinguish naming + conventions and structural patterns across languages. + """ + + @behaviour CodeQA.Metrics.File.FileMetric + + @impl true + def name, do: "separator_counts" + + @impl true + def keys, do: ["underscore_count", "hyphen_count", "slash_count", "dot_count"] + + @spec analyze(map()) :: map() + @impl true + def analyze(%{content: content}) do + %{ + "underscore_count" => count(content, "_"), + "hyphen_count" => count(content, "-"), + "slash_count" => count(content, "/"), + "dot_count" => count(content, ".") + } + end + + @impl true + def analyze_loo(baseline, block_content) do + %{ + "underscore_count" => baseline["underscore_count"] - count(block_content, "_"), + "hyphen_count" => baseline["hyphen_count"] - count(block_content, "-"), + "slash_count" => baseline["slash_count"] - count(block_content, "/"), + "dot_count" => baseline["dot_count"] - count(block_content, ".") + } + end + + defp count(content, char) do + content + |> String.graphemes() + |> Enum.count(&(&1 == char)) + end +end diff --git a/lib/codeqa/metrics/symbol_density.ex b/lib/codeqa/metrics/file/symbol_density.ex similarity index 85% rename from lib/codeqa/metrics/symbol_density.ex rename to lib/codeqa/metrics/file/symbol_density.ex index 67459a0c..3e71bf34 100644 --- a/lib/codeqa/metrics/symbol_density.ex +++ b/lib/codeqa/metrics/file/symbol_density.ex @@ -1,4 +1,4 @@ -defmodule CodeQA.Metrics.SymbolDensity do +defmodule CodeQA.Metrics.File.SymbolDensity do @moduledoc """ Measures the density of non-word, non-whitespace symbols in source code. @@ -9,11 +9,14 @@ defmodule CodeQA.Metrics.SymbolDensity do See [code readability](https://en.wikipedia.org/wiki/Computer_programming#Readability_of_source_code). """ - @behaviour CodeQA.Metrics.FileMetric + @behaviour CodeQA.Metrics.File.FileMetric @impl true def name, do: "symbol_density" + @impl true + def keys, do: ["density", "symbol_count", "distinct_symbol_types"] + @spec analyze(map()) :: map() @impl true def analyze(%{content: content}) do diff --git a/lib/codeqa/metrics/vocabulary.ex b/lib/codeqa/metrics/file/vocabulary.ex similarity index 91% rename from lib/codeqa/metrics/vocabulary.ex rename to lib/codeqa/metrics/file/vocabulary.ex index d9ef6374..496cc68a 100644 --- a/lib/codeqa/metrics/vocabulary.ex +++ b/lib/codeqa/metrics/file/vocabulary.ex @@ -1,4 +1,4 @@ -defmodule CodeQA.Metrics.Vocabulary do +defmodule CodeQA.Metrics.File.Vocabulary do @moduledoc """ Analyzes vocabulary diversity using type-token ratio (TTR) and MATTR. @@ -14,19 +14,22 @@ defmodule CodeQA.Metrics.Vocabulary do and [MATTR](https://doi.org/10.3758/BRM.42.2.381). """ - @behaviour CodeQA.Metrics.FileMetric + @behaviour CodeQA.Metrics.File.FileMetric @impl true def name, do: "vocabulary" + @impl true + def keys, do: ["raw_ttr", "mattr", "unique_identifiers", "total_identifiers", "vocabulary"] + @window_size 100 @spec analyze(map()) :: map() @impl true def analyze(ctx) do - identifiers = Tuple.to_list(ctx.identifiers) + identifiers = ctx.identifiers total = length(identifiers) - vocabulary = ctx.words |> Tuple.to_list() |> Enum.uniq() |> Enum.sort() + vocabulary = ctx.words |> Enum.uniq() |> Enum.sort() if total == 0 do %{ diff --git a/lib/codeqa/metrics/vowel_density.ex b/lib/codeqa/metrics/file/vowel_density.ex similarity index 86% rename from lib/codeqa/metrics/vowel_density.ex rename to lib/codeqa/metrics/file/vowel_density.ex index 84ea39e7..f3f53de5 100644 --- a/lib/codeqa/metrics/vowel_density.ex +++ b/lib/codeqa/metrics/file/vowel_density.ex @@ -1,4 +1,4 @@ -defmodule CodeQA.Metrics.VowelDensity do +defmodule CodeQA.Metrics.File.VowelDensity do @moduledoc """ Measures the density of vowels in identifiers. @@ -9,17 +9,20 @@ defmodule CodeQA.Metrics.VowelDensity do See [identifier naming](https://en.wikipedia.org/wiki/Identifier_(computer_languages)). """ - @behaviour CodeQA.Metrics.FileMetric + @behaviour CodeQA.Metrics.File.FileMetric @vowels MapSet.new(~c"aeiouyAEIOUY") @impl true def name, do: "vowel_density" + @impl true + def keys, do: ["density", "vowel_count", "total_chars"] + @spec analyze(map()) :: map() @impl true def analyze(%{identifiers: identifiers}) do - list = Tuple.to_list(identifiers) + list = identifiers if list == [] do %{"density" => 0.0, "vowel_count" => 0, "total_chars" => 0} diff --git a/lib/codeqa/metrics/winnowing.ex b/lib/codeqa/metrics/file/winnowing.ex similarity index 96% rename from lib/codeqa/metrics/winnowing.ex rename to lib/codeqa/metrics/file/winnowing.ex index 9c8961ca..d725a388 100644 --- a/lib/codeqa/metrics/winnowing.ex +++ b/lib/codeqa/metrics/file/winnowing.ex @@ -1,4 +1,4 @@ -defmodule CodeQA.Metrics.Winnowing do +defmodule CodeQA.Metrics.File.Winnowing do @moduledoc """ Generates structural fingerprints using k-grams. diff --git a/lib/codeqa/metrics/zipf.ex b/lib/codeqa/metrics/file/zipf.ex similarity index 86% rename from lib/codeqa/metrics/zipf.ex rename to lib/codeqa/metrics/file/zipf.ex index 4948c3d9..b03a07c5 100644 --- a/lib/codeqa/metrics/zipf.ex +++ b/lib/codeqa/metrics/file/zipf.ex @@ -1,4 +1,4 @@ -defmodule CodeQA.Metrics.Zipf do +defmodule CodeQA.Metrics.File.Zipf do @moduledoc """ Fits Zipf's law to the token frequency distribution. @@ -9,21 +9,24 @@ defmodule CodeQA.Metrics.Zipf do See [Zipf's law](https://en.wikipedia.org/wiki/Zipf%27s_law). """ - @behaviour CodeQA.Metrics.FileMetric + @behaviour CodeQA.Metrics.File.FileMetric @impl true def name, do: "zipf" + @impl true + def keys, do: ["exponent", "r_squared", "vocab_size", "total_tokens"] + @spec analyze(map()) :: map() @impl true - def analyze(%{tokens: tokens, token_counts: _token_counts}) when tuple_size(tokens) == 0 do + def analyze(%{tokens: [], token_counts: _token_counts}) do %{"exponent" => 0.0, "r_squared" => 0.0, "vocab_size" => 0, "total_tokens" => 0} end def analyze(%{tokens: tokens, token_counts: token_counts}) do frequencies = token_counts |> Map.values() |> Enum.sort(:desc) vocab_size = length(frequencies) - total_tokens = tuple_size(tokens) + total_tokens = length(tokens) if vocab_size < 3 do %{ diff --git a/lib/codeqa/metrics/file_metric.ex b/lib/codeqa/metrics/file_metric.ex deleted file mode 100644 index 75a6f61b..00000000 --- a/lib/codeqa/metrics/file_metric.ex +++ /dev/null @@ -1,37 +0,0 @@ -defmodule CodeQA.Metrics.FileMetric do - @moduledoc """ - Behaviour for metrics that analyze a single source file. - - Implementations receive a `CodeQA.Pipeline.FileContext` struct containing - pre-parsed data (tokens, identifiers, lines, etc.) and return a map of - metric key-value pairs. On error, return an empty map `%{}` rather than - raising. - - ## Minimal implementation - - defmodule MyMetric do - @behaviour CodeQA.Metrics.FileMetric - - @impl true - def name, do: "my_metric" - - @impl true - def analyze(ctx) do - %{"value" => compute(ctx)} - end - end - - See [software metrics](https://en.wikipedia.org/wiki/Software_metric). - """ - - @callback name() :: String.t() - @callback analyze(CodeQA.Pipeline.FileContext.t()) :: map() - - @doc "Human-readable description of what this metric measures." - @callback description() :: String.t() - - @doc "Whether this metric is enabled. Defaults to true when not implemented." - @callback enabled?() :: boolean() - - @optional_callbacks [description: 0, enabled?: 0] -end diff --git a/lib/codeqa/metrics/post_processing/menzerath.ex b/lib/codeqa/metrics/post_processing/menzerath.ex new file mode 100644 index 00000000..4b5b10cf --- /dev/null +++ b/lib/codeqa/metrics/post_processing/menzerath.ex @@ -0,0 +1,282 @@ +defmodule CodeQA.Metrics.PostProcessing.Menzerath do + @moduledoc """ + Measures structural hierarchy conformance using Menzerath's law. + + ## Block-level score + + For each parsed block in a file, computes: + + ratio = block.line_count / parent.line_count + + Root blocks use the file's line count as parent. Ratio close to 1.0 means the block + dominates its parent (poor decomposition). Low ratio means the block is small relative + to its parent (good decomposition). + + For internal nodes that have children, also computes `avg_child_ratio` — the mean ratio + of direct children. High `avg_child_ratio` means this node failed to decompose its + children into small enough pieces. + + ## Codebase-level score + + Collects `{function_count, avg_function_lines}` pairs from all files and computes: + - Pearson correlation (negative = law holds across the codebase) + - Power-law exponent `b` from `y = a · x^b` fit on log-log scale + - R² of the fit + """ + + @behaviour CodeQA.Metrics.PostProcessing.PostProcessingMetric + + alias CodeQA.AST.Lexing.TokenNormalizer + alias CodeQA.AST.Parsing.Parser + alias CodeQA.Languages.Unknown + + @violation_threshold 0.6 + + @impl true + def name, do: "menzerath" + + @impl true + def analyze(pipeline_result, files_map, _opts) do + file_scores = + Map.new(files_map, fn {path, content} -> + {path, %{"menzerath" => score_file(content)}} + end) + + codebase_score = compute_codebase_score(pipeline_result) + + %{ + "files" => file_scores, + "codebase" => %{"menzerath" => codebase_score} + } + end + + # --- file-level scoring --- + + defp score_file("") do + %{ + "blocks" => [], + "mean_ratio" => 0.0, + "max_ratio" => 0.0, + "violation_count" => 0, + "insight" => "Empty file." + } + end + + defp score_file(content) do + file_lines = content |> String.split("\n") |> length() + root_tokens = TokenNormalizer.normalize_structural(content) + top_nodes = Parser.detect_blocks(root_tokens, Unknown) + + blocks = Enum.map(top_nodes, &score_node(&1, file_lines)) + all_ratios = collect_ratios(blocks) + n = length(all_ratios) + + mean_ratio = if(n == 0, do: 0.0, else: round4(Enum.sum(all_ratios) / n)) + max_ratio = if(n == 0, do: 0.0, else: round4(Enum.max(all_ratios))) + violation_count = Enum.count(all_ratios, &(&1 >= @violation_threshold)) + + %{ + "blocks" => blocks, + "mean_ratio" => mean_ratio, + "max_ratio" => max_ratio, + "violation_count" => violation_count, + "insight" => file_insight(mean_ratio, max_ratio, violation_count, length(top_nodes)) + } + end + + defp file_insight(_mean, _max, _violations, 0), + do: "No blocks detected." + + defp file_insight(_mean, _max, 0, _block_count), + do: "Well decomposed — all blocks are small relative to their parents." + + defp file_insight(_mean, max_ratio, violations, _block_count) when max_ratio >= 0.9, + do: + "#{violations} block(s) nearly span the entire file — the file is not decomposed into meaningful pieces." + + defp file_insight(mean_ratio, _max, violations, _block_count) when mean_ratio >= 0.5, + do: + "#{violations} violation(s); blocks are large on average (mean ratio #{mean_ratio}) — the file likely needs to be split or its blocks extracted." + + defp file_insight(_mean, _max, violations, _block_count), + do: + "#{violations} block(s) dominate their parent context — consider extracting those into separate functions or modules." + + defp score_node(node, parent_lines) do + ratio = if parent_lines > 0, do: round4(node.line_count / parent_lines), else: 0.0 + + children = Enum.map(node.children, &score_node(&1, node.line_count)) + + base = %{ + "start_line" => node.start_line, + "end_line" => node.end_line, + "line_count" => node.line_count, + "parent_lines" => parent_lines, + "ratio" => ratio, + "insight" => block_insight(ratio, []), + "children" => children + } + + case children do + [] -> + base + + kids -> + child_ratios = Enum.map(kids, & &1["ratio"]) + avg = round4(Enum.sum(child_ratios) / length(child_ratios)) + + base + |> Map.put("avg_child_ratio", avg) + |> Map.put("insight", block_insight(ratio, avg_child_ratio: avg)) + end + end + + defp block_insight(ratio, opts) do + avg_child_ratio = Keyword.get(opts, :avg_child_ratio) + + cond do + ratio >= 0.9 -> + "Block spans nearly the entire parent — no meaningful decomposition at this level." + + (ratio >= @violation_threshold and avg_child_ratio) && + avg_child_ratio >= @violation_threshold -> + "Block is large relative to its parent and its own children are also large — nested decomposition failure." + + ratio >= @violation_threshold -> + "Block is large relative to its parent — consider splitting or extracting." + + avg_child_ratio && avg_child_ratio >= @violation_threshold -> + "Block is reasonably sized but its children are too large — this block should be broken down further." + + true -> + nil + end + end + + defp collect_ratios(blocks) do + Enum.flat_map(blocks, fn block -> + [block["ratio"] | collect_ratios(block["children"])] + end) + end + + # --- codebase-level scoring --- + + defp compute_codebase_score(pipeline_result) do + pairs = + pipeline_result + |> Map.get("files", %{}) + |> Enum.flat_map(fn {_path, file_data} -> + fm = get_in(file_data, ["metrics", "function_metrics"]) || %{} + count = fm["function_count"] + avg = fm["avg_function_lines"] + + if is_number(count) and is_number(avg) and count > 0 do + [{count * 1.0, avg * 1.0}] + else + [] + end + end) + + n = length(pairs) + + if n < 3 do + %{ + "correlation" => nil, + "exponent" => nil, + "r_squared" => nil, + "sample_size" => n, + "insight" => + "Not enough files with function data to compute Menzerath conformance (need ≥ 3, got #{n})." + } + else + xs = Enum.map(pairs, &elem(&1, 0)) + ys = Enum.map(pairs, &elem(&1, 1)) + correlation = round4(pearson(xs, ys)) + {exponent, r_squared} = fit_power_law(xs, ys) + + %{ + "correlation" => correlation, + "exponent" => if(exponent, do: round4(exponent), else: nil), + "r_squared" => if(r_squared, do: round4(r_squared), else: nil), + "sample_size" => n, + "insight" => codebase_insight(correlation, r_squared) + } + end + end + + defp codebase_insight(correlation, r_squared) do + fit_quality = if r_squared && r_squared >= 0.5, do: " (strong fit, R²=#{r_squared})", else: "" + + cond do + correlation <= -0.3 -> + "Menzerath's law holds#{fit_quality} — larger files tend to have shorter functions, indicating healthy decomposition." + + correlation >= 0.3 -> + "Menzerath's law violated#{fit_quality} — larger files have longer functions. Files are growing without being decomposed; consider splitting large files or extracting functions." + + true -> + "Weak Menzerath signal (correlation #{correlation}) — no clear relationship between file size and function length. Decomposition patterns are inconsistent across the codebase." + end + end + + defp pearson(xs, ys) do + n = length(xs) + sum_x = Enum.sum(xs) + sum_y = Enum.sum(ys) + sum_xy = Enum.zip(xs, ys) |> Enum.reduce(0.0, fn {x, y}, acc -> acc + x * y end) + sum_x2 = Enum.reduce(xs, 0.0, fn x, acc -> acc + x * x end) + sum_y2 = Enum.reduce(ys, 0.0, fn y, acc -> acc + y * y end) + + num = n * sum_xy - sum_x * sum_y + den = :math.sqrt((n * sum_x2 - sum_x * sum_x) * (n * sum_y2 - sum_y * sum_y)) + + if den == 0.0, do: 0.0, else: num / den + end + + defp fit_power_law(xs, ys) do + # Linearize: log(y) = log(a) + b * log(x), fit via OLS on log-log scale + pairs = + Enum.zip(xs, ys) + |> Enum.filter(fn {x, y} -> x > 0 and y > 0 end) + + if length(pairs) < 2 do + {nil, nil} + else + log_xs = Enum.map(pairs, fn {x, _} -> :math.log(x) end) + log_ys = Enum.map(pairs, fn {_, y} -> :math.log(y) end) + + n = length(pairs) + sum_lx = Enum.sum(log_xs) + sum_ly = Enum.sum(log_ys) + sum_lx2 = Enum.reduce(log_xs, 0.0, fn x, acc -> acc + x * x end) + sum_lxly = Enum.zip(log_xs, log_ys) |> Enum.reduce(0.0, fn {x, y}, acc -> acc + x * y end) + + denom = n * sum_lx2 - sum_lx * sum_lx + + if denom == 0.0 do + {nil, nil} + else + fit_power_law_coefficients(log_xs, log_ys, sum_lx, sum_ly, sum_lxly, n, denom) + end + end + end + + defp fit_power_law_coefficients(log_xs, log_ys, sum_lx, sum_ly, sum_lxly, n, denom) do + b = (n * sum_lxly - sum_lx * sum_ly) / denom + log_a = (sum_ly - b * sum_lx) / n + mean_ly = sum_ly / n + + ss_tot = Enum.reduce(log_ys, 0.0, fn ly, acc -> acc + (ly - mean_ly) ** 2 end) + + ss_res = + Enum.zip(log_xs, log_ys) + |> Enum.reduce(0.0, fn {lx, ly}, acc -> + acc + (ly - (log_a + b * lx)) ** 2 + end) + + r_squared = if ss_tot == 0.0, do: 0.0, else: 1.0 - ss_res / ss_tot + {b, r_squared} + end + + defp round4(v), do: Float.round(v * 1.0, 4) +end diff --git a/lib/codeqa/metrics/post_processing/post_processing_metric.ex b/lib/codeqa/metrics/post_processing/post_processing_metric.ex new file mode 100644 index 00000000..c4b7bc05 --- /dev/null +++ b/lib/codeqa/metrics/post_processing/post_processing_metric.ex @@ -0,0 +1,21 @@ +defmodule CodeQA.Metrics.PostProcessing.PostProcessingMetric do + @moduledoc """ + Behaviour for post-processing metrics that derive values from the full pipeline result. + + Post-processing metrics run after both file and codebase metrics complete. They receive + the full result tree and the raw files map, and return a partial result map that is + deep-merged into the pipeline result. + """ + + @doc "Unique name used as the key in the output." + @callback name() :: String.t() + + @doc """ + Analyze the pipeline result and return a partial result map to be deep-merged. + + The returned map should use the same top-level structure as the pipeline result: + `%{"files" => %{path => additions}, "codebase" => additions}`. + Only keys present in the return value are merged; absent keys are left unchanged. + """ + @callback analyze(pipeline_result :: map(), files_map :: map(), opts :: keyword()) :: map() +end diff --git a/lib/codeqa/metrics/token_normalizer.ex b/lib/codeqa/metrics/token_normalizer.ex deleted file mode 100644 index 6967e6a4..00000000 --- a/lib/codeqa/metrics/token_normalizer.ex +++ /dev/null @@ -1,45 +0,0 @@ -defmodule CodeQA.Metrics.TokenNormalizer do - @moduledoc """ - Abstracts raw source code into language-agnostic structural tokens. - - See [lexical analysis](https://en.wikipedia.org/wiki/Lexical_analysis). - """ - - # Note for future: This module can be extended with a second parameter - # normalize(code, language \\ :agnostic) to load specific regex dictionaries. - - @doc """ - Normalizes source code into a list of structural tokens. - - Replaces string literals with ``, numeric literals with ``, - and identifiers/keywords with ``. Remaining punctuation is split into - individual tokens, with common multi-character operators kept together. - - ## Examples - - iex> CodeQA.Metrics.TokenNormalizer.normalize("x = 42") - ["", "=", ""] - - """ - @spec normalize(String.t()) :: [String.t()] - def normalize(code) do - code - # 1. Strings (single and double quotes, handling escaped quotes) - |> String.replace(~r/"(?:[^"\\]|\\.)*"|'(?:[^'\\]|\\.)*'/, " ") - # 2. Numbers (integers and floats) - |> String.replace(~r/\b\d+(\.\d+)?\b/, " ") - # 3. Identifiers/Keywords (negative lookbehind/ahead to avoid clobbering // tags) - |> String.replace(~r/(?)/, " ") - # 4. Split by whitespace to extract the tokens and remaining structural punctuation - |> String.split(~r/\s+/, trim: true) - # 5. Further split punctuation, keeping common multi-char operators together - |> Enum.flat_map(&split_punctuation/1) - end - - defp split_punctuation(token) when token in ["", "", ""], do: [token] - - defp split_punctuation(text) do - Regex.scan(~r/->|=>|<>|\|>|::|\.\.\.|<-|!=|==|<=|>=|\+\+|--|&&|\|\||[^\w\s]/, text) - |> List.flatten() - end -end diff --git a/lib/codeqa/pipeline.ex b/lib/codeqa/pipeline.ex deleted file mode 100644 index bcd256cb..00000000 --- a/lib/codeqa/pipeline.ex +++ /dev/null @@ -1,109 +0,0 @@ -defmodule CodeQA.Pipeline do - @moduledoc "Pre-computed shared context for file-level metrics." - - defmodule FileContext do - @moduledoc "Immutable pre-computed data shared across all file metrics." - @enforce_keys [ - :content, - :tokens, - :token_counts, - :words, - :identifiers, - :lines, - :encoded, - :byte_count, - :line_count - ] - defstruct @enforce_keys - - @type t :: %__MODULE__{ - content: String.t(), - tokens: tuple(), - token_counts: map(), - words: tuple(), - identifiers: tuple(), - lines: tuple(), - encoded: String.t(), - byte_count: non_neg_integer(), - line_count: non_neg_integer() - } - end - - @word_re ~r/\b[a-zA-Z_]\w*\b/u - - # Reserved words and keywords for: - # Python, Ruby, JavaScript, Elixir, C#, - # Java, C++, Go, Rust, PHP, Swift, Shell, Kotlin - @keywords MapSet.new(~w[ - if else elif elsif unless - for foreach while until do - return break continue yield pass - try except finally rescue ensure after catch throw raise begin end throws - case when switch cond match default fallthrough - with as and or not in is - import from require use using alias namespace package - class def defp defmodule defmacro defmacrop defprotocol defimpl defguard defdelegate - module interface struct enum delegate event protocol extension - function fn func fun new delete typeof instanceof void - var let val const static public private protected internal - sealed override virtual abstract final readonly open - async await receive suspend - self super this Self - extends implements - null undefined nil None nullptr - true false True False - bool int float double long short byte char boolean string decimal object dynamic - ref out params get set value inout - lambda del global nonlocal assert - type typealias - synchronized volatile transient native strictfp - auto register extern signed unsigned typedef sizeof union - template typename operator inline friend explicit mutable constexpr decltype noexcept - func chan go select defer range - mut impl trait pub mod crate dyn unsafe loop where move - echo print array list mixed never - actor init deinit lazy open some any rethrows willSet didSet - then fi done esac local export source unset declare - fun val object data companion reified infix vararg expect actual - ]) - - @spec build_file_context(String.t(), keyword()) :: FileContext.t() - def build_file_context(content, opts \\ []) when is_binary(content) do - stopwords = Keyword.get(opts, :word_stopwords, MapSet.new()) - - tokens = content |> String.split() |> List.to_tuple() - token_list = Tuple.to_list(tokens) - token_counts = Enum.frequencies(token_list) - - words = - Regex.scan(@word_re, content) - |> List.flatten() - |> Enum.reject(&MapSet.member?(stopwords, &1)) - |> List.to_tuple() - - word_list = Tuple.to_list(words) - identifiers = word_list |> Enum.reject(&MapSet.member?(@keywords, &1)) |> List.to_tuple() - lines = content |> String.split("\n") |> trim_trailing_empty() |> List.to_tuple() - encoded = content - - %FileContext{ - content: content, - tokens: tokens, - token_counts: token_counts, - words: words, - identifiers: identifiers, - lines: lines, - encoded: encoded, - byte_count: byte_size(content), - line_count: tuple_size(lines) - } - end - - defp trim_trailing_empty(lines) do - # Match Python's str.splitlines() behavior - case List.last(lines) do - "" -> List.delete_at(lines, -1) - _ -> lines - end - end -end diff --git a/lib/codeqa/stopwords.ex b/lib/codeqa/stopwords.ex deleted file mode 100644 index bd33374f..00000000 --- a/lib/codeqa/stopwords.ex +++ /dev/null @@ -1,63 +0,0 @@ -defmodule CodeQA.Stopwords do - @moduledoc "Finds highly frequent items across a codebase to act as stopwords." - - @doc """ - Finds items that appear in more than the specified threshold of files. - `extractor` is a function that takes a file's content and returns an Enumerable of items. - """ - def find_stopwords(files, extractor, opts \\ []) do - threshold_ratio = Keyword.get(opts, :stopwords_threshold, 0.15) - total_docs = map_size(files) - min_docs = max(1, round(total_docs * threshold_ratio)) - workers = Keyword.get(opts, :workers, System.schedulers_online()) - has_progress = Keyword.get(opts, :progress, false) - label = Keyword.get(opts, :progress_label, "") - - counter = :counters.new(1, [:atomics]) - start_time = System.monotonic_time(:millisecond) - - files - |> Task.async_stream( - fn {_path, content} -> - res = - content - |> extractor.() - |> MapSet.new() - - if has_progress do - :counters.add(counter, 1, 1) - completed = :counters.get(counter, 1) - print_progress(completed, total_docs, start_time, label) - end - - res - end, max_concurrency: workers, timeout: :infinity) - |> Enum.reduce(%{}, fn {:ok, unique_items_in_file}, doc_freqs -> - Enum.reduce(unique_items_in_file, doc_freqs, fn item, acc -> - Map.update(acc, item, 1, &(&1 + 1)) - end) - end) - |> Enum.filter(fn {_item, count} -> count >= min_docs end) - |> Enum.map(fn {item, _count} -> item end) - |> MapSet.new() - end - - defp print_progress(completed, total, start_time, label) do - now = System.monotonic_time(:millisecond) - elapsed = max(now - start_time, 1) - avg_time = elapsed / completed - eta_ms = round((total - completed) * avg_time) - - output = - CodeQA.CLI.UI.progress_bar(completed, total, - eta: CodeQA.CLI.UI.format_eta(eta_ms), - label: label - ) - - IO.write(:stderr, "\r" <> output) - - if completed == total do - IO.puts(:stderr, "") - end - end -end diff --git a/lib/codeqa/summarizer.ex b/lib/codeqa/summarizer.ex deleted file mode 100644 index d6d9c924..00000000 --- a/lib/codeqa/summarizer.ex +++ /dev/null @@ -1,126 +0,0 @@ -defmodule CodeQA.Summarizer do - @moduledoc false - - @codebase_direction_metrics [ - {"complexity", "halstead", "mean_volume"}, - {"readability", "readability", "mean_flesch_adapted"}, - {"entropy", "entropy", "mean_char_entropy"}, - {"redundancy", "compression", "mean_redundancy"} - ] - - @file_direction_metrics [ - {"complexity", "halstead", "volume"}, - {"readability", "readability", "flesch_adapted"}, - {"entropy", "entropy", "char_entropy"}, - {"redundancy", "compression", "redundancy"} - ] - - @threshold_stable 0.05 - @threshold_slight 0.20 - - def summarize_codebase(comparison) do - files = Map.get(comparison, "files", %{}) - codebase = Map.get(comparison, "codebase", %{}) - - file_counts = count_statuses(files) - directions = compute_codebase_directions(codebase) - gist = build_codebase_gist(file_counts, directions) - - %{"gist" => gist, "file_counts" => file_counts, "directions" => directions} - end - - def summarize_file(_path, %{"status" => "added"} = data) do - lines = get_in(data, ["head", "lines"]) || 0 - %{"gist" => "new file (#{lines} lines)", "status" => "added", "lines" => lines} - end - - def summarize_file(_path, %{"status" => "deleted"} = data) do - lines = get_in(data, ["base", "lines"]) || 0 - %{"gist" => "removed (#{lines} lines)", "status" => "deleted", "lines" => lines} - end - - def summarize_file(_path, %{"status" => "modified"} = data) do - directions = compute_file_directions(data) - gist = build_file_gist(directions) - %{"gist" => gist, "status" => "modified", "directions" => directions} - end - - defp count_statuses(files) do - Enum.reduce(files, %{"added" => 0, "modified" => 0, "deleted" => 0}, fn {_path, data}, acc -> - status = Map.get(data, "status", "modified") - Map.update!(acc, status, &(&1 + 1)) - end) - end - - defp compute_codebase_directions(codebase) do - base_agg = get_in(codebase, ["base", "aggregate"]) || %{} - delta_agg = get_in(codebase, ["delta", "aggregate"]) || %{} - - Map.new(@codebase_direction_metrics, fn {dir_key, metric, agg_key} -> - base_val = get_in(base_agg, [metric, agg_key]) - delta_val = get_in(delta_agg, [metric, agg_key]) - {dir_key, classify_change(base_val, delta_val)} - end) - end - - defp compute_file_directions(file_data) do - base_metrics = get_in(file_data, ["base", "metrics"]) || %{} - delta_metrics = get_in(file_data, ["delta", "metrics"]) || %{} - - Map.new(@file_direction_metrics, fn {dir_key, metric, key} -> - base_val = get_in(base_metrics, [metric, key]) - delta_val = get_in(delta_metrics, [metric, key]) - {dir_key, classify_change(base_val, delta_val)} - end) - end - - defp classify_change(nil, _), do: "stable" - defp classify_change(_, nil), do: "stable" - defp classify_change(0, _), do: "stable" - defp classify_change(+0.0, _), do: "stable" - - defp classify_change(base_val, delta_val) do - ratio = abs(delta_val) / abs(base_val) - - cond do - ratio < @threshold_stable -> "stable" - ratio < @threshold_slight and delta_val > 0 -> "increased slightly" - ratio < @threshold_slight -> "decreased slightly" - delta_val > 0 -> "increased" - true -> "decreased" - end - end - - defp build_file_gist(directions) do - parts = - directions - |> Enum.reject(fn {_, d} -> d == "stable" end) - |> Enum.map(fn {k, d} -> "#{k} #{d}" end) - - if parts == [], do: "all metrics stable", else: Enum.join(parts, ", ") - end - - defp build_codebase_gist(file_counts, directions) do - file_parts = - [ - {"added", file_counts["added"]}, - {"modified", file_counts["modified"]}, - {"deleted", file_counts["deleted"]} - ] - |> Enum.filter(fn {_, c} -> c > 0 end) - |> Enum.map(fn {s, c} -> "#{c} #{s}" end) - - file_summary = if file_parts == [], do: "no changes", else: Enum.join(file_parts, ", ") - - dir_parts = - directions - |> Enum.reject(fn {_, d} -> d == "stable" end) - |> Enum.map(fn {k, d} -> "#{k} #{d}" end) - - if dir_parts == [] do - "#{file_summary} — all metrics stable" - else - "#{file_summary} — #{Enum.join(dir_parts, ", ")}" - end - end -end diff --git a/lib/codeqa/telemetry.ex b/lib/codeqa/telemetry.ex deleted file mode 100644 index 3f5d22d0..00000000 --- a/lib/codeqa/telemetry.ex +++ /dev/null @@ -1,68 +0,0 @@ -defmodule CodeQA.Telemetry do - @moduledoc "Simple concurrent telemetry tracker using ETS." - - def setup do - if :ets.info(:codeqa_telemetry) == :undefined do - :ets.new(:codeqa_telemetry, [:named_table, :public, :set, write_concurrency: true]) - end - - :ok - end - - def time(metric_name, fun) do - if :ets.info(:codeqa_telemetry) != :undefined do - start_time = System.monotonic_time(:microsecond) - result = fun.() - end_time = System.monotonic_time(:microsecond) - duration = end_time - start_time - - :ets.update_counter(:codeqa_telemetry, metric_name, {2, duration}, {metric_name, 0}) - - :ets.update_counter( - :codeqa_telemetry, - "#{metric_name}_count", - {2, 1}, - {"#{metric_name}_count", 0} - ) - - result - else - fun.() - end - end - - defp format_metric_line({name, total_time_us}) do - count = - case :ets.lookup(:codeqa_telemetry, "#{name}_count") do - [{_, c}] -> c - _ -> 1 - end - - total_ms = Float.round(total_time_us / 1000, 2) - avg_ms = Float.round(total_ms / count, 2) - - String.pad_trailing(to_string(name), 30) <> - " | Total: #{String.pad_trailing(to_string(total_ms) <> "ms", 12)}" <> - " | Count: #{String.pad_trailing(to_string(count), 6)}" <> - " | Avg: #{avg_ms}ms" - end - - def print_report do - if :ets.info(:codeqa_telemetry) != :undefined do - IO.puts(:stderr, " ---- Telemetry Report (Wall-clock times) ---") - metrics = :ets.tab2list(:codeqa_telemetry) - - # Group totals and counts - totals = - Enum.filter(metrics, fn {k, _} -> not String.ends_with?(to_string(k), "_count") end) - - totals - |> Enum.sort_by(fn {_, time} -> time end, :desc) - |> Enum.each(&IO.puts(:stderr, format_metric_line(&1))) - - IO.puts(:stderr, "------------------------------------------- -") - end - end -end diff --git a/lib/mix/tasks/codeqa/sample_report.ex b/lib/mix/tasks/codeqa/sample_report.ex new file mode 100644 index 00000000..1bc5cf0d --- /dev/null +++ b/lib/mix/tasks/codeqa/sample_report.ex @@ -0,0 +1,210 @@ +defmodule Mix.Tasks.Codeqa.SampleReport do + use Mix.Task + + @shortdoc "Evaluates combined metric formulas against good/bad sample code" + + @moduledoc """ + Runs combined metric formulas against sample files and prints a separation table. + + mix codeqa.sample_report + mix codeqa.sample_report --category variable_naming + mix codeqa.sample_report --verbose + mix codeqa.sample_report --output results.json + mix codeqa.sample_report --apply-scalars + mix codeqa.sample_report --apply-languages + mix codeqa.sample_report --apply-languages --category variable_naming + mix codeqa.sample_report --file path/to/file.ex + + A ratio ≥ 2x means the formula meaningfully separates good from bad code. + A ratio < 1.5x is flagged as weak; < 1.0x is marked ✗ (wrong direction). + + `--apply-scalars` rewrites the YAML config files with suggested scalars derived + from the sample data. Metrics with ratio in the deadzone (0.995–1.005) are + excluded. All non-deadzoned metrics are written, including ones not previously + in the YAML. + + `--file` analyzes a single file or directory and prints all combined metric + behavior scores, grouped by category, sorted worst-first. + """ + + @switches [ + category: :string, + verbose: :boolean, + output: :string, + report: :string, + apply_scalars: :boolean, + apply_languages: :boolean, + file: :string, + top: :integer + ] + + alias CodeQA.CombinedMetrics.SampleRunner + alias CodeQA.Engine.Analyzer + alias CodeQA.Engine.Collector + + def run(args) do + Mix.Task.run("app.start") + {opts, _, _} = OptionParser.parse(args, switches: @switches) + + results = SampleRunner.run(opts) + + results + |> Enum.group_by(& &1.category) + |> Enum.each(&print_category(&1, opts)) + + if path = opts[:output] do + File.write!(path, Jason.encode!(results, pretty: true)) + IO.puts("\nResults written to #{path}") + end + + if path = opts[:report] do + report = SampleRunner.build_metric_report(opts) + File.write!(path, Jason.encode!(report, pretty: true)) + IO.puts("\nMetric report written to #{path}") + end + + if opts[:apply_scalars] do + stats = SampleRunner.apply_scalars(opts) + IO.puts("\nApplied scalars to YAML configs:") + Enum.each(stats, &print_scalar_stats/1) + end + + if opts[:apply_languages] do + stats = SampleRunner.apply_languages(opts) + IO.puts("\nApplied language coverage to YAML configs:") + + Enum.each(stats, fn %{category: cat, behaviors_with_languages: n} -> + IO.puts(" #{cat}: #{n} behaviors with language coverage") + end) + end + + if path = opts[:file] do + print_file_scores(path, opts) + end + end + + defp print_category({category, results}, opts) do + IO.puts("\n#{category}") + IO.puts(String.duplicate("-", 75)) + + IO.puts( + " " <> + pad("behavior", 35) <> + pad("bad", 9) <> + pad("good", 9) <> + pad("ratio", 13) <> + "ok?" + ) + + Enum.each(results, &print_row(&1, opts)) + end + + defp print_row(r, opts) do + ratio_str = + "#{r.ratio}x" <> + cond do + not r.direction_ok -> "" + r.ratio < 1.5 -> " (weak)" + true -> "" + end + + ok = if r.direction_ok, do: "✓", else: "✗" + + IO.puts( + " " <> + pad(r.behavior, 35) <> + pad(fmt(r.bad_score), 9) <> + pad(fmt(r.good_score), 9) <> + pad(ratio_str, 13) <> + ok + ) + + if opts[:verbose] do + Enum.each(r.metric_detail, &print_metric_detail/1) + end + end + + defp print_metric_detail(m) do + scalar_str = if m.scalar >= 0, do: "+#{m.scalar}", else: "#{m.scalar}" + + IO.puts( + " " <> + pad("#{m.group}.#{m.key}", 45) <> + pad(scalar_str, 7) <> + pad(fmt(m.bad), 8) <> + pad(fmt(m.good), 8) <> + "#{m.ratio}x" + ) + end + + defp print_file_scores(path, opts) do + expanded = Path.expand(path) + + files = + cond do + File.dir?(expanded) -> + Collector.collect_files(expanded) + + File.regular?(expanded) -> + %{Path.basename(expanded) => File.read!(expanded)} + + true -> + IO.puts("\nPath not found: #{path}") + nil + end + + if files && map_size(files) > 0 do + IO.puts("\nAnalyzing #{map_size(files)} file(s) at: #{path}") + + aggregate = + files + |> Analyzer.analyze_codebase() + |> get_in(["codebase", "aggregate"]) + + top_n = opts[:top] || 15 + issues = SampleRunner.diagnose_aggregate(aggregate, top: top_n) + IO.puts("\nTop #{top_n} likely issues (by cosine similarity):") + IO.puts(String.duplicate("-", 75)) + IO.puts(" " <> pad("behavior", 38) <> pad("cosine", 9) <> "score") + Enum.each(issues, &print_issue_row/1) + + IO.puts("\nFull breakdown by category:") + combined = SampleRunner.score_aggregate(aggregate) + IO.puts("") + Enum.each(combined, &print_combined_category/1) + else + IO.puts("\nNo supported files found at: #{path}") + end + end + + defp print_issue_row(%{category: cat, behavior: b, cosine: cos, score: s, top_metrics: metrics}) do + IO.puts(" " <> pad("#{cat}.#{b}", 38) <> pad(fmt(cos), 9) <> fmt(s)) + + Enum.each(metrics, fn %{metric: m, contribution: c} -> + IO.puts(" " <> pad(m, 44) <> fmt(c)) + end) + end + + defp print_combined_category(%{name: name, behaviors: behaviors}) do + IO.puts(name) + IO.puts(String.duplicate("-", 60)) + + IO.puts(" " <> pad("behavior", 40) <> "score") + + behaviors + |> Enum.sort_by(& &1.score) + |> Enum.each(fn %{behavior: b, score: s} -> + flag = if s < 0.0, do: " ⚠", else: "" + IO.puts(" " <> pad(b, 40) <> fmt(s) <> flag) + end) + + IO.puts("") + end + + defp print_scalar_stats(%{category: cat, updated: u, deadzoned: d, skipped: s}) do + IO.puts(" #{pad(cat, 30)} #{u} written #{d} deadzoned #{s} skipped (no samples)") + end + + defp fmt(f), do: :erlang.float_to_binary(f / 1, decimals: 4) + defp pad(s, n), do: String.pad_trailing(to_string(s), n) +end diff --git a/lib/mix/tasks/codeqa/signal_debug.ex b/lib/mix/tasks/codeqa/signal_debug.ex new file mode 100644 index 00000000..3852dec5 --- /dev/null +++ b/lib/mix/tasks/codeqa/signal_debug.ex @@ -0,0 +1,183 @@ +defmodule Mix.Tasks.Codeqa.SignalDebug do + use Mix.Task + + @shortdoc "Shows structural signal emissions when splitting a file into blocks" + + @moduledoc """ + Runs each structural signal over a file and prints its emissions step by step. + + mix codeqa.signal_debug path/to/file.ex + mix codeqa.signal_debug path/to/file.py --signal keyword + mix codeqa.signal_debug path/to/file.ex --show-tokens + + Options: + --signal Only show a specific signal (e.g. keyword, blank, bracket) + --show-tokens Print the full token list before signal output + """ + + alias CodeQA.AST.Lexing.TokenNormalizer + alias CodeQA.AST.Parsing.SignalStream + alias CodeQA.Language + + alias CodeQA.AST.Signals.Structural.{ + AccessModifierSignal, + BlankLineSignal, + BracketSignal, + BranchSplitSignal, + ColonIndentSignal, + CommentDividerSignal, + KeywordSignal, + SQLBlockSignal, + TripleQuoteSignal + } + + @switches [signal: :string, show_tokens: :boolean] + + @all_signals [ + %TripleQuoteSignal{}, + %BlankLineSignal{}, + %KeywordSignal{}, + %BranchSplitSignal{}, + %AccessModifierSignal{}, + %CommentDividerSignal{}, + %SQLBlockSignal{}, + %BracketSignal{}, + %ColonIndentSignal{} + ] + + @impl Mix.Task + def run(args) do + {opts, positional, _} = OptionParser.parse(args, strict: @switches) + + path = + case positional do + [p | _] -> p + [] -> Mix.raise("Usage: mix codeqa.signal_debug [--signal ] [--show-tokens]") + end + + unless File.exists?(path), do: Mix.raise("File not found: #{path}") + + content = File.read!(path) + lang_mod = Language.detect(path) + tokens = TokenNormalizer.normalize_structural(content) + lines = String.split(content, "\n") + + Mix.shell().info("File: #{path}") + Mix.shell().info("Language: #{lang_mod.name()}") + Mix.shell().info("Tokens: #{length(tokens)}") + Mix.shell().info("Lines: #{length(lines)}") + Mix.shell().info("") + + if opts[:show_tokens] do + print_tokens(tokens) + end + + signals = filter_signals(@all_signals, opts[:signal]) + + emissions_per_signal = + SignalStream.run(tokens, signals, lang_mod) + + Enum.zip(signals, emissions_per_signal) + |> Enum.each(fn {signal, emissions} -> + print_signal_section(signal, emissions, tokens, lines) + end) + end + + defp filter_signals(signals, nil), do: signals + + defp filter_signals(signals, name_filter) do + Enum.filter(signals, fn signal -> + module_name = + signal.__struct__ + |> Module.split() + |> List.last() + |> String.downcase() + + String.contains?(module_name, String.downcase(name_filter)) + end) + end + + defp print_tokens(tokens) do + Mix.shell().info("=== TOKEN LIST ===") + + tokens + |> Enum.with_index() + |> Enum.each(fn {token, idx} -> + Mix.shell().info( + " [#{idx}] line #{token.line} col #{token.col} #{inspect(token.kind)} #{inspect(token.content)}" + ) + end) + + Mix.shell().info("") + end + + defp print_signal_section(signal, emissions, tokens, lines) do + name = signal.__struct__ |> Module.split() |> List.last() + separator = String.duplicate("─", 60) + + Mix.shell().info(separator) + Mix.shell().info("SIGNAL: #{name}") + Mix.shell().info("Emissions: #{length(emissions)}") + Mix.shell().info("") + + if Enum.empty?(emissions) do + Mix.shell().info(" (no emissions)") + else + Enum.each(emissions, fn {_source, group, emission_name, value} -> + print_emission(group, emission_name, value, tokens, lines) + end) + end + + Mix.shell().info("") + end + + defp print_emission(:split, name, token_idx, tokens, lines) do + token = Enum.at(tokens, token_idx) + + line_num = token && token.line + line_src = line_num && Enum.at(lines, line_num - 1) + + Mix.shell().info(" [SPLIT :#{name}] token[#{token_idx}] → line #{line_num}") + + if line_src do + Mix.shell().info(" #{String.trim_trailing(line_src)}") + end + + if token do + Mix.shell().info(" ^ #{inspect(token.kind)} #{inspect(token.content)}") + end + + Mix.shell().info("") + end + + defp print_emission(:enclosure, name, {start_idx, end_idx}, tokens, lines) do + start_token = Enum.at(tokens, start_idx) + end_token = Enum.at(tokens, end_idx) + + start_line = start_token && start_token.line + end_line = end_token && end_token.line + + Mix.shell().info( + " [ENCLOSURE :#{name}] tokens[#{start_idx}..#{end_idx}] lines #{start_line}–#{end_line}" + ) + + if start_line do + Mix.shell().info( + " open: #{inspect(Enum.at(lines, start_line - 1) |> String.trim_trailing())}" + ) + end + + if end_line && end_line != start_line do + Mix.shell().info( + " close: #{inspect(Enum.at(lines, end_line - 1) |> String.trim_trailing())}" + ) + end + + Mix.shell().info("") + end + + defp print_emission(group, name, value, _tokens, _lines) do + Mix.shell().info(" [:#{group} :#{name}] #{inspect(value)}") + Mix.shell().info("") + end +end diff --git a/mix.exs b/mix.exs index 2e55bfe8..cb2f4133 100644 --- a/mix.exs +++ b/mix.exs @@ -11,7 +11,13 @@ defmodule CodeQA.MixProject do escript: [main_module: CodeQA.CLI], elixirc_paths: elixirc_paths(Mix.env()), preferred_envs: [precommit: :test], - aliases: aliases() + aliases: aliases(), + dialyzer: [ + ignore_warnings: ".dialyzer_ignore.exs", + plt_local_path: "priv/plts", + plt_core_path: "priv/plts" + ], + consolidate_protocols: Mix.env() != :test ] end @@ -30,6 +36,12 @@ defmodule CodeQA.MixProject do "compile --warnings-as-errors", "deps.unlock --unused", "format" + ], + health: [ + "run -e 'CodeQA.CLI.main([\"health-report\", \".\", \"--ignore-paths\", \"test/**\"])'" + ], + "health.progress": [ + "run -e 'CodeQA.CLI.main([\"health-report\", \".\", \"--ignore-paths\", \"test/**\", \"--progress\"])'" ] ] end diff --git a/priv/combined_metrics/code_smells.yml b/priv/combined_metrics/code_smells.yml new file mode 100644 index 00000000..cd4e559b --- /dev/null +++ b/priv/combined_metrics/code_smells.yml @@ -0,0 +1,592 @@ +consistent_string_quote_style: + _doc: "Files should use a single, consistent string quoting style throughout." + _languages: [elixir] + _log_baseline: -18.9887 + branching: + mean_branching_density: 0.0243 + mean_non_blank_count: -0.0248 + brevity: + mean_sample_size: -0.0656 + casing_entropy: + mean_entropy: -0.0405 + mean_pascal_case_count: -0.1743 + mean_snake_case_count: -0.0505 + compression: + mean_raw_bytes: -0.0672 + mean_redundancy: 0.0207 + mean_unique_line_ratio: -0.0338 + mean_zlib_bytes: -0.1085 + mean_zlib_ratio: 0.0413 + entropy: + mean_char_max_entropy: -0.0077 + mean_token_entropy: -0.0178 + mean_token_max_entropy: -0.0141 + mean_total_tokens: -0.0783 + mean_vocab_size: -0.0656 + function_metrics: + mean_avg_function_lines: -0.0226 + mean_max_function_lines: -0.0644 + halstead: + mean_N1_total_operators: -0.1087 + mean_N2_total_operands: -0.2297 + mean_difficulty: -0.1017 + mean_effort: -0.2949 + mean_estimated_bugs: -0.1934 + mean_length: -0.1541 + mean_n1_unique_operators: -0.0790 + mean_n2_unique_operands: -0.2071 + mean_time_to_implement_seconds: -0.2949 + mean_vocabulary: -0.1721 + mean_volume: -0.1933 + heaps: + mean_k: -0.0368 + mean_r_squared: -0.0080 + identifier_length_variance: + mean_mean: 0.0059 + mean_std_dev: 0.0235 + mean_variance: 0.0470 + indentation: + mean_blank_line_ratio: 0.0205 + mean_variance: 0.0246 + line_patterns: + mean_blank_line_ratio: 0.0205 + mean_string_literal_ratio: -0.1616 + mean_unique_line_ratio: -0.0365 + magic_number_density: + mean_string_literal_ratio: 0.3018 + near_duplicate_blocks_file: + mean_near_dup_block_d2: 0.9542 + mean_near_dup_block_d3: -0.6021 + mean_near_dup_block_d4: -0.9542 + mean_near_dup_block_d5: 0.9542 + mean_near_dup_block_d6: -0.6021 + mean_sub_block_count: -0.1804 + ngram: + mean_bigram_hapax_fraction: -0.0101 + mean_bigram_repeated_unique: -0.0915 + mean_bigram_repetition_rate: 0.0277 + mean_bigram_total: -0.0785 + mean_bigram_unique: -0.1146 + mean_trigram_repeated_unique: -0.1104 + mean_trigram_repetition_rate: 0.0499 + mean_trigram_total: -0.0787 + mean_trigram_unique: -0.1182 + punctuation_density: + mean_arrow_density: 0.0674 + mean_bracket_nonalpha_prefix_count: 1.0103 + mean_bracket_nonalpha_suffix_count: 2.0000 + mean_colon_suffix_density: 0.0458 + mean_dot_count: -0.1743 + mean_exclamation_density: 0.0424 + mean_id_nonalpha_suffix_density: 0.0783 + readability: + mean_avg_line_length: -0.0444 + mean_avg_tokens_per_line: -0.0535 + mean_flesch_adapted: 0.0046 + mean_fog_adapted: -0.0301 + mean_total_lines: -0.0248 + separator_counts: + mean_dot_count: -0.1743 + mean_underscore_count: -0.0644 + symbol_density: + mean_density: -0.0325 + mean_distinct_symbol_types: -0.0966 + mean_symbol_count: -0.0999 + vocabulary: + mean_mattr: -0.0187 + mean_raw_ttr: -0.0050 + mean_total_identifiers: -0.0666 + mean_unique_identifiers: -0.0714 + vowel_density: + mean_total_chars: -0.0607 + zipf: + mean_total_tokens: -0.0783 + mean_vocab_size: -0.0656 + +no_dead_code_after_return: + _doc: "There should be no unreachable statements after a return or early exit." + _languages: [elixir] + _log_baseline: -62.7495 + branching: + mean_branch_count: -2.0000 + mean_branching_density: -1.4201 + mean_non_blank_count: -0.5815 + brevity: + mean_sample_size: -0.2610 + casing_entropy: + mean_entropy: -0.2430 + mean_other_count: -0.8708 + mean_pascal_case_count: -0.5752 + mean_snake_case_count: -0.3559 + compression: + mean_raw_bytes: -0.4531 + mean_redundancy: -0.0467 + mean_zlib_bytes: -0.3558 + mean_zlib_ratio: -0.0974 + entropy: + mean_char_entropy: 0.0250 + mean_char_max_entropy: -0.0205 + mean_char_normalized: 0.0455 + mean_token_entropy: -0.0475 + mean_token_max_entropy: -0.0575 + mean_token_normalized: 0.0099 + mean_total_tokens: -0.3093 + mean_vocab_size: -0.2610 + function_metrics: + mean_avg_function_lines: -0.4255 + mean_avg_param_count: 0.1143 + mean_function_count: -0.1143 + mean_max_function_lines: -0.5062 + halstead: + mean_N1_total_operators: -0.2185 + mean_N2_total_operands: -0.4051 + mean_difficulty: -0.1769 + mean_effort: -0.5126 + mean_estimated_bugs: -0.3357 + mean_length: -0.2795 + mean_n1_unique_operators: -0.0857 + mean_n2_unique_operands: -0.3139 + mean_time_to_implement_seconds: -0.5126 + mean_vocabulary: -0.2525 + mean_volume: -0.3357 + heaps: + mean_k: -0.1169 + identifier_length_variance: + mean_max: -0.4367 + mean_mean: -0.0159 + mean_std_dev: -0.2804 + mean_variance: -0.5607 + indentation: + mean_blank_line_ratio: 0.2883 + mean_mean_depth: -0.4448 + mean_variance: -0.6173 + line_patterns: + mean_blank_line_ratio: 0.2883 + mean_string_literal_ratio: -0.8289 + mean_unique_line_ratio: -0.0289 + magic_number_density: + mean_density: 0.2821 + mean_string_literal_ratio: -0.8289 + near_duplicate_blocks_file: + mean_block_count: -0.1083 + mean_near_dup_block_d0: 1.1292 + mean_near_dup_block_d5: 1.1292 + mean_near_dup_block_d7: -0.7124 + mean_near_dup_block_d8: 1.1292 + mean_sub_block_count: -0.3612 + ngram: + mean_bigram_hapax_fraction: 0.0142 + mean_bigram_repeated_unique: -0.3335 + mean_bigram_repetition_rate: -0.0114 + mean_bigram_total: -0.3100 + mean_bigram_unique: -0.3022 + mean_trigram_hapax_fraction: -0.0576 + mean_trigram_repeated_unique: -0.0894 + mean_trigram_repetition_rate: 0.0890 + mean_trigram_total: -0.3107 + mean_trigram_unique: -0.3313 + punctuation_density: + mean_arrow_density: -1.1156 + mean_bracket_nonalpha_prefix_count: 1.0397 + mean_bracket_nonalpha_suffix_count: -0.4541 + mean_colon_suffix_density: 0.3588 + mean_dot_count: -1.0081 + mean_id_nonalpha_suffix_density: 0.0111 + readability: + mean_avg_line_length: 0.1309 + mean_avg_sub_words_per_id: -0.0415 + mean_avg_tokens_per_line: 0.2722 + mean_flesch_adapted: 0.0243 + mean_fog_adapted: -0.3299 + mean_total_lines: -0.5815 + separator_counts: + mean_dot_count: -1.0081 + mean_hyphen_count: -1.1292 + mean_underscore_count: -0.6750 + symbol_density: + mean_density: 0.2141 + mean_symbol_count: -0.2386 + vocabulary: + mean_mattr: -0.0424 + mean_raw_ttr: 0.0435 + mean_total_identifiers: -0.4061 + mean_unique_identifiers: -0.3626 + vowel_density: + mean_total_chars: -0.4220 + zipf: + mean_exponent: -0.0067 + mean_total_tokens: -0.3093 + mean_vocab_size: -0.2610 + +no_debug_print_statements: + _doc: "Debug output (`console.log`, `IO.inspect`, `fmt.Println`) must not be left in committed code." + _languages: [elixir, go, javascript, python, ruby] + _log_baseline: -88.2885 + branching: + mean_branch_count: 0.2378 + mean_branching_density: 0.7072 + mean_max_nesting_depth: 0.1175 + mean_non_blank_count: -0.3222 + brevity: + mean_sample_size: -0.0776 + casing_entropy: + mean_camel_case_count: -0.4777 + mean_entropy: 0.2378 + mean_other_count: 0.0409 + mean_pascal_case_count: -0.5178 + mean_snake_case_count: -0.7116 + compression: + mean_raw_bytes: -0.3758 + mean_redundancy: -0.0974 + mean_unique_line_ratio: -0.0654 + mean_zlib_bytes: -0.2114 + mean_zlib_ratio: -0.1430 + entropy: + mean_char_entropy: 0.0136 + mean_char_max_entropy: 0.0206 + mean_token_entropy: 0.0326 + mean_token_max_entropy: -0.0104 + mean_token_normalized: 0.0414 + mean_total_tokens: -0.4317 + mean_vocab_size: -0.0776 + function_metrics: + mean_avg_function_lines: -0.5945 + mean_avg_param_count: -0.0572 + mean_function_count: 0.0513 + mean_max_function_lines: -0.5630 + halstead: + mean_N1_total_operators: -0.2976 + mean_N2_total_operands: -0.3928 + mean_difficulty: -0.1881 + mean_effort: -0.6575 + mean_estimated_bugs: -0.3863 + mean_length: -0.3347 + mean_n1_unique_operators: -0.0361 + mean_n2_unique_operands: -0.2661 + mean_time_to_implement_seconds: -0.6575 + mean_vocabulary: -0.1934 + mean_volume: -0.3862 + heaps: + mean_beta: 0.0567 + mean_k: 0.0771 + mean_r_squared: -0.0186 + identifier_length_variance: + mean_max: -0.0146 + mean_mean: 0.0926 + mean_std_dev: -0.0267 + mean_variance: -0.0755 + indentation: + mean_blank_line_ratio: 0.1672 + mean_max_depth: -0.1656 + mean_mean_depth: -0.2127 + mean_variance: 0.1646 + line_patterns: + mean_blank_line_ratio: 0.1672 + mean_max_nesting_depth: 0.1175 + mean_string_literal_ratio: -0.6400 + mean_unique_line_ratio: -0.0422 + magic_number_density: + mean_density: 0.8678 + mean_magic_number_count: 0.3203 + mean_string_literal_ratio: -0.6501 + near_duplicate_blocks_file: + mean_near_dup_block_d0: -0.6126 + mean_near_dup_block_d5: 1.2615 + mean_near_dup_block_d6: 0.5236 + mean_near_dup_block_d7: 0.1585 + mean_near_dup_block_d8: 1.0702 + mean_sub_block_count: 0.1879 + ngram: + mean_bigram_repeated_unique: -0.1860 + mean_bigram_repetition_rate: -0.1042 + mean_bigram_total: -0.4331 + mean_bigram_unique: -0.2201 + mean_trigram_hapax_fraction: -0.0213 + mean_trigram_repeated_unique: -0.1930 + mean_trigram_repetition_rate: -0.0683 + mean_trigram_total: -0.4345 + mean_trigram_unique: -0.3080 + punctuation_density: + mean_arrow_density: 0.8097 + mean_bracket_nonalpha_prefix_count: -1.6350 + mean_bracket_nonalpha_suffix_count: -1.0342 + mean_colon_suffix_density: -0.4483 + mean_dot_count: -0.4489 + mean_exclamation_density: 1.6684 + mean_id_nonalpha_suffix_density: -0.1102 + readability: + mean_avg_line_length: -0.0110 + mean_avg_sub_words_per_id: 0.0409 + mean_avg_tokens_per_line: -0.0639 + mean_flesch_adapted: -0.0331 + mean_fog_adapted: -0.0304 + mean_total_lines: -0.3222 + separator_counts: + mean_dot_count: -0.4489 + mean_hyphen_count: 0.1350 + mean_slash_count: 2.0000 + mean_underscore_count: 0.1201 + symbol_density: + mean_density: -0.0960 + mean_distinct_symbol_types: 0.0354 + mean_symbol_count: -0.4722 + vocabulary: + mean_mattr: 0.3241 + mean_raw_ttr: 0.3641 + mean_total_identifiers: -0.5984 + mean_unique_identifiers: -0.1231 + vowel_density: + mean_total_chars: -0.4638 + zipf: + mean_exponent: -0.1608 + mean_total_tokens: -0.4317 + mean_vocab_size: -0.0776 + +no_fixme_comments: + _doc: "FIXME, XXX, and HACK comments indicate known problems that should be resolved before merging." + _languages: [elixir, go, javascript, python, ruby] + _log_baseline: 11.3113 + branching: + mean_branch_count: 0.1713 + mean_branching_density: 0.1042 + mean_max_nesting_depth: 0.0518 + mean_non_blank_count: 0.0570 + brevity: + mean_sample_size: -0.1049 + casing_entropy: + mean_camel_case_count: 0.1803 + mean_entropy: 0.0464 + mean_macro_case_count: 0.2871 + mean_other_count: 0.0237 + mean_pascal_case_count: 0.0230 + mean_screaming_snake_density: -2.0000 + mean_snake_case_count: -0.0374 + comment_structure: + mean_comment_line_count: -0.6230 + mean_comment_line_ratio: -0.8044 + mean_todo_fixme_count: -1.0293 + compression: + mean_raw_bytes: 0.0311 + mean_redundancy: 0.0289 + mean_zlib_bytes: -0.0094 + mean_zlib_ratio: 0.0402 + entropy: + mean_char_entropy: 0.0042 + mean_char_normalized: 0.0044 + mean_token_entropy: -0.0313 + mean_token_max_entropy: -0.0222 + mean_token_normalized: -0.0092 + mean_total_tokens: 0.0768 + mean_vocab_size: -0.1049 + function_metrics: + mean_avg_function_lines: 0.0522 + mean_avg_param_count: 0.0093 + mean_function_count: 0.0138 + mean_max_function_lines: 0.1258 + halstead: + mean_N1_total_operators: 0.1260 + mean_N2_total_operands: 0.0296 + mean_difficulty: 0.1341 + mean_effort: 0.2123 + mean_estimated_bugs: 0.0709 + mean_length: 0.0866 + mean_n2_unique_operands: -0.1071 + mean_time_to_implement_seconds: 0.2123 + mean_vocabulary: -0.0827 + mean_volume: 0.0709 + heaps: + mean_beta: -0.0733 + mean_k: 0.1138 + identifier_length_variance: + mean_mean: 0.0175 + mean_std_dev: 0.0355 + mean_variance: 0.0707 + indentation: + mean_blank_line_ratio: 0.0568 + mean_max_depth: 0.0525 + mean_mean_depth: 0.0511 + mean_variance: 0.1576 + line_patterns: + mean_blank_line_ratio: 0.0568 + mean_max_nesting_depth: 0.0518 + mean_string_literal_ratio: 0.1690 + mean_unique_line_ratio: 0.0069 + magic_number_density: + mean_density: -0.0712 + mean_string_literal_ratio: 0.1213 + near_duplicate_blocks_file: + mean_block_count: 0.0522 + mean_near_dup_block_d0: 0.6667 + mean_near_dup_block_d2: -0.3795 + mean_near_dup_block_d4: -0.2116 + mean_near_dup_block_d5: 0.2871 + mean_near_dup_block_d6: -0.6667 + mean_near_dup_block_d8: -0.0553 + mean_sub_block_count: 0.1304 + ngram: + mean_bigram_hapax_fraction: -0.0337 + mean_bigram_repeated_unique: 0.1067 + mean_bigram_repetition_rate: 0.0765 + mean_bigram_total: 0.0770 + mean_bigram_unique: 0.0192 + mean_trigram_hapax_fraction: -0.0128 + mean_trigram_repeated_unique: 0.1226 + mean_trigram_repetition_rate: 0.0851 + mean_trigram_total: 0.0771 + mean_trigram_unique: 0.0513 + punctuation_density: + mean_arrow_density: 0.0655 + mean_bracket_nonalpha_prefix_count: 0.4987 + mean_bracket_nonalpha_suffix_count: 0.2338 + mean_colon_suffix_density: -0.2241 + mean_dot_count: 0.3139 + mean_exclamation_density: 0.3359 + mean_id_nonalpha_suffix_density: 0.0318 + mean_question_mark_density: 0.6230 + readability: + mean_avg_line_length: 0.0230 + mean_avg_sub_words_per_id: 0.0184 + mean_avg_tokens_per_line: -0.0239 + mean_flesch_adapted: -0.0152 + mean_fog_adapted: -0.0037 + mean_total_lines: 0.0927 + separator_counts: + mean_dot_count: 0.3139 + mean_hyphen_count: -0.0251 + mean_slash_count: -0.1931 + mean_underscore_count: 0.0971 + symbol_density: + mean_density: 0.1198 + mean_distinct_symbol_types: 0.0283 + mean_symbol_count: 0.1467 + vocabulary: + mean_mattr: -0.1170 + mean_raw_ttr: -0.1141 + mean_total_identifiers: -0.0122 + mean_unique_identifiers: -0.1331 + zipf: + mean_exponent: 0.0693 + mean_r_squared: 0.0061 + mean_total_tokens: 0.0768 + mean_vocab_size: -0.1049 + +no_nested_ternary: + _doc: "Nested conditional expressions (ternary-within-ternary) are harder to read than a plain if-else." + _languages: [elixir] + _log_baseline: 8.0040 + branching: + mean_branch_count: -0.5662 + mean_branching_density: -0.3441 + mean_max_nesting_depth: 0.1824 + mean_non_blank_count: -0.2221 + brevity: + mean_sample_size: 0.0486 + casing_entropy: + mean_entropy: 0.2311 + mean_other_count: 0.7455 + mean_pascal_case_count: 0.1237 + mean_snake_case_count: 0.1138 + compression: + mean_raw_bytes: -0.0141 + mean_redundancy: -0.0117 + mean_unique_line_ratio: 0.1154 + mean_zlib_bytes: 0.0170 + mean_zlib_ratio: -0.0312 + entropy: + mean_char_entropy: 0.0689 + mean_char_max_entropy: 0.0024 + mean_char_normalized: 0.0665 + mean_token_entropy: -0.0014 + mean_token_max_entropy: 0.0110 + mean_token_normalized: -0.0124 + mean_total_tokens: 0.1324 + mean_vocab_size: 0.0486 + function_metrics: + mean_avg_function_lines: -0.7403 + mean_avg_param_count: -0.0277 + mean_function_count: 0.5579 + mean_max_function_lines: -0.4954 + halstead: + mean_N1_total_operators: 0.1382 + mean_N2_total_operands: 0.1252 + mean_difficulty: 0.0773 + mean_effort: 0.2218 + mean_estimated_bugs: 0.1445 + mean_length: 0.1335 + mean_n1_unique_operators: 0.0128 + mean_n2_unique_operands: 0.0608 + mean_time_to_implement_seconds: 0.2218 + mean_vocabulary: 0.0480 + mean_volume: 0.1445 + heaps: + mean_beta: -0.0334 + mean_k: 0.0563 + identifier_length_variance: + mean_max: 0.0170 + mean_mean: -0.0112 + mean_std_dev: -0.0060 + mean_variance: -0.0120 + indentation: + mean_blank_line_ratio: 0.3825 + mean_max_depth: -0.2891 + mean_mean_depth: -0.2922 + mean_variance: -0.5254 + line_patterns: + mean_blank_line_ratio: 0.3825 + mean_max_nesting_depth: 0.1824 + mean_string_literal_ratio: 0.0146 + mean_unique_line_ratio: 0.1591 + magic_number_density: + mean_density: -0.1634 + mean_magic_number_count: -0.0310 + mean_string_literal_ratio: 0.0146 + near_duplicate_blocks_file: + mean_block_count: 0.0885 + mean_near_dup_block_d7: -0.1824 + mean_sub_block_count: 0.5472 + ngram: + mean_bigram_hapax_fraction: -0.0464 + mean_bigram_repeated_unique: 0.1405 + mean_bigram_repetition_rate: 0.0564 + mean_bigram_total: 0.1327 + mean_bigram_unique: 0.0600 + mean_trigram_hapax_fraction: -0.0321 + mean_trigram_repeated_unique: 0.1699 + mean_trigram_repetition_rate: 0.1003 + mean_trigram_total: 0.1331 + mean_trigram_unique: 0.0704 + punctuation_density: + mean_bracket_nonalpha_prefix_count: 0.5781 + mean_bracket_nonalpha_suffix_count: 0.7295 + mean_colon_suffix_density: -0.6851 + mean_dot_count: -0.1824 + mean_exclamation_density: 2.0000 + mean_id_nonalpha_suffix_density: 0.2589 + readability: + mean_avg_line_length: 0.2148 + mean_avg_sub_words_per_id: 0.0173 + mean_avg_tokens_per_line: 0.3545 + mean_flesch_adapted: -0.0367 + mean_fog_adapted: 0.3545 + mean_total_lines: -0.2221 + separator_counts: + mean_dot_count: -0.1824 + mean_hyphen_count: -0.1067 + mean_underscore_count: 0.3101 + symbol_density: + mean_density: 0.2615 + mean_distinct_symbol_types: 0.0377 + mean_symbol_count: 0.2475 + vocabulary: + mean_mattr: -0.0587 + mean_raw_ttr: -0.0515 + mean_total_identifiers: 0.1551 + mean_unique_identifiers: 0.1036 + vowel_density: + mean_total_chars: 0.1439 + zipf: + mean_exponent: 0.0240 + mean_r_squared: 0.0111 + mean_total_tokens: 0.1324 + mean_vocab_size: 0.0486 + diff --git a/priv/combined_metrics/consistency.yml b/priv/combined_metrics/consistency.yml new file mode 100644 index 00000000..902817d2 --- /dev/null +++ b/priv/combined_metrics/consistency.yml @@ -0,0 +1,334 @@ +consistent_casing_within_file: + _doc: "A file should use one naming convention throughout — no mixing of camelCase and snake_case for the same kind of identifier." + _log_baseline: -0.6750 + brevity: + mean_sample_size: -0.0471 + casing_entropy: + mean_camel_case_count: -2.0000 + mean_entropy: -0.4254 + mean_snake_case_count: 0.2663 + compression: + mean_raw_bytes: 0.0213 + mean_redundancy: 0.0219 + mean_zlib_bytes: -0.0194 + mean_zlib_ratio: 0.0407 + entropy: + mean_char_entropy: -0.0126 + mean_char_max_entropy: -0.0170 + mean_char_normalized: 0.0044 + mean_token_entropy: -0.0090 + mean_token_max_entropy: -0.0101 + mean_vocab_size: -0.0471 + halstead: + mean_difficulty: 0.0629 + mean_effort: 0.0530 + mean_estimated_bugs: -0.0099 + mean_n2_unique_operands: -0.0629 + mean_time_to_implement_seconds: 0.0530 + mean_vocabulary: -0.0456 + mean_volume: -0.0099 + heaps: + mean_beta: -0.0232 + mean_k: 0.0253 + identifier_length_variance: + mean_mean: 0.0337 + mean_std_dev: 0.0139 + mean_variance: 0.0278 + ngram: + mean_bigram_hapax_fraction: -0.0071 + mean_bigram_repetition_rate: 0.0267 + mean_bigram_unique: -0.0197 + mean_trigram_hapax_fraction: -0.0122 + mean_trigram_repeated_unique: 0.0698 + mean_trigram_repetition_rate: 0.0874 + mean_trigram_unique: -0.0172 + readability: + mean_avg_line_length: 0.0221 + separator_counts: + mean_underscore_count: 0.4311 + symbol_density: + mean_density: -0.0214 + vocabulary: + mean_mattr: -0.0680 + mean_raw_ttr: -0.0735 + mean_unique_identifiers: -0.0735 + vowel_density: + mean_total_chars: 0.0337 + zipf: + mean_exponent: 0.0265 + mean_vocab_size: -0.0471 + +consistent_error_return_shape: + _doc: "All functions in a module should return errors in the same shape — mixed `nil`, `false`, and `{:error, _}` returns are confusing." + _log_baseline: -2.6048 + branching: + mean_branch_count: -0.2800 + mean_branching_density: 0.5563 + mean_max_nesting_depth: -0.4366 + mean_non_blank_count: -0.3834 + brevity: + mean_sample_size: -0.0588 + casing_entropy: + mean_entropy: 0.0596 + mean_pascal_case_count: 0.2800 + mean_snake_case_count: -0.0268 + compression: + mean_raw_bytes: -0.1755 + mean_redundancy: -0.0215 + mean_unique_line_ratio: 0.0369 + mean_zlib_bytes: -0.2211 + mean_zlib_ratio: -0.0103 + entropy: + mean_char_entropy: 0.0605 + mean_char_normalized: 0.0641 + mean_token_entropy: -0.0194 + mean_token_max_entropy: -0.0167 + mean_total_tokens: -0.0672 + mean_vocab_size: -0.0588 + function_metrics: + mean_avg_function_lines: -0.5108 + mean_function_count: -0.0833 + mean_max_function_lines: -0.4535 + halstead: + mean_N1_total_operators: 0.0624 + mean_N2_total_operands: -0.0870 + mean_difficulty: 0.1365 + mean_effort: 0.2447 + mean_estimated_bugs: 0.0150 + mean_length: 0.0128 + mean_n1_unique_operators: 0.0412 + mean_n2_unique_operands: -0.2129 + mean_time_to_implement_seconds: 0.2447 + mean_vocabulary: -0.1263 + mean_volume: 0.0150 + heaps: + mean_beta: 0.0694 + mean_k: -0.0226 + identifier_length_variance: + mean_mean: -0.0575 + mean_std_dev: -0.0511 + mean_variance: -0.0858 + indentation: + mean_blank_line_ratio: 0.5151 + mean_max_depth: -0.4917 + mean_mean_depth: -0.4787 + mean_variance: -0.8229 + line_patterns: + mean_blank_line_ratio: 0.5151 + mean_max_nesting_depth: -0.4366 + mean_string_literal_ratio: -0.5234 + mean_unique_line_ratio: 0.0441 + magic_number_density: + mean_density: -0.2062 + mean_string_literal_ratio: -0.5234 + near_duplicate_blocks_file: + mean_block_count: -0.1566 + mean_near_dup_block_d0: -1.3562 + mean_near_dup_block_d4: 1.3562 + mean_near_dup_block_d5: 1.3562 + mean_near_dup_block_d6: 1.3562 + mean_near_dup_block_d7: 1.3562 + mean_near_dup_block_d8: -0.7933 + mean_sub_block_count: 0.0308 + ngram: + mean_bigram_hapax_fraction: -0.4505 + mean_bigram_repeated_unique: 0.3514 + mean_bigram_repetition_rate: 0.3102 + mean_bigram_total: -0.0673 + mean_bigram_unique: -0.2362 + mean_trigram_hapax_fraction: -0.3293 + mean_trigram_repeated_unique: 0.3390 + mean_trigram_repetition_rate: 0.5429 + mean_trigram_total: -0.0675 + mean_trigram_unique: -0.2662 + punctuation_density: + mean_arrow_density: -2.0000 + mean_bracket_nonalpha_prefix_count: 0.1865 + mean_bracket_nonalpha_suffix_count: 0.3180 + mean_colon_suffix_density: 0.7587 + mean_dot_count: 0.6069 + mean_id_nonalpha_suffix_density: 0.3082 + readability: + mean_avg_line_length: 0.3857 + mean_avg_tokens_per_line: 0.5377 + mean_flesch_adapted: -0.0433 + mean_fog_adapted: 0.1454 + mean_total_lines: -0.3834 + separator_counts: + mean_dot_count: 0.6069 + mean_hyphen_count: -1.1258 + mean_underscore_count: 0.0416 + symbol_density: + mean_density: 0.1508 + mean_distinct_symbol_types: -0.0567 + vocabulary: + mean_mattr: -0.1541 + mean_raw_ttr: -0.1719 + mean_unique_identifiers: -0.0425 + zipf: + mean_exponent: -0.0773 + mean_r_squared: 0.0157 + mean_total_tokens: -0.0672 + mean_vocab_size: -0.0588 + +consistent_function_style: + _doc: "A module should not mix one-liner and multi-clause function definitions for the same concern." + _log_baseline: -0.1780 + branching: + mean_branch_count: -0.1610 + mean_branching_density: -0.3349 + mean_max_nesting_depth: -0.1610 + mean_non_blank_count: 0.1738 + brevity: + mean_sample_size: 0.0028 + casing_entropy: + mean_entropy: -0.0534 + mean_other_count: -0.2753 + mean_pascal_case_count: -0.0379 + mean_snake_case_count: 0.0199 + compression: + mean_raw_bytes: 0.0313 + mean_redundancy: 0.0188 + mean_unique_line_ratio: -0.0440 + mean_zlib_bytes: 0.0037 + mean_zlib_ratio: 0.0276 + entropy: + mean_char_entropy: -0.0072 + mean_char_normalized: -0.0071 + mean_token_entropy: 0.0058 + mean_token_normalized: 0.0052 + mean_vocab_size: 0.0028 + function_metrics: + mean_avg_function_lines: 0.0608 + mean_avg_param_count: -0.0099 + mean_function_count: 0.0939 + mean_max_function_lines: -0.0797 + halstead: + mean_N2_total_operands: 0.0471 + mean_difficulty: 0.0186 + mean_effort: 0.0362 + mean_estimated_bugs: 0.0176 + mean_length: 0.0157 + mean_n1_unique_operators: -0.0122 + mean_n2_unique_operands: 0.0162 + mean_time_to_implement_seconds: 0.0362 + mean_vocabulary: 0.0091 + mean_volume: 0.0176 + heaps: + mean_beta: 0.0024 + identifier_length_variance: + mean_mean: 0.0076 + mean_variance: 0.0038 + indentation: + mean_blank_line_ratio: -0.0991 + mean_max_depth: -0.1143 + mean_mean_depth: -0.0203 + mean_variance: -0.1730 + line_patterns: + mean_blank_line_ratio: -0.0991 + mean_max_nesting_depth: -0.1610 + mean_unique_line_ratio: -0.0456 + near_duplicate_blocks_file: + mean_block_count: 0.2753 + mean_near_dup_block_d0: 0.9145 + mean_near_dup_block_d7: 0.1610 + mean_near_dup_block_d8: 0.5506 + mean_sub_block_count: 0.0594 + ngram: + mean_bigram_hapax_fraction: 0.0037 + mean_bigram_repeated_unique: -0.0041 + mean_bigram_repetition_rate: -0.0091 + mean_bigram_unique: 0.0065 + mean_trigram_repeated_unique: -0.0058 + punctuation_density: + mean_arrow_density: 2.0000 + mean_bracket_nonalpha_suffix_count: -0.0781 + mean_colon_suffix_density: -0.1318 + mean_dot_count: -0.0204 + mean_id_nonalpha_suffix_density: -0.0132 + readability: + mean_avg_line_length: -0.1471 + mean_avg_sub_words_per_id: 0.0030 + mean_avg_tokens_per_line: -0.1751 + mean_flesch_adapted: 0.0147 + mean_fog_adapted: -0.1412 + mean_total_lines: 0.1738 + separator_counts: + mean_dot_count: -0.0204 + mean_hyphen_count: -0.1460 + mean_underscore_count: 0.0287 + symbol_density: + mean_density: -0.0473 + mean_symbol_count: -0.0159 + vocabulary: + mean_mattr: -0.0025 + mean_raw_ttr: -0.0051 + mean_total_identifiers: 0.0098 + mean_unique_identifiers: 0.0047 + vowel_density: + mean_total_chars: 0.0175 + zipf: + mean_exponent: -0.0054 + mean_vocab_size: 0.0028 + +same_concept_same_name: + _doc: "The same domain concept should use the same name throughout a file — mixing `user`, `usr`, and `u` for the same thing harms readability." + _log_baseline: -15.1568 + brevity: + mean_sample_size: -1.3457 + compression: + mean_raw_bytes: 0.1773 + mean_redundancy: 0.3935 + mean_unique_line_ratio: -0.3251 + mean_zlib_bytes: -0.8263 + mean_zlib_ratio: 1.0033 + entropy: + mean_char_entropy: -0.1808 + mean_char_normalized: -0.1800 + mean_token_entropy: -0.3546 + mean_token_max_entropy: -0.2899 + mean_vocab_size: -1.3457 + halstead: + mean_difficulty: 1.8665 + mean_effort: 1.5662 + mean_estimated_bugs: -0.2997 + mean_n2_unique_operands: -1.8665 + mean_time_to_implement_seconds: 1.5662 + mean_vocabulary: -1.3857 + mean_volume: -0.3003 + heaps: + mean_beta: -0.5870 + mean_k: 0.5102 + identifier_length_variance: + mean_mean: 0.3431 + mean_std_dev: -0.4791 + mean_variance: -0.9580 + line_patterns: + mean_unique_line_ratio: -0.6939 + ngram: + mean_bigram_hapax_fraction: -0.6466 + mean_bigram_repeated_unique: -0.2091 + mean_bigram_repetition_rate: 0.6530 + mean_bigram_unique: -1.1746 + mean_trigram_hapax_fraction: -0.6625 + mean_trigram_repeated_unique: 1.2887 + mean_trigram_repetition_rate: 1.6149 + mean_trigram_unique: -0.9875 + readability: + mean_avg_line_length: 0.1837 + mean_avg_sub_words_per_id: -0.1771 + mean_flesch_adapted: 0.2348 + separator_counts: + mean_underscore_count: -0.5711 + symbol_density: + mean_density: -0.1807 + vocabulary: + mean_mattr: -1.8899 + mean_raw_ttr: -1.9969 + mean_unique_identifiers: -2.0000 + vowel_density: + mean_total_chars: 0.3432 + zipf: + mean_exponent: 0.7698 + mean_vocab_size: -1.3457 + diff --git a/priv/combined_metrics/dependencies.yml b/priv/combined_metrics/dependencies.yml new file mode 100644 index 00000000..93c0d128 --- /dev/null +++ b/priv/combined_metrics/dependencies.yml @@ -0,0 +1,326 @@ +import_count_under_10: + _doc: "Files should import fewer than 10 modules; high import counts signal excessive coupling." + _languages: [elixir] + _log_baseline: 7.1916 + branching: + mean_branch_count: 0.2110 + mean_branching_density: -1.0683 + mean_max_nesting_depth: 0.1234 + mean_non_blank_count: -0.0219 + brevity: + mean_sample_size: 0.0119 + casing_entropy: + mean_entropy: -0.0389 + mean_pascal_case_count: -0.1657 + mean_snake_case_count: -0.0025 + comment_structure: + mean_comment_line_ratio: -1.2802 + compression: + mean_raw_bytes: -0.0133 + mean_redundancy: -0.0135 + mean_unique_line_ratio: -0.0046 + mean_zlib_bytes: 0.0144 + mean_zlib_ratio: -0.0277 + entropy: + mean_char_entropy: 0.0035 + mean_char_max_entropy: 0.0088 + mean_char_normalized: -0.0053 + mean_token_entropy: -0.0040 + mean_token_max_entropy: 0.0026 + mean_token_normalized: -0.0066 + mean_total_tokens: -0.0251 + mean_vocab_size: 0.0119 + function_metrics: + mean_avg_function_lines: -0.0688 + mean_avg_param_count: -0.0555 + mean_function_count: 0.1234 + mean_max_function_lines: 0.0944 + mean_max_param_count: -0.1234 + halstead: + mean_N1_total_operators: -0.0138 + mean_N2_total_operands: -0.0464 + mean_difficulty: -0.0353 + mean_effort: -0.0606 + mean_estimated_bugs: -0.0253 + mean_length: -0.0260 + mean_n1_unique_operators: 0.0111 + mean_time_to_implement_seconds: -0.0606 + mean_vocabulary: 0.0032 + mean_volume: -0.0253 + heaps: + mean_beta: -0.0893 + mean_k: 0.3293 + mean_r_squared: 0.0101 + identifier_length_variance: + mean_max: 0.0679 + mean_mean: 0.0648 + mean_std_dev: 0.0688 + mean_variance: 0.1375 + indentation: + mean_blank_line_ratio: 0.1478 + mean_max_depth: -0.0876 + mean_mean_depth: -0.0397 + mean_variance: -0.2328 + line_patterns: + mean_blank_line_ratio: 0.1478 + mean_max_nesting_depth: 0.1234 + mean_string_literal_ratio: 0.0265 + mean_unique_line_ratio: -0.0050 + magic_number_density: + mean_density: 0.5219 + mean_magic_number_count: 0.4898 + mean_string_literal_ratio: 0.0265 + near_duplicate_blocks_file: + mean_block_count: 0.0765 + mean_sub_block_count: 0.1110 + ngram: + mean_bigram_repeated_unique: 0.0034 + mean_bigram_repetition_rate: -0.0129 + mean_bigram_total: -0.0252 + mean_bigram_unique: 0.0024 + mean_trigram_hapax_fraction: -0.0051 + mean_trigram_repeated_unique: 0.0257 + mean_trigram_repetition_rate: -0.0296 + mean_trigram_total: -0.0252 + mean_trigram_unique: 0.0062 + punctuation_density: + mean_bracket_nonalpha_prefix_count: 0.5922 + mean_bracket_nonalpha_suffix_count: 0.1086 + mean_colon_suffix_density: -0.1389 + mean_dot_count: -0.1234 + mean_id_nonalpha_suffix_density: -0.0141 + mean_question_mark_density: -2.0000 + readability: + mean_avg_line_length: 0.0110 + mean_avg_sub_words_per_id: 0.0173 + mean_flesch_adapted: -0.0204 + mean_fog_adapted: 0.2028 + mean_total_lines: -0.0265 + symbol_density: + mean_density: 0.0223 + mean_distinct_symbol_types: 0.0643 + mean_symbol_count: 0.0087 + vocabulary: + mean_mattr: -0.0031 + mean_raw_ttr: 0.0573 + mean_total_identifiers: -0.0573 + vowel_density: + mean_total_chars: 0.0075 + zipf: + mean_exponent: -0.0152 + mean_r_squared: 0.0050 + mean_total_tokens: -0.0251 + mean_vocab_size: 0.0119 + +low_coupling: + _doc: "Modules should depend on few external symbols — a low unique-operand count relative to total is a proxy for tight coupling." + _languages: [elixir] + _log_baseline: -38.2335 + branching: + mean_branch_count: 0.0745 + mean_branching_density: 0.2097 + mean_max_nesting_depth: -0.1353 + mean_non_blank_count: -0.1353 + brevity: + mean_sample_size: -0.1276 + casing_entropy: + mean_entropy: -0.0870 + mean_pascal_case_count: -0.3722 + mean_snake_case_count: -0.1302 + compression: + mean_raw_bytes: -0.1657 + mean_redundancy: 0.0126 + mean_unique_line_ratio: -0.0296 + mean_zlib_bytes: -0.1918 + mean_zlib_ratio: 0.0262 + entropy: + mean_char_entropy: -0.0044 + mean_char_max_entropy: -0.0152 + mean_char_normalized: 0.0108 + mean_token_entropy: -0.0215 + mean_token_max_entropy: -0.0285 + mean_token_normalized: 0.0070 + mean_total_tokens: -0.1602 + mean_vocab_size: -0.1276 + function_metrics: + mean_avg_function_lines: -0.3103 + mean_function_count: 0.1353 + mean_max_function_lines: -0.3573 + halstead: + mean_N1_total_operators: -0.1645 + mean_N2_total_operands: -0.1785 + mean_difficulty: -0.1429 + mean_effort: -0.3500 + mean_estimated_bugs: -0.2072 + mean_length: -0.1700 + mean_n1_unique_operators: -0.1406 + mean_n2_unique_operands: -0.1763 + mean_time_to_implement_seconds: -0.3500 + mean_vocabulary: -0.1655 + mean_volume: -0.2072 + heaps: + mean_beta: -0.0557 + mean_k: 0.1362 + mean_r_squared: -0.0234 + identifier_length_variance: + mean_max: -0.0427 + mean_mean: 0.0133 + mean_std_dev: -0.0321 + mean_variance: -0.0642 + indentation: + mean_blank_line_ratio: -0.0752 + mean_max_depth: -0.0352 + mean_mean_depth: -0.1381 + mean_variance: -0.2519 + line_patterns: + mean_blank_line_ratio: -0.0752 + mean_max_nesting_depth: -0.1353 + mean_string_literal_ratio: 0.1282 + mean_unique_line_ratio: -0.0312 + magic_number_density: + mean_density: -2.0000 + mean_string_literal_ratio: 0.1282 + near_duplicate_blocks_file: + mean_block_count: 0.1123 + mean_near_dup_block_d4: 0.2314 + mean_near_dup_block_d8: 0.2314 + mean_sub_block_count: -0.0902 + ngram: + mean_bigram_hapax_fraction: -0.0247 + mean_bigram_repeated_unique: -0.1792 + mean_bigram_repetition_rate: 0.0301 + mean_bigram_total: -0.1605 + mean_bigram_unique: -0.2135 + mean_trigram_hapax_fraction: -0.0265 + mean_trigram_repeated_unique: -0.1784 + mean_trigram_repetition_rate: 0.0750 + mean_trigram_total: -0.1608 + mean_trigram_unique: -0.2352 + punctuation_density: + mean_arrow_density: -0.0373 + mean_bracket_nonalpha_prefix_count: -0.4412 + mean_bracket_nonalpha_suffix_count: 0.2314 + mean_colon_suffix_density: -0.0705 + mean_dot_count: -0.2609 + mean_exclamation_density: 1.8877 + mean_id_nonalpha_suffix_density: -0.0113 + readability: + mean_avg_line_length: -0.0307 + mean_avg_sub_words_per_id: 0.0032 + mean_avg_tokens_per_line: -0.0248 + mean_fog_adapted: 0.0082 + mean_total_lines: -0.1353 + symbol_density: + mean_density: -0.0137 + mean_distinct_symbol_types: -0.0960 + mean_symbol_count: -0.1794 + vocabulary: + mean_mattr: -0.0823 + mean_raw_ttr: 0.0349 + mean_total_identifiers: -0.1801 + mean_unique_identifiers: -0.1453 + vowel_density: + mean_total_chars: -0.1669 + zipf: + mean_exponent: 0.0065 + mean_r_squared: -0.0205 + mean_total_tokens: -0.1602 + mean_vocab_size: -0.1276 + +no_wildcard_imports: + _doc: "Wildcard imports (`import *`, `using Module`) pollute the local namespace and hide dependencies." + _languages: [elixir] + _log_baseline: -8.9685 + branching: + mean_branching_density: 0.0249 + mean_non_blank_count: -0.0268 + brevity: + mean_sample_size: -0.0077 + casing_entropy: + mean_entropy: -0.0054 + mean_snake_case_count: 0.0163 + compression: + mean_raw_bytes: 0.0310 + mean_unique_line_ratio: -0.0046 + mean_zlib_bytes: 0.0331 + entropy: + mean_total_tokens: 0.0131 + mean_vocab_size: -0.0077 + function_metrics: + mean_avg_function_lines: -0.0263 + halstead: + mean_N1_total_operators: 0.0202 + mean_N2_total_operands: 0.0271 + mean_difficulty: 0.0600 + mean_effort: 0.0778 + mean_estimated_bugs: 0.0179 + mean_length: 0.0228 + mean_n2_unique_operands: -0.0329 + mean_time_to_implement_seconds: 0.0778 + mean_vocabulary: -0.0230 + mean_volume: 0.0178 + heaps: + mean_beta: -0.0537 + mean_k: 0.1998 + mean_r_squared: -0.0155 + identifier_length_variance: + mean_mean: 0.0438 + mean_std_dev: 0.0473 + mean_variance: 0.0945 + indentation: + mean_blank_line_ratio: 0.0763 + mean_mean_depth: -0.0117 + mean_variance: 0.0042 + line_patterns: + mean_blank_line_ratio: 0.0763 + mean_string_literal_ratio: -0.3463 + mean_unique_line_ratio: -0.0053 + magic_number_density: + mean_density: 1.1035 + mean_magic_number_count: 1.1312 + mean_string_literal_ratio: -0.3463 + near_duplicate_blocks_file: + mean_near_dup_block_d6: -0.3309 + mean_near_dup_block_d7: 0.3309 + mean_near_dup_block_d8: 0.3309 + mean_sub_block_count: 0.0355 + ngram: + mean_bigram_hapax_fraction: 0.0182 + mean_bigram_repeated_unique: -0.0089 + mean_bigram_repetition_rate: -0.0149 + mean_bigram_total: 0.0131 + mean_bigram_unique: 0.0308 + mean_trigram_hapax_fraction: 0.0094 + mean_trigram_repeated_unique: -0.0263 + mean_trigram_repetition_rate: -0.0255 + mean_trigram_total: 0.0132 + mean_trigram_unique: 0.0274 + punctuation_density: + mean_arrow_density: -0.0139 + mean_bracket_nonalpha_prefix_count: -0.5656 + mean_bracket_nonalpha_suffix_count: -0.0908 + mean_colon_suffix_density: 2.0000 + mean_dot_count: -0.0137 + mean_id_nonalpha_suffix_density: 0.0143 + readability: + mean_avg_line_length: 0.0591 + mean_avg_sub_words_per_id: 0.0084 + mean_avg_tokens_per_line: 0.0399 + mean_flesch_adapted: -0.0142 + mean_fog_adapted: 0.0290 + mean_total_lines: -0.0268 + symbol_density: + mean_density: -0.0266 + mean_distinct_symbol_types: -0.0817 + mean_symbol_count: 0.0042 + vocabulary: + mean_mattr: 0.0259 + mean_raw_ttr: -0.0117 + mean_total_identifiers: 0.0116 + vowel_density: + mean_total_chars: 0.0554 + zipf: + mean_exponent: -0.0270 + mean_total_tokens: 0.0131 + mean_vocab_size: -0.0077 + diff --git a/priv/combined_metrics/documentation.yml b/priv/combined_metrics/documentation.yml new file mode 100644 index 00000000..fba47a32 --- /dev/null +++ b/priv/combined_metrics/documentation.yml @@ -0,0 +1,637 @@ +docstring_is_nonempty: + _doc: "Docstrings must contain meaningful content, not just a placeholder or empty string." + _languages: [elixir] + _log_baseline: 28.4942 + branching: + mean_branch_count: 0.3089 + mean_branching_density: 0.2652 + mean_non_blank_count: 0.0437 + brevity: + mean_sample_size: 0.1931 + casing_entropy: + mean_entropy: 0.0676 + mean_other_count: 0.0709 + mean_pascal_case_count: 0.3089 + mean_snake_case_count: 0.1382 + compression: + mean_raw_bytes: 0.1245 + mean_redundancy: -0.0198 + mean_unique_line_ratio: 0.0053 + mean_zlib_bytes: 0.1557 + mean_zlib_ratio: -0.0312 + entropy: + mean_char_entropy: 0.0065 + mean_char_max_entropy: 0.0102 + mean_char_normalized: -0.0036 + mean_token_entropy: 0.0408 + mean_token_max_entropy: 0.0400 + mean_total_tokens: 0.1038 + mean_vocab_size: 0.1931 + function_metrics: + mean_avg_function_lines: 0.0357 + mean_avg_param_count: 0.0131 + mean_function_count: -0.0290 + mean_max_function_lines: 0.0329 + halstead: + mean_N1_total_operators: 0.0456 + mean_N2_total_operands: -0.0027 + mean_difficulty: 0.0706 + mean_effort: 0.1098 + mean_estimated_bugs: 0.0392 + mean_length: 0.0289 + mean_n1_unique_operators: 0.0913 + mean_n2_unique_operands: 0.0179 + mean_time_to_implement_seconds: 0.1098 + mean_vocabulary: 0.0465 + mean_volume: 0.0392 + heaps: + mean_beta: 0.0242 + mean_k: 0.0556 + identifier_length_variance: + mean_mean: 0.0042 + mean_std_dev: -0.0168 + mean_variance: -0.0336 + indentation: + mean_blank_line_ratio: 0.0413 + mean_mean_depth: -0.0330 + mean_variance: -0.0309 + line_patterns: + mean_blank_line_ratio: 0.0413 + mean_string_literal_ratio: 0.1078 + mean_unique_line_ratio: 0.0072 + magic_number_density: + mean_density: 0.0693 + mean_magic_number_count: 0.1709 + mean_string_literal_ratio: 0.1078 + near_duplicate_blocks_file: + mean_block_count: 0.0907 + mean_near_dup_block_d5: -0.2709 + mean_near_dup_block_d8: 0.1000 + mean_sub_block_count: -0.0061 + ngram: + mean_bigram_hapax_fraction: 0.0378 + mean_bigram_repeated_unique: 0.0767 + mean_bigram_repetition_rate: -0.0528 + mean_bigram_total: 0.1039 + mean_bigram_unique: 0.1635 + mean_trigram_hapax_fraction: 0.0158 + mean_trigram_repeated_unique: 0.0692 + mean_trigram_repetition_rate: -0.0615 + mean_trigram_total: 0.1041 + mean_trigram_unique: 0.1386 + punctuation_density: + mean_arrow_density: -0.0651 + mean_bracket_nonalpha_prefix_count: 0.0450 + mean_bracket_nonalpha_suffix_count: 0.1000 + mean_colon_suffix_density: -0.0260 + mean_dot_count: 0.1435 + mean_exclamation_density: -2.0000 + mean_id_nonalpha_suffix_density: -0.0474 + readability: + mean_avg_line_length: 0.0834 + mean_avg_sub_words_per_id: -0.0071 + mean_avg_tokens_per_line: 0.0601 + mean_fog_adapted: 0.0452 + mean_total_lines: 0.0437 + symbol_density: + mean_density: -0.0578 + mean_distinct_symbol_types: 0.0505 + mean_symbol_count: 0.0664 + vocabulary: + mean_mattr: 0.1382 + mean_raw_ttr: 0.0976 + mean_total_identifiers: 0.1534 + mean_unique_identifiers: 0.2510 + vowel_density: + mean_total_chars: 0.1576 + zipf: + mean_exponent: -0.0353 + mean_r_squared: 0.0037 + mean_total_tokens: 0.1038 + mean_vocab_size: 0.1931 + +file_has_license_header: + _doc: "Source files should begin with a license or copyright header." + _languages: [elixir] + _log_baseline: 5.8777 + branching: + mean_branching_density: -0.0081 + mean_non_blank_count: 0.0080 + brevity: + mean_sample_size: 0.0263 + casing_entropy: + mean_entropy: 0.0296 + mean_pascal_case_count: 0.0957 + mean_snake_case_count: 0.0039 + comment_structure: + mean_comment_line_ratio: -2.0000 + compression: + mean_raw_bytes: 0.0104 + mean_redundancy: -0.0059 + mean_zlib_bytes: 0.0200 + mean_zlib_ratio: -0.0095 + entropy: + mean_char_entropy: 0.0028 + mean_char_max_entropy: 0.0052 + mean_token_entropy: 0.0042 + mean_token_max_entropy: 0.0054 + mean_total_tokens: 0.0091 + mean_vocab_size: 0.0263 + halstead: + mean_N1_total_operators: 0.0051 + mean_N2_total_operands: 0.0185 + mean_difficulty: -0.0273 + mean_effort: -0.0113 + mean_estimated_bugs: 0.0159 + mean_length: 0.0095 + mean_n2_unique_operands: 0.0458 + mean_time_to_implement_seconds: -0.0113 + mean_vocabulary: 0.0306 + mean_volume: 0.0160 + heaps: + mean_beta: -0.0113 + mean_k: 0.0614 + identifier_length_variance: + mean_mean: -0.0048 + indentation: + mean_blank_line_ratio: 0.0206 + mean_mean_depth: -0.0080 + mean_variance: 0.0154 + line_patterns: + mean_blank_line_ratio: 0.0206 + mean_string_literal_ratio: -0.0104 + magic_number_density: + mean_density: 0.1920 + mean_magic_number_count: 0.1973 + mean_string_literal_ratio: -0.0104 + near_duplicate_blocks_file: + mean_block_count: 0.0650 + mean_sub_block_count: 0.0089 + ngram: + mean_bigram_hapax_fraction: 0.0086 + mean_bigram_repetition_rate: -0.0091 + mean_bigram_total: 0.0091 + mean_bigram_unique: 0.0182 + mean_trigram_hapax_fraction: 0.0031 + mean_trigram_repetition_rate: -0.0091 + mean_trigram_total: 0.0091 + mean_trigram_unique: 0.0133 + punctuation_density: + mean_arrow_density: -0.0105 + mean_colon_suffix_density: -0.0104 + mean_dot_count: 0.0423 + readability: + mean_avg_tokens_per_line: 0.0091 + mean_fog_adapted: 0.0060 + symbol_density: + mean_density: -0.0042 + mean_symbol_count: 0.0065 + vocabulary: + mean_mattr: 0.0108 + mean_raw_ttr: 0.0207 + mean_total_identifiers: 0.0187 + mean_unique_identifiers: 0.0395 + vowel_density: + mean_total_chars: 0.0139 + zipf: + mean_exponent: -0.0055 + mean_total_tokens: 0.0091 + mean_vocab_size: 0.0263 + +file_has_module_docstring: + _doc: "Files should have a module-level docstring explaining purpose and usage." + _languages: [elixir] + _log_baseline: 24.1681 + branching: + mean_branch_count: 0.3854 + mean_branching_density: -2.0000 + mean_non_blank_count: 0.0908 + brevity: + mean_sample_size: 0.2219 + casing_entropy: + mean_entropy: -0.0210 + mean_pascal_case_count: 0.0929 + mean_snake_case_count: 0.1544 + compression: + mean_raw_bytes: 0.1161 + mean_redundancy: -0.0256 + mean_unique_line_ratio: 0.0122 + mean_zlib_bytes: 0.1676 + mean_zlib_ratio: -0.0514 + entropy: + mean_char_max_entropy: 0.0126 + mean_char_normalized: -0.0120 + mean_token_entropy: 0.0441 + mean_token_max_entropy: 0.0457 + mean_total_tokens: 0.0837 + mean_vocab_size: 0.2219 + function_metrics: + mean_avg_function_lines: 0.0166 + mean_max_function_lines: 0.1014 + halstead: + mean_N1_total_operators: 0.0448 + mean_N2_total_operands: 0.0268 + mean_difficulty: 0.0971 + mean_effort: 0.1486 + mean_estimated_bugs: 0.0515 + mean_length: 0.0387 + mean_n1_unique_operators: 0.1116 + mean_n2_unique_operands: 0.0412 + mean_time_to_implement_seconds: 0.1486 + mean_vocabulary: 0.0602 + mean_volume: 0.0515 + heaps: + mean_beta: -0.0925 + mean_k: 0.5760 + mean_r_squared: -0.0049 + identifier_length_variance: + mean_mean: -0.0101 + mean_std_dev: -0.0477 + mean_variance: -0.0954 + indentation: + mean_blank_line_ratio: 0.0686 + mean_mean_depth: -0.0240 + mean_variance: -0.0634 + line_patterns: + mean_blank_line_ratio: 0.0686 + mean_string_literal_ratio: 0.1425 + mean_unique_line_ratio: 0.0141 + magic_number_density: + mean_density: 0.0812 + mean_magic_number_count: 0.1599 + mean_string_literal_ratio: 0.1425 + near_duplicate_blocks_file: + mean_block_count: 0.0586 + mean_sub_block_count: 0.0098 + ngram: + mean_bigram_hapax_fraction: 0.0500 + mean_bigram_repeated_unique: 0.0539 + mean_bigram_repetition_rate: -0.0497 + mean_bigram_total: 0.0838 + mean_bigram_unique: 0.1493 + mean_trigram_hapax_fraction: 0.0283 + mean_trigram_repeated_unique: 0.0225 + mean_trigram_repetition_rate: -0.0657 + mean_trigram_total: 0.0839 + mean_trigram_unique: 0.1235 + punctuation_density: + mean_colon_suffix_density: 0.0341 + mean_dot_count: 0.0777 + mean_exclamation_density: -0.1014 + mean_id_nonalpha_suffix_density: -0.0339 + readability: + mean_avg_line_length: 0.0257 + mean_avg_sub_words_per_id: -0.0181 + mean_avg_tokens_per_line: -0.0071 + mean_flesch_adapted: 0.0205 + mean_fog_adapted: -0.0266 + mean_total_lines: 0.0908 + symbol_density: + mean_density: -0.0727 + mean_distinct_symbol_types: 0.0618 + mean_symbol_count: 0.0433 + vocabulary: + mean_mattr: 0.0532 + mean_raw_ttr: 0.1353 + mean_total_identifiers: 0.1326 + mean_unique_identifiers: 0.2679 + vowel_density: + mean_total_chars: 0.1226 + zipf: + mean_exponent: -0.0467 + mean_total_tokens: 0.0837 + mean_vocab_size: 0.2219 + +file_has_no_commented_out_code: + _doc: "Files should not contain commented-out code blocks left from development." + _languages: [elixir] + _log_baseline: -8.5677 + branching: + mean_branching_density: 0.0368 + mean_non_blank_count: -0.0367 + brevity: + mean_sample_size: -0.0046 + casing_entropy: + mean_entropy: -0.0091 + mean_pascal_case_count: -0.0597 + mean_snake_case_count: -0.0126 + comment_structure: + mean_comment_line_count: -0.9901 + mean_comment_line_ratio: 0.3578 + compression: + mean_raw_bytes: -0.0068 + mean_redundancy: 0.0077 + mean_zlib_bytes: -0.0179 + mean_zlib_ratio: 0.0111 + entropy: + mean_char_entropy: -0.0026 + mean_char_max_entropy: -0.0061 + mean_char_normalized: 0.0035 + mean_total_tokens: -0.0158 + mean_vocab_size: -0.0046 + function_metrics: + mean_avg_function_lines: -0.0992 + mean_function_count: 0.0686 + mean_max_function_lines: -0.1247 + halstead: + mean_N1_total_operators: -0.0058 + mean_N2_total_operands: -0.0546 + mean_difficulty: 0.0608 + mean_effort: 0.0253 + mean_estimated_bugs: -0.0355 + mean_length: -0.0224 + mean_n1_unique_operators: 0.0171 + mean_n2_unique_operands: -0.0984 + mean_time_to_implement_seconds: 0.0253 + mean_vocabulary: -0.0628 + mean_volume: -0.0356 + heaps: + mean_beta: -0.0499 + mean_k: 0.1958 + mean_r_squared: -0.0200 + identifier_length_variance: + mean_mean: 0.0169 + mean_std_dev: 0.0264 + mean_variance: 0.0527 + indentation: + mean_blank_line_ratio: 0.0551 + mean_max_depth: 0.0324 + mean_mean_depth: 0.0564 + mean_variance: 0.0552 + line_patterns: + mean_blank_line_ratio: 0.0551 + mean_string_literal_ratio: -0.0818 + mean_unique_line_ratio: -0.0077 + magic_number_density: + mean_density: 2.0000 + mean_string_literal_ratio: -0.0818 + near_duplicate_blocks_file: + mean_block_count: -0.0474 + mean_sub_block_count: -0.0454 + ngram: + mean_bigram_hapax_fraction: 0.0101 + mean_bigram_repeated_unique: -0.0414 + mean_bigram_repetition_rate: -0.0019 + mean_bigram_total: -0.0158 + mean_bigram_unique: -0.0223 + mean_trigram_hapax_fraction: -0.0019 + mean_trigram_repeated_unique: -0.0273 + mean_trigram_repetition_rate: 0.0258 + mean_trigram_total: -0.0159 + mean_trigram_unique: -0.0338 + punctuation_density: + mean_arrow_density: 0.1869 + mean_bracket_nonalpha_prefix_count: -0.1247 + mean_bracket_nonalpha_suffix_count: -0.0885 + mean_colon_suffix_density: -0.1285 + mean_dot_count: -0.0411 + mean_exclamation_density: -0.1956 + mean_id_nonalpha_suffix_density: 0.0028 + mean_question_mark_density: -0.2494 + readability: + mean_avg_line_length: 0.0371 + mean_avg_sub_words_per_id: -0.0018 + mean_avg_tokens_per_line: -0.0943 + mean_flesch_adapted: 0.0114 + mean_fog_adapted: -0.0779 + mean_total_lines: 0.0785 + symbol_density: + mean_density: -0.0172 + mean_symbol_count: -0.0237 + vocabulary: + mean_mattr: -0.0327 + mean_raw_ttr: 0.0060 + mean_total_identifiers: -0.0246 + mean_unique_identifiers: -0.0186 + vowel_density: + mean_total_chars: -0.0077 + zipf: + mean_exponent: -0.0043 + mean_total_tokens: -0.0158 + mean_vocab_size: -0.0046 + +function_has_docstring: + _doc: "Public functions should have a docstring describing behaviour, params, and return value." + _languages: [elixir] + _log_baseline: 41.6283 + branching: + mean_branch_count: 0.5279 + mean_branching_density: 0.3832 + mean_non_blank_count: 0.1446 + brevity: + mean_sample_size: 0.2608 + casing_entropy: + mean_entropy: -0.0026 + mean_other_count: 0.3105 + mean_pascal_case_count: 0.1852 + mean_snake_case_count: 0.2708 + comment_structure: + mean_comment_line_ratio: -2.0000 + compression: + mean_raw_bytes: 0.2251 + mean_redundancy: -0.0242 + mean_unique_line_ratio: -0.0264 + mean_zlib_bytes: 0.2718 + mean_zlib_ratio: -0.0468 + entropy: + mean_char_entropy: 0.0081 + mean_char_max_entropy: 0.0163 + mean_char_normalized: -0.0082 + mean_token_entropy: 0.0517 + mean_token_max_entropy: 0.0557 + mean_token_normalized: -0.0040 + mean_total_tokens: 0.2284 + mean_vocab_size: 0.2608 + function_metrics: + mean_avg_function_lines: 0.0289 + mean_avg_param_count: 0.0202 + mean_function_count: 0.0999 + mean_max_function_lines: 0.1368 + halstead: + mean_N1_total_operators: 0.1175 + mean_N2_total_operands: 0.0799 + mean_difficulty: 0.0232 + mean_effort: 0.1555 + mean_estimated_bugs: 0.1324 + mean_length: 0.1035 + mean_n1_unique_operators: 0.0939 + mean_n2_unique_operands: 0.1507 + mean_time_to_implement_seconds: 0.1555 + mean_vocabulary: 0.1288 + mean_volume: 0.1324 + heaps: + mean_beta: 0.0660 + mean_k: -0.0612 + mean_r_squared: -0.0041 + identifier_length_variance: + mean_mean: -0.0191 + mean_std_dev: -0.0493 + mean_variance: -0.0985 + indentation: + mean_blank_line_ratio: 0.1003 + mean_max_depth: -0.1288 + mean_mean_depth: -0.0904 + mean_variance: -0.2118 + line_patterns: + mean_blank_line_ratio: 0.1003 + mean_string_literal_ratio: 0.5931 + mean_unique_line_ratio: -0.0135 + magic_number_density: + mean_density: 0.1744 + mean_magic_number_count: 0.4104 + mean_string_literal_ratio: 0.5931 + near_duplicate_blocks_file: + mean_block_count: 0.2288 + mean_near_dup_block_d6: -0.3105 + mean_near_dup_block_d7: 0.3105 + mean_near_dup_block_d8: -0.1816 + mean_sub_block_count: 0.0349 + ngram: + mean_bigram_hapax_fraction: 0.0560 + mean_bigram_repeated_unique: 0.1917 + mean_bigram_repetition_rate: -0.0476 + mean_bigram_total: 0.2288 + mean_bigram_unique: 0.2856 + mean_trigram_hapax_fraction: 0.0480 + mean_trigram_repeated_unique: 0.1175 + mean_trigram_repetition_rate: -0.1025 + mean_trigram_total: 0.2292 + mean_trigram_unique: 0.2807 + punctuation_density: + mean_arrow_density: -0.3619 + mean_bracket_nonalpha_prefix_count: 0.0999 + mean_bracket_nonalpha_suffix_count: 0.2024 + mean_colon_suffix_density: -0.0297 + mean_dot_count: 0.1816 + mean_exclamation_density: -0.3105 + mean_id_nonalpha_suffix_density: -0.1019 + mean_question_mark_density: -0.2377 + readability: + mean_avg_line_length: 0.0861 + mean_avg_sub_words_per_id: -0.0113 + mean_avg_tokens_per_line: 0.0890 + mean_flesch_adapted: 0.0026 + mean_fog_adapted: 0.0948 + mean_total_lines: 0.1394 + symbol_density: + mean_density: -0.0353 + mean_distinct_symbol_types: 0.0427 + mean_symbol_count: 0.1896 + vocabulary: + mean_mattr: 0.1769 + mean_raw_ttr: 0.0666 + mean_total_identifiers: 0.2541 + mean_unique_identifiers: 0.3207 + vowel_density: + mean_total_chars: 0.2350 + zipf: + mean_exponent: 0.0025 + mean_total_tokens: 0.2284 + mean_vocab_size: 0.2608 + +function_todo_comment_in_body: + _doc: "Functions should not contain TODO/FIXME comments indicating unfinished work." + _languages: [elixir] + _log_baseline: 7.2394 + branching: + mean_branch_count: -0.0287 + mean_branching_density: -0.0435 + mean_non_blank_count: 0.0147 + brevity: + mean_sample_size: -0.0084 + casing_entropy: + mean_entropy: 0.0157 + mean_pascal_case_count: 0.0410 + mean_snake_case_count: -0.0125 + comment_structure: + mean_comment_line_count: -0.5392 + mean_comment_line_ratio: 0.7796 + mean_todo_fixme_count: -0.5392 + compression: + mean_raw_bytes: 0.0082 + mean_unique_line_ratio: 0.0028 + mean_zlib_bytes: 0.0074 + entropy: + mean_char_entropy: 0.0026 + mean_char_normalized: 0.0026 + mean_token_max_entropy: -0.0017 + mean_token_normalized: 0.0020 + mean_total_tokens: 0.0157 + mean_vocab_size: -0.0084 + function_metrics: + mean_avg_function_lines: -0.0250 + mean_avg_param_count: -0.0354 + mean_function_count: 0.0354 + mean_max_function_lines: -0.0182 + halstead: + mean_N1_total_operators: 0.0224 + mean_N2_total_operands: -0.0309 + mean_difficulty: 0.0451 + mean_effort: 0.0375 + mean_estimated_bugs: -0.0076 + mean_length: 0.0035 + mean_n2_unique_operands: -0.0761 + mean_time_to_implement_seconds: 0.0375 + mean_vocabulary: -0.0540 + mean_volume: -0.0076 + heaps: + mean_beta: -0.0498 + mean_k: 0.1608 + mean_r_squared: -0.0095 + identifier_length_variance: + mean_mean: 0.0061 + mean_std_dev: 0.0128 + mean_variance: 0.0257 + indentation: + mean_blank_line_ratio: 0.0593 + mean_mean_depth: -0.0184 + mean_variance: -0.0277 + line_patterns: + mean_blank_line_ratio: 0.0593 + mean_string_literal_ratio: -0.0151 + mean_unique_line_ratio: 0.0033 + magic_number_density: + mean_density: -2.0000 + mean_string_literal_ratio: -0.0151 + near_duplicate_blocks_file: + mean_block_count: 0.0317 + mean_sub_block_count: 0.0281 + ngram: + mean_bigram_hapax_fraction: -0.0187 + mean_bigram_repeated_unique: 0.0464 + mean_bigram_repetition_rate: 0.0098 + mean_bigram_total: 0.0157 + mean_bigram_unique: 0.0136 + mean_trigram_hapax_fraction: -0.0109 + mean_trigram_repeated_unique: 0.0479 + mean_trigram_repetition_rate: 0.0123 + mean_trigram_total: 0.0157 + mean_trigram_unique: 0.0149 + punctuation_density: + mean_arrow_density: -0.0161 + mean_bracket_nonalpha_prefix_count: -0.0287 + mean_colon_suffix_density: -0.0293 + mean_dot_count: 0.0485 + mean_id_nonalpha_suffix_density: 0.0062 + mean_question_mark_density: -0.0287 + readability: + mean_avg_line_length: 0.0123 + mean_avg_sub_words_per_id: 0.0073 + mean_avg_tokens_per_line: -0.0224 + mean_flesch_adapted: -0.0053 + mean_fog_adapted: -0.0109 + mean_total_lines: 0.0381 + symbol_density: + mean_density: 0.0116 + mean_distinct_symbol_types: -0.0140 + mean_symbol_count: 0.0200 + vocabulary: + mean_mattr: -0.0525 + mean_raw_ttr: -0.0250 + mean_unique_identifiers: -0.0236 + vowel_density: + mean_total_chars: 0.0076 + zipf: + mean_total_tokens: 0.0157 + mean_vocab_size: -0.0084 + diff --git a/priv/combined_metrics/error_handling.yml b/priv/combined_metrics/error_handling.yml new file mode 100644 index 00000000..b09f542e --- /dev/null +++ b/priv/combined_metrics/error_handling.yml @@ -0,0 +1,335 @@ +does_not_swallow_errors: + _doc: "Errors must be handled or re-raised — empty rescue/catch blocks silently hide failures." + _languages: [elixir] + _log_baseline: 86.0584 + branching: + mean_branch_count: -0.1041 + mean_branching_density: -0.2095 + mean_max_nesting_depth: 0.5405 + mean_non_blank_count: 0.1054 + brevity: + mean_sample_size: 0.2830 + casing_entropy: + mean_entropy: -0.1412 + mean_other_count: -1.6214 + mean_pascal_case_count: 0.8391 + mean_snake_case_count: 0.4785 + compression: + mean_raw_bytes: 0.3818 + mean_redundancy: 0.0202 + mean_unique_line_ratio: 0.1028 + mean_zlib_bytes: 0.3399 + mean_zlib_ratio: 0.0419 + entropy: + mean_char_entropy: 0.0445 + mean_char_max_entropy: 0.0347 + mean_char_normalized: 0.0098 + mean_token_entropy: 0.0223 + mean_token_max_entropy: 0.0620 + mean_token_normalized: -0.0397 + mean_total_tokens: 0.4926 + mean_vocab_size: 0.2830 + function_metrics: + mean_avg_function_lines: 0.1005 + mean_max_function_lines: 0.2243 + halstead: + mean_N1_total_operators: 0.4699 + mean_N2_total_operands: 0.2900 + mean_difficulty: 0.2395 + mean_effort: 0.6960 + mean_estimated_bugs: 0.4564 + mean_length: 0.4072 + mean_n1_unique_operators: 0.1859 + mean_n2_unique_operands: 0.2364 + mean_time_to_implement_seconds: 0.6960 + mean_vocabulary: 0.2190 + mean_volume: 0.4565 + heaps: + mean_beta: -0.0869 + mean_k: 0.2466 + identifier_length_variance: + mean_std_dev: -0.1168 + mean_variance: -0.2335 + indentation: + mean_blank_line_ratio: 0.0451 + mean_max_depth: 0.1740 + mean_mean_depth: 0.1043 + mean_variance: 0.3416 + line_patterns: + mean_blank_line_ratio: 0.0451 + mean_max_nesting_depth: 0.5405 + mean_string_literal_ratio: 0.2524 + mean_unique_line_ratio: 0.1413 + magic_number_density: + mean_string_literal_ratio: 0.2524 + near_duplicate_blocks_file: + mean_near_dup_block_d0: -0.5405 + mean_near_dup_block_d7: -0.3162 + mean_near_dup_block_d8: 0.8566 + mean_sub_block_count: 0.3065 + ngram: + mean_bigram_hapax_fraction: -0.0373 + mean_bigram_repeated_unique: 0.4011 + mean_bigram_repetition_rate: 0.0995 + mean_bigram_total: 0.4937 + mean_bigram_unique: 0.3266 + mean_trigram_hapax_fraction: -0.0651 + mean_trigram_repeated_unique: 0.5672 + mean_trigram_repetition_rate: 0.2299 + mean_trigram_total: 0.4949 + mean_trigram_unique: 0.3376 + punctuation_density: + mean_arrow_density: -0.3177 + mean_bracket_nonalpha_prefix_count: 0.7888 + mean_bracket_nonalpha_suffix_count: 2.0000 + mean_colon_suffix_density: 0.2150 + mean_dot_count: 0.5172 + mean_exclamation_density: -0.5217 + mean_id_nonalpha_suffix_density: 0.0181 + mean_question_mark_density: -0.4364 + readability: + mean_avg_line_length: 0.2905 + mean_avg_sub_words_per_id: 0.0219 + mean_avg_tokens_per_line: 0.3872 + mean_flesch_adapted: -0.0373 + mean_fog_adapted: 0.3019 + mean_total_lines: 0.1054 + symbol_density: + mean_density: 0.2563 + mean_distinct_symbol_types: 0.0400 + mean_symbol_count: 0.6378 + vocabulary: + mean_mattr: 0.0350 + mean_raw_ttr: -0.0769 + mean_total_identifiers: 0.4896 + mean_unique_identifiers: 0.4127 + vowel_density: + mean_total_chars: 0.4927 + zipf: + mean_exponent: 0.0933 + mean_total_tokens: 0.4926 + mean_vocab_size: 0.2830 + +error_message_is_descriptive: + _doc: "Error values should carry a meaningful message, not just a bare atom or empty string." + _languages: [elixir] + _log_baseline: 52.7053 + branching: + mean_branch_count: 0.0664 + mean_branching_density: -0.0540 + mean_max_nesting_depth: 0.3900 + mean_non_blank_count: 0.1204 + brevity: + mean_sample_size: 0.3136 + casing_entropy: + mean_entropy: 0.1147 + mean_pascal_case_count: 2.0000 + mean_snake_case_count: 0.5117 + compression: + mean_raw_bytes: 0.3028 + mean_redundancy: 0.0104 + mean_unique_line_ratio: -0.0126 + mean_zlib_bytes: 0.2771 + mean_zlib_ratio: 0.0257 + entropy: + mean_char_entropy: 0.0161 + mean_char_max_entropy: 0.0487 + mean_char_normalized: -0.0326 + mean_token_entropy: 0.0596 + mean_token_max_entropy: 0.0685 + mean_token_normalized: -0.0089 + mean_total_tokens: 0.3002 + mean_vocab_size: 0.3136 + function_metrics: + mean_avg_function_lines: 0.1160 + mean_max_function_lines: 0.0713 + halstead: + mean_N1_total_operators: 0.1787 + mean_N2_total_operands: 0.0463 + mean_difficulty: 0.0136 + mean_effort: 0.1822 + mean_estimated_bugs: 0.1686 + mean_length: 0.1370 + mean_n1_unique_operators: 0.1179 + mean_n2_unique_operands: 0.1506 + mean_time_to_implement_seconds: 0.1822 + mean_vocabulary: 0.1415 + mean_volume: 0.1686 + heaps: + mean_beta: 0.0120 + mean_k: 0.1259 + mean_r_squared: 0.0073 + identifier_length_variance: + mean_mean: -0.0908 + mean_std_dev: -0.0799 + mean_variance: -0.1597 + indentation: + mean_blank_line_ratio: -0.1098 + mean_max_depth: 0.1754 + mean_mean_depth: 0.1108 + mean_variance: 0.1967 + line_patterns: + mean_blank_line_ratio: -0.1098 + mean_max_nesting_depth: 0.3900 + mean_string_literal_ratio: 0.3673 + mean_unique_line_ratio: 0.0304 + magic_number_density: + mean_string_literal_ratio: 0.3673 + near_duplicate_blocks_file: + mean_near_dup_block_d6: -1.0566 + mean_near_dup_block_d8: -0.6667 + mean_sub_block_count: 0.0621 + ngram: + mean_bigram_hapax_fraction: -0.0059 + mean_bigram_repeated_unique: 0.3150 + mean_bigram_total: 0.3008 + mean_bigram_unique: 0.3055 + mean_trigram_hapax_fraction: -0.0298 + mean_trigram_repeated_unique: 0.4104 + mean_trigram_repetition_rate: 0.0227 + mean_trigram_total: 0.3014 + mean_trigram_unique: 0.3075 + punctuation_density: + mean_arrow_density: -0.0591 + mean_bracket_nonalpha_prefix_count: 1.4250 + mean_bracket_nonalpha_suffix_count: 0.3350 + mean_colon_suffix_density: -0.0854 + mean_id_nonalpha_suffix_density: -0.0694 + readability: + mean_avg_line_length: 0.1895 + mean_avg_tokens_per_line: 0.1798 + mean_flesch_adapted: -0.0175 + mean_fog_adapted: 0.1420 + mean_total_lines: 0.1204 + symbol_density: + mean_distinct_symbol_types: 0.0664 + mean_symbol_count: 0.3056 + vocabulary: + mean_mattr: -0.0179 + mean_raw_ttr: -0.1153 + mean_total_identifiers: 0.5114 + mean_unique_identifiers: 0.3962 + vowel_density: + mean_total_chars: 0.4207 + zipf: + mean_r_squared: 0.0056 + mean_total_tokens: 0.3002 + mean_vocab_size: 0.3136 + +returns_typed_error: + _doc: "Functions should signal failure via a typed return (e.g. `{:error, reason}`) rather than returning `nil` or `false`." + _languages: [elixir] + _log_baseline: 120.8554 + branching: + mean_branch_count: -0.1286 + mean_branching_density: -0.1895 + mean_max_nesting_depth: 1.1292 + mean_non_blank_count: 0.0608 + brevity: + mean_sample_size: 0.2322 + casing_entropy: + mean_entropy: -0.3072 + mean_other_count: -0.2697 + mean_pascal_case_count: 0.7124 + mean_snake_case_count: 0.6125 + compression: + mean_raw_bytes: 0.4375 + mean_redundancy: 0.0334 + mean_unique_line_ratio: 0.1471 + mean_zlib_bytes: 0.3486 + mean_zlib_ratio: 0.0889 + entropy: + mean_char_entropy: 0.0854 + mean_char_max_entropy: 0.0427 + mean_char_normalized: 0.0426 + mean_token_entropy: -0.0120 + mean_token_max_entropy: 0.0531 + mean_token_normalized: -0.0651 + mean_total_tokens: 0.6727 + mean_vocab_size: 0.2322 + function_metrics: + mean_avg_function_lines: 0.0904 + mean_avg_param_count: 0.0054 + mean_function_count: -0.0556 + mean_max_function_lines: 0.0823 + halstead: + mean_N1_total_operators: 0.7914 + mean_N2_total_operands: 0.5495 + mean_difficulty: 0.4516 + mean_effort: 1.2300 + mean_estimated_bugs: 0.7784 + mean_length: 0.7139 + mean_n1_unique_operators: 0.2105 + mean_n2_unique_operands: 0.3084 + mean_time_to_implement_seconds: 1.2300 + mean_vocabulary: 0.2747 + mean_volume: 0.7785 + heaps: + mean_beta: -0.2332 + mean_k: 0.4822 + mean_r_squared: 0.0110 + identifier_length_variance: + mean_max: 0.1996 + mean_mean: 0.1313 + mean_std_dev: 0.2519 + mean_variance: 0.5039 + indentation: + mean_blank_line_ratio: -0.1515 + mean_mean_depth: -0.0287 + mean_variance: 0.0372 + line_patterns: + mean_blank_line_ratio: -0.1515 + mean_max_nesting_depth: 1.1292 + mean_string_literal_ratio: -0.6750 + mean_unique_line_ratio: 0.1454 + magic_number_density: + mean_string_literal_ratio: -0.6750 + near_duplicate_blocks_file: + mean_block_count: -0.0980 + mean_near_dup_block_d0: -1.4248 + mean_near_dup_block_d6: 0.7124 + mean_near_dup_block_d7: -1.0081 + mean_near_dup_block_d8: -2.0000 + mean_sub_block_count: 0.7384 + ngram: + mean_bigram_hapax_fraction: -0.1410 + mean_bigram_repeated_unique: 0.4891 + mean_bigram_repetition_rate: 0.1302 + mean_bigram_total: 0.6740 + mean_bigram_unique: 0.3101 + mean_trigram_hapax_fraction: -0.0547 + mean_trigram_repeated_unique: 0.5003 + mean_trigram_repetition_rate: 0.2370 + mean_trigram_total: 0.6753 + mean_trigram_unique: 0.3580 + punctuation_density: + mean_arrow_density: -0.8033 + mean_bracket_nonalpha_prefix_count: -0.1874 + mean_colon_suffix_density: -0.8583 + mean_dot_count: 1.1292 + mean_id_nonalpha_suffix_density: 0.0810 + mean_question_mark_density: -0.6568 + readability: + mean_avg_line_length: 0.3955 + mean_avg_sub_words_per_id: 0.0948 + mean_avg_tokens_per_line: 0.6118 + mean_flesch_adapted: -0.1272 + mean_fog_adapted: 0.6637 + mean_total_lines: 0.0608 + symbol_density: + mean_density: 0.5813 + mean_distinct_symbol_types: 0.2134 + mean_symbol_count: 1.0187 + vocabulary: + mean_mattr: -0.2229 + mean_raw_ttr: -0.2020 + mean_total_identifiers: 0.4979 + mean_unique_identifiers: 0.2957 + vowel_density: + mean_total_chars: 0.6292 + zipf: + mean_exponent: 0.1047 + mean_r_squared: 0.0253 + mean_total_tokens: 0.6727 + mean_vocab_size: 0.2322 + diff --git a/priv/combined_metrics/file_structure.yml b/priv/combined_metrics/file_structure.yml new file mode 100644 index 00000000..10ae0b2e --- /dev/null +++ b/priv/combined_metrics/file_structure.yml @@ -0,0 +1,574 @@ +has_consistent_indentation: + _doc: "Files should use a single, consistent indentation style with no mixed tabs and spaces." + _log_baseline: -8.2745 + branching: + mean_branching_density: 0.0144 + mean_non_blank_count: -0.0302 + brevity: + mean_sample_size: -0.0053 + casing_entropy: + mean_entropy: 0.0021 + mean_snake_case_count: -0.0030 + compression: + mean_raw_bytes: -0.0218 + mean_redundancy: -0.0026 + mean_unique_line_ratio: -0.0381 + mean_zlib_bytes: -0.0128 + mean_zlib_ratio: -0.0065 + entropy: + mean_char_entropy: 0.0079 + mean_char_normalized: 0.0096 + mean_token_entropy: -0.0018 + mean_total_tokens: -0.0033 + mean_vocab_size: -0.0053 + function_metrics: + mean_avg_function_lines: -0.0128 + mean_max_function_lines: -0.0176 + halstead: + mean_N1_total_operators: -0.0035 + mean_difficulty: -0.0061 + mean_effort: -0.0038 + mean_estimated_bugs: -0.0025 + mean_length: -0.0023 + mean_n1_unique_operators: -0.0081 + mean_time_to_implement_seconds: -0.0038 + mean_vocabulary: -0.0030 + mean_volume: -0.0026 + identifier_length_variance: + mean_mean: 0.0089 + mean_std_dev: -0.0025 + mean_variance: -0.0054 + indentation: + mean_blank_line_ratio: 0.0250 + mean_max_depth: -0.2075 + mean_mean_depth: -0.0900 + mean_variance: -0.3941 + line_patterns: + mean_blank_line_ratio: 0.0250 + mean_string_literal_ratio: 0.0093 + mean_unique_line_ratio: -0.0077 + magic_number_density: + mean_density: 2.0000 + mean_magic_number_count: 0.2373 + mean_string_literal_ratio: 0.0093 + ngram: + mean_bigram_hapax_fraction: -0.0042 + mean_bigram_repeated_unique: -0.0026 + mean_bigram_repetition_rate: 0.0023 + mean_bigram_total: -0.0033 + mean_bigram_unique: -0.0051 + mean_trigram_hapax_fraction: -0.0055 + mean_trigram_repetition_rate: 0.0058 + mean_trigram_total: -0.0033 + mean_trigram_unique: -0.0048 + punctuation_density: + mean_colon_suffix_density: 0.0078 + readability: + mean_avg_tokens_per_line: 0.0087 + mean_fog_adapted: 0.0083 + mean_total_lines: -0.0302 + separator_counts: + mean_hyphen_count: -0.0221 + symbol_density: + mean_density: 0.0145 + mean_symbol_count: -0.0026 + vocabulary: + mean_mattr: 0.0043 + mean_raw_ttr: 0.0043 + mean_total_identifiers: -0.0025 + vowel_density: + mean_total_chars: 0.0031 + zipf: + mean_total_tokens: -0.0033 + mean_vocab_size: -0.0053 + +line_count_under_300: + _doc: "Files should be under 300 lines; longer files typically violate single responsibility." + _log_baseline: -48.1609 + branching: + mean_branch_count: -0.4508 + mean_branching_density: -0.2446 + mean_non_blank_count: -0.2063 + brevity: + mean_sample_size: -0.2062 + casing_entropy: + mean_entropy: 0.0413 + mean_other_count: -0.6011 + mean_pascal_case_count: 0.1036 + mean_snake_case_count: -0.2080 + compression: + mean_raw_bytes: -0.2263 + mean_redundancy: -0.0026 + mean_unique_line_ratio: 0.0519 + mean_zlib_bytes: -0.2194 + mean_zlib_ratio: -0.0069 + entropy: + mean_char_entropy: -0.0072 + mean_char_max_entropy: -0.0245 + mean_char_normalized: 0.0173 + mean_token_entropy: -0.0264 + mean_token_max_entropy: -0.0433 + mean_token_normalized: 0.0169 + mean_total_tokens: -0.1807 + mean_vocab_size: -0.2062 + function_metrics: + mean_avg_function_lines: 0.1338 + mean_avg_param_count: -0.0931 + mean_function_count: -0.3274 + mean_max_function_lines: 0.0222 + mean_max_param_count: -0.1036 + halstead: + mean_N1_total_operators: -0.1746 + mean_N2_total_operands: -0.1868 + mean_difficulty: 0.0070 + mean_effort: -0.2194 + mean_estimated_bugs: -0.2264 + mean_length: -0.1785 + mean_n1_unique_operators: -0.0814 + mean_n2_unique_operands: -0.2752 + mean_time_to_implement_seconds: -0.2194 + mean_vocabulary: -0.2238 + mean_volume: -0.2264 + heaps: + mean_beta: -0.0687 + mean_k: 0.0978 + mean_r_squared: -0.0094 + identifier_length_variance: + mean_max: -0.0671 + mean_mean: -0.0614 + mean_std_dev: 0.0205 + mean_variance: 0.0411 + indentation: + mean_blank_line_ratio: -0.4899 + mean_max_depth: 0.0301 + mean_mean_depth: 0.0114 + mean_variance: 0.1685 + line_patterns: + mean_blank_line_ratio: -0.4899 + mean_string_literal_ratio: 0.0039 + mean_unique_line_ratio: 0.0561 + magic_number_density: + mean_density: 1.4051 + mean_magic_number_count: -0.4114 + mean_string_literal_ratio: 0.0039 + near_duplicate_blocks_file: + mean_block_count: 0.5617 + mean_near_dup_block_d7: 0.1772 + mean_near_dup_block_d8: 0.1772 + mean_sub_block_count: 1.0591 + ngram: + mean_bigram_hapax_fraction: -0.0655 + mean_bigram_repeated_unique: -0.1356 + mean_bigram_repetition_rate: 0.0296 + mean_bigram_total: -0.1809 + mean_bigram_unique: -0.2260 + mean_trigram_hapax_fraction: -0.0366 + mean_trigram_repeated_unique: -0.1208 + mean_trigram_repetition_rate: 0.0506 + mean_trigram_total: -0.1812 + mean_trigram_unique: -0.2220 + punctuation_density: + mean_arrow_density: -0.2511 + mean_bracket_nonalpha_prefix_count: -0.2342 + mean_bracket_nonalpha_suffix_count: -0.3472 + mean_bracket_number_pair_count: -0.1772 + mean_colon_suffix_density: -0.2045 + mean_dot_count: -0.0341 + mean_exclamation_density: 2.0000 + mean_id_nonalpha_suffix_density: -0.0125 + readability: + mean_avg_line_length: -0.0217 + mean_avg_sub_words_per_id: -0.0148 + mean_avg_tokens_per_line: 0.0256 + mean_flesch_adapted: 0.0146 + mean_fog_adapted: 0.0323 + mean_total_lines: -0.2063 + separator_counts: + mean_dot_count: -0.0341 + mean_hyphen_count: -0.2532 + mean_underscore_count: -0.3087 + symbol_density: + mean_density: 0.0758 + mean_distinct_symbol_types: -0.0604 + mean_symbol_count: -0.1504 + vocabulary: + mean_mattr: -0.1396 + mean_raw_ttr: -0.0669 + mean_total_identifiers: -0.1838 + mean_unique_identifiers: -0.2507 + vowel_density: + mean_total_chars: -0.2452 + zipf: + mean_exponent: 0.0102 + mean_r_squared: -0.0067 + mean_total_tokens: -0.1807 + mean_vocab_size: -0.2062 + +line_length_under_120: + _doc: "Lines should be under 120 characters to avoid horizontal scrolling." + _log_baseline: -6.3790 + branching: + mean_branching_density: -0.1942 + mean_non_blank_count: 0.1944 + brevity: + mean_sample_size: -0.0200 + casing_entropy: + mean_entropy: -0.0047 + mean_snake_case_count: 0.0074 + compression: + mean_raw_bytes: 0.0170 + mean_redundancy: 0.0140 + mean_unique_line_ratio: 0.0133 + mean_zlib_bytes: -0.0077 + mean_zlib_ratio: 0.0247 + entropy: + mean_char_entropy: -0.0087 + mean_char_normalized: -0.0076 + mean_token_entropy: -0.0022 + mean_token_max_entropy: -0.0041 + mean_token_normalized: 0.0019 + mean_total_tokens: -0.0030 + mean_vocab_size: -0.0200 + function_metrics: + mean_avg_function_lines: 0.2084 + mean_avg_param_count: -0.0276 + mean_max_function_lines: 0.2570 + mean_max_param_count: -0.0944 + halstead: + mean_N1_total_operators: -0.0033 + mean_N2_total_operands: 0.0022 + mean_difficulty: 0.0219 + mean_effort: 0.0160 + mean_estimated_bugs: -0.0059 + mean_n1_unique_operators: -0.0081 + mean_n2_unique_operands: -0.0278 + mean_time_to_implement_seconds: 0.0160 + mean_vocabulary: -0.0228 + mean_volume: -0.0059 + heaps: + mean_beta: -0.0068 + mean_k: 0.0086 + identifier_length_variance: + mean_mean: -0.0207 + mean_std_dev: -0.0480 + mean_variance: -0.0960 + indentation: + mean_blank_line_ratio: -0.0420 + mean_max_depth: 0.1137 + mean_mean_depth: 0.1254 + mean_variance: 0.2595 + line_patterns: + mean_blank_line_ratio: -0.0420 + mean_string_literal_ratio: -0.0264 + mean_unique_line_ratio: 0.0181 + magic_number_density: + mean_density: 0.0052 + mean_string_literal_ratio: -0.0264 + near_duplicate_blocks_file: + mean_sub_block_count: 0.0477 + ngram: + mean_bigram_hapax_fraction: -0.0141 + mean_bigram_repeated_unique: 0.0257 + mean_bigram_repetition_rate: 0.0141 + mean_bigram_total: -0.0030 + mean_bigram_unique: -0.0113 + mean_trigram_hapax_fraction: 0.0017 + mean_trigram_repeated_unique: -0.0134 + mean_trigram_total: -0.0030 + mean_trigram_unique: -0.0043 + punctuation_density: + mean_bracket_nonalpha_prefix_count: -0.0807 + mean_bracket_nonalpha_suffix_count: -0.1362 + mean_colon_suffix_density: 0.0705 + mean_dot_count: -0.0069 + mean_id_nonalpha_suffix_density: 0.0093 + mean_question_mark_density: 2.0000 + readability: + mean_avg_line_length: -0.1816 + mean_avg_sub_words_per_id: -0.0066 + mean_avg_tokens_per_line: -0.1974 + mean_flesch_adapted: 0.0402 + mean_fog_adapted: -0.2009 + mean_total_lines: 0.1944 + separator_counts: + mean_dot_count: -0.0069 + mean_underscore_count: -0.0349 + symbol_density: + mean_density: -0.0247 + mean_distinct_symbol_types: -0.0130 + mean_symbol_count: -0.0078 + vocabulary: + mean_mattr: -0.0231 + mean_raw_ttr: -0.0300 + mean_total_identifiers: 0.0067 + mean_unique_identifiers: -0.0232 + vowel_density: + mean_total_chars: -0.0140 + zipf: + mean_exponent: 0.0039 + mean_total_tokens: -0.0030 + mean_vocab_size: -0.0200 + +no_magic_numbers: + _doc: "Numeric literals should be extracted to named constants rather than used inline." + _log_baseline: 111.4823 + branching: + mean_branch_count: -0.4352 + mean_branching_density: -0.9103 + mean_non_blank_count: 0.4762 + brevity: + mean_sample_size: 0.3955 + casing_entropy: + mean_entropy: -0.5234 + mean_snake_case_count: 0.9072 + compression: + mean_raw_bytes: 0.7713 + mean_redundancy: 0.1328 + mean_unique_line_ratio: 0.1073 + mean_zlib_bytes: 0.5072 + mean_zlib_ratio: 0.2642 + entropy: + mean_char_entropy: 0.0481 + mean_char_normalized: 0.0481 + mean_token_entropy: 0.0769 + mean_token_max_entropy: 0.0825 + mean_total_tokens: 0.4877 + mean_vocab_size: 0.3955 + function_metrics: + mean_avg_function_lines: -0.5888 + mean_avg_param_count: -0.1339 + mean_function_count: 0.5327 + mean_max_function_lines: -0.2655 + halstead: + mean_N1_total_operators: 0.1749 + mean_N2_total_operands: 0.4966 + mean_difficulty: -0.0338 + mean_effort: 0.3387 + mean_estimated_bugs: 0.3723 + mean_length: 0.3056 + mean_n1_unique_operators: -0.0901 + mean_n2_unique_operands: 0.4402 + mean_time_to_implement_seconds: 0.3387 + mean_vocabulary: 0.3159 + mean_volume: 0.3724 + heaps: + mean_beta: -0.1294 + mean_k: 0.7952 + mean_r_squared: -0.0645 + identifier_length_variance: + mean_max: 0.2172 + mean_mean: 0.4886 + mean_std_dev: 0.4918 + mean_variance: 0.9835 + indentation: + mean_blank_line_ratio: 0.3137 + mean_mean_depth: -0.4612 + mean_variance: -0.5503 + line_patterns: + mean_blank_line_ratio: 0.3137 + mean_string_literal_ratio: -0.5060 + mean_unique_line_ratio: 0.1502 + magic_number_density: + mean_density: -1.2903 + mean_magic_number_count: -0.8032 + mean_string_literal_ratio: -0.5060 + near_duplicate_blocks_file: + mean_block_count: -0.1911 + mean_near_dup_block_d0: -1.6546 + mean_near_dup_block_d7: -1.0789 + mean_sub_block_count: 0.3466 + ngram: + mean_bigram_hapax_fraction: -0.1520 + mean_bigram_repeated_unique: 0.7630 + mean_bigram_repetition_rate: 0.1469 + mean_bigram_total: 0.4887 + mean_bigram_unique: 0.4248 + mean_trigram_hapax_fraction: 0.0849 + mean_trigram_repeated_unique: 0.0415 + mean_trigram_repetition_rate: -0.2233 + mean_trigram_total: 0.4896 + mean_trigram_unique: 0.5215 + punctuation_density: + mean_arrow_density: -1.4573 + mean_bracket_nonalpha_suffix_count: 0.5999 + mean_colon_suffix_density: 0.5811 + mean_id_nonalpha_suffix_density: -0.1238 + mean_question_mark_density: -0.8032 + readability: + mean_avg_line_length: 0.3048 + mean_avg_sub_words_per_id: 0.3883 + mean_flesch_adapted: -0.7069 + mean_fog_adapted: 2.0000 + mean_total_lines: 0.4762 + separator_counts: + mean_hyphen_count: -0.8032 + mean_underscore_count: 1.7114 + symbol_density: + mean_density: -0.3071 + mean_symbol_count: 0.4654 + vocabulary: + mean_mattr: 0.3553 + mean_raw_ttr: -0.0669 + mean_total_identifiers: 0.7640 + mean_unique_identifiers: 0.6968 + vowel_density: + mean_total_chars: 1.2526 + zipf: + mean_exponent: -0.1353 + mean_r_squared: -0.0320 + mean_total_tokens: 0.4877 + mean_vocab_size: 0.3955 + +single_responsibility: + _doc: "Each file should have one primary concern — low complexity spread across few, focused functions." + _log_baseline: -38.1040 + branching: + mean_branch_count: -0.0678 + mean_branching_density: 0.1364 + mean_max_nesting_depth: -0.1093 + mean_non_blank_count: -0.2043 + brevity: + mean_sample_size: -0.0864 + casing_entropy: + mean_entropy: -0.0206 + mean_other_count: -0.7475 + mean_pascal_case_count: 0.0470 + mean_snake_case_count: -0.1543 + compression: + mean_raw_bytes: -0.1908 + mean_redundancy: -0.0351 + mean_unique_line_ratio: 0.0316 + mean_zlib_bytes: -0.1293 + mean_zlib_ratio: -0.0616 + entropy: + mean_char_entropy: 0.0078 + mean_char_max_entropy: -0.0021 + mean_char_normalized: 0.0099 + mean_token_entropy: 0.0014 + mean_token_max_entropy: -0.0182 + mean_token_normalized: 0.0196 + mean_total_tokens: -0.1489 + mean_vocab_size: -0.0864 + function_metrics: + mean_avg_function_lines: 0.1696 + mean_avg_param_count: -0.0805 + mean_function_count: -0.4114 + mean_max_param_count: -0.2962 + halstead: + mean_N1_total_operators: -0.1395 + mean_N2_total_operands: -0.1701 + mean_difficulty: 0.0527 + mean_effort: -0.1183 + mean_estimated_bugs: -0.1710 + mean_length: -0.1498 + mean_n1_unique_operators: 0.0541 + mean_n2_unique_operands: -0.1687 + mean_time_to_implement_seconds: -0.1183 + mean_vocabulary: -0.0965 + mean_volume: -0.1710 + heaps: + mean_beta: -0.0154 + mean_k: 0.0801 + mean_r_squared: -0.0163 + identifier_length_variance: + mean_max: -0.0836 + mean_mean: -0.0508 + mean_std_dev: -0.0865 + mean_variance: -0.1729 + indentation: + mean_blank_line_ratio: 0.0458 + mean_mean_depth: -0.0476 + mean_variance: -0.0931 + line_patterns: + mean_blank_line_ratio: 0.0458 + mean_max_nesting_depth: -0.1093 + mean_string_literal_ratio: -0.1759 + mean_unique_line_ratio: 0.0324 + magic_number_density: + mean_density: 0.1469 + mean_string_literal_ratio: -0.1759 + near_duplicate_blocks_file: + mean_block_count: -0.2284 + mean_near_dup_block_d0: -0.2962 + mean_near_dup_block_d7: -0.3737 + mean_sub_block_count: -0.1348 + ngram: + mean_bigram_hapax_fraction: 0.0075 + mean_bigram_repeated_unique: -0.1303 + mean_bigram_repetition_rate: -0.0207 + mean_bigram_total: -0.1492 + mean_bigram_unique: -0.1162 + mean_trigram_hapax_fraction: 0.0132 + mean_trigram_repeated_unique: -0.1793 + mean_trigram_repetition_rate: -0.0466 + mean_trigram_total: -0.1495 + mean_trigram_unique: -0.1273 + punctuation_density: + mean_arrow_density: -0.1462 + mean_bracket_nonalpha_prefix_count: -0.0859 + mean_bracket_nonalpha_suffix_count: -0.4201 + mean_colon_suffix_density: -0.4720 + mean_dot_count: -0.0630 + mean_exclamation_density: 2.0000 + mean_id_nonalpha_suffix_density: -0.0499 + mean_question_mark_density: 0.4596 + readability: + mean_avg_line_length: 0.0137 + mean_avg_sub_words_per_id: -0.0377 + mean_avg_tokens_per_line: 0.0553 + mean_flesch_adapted: 0.0348 + mean_fog_adapted: -0.0587 + mean_total_lines: -0.2043 + separator_counts: + mean_dot_count: -0.0630 + mean_hyphen_count: -0.1453 + mean_slash_count: 0.3737 + mean_underscore_count: -0.4685 + symbol_density: + mean_density: 0.0683 + mean_distinct_symbol_types: 0.0284 + mean_symbol_count: -0.1225 + vocabulary: + mean_mattr: -0.0285 + mean_raw_ttr: 0.0110 + mean_total_identifiers: -0.1419 + mean_unique_identifiers: -0.1309 + vowel_density: + mean_total_chars: -0.1927 + zipf: + mean_exponent: -0.0209 + mean_r_squared: -0.0043 + mean_total_tokens: -0.1489 + mean_vocab_size: -0.0864 + +uses_standard_indentation_width: + _doc: "Indentation should use consistent multiples of 2 or 4 spaces throughout the file." + _log_baseline: -17.9172 + compression: + mean_raw_bytes: -0.2512 + mean_redundancy: -0.0906 + mean_zlib_bytes: -0.0351 + mean_zlib_ratio: -0.2161 + entropy: + mean_char_entropy: 0.1510 + mean_char_normalized: 0.1510 + function_metrics: + mean_avg_function_lines: 0.0361 + indentation: + mean_blank_line_ratio: 0.2077 + mean_max_depth: -1.0000 + mean_mean_depth: -1.0000 + mean_variance: -2.0000 + line_patterns: + mean_blank_line_ratio: 0.2077 + near_duplicate_blocks_file: + mean_near_dup_block_d3: -1.0000 + mean_near_dup_block_d4: 1.0000 + punctuation_density: + mean_exclamation_density: 0.2630 + mean_question_mark_density: 0.2630 + readability: + mean_avg_line_length: -0.2644 + symbol_density: + mean_density: 0.2512 + diff --git a/priv/combined_metrics/function_design.yml b/priv/combined_metrics/function_design.yml new file mode 100644 index 00000000..e34ba2ab --- /dev/null +++ b/priv/combined_metrics/function_design.yml @@ -0,0 +1,862 @@ +boolean_function_has_question_mark: + _doc: "Functions returning a boolean should end with `?` (Elixir/Ruby) or start with `is_`/`has_` (JS/Python)." + _log_baseline: -6.4663 + brevity: + mean_sample_size: 0.0127 + casing_entropy: + mean_camel_case_count: 0.3410 + mean_entropy: 0.0137 + mean_snake_case_count: -0.0205 + compression: + mean_raw_bytes: -0.0088 + mean_zlib_bytes: -0.0060 + entropy: + mean_token_max_entropy: 0.0036 + mean_token_normalized: -0.0045 + mean_total_tokens: 0.0278 + mean_vocab_size: 0.0127 + halstead: + mean_difficulty: -0.0101 + mean_effort: -0.0070 + mean_n2_unique_operands: 0.0049 + mean_time_to_implement_seconds: -0.0070 + heaps: + mean_beta: -0.0136 + mean_k: 0.0266 + identifier_length_variance: + mean_max: 0.0167 + mean_mean: -0.0315 + mean_std_dev: -0.0588 + mean_variance: -0.0896 + line_patterns: + mean_string_literal_ratio: -0.0125 + magic_number_density: + mean_string_literal_ratio: -0.0125 + near_duplicate_blocks_file: + mean_near_dup_block_d5: 0.2707 + mean_near_dup_block_d6: 1.0745 + mean_near_dup_block_d8: 0.4628 + ngram: + mean_bigram_repeated_unique: 0.0212 + mean_bigram_repetition_rate: 0.0188 + mean_bigram_total: 0.0281 + mean_bigram_unique: 0.0114 + mean_trigram_repeated_unique: 0.0334 + mean_trigram_repetition_rate: 0.0258 + mean_trigram_total: 0.0283 + mean_trigram_unique: 0.0227 + punctuation_density: + mean_bracket_nonalpha_prefix_count: 2.0000 + mean_colon_suffix_density: -0.0269 + mean_exclamation_density: 0.0234 + mean_id_nonalpha_suffix_density: 0.0246 + mean_question_mark_density: 1.1524 + readability: + mean_avg_line_length: -0.0092 + mean_avg_sub_words_per_id: -0.0097 + mean_avg_tokens_per_line: 0.0248 + mean_flesch_adapted: 0.0078 + mean_fog_adapted: 0.0564 + symbol_density: + mean_density: 0.0691 + mean_distinct_symbol_types: 0.0138 + mean_symbol_count: 0.0600 + vocabulary: + mean_mattr: 0.0250 + mean_raw_ttr: 0.0250 + mean_unique_identifiers: 0.0175 + vowel_density: + mean_total_chars: -0.0370 + zipf: + mean_exponent: 0.0154 + mean_r_squared: -0.0050 + mean_total_tokens: 0.0278 + mean_vocab_size: 0.0127 + +cyclomatic_complexity_under_10: + _doc: "Functions should have a cyclomatic complexity under 10." + _log_baseline: -1.4896 + branching: + mean_branch_count: -0.2373 + mean_branching_density: -0.1952 + mean_non_blank_count: -0.0421 + casing_entropy: + mean_entropy: 0.0964 + mean_other_count: 0.3306 + mean_snake_case_count: 0.0321 + compression: + mean_raw_bytes: -0.0162 + mean_redundancy: -0.0172 + mean_unique_line_ratio: -0.0305 + mean_zlib_bytes: 0.0186 + mean_zlib_ratio: -0.0347 + entropy: + mean_char_entropy: 0.0349 + mean_char_max_entropy: 0.0016 + mean_char_normalized: 0.0333 + mean_token_entropy: -0.0050 + mean_token_normalized: -0.0050 + mean_total_tokens: 0.0437 + function_metrics: + mean_avg_function_lines: -0.4757 + mean_function_count: 0.4636 + mean_max_function_lines: -0.5038 + halstead: + mean_N1_total_operators: 0.0708 + mean_N2_total_operands: 0.0358 + mean_difficulty: 0.0472 + mean_effort: 0.1087 + mean_estimated_bugs: 0.0615 + mean_length: 0.0605 + mean_n1_unique_operators: 0.0114 + mean_time_to_implement_seconds: 0.1087 + mean_vocabulary: 0.0043 + mean_volume: 0.0615 + heaps: + mean_beta: -0.0367 + mean_k: 0.0672 + mean_r_squared: 0.0049 + identifier_length_variance: + mean_mean: 0.0130 + mean_std_dev: 0.0120 + mean_variance: 0.0240 + indentation: + mean_blank_line_ratio: 0.1655 + mean_max_depth: -0.2086 + mean_mean_depth: -0.2901 + mean_variance: -0.4637 + line_patterns: + mean_blank_line_ratio: 0.1655 + mean_string_literal_ratio: -0.0439 + mean_unique_line_ratio: 0.0055 + magic_number_density: + mean_density: -0.0329 + mean_string_literal_ratio: -0.0439 + near_duplicate_blocks_file: + mean_block_count: 0.1013 + mean_near_dup_block_d8: -0.2086 + mean_sub_block_count: 0.0994 + ngram: + mean_bigram_hapax_fraction: -0.0068 + mean_bigram_repeated_unique: 0.0301 + mean_bigram_repetition_rate: 0.0115 + mean_bigram_total: 0.0438 + mean_bigram_unique: 0.0192 + mean_trigram_hapax_fraction: -0.0027 + mean_trigram_repeated_unique: 0.0456 + mean_trigram_repetition_rate: 0.0055 + mean_trigram_total: 0.0440 + mean_trigram_unique: 0.0388 + punctuation_density: + mean_arrow_density: -0.4960 + mean_bracket_nonalpha_prefix_count: 0.2488 + mean_bracket_nonalpha_suffix_count: 0.3306 + mean_colon_suffix_density: 0.2760 + mean_dot_count: -0.3005 + mean_exclamation_density: 2.0000 + mean_id_nonalpha_suffix_density: 0.0616 + readability: + mean_avg_line_length: 0.0257 + mean_avg_sub_words_per_id: 0.0067 + mean_avg_tokens_per_line: 0.0858 + mean_flesch_adapted: -0.0140 + mean_fog_adapted: 0.1035 + mean_total_lines: -0.0421 + symbol_density: + mean_density: 0.0552 + mean_distinct_symbol_types: -0.0172 + mean_symbol_count: 0.0391 + vocabulary: + mean_mattr: -0.0361 + mean_raw_ttr: -0.0361 + mean_total_identifiers: 0.0441 + mean_unique_identifiers: 0.0080 + vowel_density: + mean_total_chars: 0.0572 + zipf: + mean_exponent: 0.0120 + mean_r_squared: 0.0057 + mean_total_tokens: 0.0437 + +has_verb_in_name: + _doc: "Function names should contain a verb describing the action performed." + _log_baseline: 14.8350 + compression: + mean_raw_bytes: 0.0816 + mean_redundancy: -0.0390 + mean_zlib_bytes: 0.2011 + mean_zlib_ratio: -0.1195 + identifier_length_variance: + mean_max: 0.7747 + mean_mean: 0.2058 + mean_std_dev: 1.0000 + mean_variance: 2.0000 + punctuation_density: + mean_exclamation_density: -0.1076 + readability: + mean_avg_line_length: 0.0846 + mean_avg_sub_words_per_id: 0.1330 + mean_flesch_adapted: -0.1324 + mean_fog_adapted: 1.3261 + symbol_density: + mean_density: -0.0828 + vowel_density: + mean_total_chars: 0.2058 + +is_less_than_20_lines: + _doc: "Functions should be 20 lines or fewer." + _log_baseline: 23.9658 + branching: + mean_branch_count: -0.0820 + mean_branching_density: -0.1010 + mean_max_nesting_depth: -0.1156 + mean_non_blank_count: 0.0188 + brevity: + mean_sample_size: 0.0165 + casing_entropy: + mean_entropy: 0.0577 + mean_other_count: 0.6266 + mean_pascal_case_count: 0.0440 + mean_snake_case_count: 0.0910 + compression: + mean_raw_bytes: 0.0746 + mean_redundancy: 0.0227 + mean_unique_line_ratio: -0.0334 + mean_zlib_bytes: 0.0366 + mean_zlib_ratio: 0.0379 + entropy: + mean_char_entropy: 0.0020 + mean_token_entropy: -0.0041 + mean_token_max_entropy: 0.0035 + mean_token_normalized: -0.0076 + mean_total_tokens: 0.0759 + mean_vocab_size: 0.0165 + function_metrics: + mean_avg_function_lines: -0.3598 + mean_avg_param_count: 0.1156 + mean_function_count: 0.3705 + mean_max_function_lines: -0.4532 + mean_max_param_count: 0.0820 + halstead: + mean_N1_total_operators: 0.0857 + mean_N2_total_operands: 0.0965 + mean_difficulty: 0.0624 + mean_effort: 0.1550 + mean_estimated_bugs: 0.0926 + mean_length: 0.0895 + mean_n1_unique_operators: -0.0097 + mean_n2_unique_operands: 0.0245 + mean_time_to_implement_seconds: 0.1550 + mean_vocabulary: 0.0143 + mean_volume: 0.0926 + heaps: + mean_k: -0.0254 + identifier_length_variance: + mean_mean: 0.0122 + mean_std_dev: 0.0297 + mean_variance: 0.0593 + indentation: + mean_blank_line_ratio: -0.0440 + mean_mean_depth: -0.0962 + mean_variance: -0.1115 + line_patterns: + mean_blank_line_ratio: -0.0440 + mean_max_nesting_depth: -0.1156 + mean_string_literal_ratio: -0.0774 + mean_unique_line_ratio: -0.0188 + magic_number_density: + mean_density: 0.0389 + mean_magic_number_count: 0.1156 + mean_string_literal_ratio: -0.0774 + near_duplicate_blocks_file: + mean_block_count: 0.2797 + mean_near_dup_block_d8: 0.3133 + mean_sub_block_count: 0.1886 + ngram: + mean_bigram_hapax_fraction: -0.0508 + mean_bigram_repeated_unique: 0.1067 + mean_bigram_repetition_rate: 0.0562 + mean_bigram_total: 0.0760 + mean_bigram_unique: 0.0228 + mean_trigram_hapax_fraction: -0.0300 + mean_trigram_repeated_unique: 0.1516 + mean_trigram_repetition_rate: 0.1014 + mean_trigram_total: 0.0761 + mean_trigram_unique: 0.0386 + punctuation_density: + mean_arrow_density: -0.3892 + mean_bracket_nonalpha_prefix_count: 0.0418 + mean_bracket_nonalpha_suffix_count: 0.0476 + mean_colon_suffix_density: 0.0941 + mean_dot_count: 0.0717 + mean_exclamation_density: -0.0820 + mean_id_nonalpha_suffix_density: 0.0518 + mean_question_mark_density: -2.0000 + readability: + mean_avg_line_length: 0.0576 + mean_avg_sub_words_per_id: -0.0034 + mean_avg_tokens_per_line: 0.0570 + mean_flesch_adapted: -0.0039 + mean_fog_adapted: 0.0868 + mean_total_lines: 0.0188 + symbol_density: + mean_density: -0.0084 + mean_distinct_symbol_types: 0.0127 + mean_symbol_count: 0.0662 + vocabulary: + mean_mattr: -0.0390 + mean_raw_ttr: -0.0717 + mean_total_identifiers: 0.0965 + mean_unique_identifiers: 0.0248 + vowel_density: + mean_total_chars: 0.1087 + zipf: + mean_exponent: 0.0225 + mean_r_squared: 0.0030 + mean_total_tokens: 0.0759 + mean_vocab_size: 0.0165 + +nesting_depth_under_4: + _doc: "Code should not nest deeper than 4 levels." + _log_baseline: 1.0611 + branching: + mean_branch_count: -0.3267 + mean_branching_density: -0.2061 + mean_max_nesting_depth: 0.2061 + mean_non_blank_count: -0.1206 + brevity: + mean_sample_size: 0.0178 + casing_entropy: + mean_entropy: -0.0207 + mean_other_count: 0.2917 + mean_pascal_case_count: -0.2725 + mean_snake_case_count: 0.0787 + compression: + mean_raw_bytes: -0.0069 + mean_redundancy: -0.0076 + mean_unique_line_ratio: -0.0028 + mean_zlib_bytes: 0.0117 + mean_zlib_ratio: -0.0186 + entropy: + mean_char_entropy: 0.0715 + mean_char_max_entropy: -0.0017 + mean_char_normalized: 0.0732 + mean_token_entropy: -0.0118 + mean_token_max_entropy: 0.0042 + mean_token_normalized: -0.0161 + mean_total_tokens: 0.1047 + mean_vocab_size: 0.0178 + function_metrics: + mean_avg_function_lines: -0.6349 + mean_function_count: 0.5787 + mean_max_function_lines: -0.3375 + halstead: + mean_N1_total_operators: 0.1525 + mean_N2_total_operands: 0.0950 + mean_difficulty: 0.0991 + mean_effort: 0.2385 + mean_estimated_bugs: 0.1394 + mean_length: 0.1347 + mean_n1_unique_operators: 0.0229 + mean_n2_unique_operands: 0.0188 + mean_time_to_implement_seconds: 0.2385 + mean_vocabulary: 0.0202 + mean_volume: 0.1394 + heaps: + mean_beta: -0.0464 + mean_k: 0.0845 + identifier_length_variance: + mean_mean: 0.0770 + mean_std_dev: 0.1858 + mean_variance: 0.3716 + indentation: + mean_blank_line_ratio: 0.5622 + mean_max_depth: -0.3155 + mean_mean_depth: -0.3651 + mean_variance: -0.6050 + line_patterns: + mean_blank_line_ratio: 0.5622 + mean_max_nesting_depth: 0.2061 + mean_string_literal_ratio: -0.1046 + mean_unique_line_ratio: 0.0786 + magic_number_density: + mean_string_literal_ratio: -0.1046 + near_duplicate_blocks_file: + mean_block_count: 0.0856 + mean_sub_block_count: 0.1999 + ngram: + mean_bigram_hapax_fraction: -0.0645 + mean_bigram_repeated_unique: 0.1420 + mean_bigram_repetition_rate: 0.0534 + mean_bigram_total: 0.1049 + mean_bigram_unique: 0.0292 + mean_trigram_hapax_fraction: -0.0170 + mean_trigram_repeated_unique: 0.1274 + mean_trigram_repetition_rate: 0.0457 + mean_trigram_total: 0.1052 + mean_trigram_unique: 0.0686 + punctuation_density: + mean_arrow_density: 0.9701 + mean_bracket_nonalpha_prefix_count: 0.1748 + mean_bracket_nonalpha_suffix_count: 0.9451 + mean_colon_suffix_density: 0.8804 + mean_dot_count: -0.2520 + mean_exclamation_density: 2.0000 + mean_id_nonalpha_suffix_density: 0.0946 + mean_question_mark_density: -0.1977 + readability: + mean_avg_line_length: 0.1152 + mean_avg_sub_words_per_id: 0.0220 + mean_avg_tokens_per_line: 0.2252 + mean_flesch_adapted: -0.0374 + mean_fog_adapted: 0.2252 + mean_total_lines: -0.1206 + symbol_density: + mean_density: 0.1426 + mean_symbol_count: 0.1355 + vocabulary: + mean_mattr: -0.0269 + mean_raw_ttr: -0.0269 + mean_total_identifiers: 0.0774 + mean_unique_identifiers: 0.0505 + vowel_density: + mean_total_chars: 0.1544 + zipf: + mean_exponent: 0.0250 + mean_r_squared: 0.0156 + mean_total_tokens: 0.1047 + mean_vocab_size: 0.0178 + +no_boolean_parameter: + _doc: "Functions should not take boolean parameters — a flag usually means the function does two things." + _log_baseline: 13.6290 + branching: + mean_branch_count: -2.0000 + mean_branching_density: 1.0271 + mean_max_nesting_depth: -0.3263 + mean_non_blank_count: -0.0383 + brevity: + mean_sample_size: -0.0253 + casing_entropy: + mean_entropy: 0.0049 + mean_pascal_case_count: 0.1180 + mean_snake_case_count: 0.0931 + compression: + mean_raw_bytes: 0.0435 + mean_redundancy: 0.0777 + mean_unique_line_ratio: -0.0656 + mean_zlib_bytes: -0.1055 + mean_zlib_ratio: 0.1490 + entropy: + mean_char_entropy: 0.0152 + mean_char_normalized: 0.0153 + mean_token_entropy: -0.0129 + mean_token_max_entropy: -0.0055 + mean_token_normalized: -0.0073 + mean_total_tokens: 0.0692 + mean_vocab_size: -0.0253 + function_metrics: + mean_avg_function_lines: -0.3850 + mean_avg_param_count: -0.2935 + mean_function_count: 0.4338 + mean_max_function_lines: -0.5579 + halstead: + mean_N1_total_operators: 0.0393 + mean_N2_total_operands: 0.0832 + mean_difficulty: 0.0207 + mean_effort: 0.0660 + mean_estimated_bugs: 0.0453 + mean_length: 0.0543 + mean_n1_unique_operators: -0.0806 + mean_n2_unique_operands: -0.0181 + mean_time_to_implement_seconds: 0.0660 + mean_vocabulary: -0.0374 + mean_volume: 0.0453 + heaps: + mean_beta: -0.0314 + mean_k: 0.0620 + identifier_length_variance: + mean_mean: 0.0125 + mean_std_dev: 0.1858 + mean_variance: 0.3715 + indentation: + mean_blank_line_ratio: 0.4402 + mean_max_depth: -0.5579 + mean_mean_depth: -0.2880 + mean_variance: -0.8414 + line_patterns: + mean_blank_line_ratio: 0.4402 + mean_max_nesting_depth: -0.3263 + mean_string_literal_ratio: 0.0206 + mean_unique_line_ratio: 0.0101 + magic_number_density: + mean_string_literal_ratio: 0.0206 + near_duplicate_blocks_file: + mean_block_count: 0.4338 + mean_near_dup_block_d0: 1.7685 + mean_near_dup_block_d2: 1.1158 + mean_near_dup_block_d4: 1.6737 + mean_near_dup_block_d5: 1.6737 + mean_near_dup_block_d6: 1.7685 + mean_near_dup_block_d7: -0.8842 + mean_near_dup_block_d8: 0.5579 + mean_sub_block_count: 0.2775 + ngram: + mean_bigram_hapax_fraction: -0.1940 + mean_bigram_repeated_unique: 0.1467 + mean_bigram_repetition_rate: 0.1504 + mean_bigram_total: 0.0694 + mean_bigram_unique: -0.1127 + mean_trigram_hapax_fraction: -0.2208 + mean_trigram_repeated_unique: 0.3783 + mean_trigram_repetition_rate: 0.3150 + mean_trigram_total: 0.0695 + mean_trigram_unique: -0.1019 + punctuation_density: + mean_bracket_nonalpha_prefix_count: 0.1152 + mean_bracket_nonalpha_suffix_count: 0.1075 + mean_colon_suffix_density: -0.2793 + mean_dot_count: 0.1538 + mean_exclamation_density: -0.0435 + mean_id_nonalpha_suffix_density: 0.0357 + readability: + mean_avg_line_length: 0.0817 + mean_avg_sub_words_per_id: 0.0160 + mean_avg_tokens_per_line: 0.1075 + mean_flesch_adapted: -0.0254 + mean_fog_adapted: 0.2928 + mean_total_lines: -0.0383 + symbol_density: + mean_density: 0.0479 + mean_symbol_count: 0.0916 + vocabulary: + mean_mattr: -0.0916 + mean_raw_ttr: -0.1091 + mean_total_identifiers: 0.0962 + mean_unique_identifiers: -0.0129 + vowel_density: + mean_total_chars: 0.1087 + zipf: + mean_exponent: 0.0374 + mean_total_tokens: 0.0692 + mean_vocab_size: -0.0253 + +no_magic_numbers: + _doc: "Numeric literals should be named constants, not inline magic numbers." + _log_baseline: 45.8808 + branching: + mean_branch_count: -0.2708 + mean_branching_density: -0.1682 + mean_non_blank_count: -0.1029 + brevity: + mean_sample_size: 0.1527 + casing_entropy: + mean_entropy: -0.2908 + mean_snake_case_count: 0.4279 + compression: + mean_raw_bytes: 0.3823 + mean_redundancy: 0.0584 + mean_unique_line_ratio: 0.2269 + mean_zlib_bytes: 0.2473 + mean_zlib_ratio: 0.1350 + entropy: + mean_char_entropy: 0.0661 + mean_char_normalized: 0.0624 + mean_token_entropy: 0.0148 + mean_token_max_entropy: 0.0355 + mean_token_normalized: -0.0207 + mean_total_tokens: 0.2834 + mean_vocab_size: 0.1527 + function_metrics: + mean_avg_function_lines: -0.8758 + mean_function_count: 0.4111 + halstead: + mean_N1_total_operators: 0.1953 + mean_N2_total_operands: 0.2960 + mean_difficulty: 0.0408 + mean_effort: 0.3105 + mean_estimated_bugs: 0.2698 + mean_length: 0.2359 + mean_n1_unique_operators: -0.0413 + mean_n2_unique_operands: 0.2139 + mean_time_to_implement_seconds: 0.3105 + mean_vocabulary: 0.1447 + mean_volume: 0.2697 + heaps: + mean_beta: -0.1129 + mean_k: 0.5236 + mean_r_squared: -0.0256 + identifier_length_variance: + mean_max: 0.0987 + mean_mean: 0.3721 + mean_std_dev: 0.3878 + mean_variance: 0.7757 + indentation: + mean_blank_line_ratio: 0.2374 + mean_mean_depth: -0.3518 + mean_variance: -0.4760 + line_patterns: + mean_blank_line_ratio: 0.2374 + mean_string_literal_ratio: -0.2880 + mean_unique_line_ratio: 0.2337 + magic_number_density: + mean_density: -0.2831 + mean_string_literal_ratio: -0.2880 + near_duplicate_blocks_file: + mean_block_count: -0.7894 + mean_near_dup_block_d0: -1.1158 + mean_near_dup_block_d7: -1.1158 + mean_sub_block_count: 0.2708 + ngram: + mean_bigram_hapax_fraction: -0.1437 + mean_bigram_repeated_unique: 0.4787 + mean_bigram_repetition_rate: 0.1545 + mean_bigram_total: 0.2844 + mean_bigram_unique: 0.1437 + mean_trigram_hapax_fraction: -0.0207 + mean_trigram_repeated_unique: 0.2787 + mean_trigram_repetition_rate: 0.1465 + mean_trigram_total: 0.2854 + mean_trigram_unique: 0.1843 + punctuation_density: + mean_arrow_density: -1.1699 + mean_bracket_nonalpha_suffix_count: 2.0000 + mean_colon_suffix_density: 1.9476 + mean_id_nonalpha_suffix_density: 0.3448 + readability: + mean_avg_line_length: 0.5035 + mean_avg_sub_words_per_id: 0.2699 + mean_avg_tokens_per_line: 0.3863 + mean_flesch_adapted: -0.3819 + mean_fog_adapted: 1.0656 + mean_total_lines: -0.1029 + symbol_density: + mean_density: -0.0314 + mean_distinct_symbol_types: 0.0644 + mean_symbol_count: 0.3512 + vocabulary: + mean_mattr: 0.0058 + mean_raw_ttr: -0.0081 + mean_total_identifiers: 0.3908 + mean_unique_identifiers: 0.3826 + vowel_density: + mean_total_chars: 0.7629 + zipf: + mean_exponent: 0.0164 + mean_r_squared: 0.0321 + mean_total_tokens: 0.2834 + mean_vocab_size: 0.1527 + +parameter_count_under_4: + _doc: "Functions should take fewer than 4 parameters." + _log_baseline: 1.9637 + branching: + mean_non_blank_count: 0.0967 + brevity: + mean_sample_size: 0.0261 + casing_entropy: + mean_entropy: 0.5731 + mean_other_count: 0.5408 + mean_pascal_case_count: 0.2329 + mean_snake_case_count: -0.0351 + compression: + mean_raw_bytes: -0.0343 + mean_redundancy: -0.0308 + mean_unique_line_ratio: -0.0166 + mean_zlib_bytes: 0.0291 + mean_zlib_ratio: -0.0634 + entropy: + mean_char_entropy: 0.0082 + mean_char_max_entropy: 0.0175 + mean_char_normalized: -0.0093 + mean_token_entropy: 0.0206 + mean_token_max_entropy: 0.0063 + mean_token_normalized: 0.0144 + mean_total_tokens: -0.0335 + mean_vocab_size: 0.0261 + function_metrics: + mean_avg_function_lines: 0.1262 + mean_avg_param_count: -0.3179 + mean_function_count: -0.0320 + mean_max_function_lines: 0.2037 + mean_max_param_count: -0.2847 + halstead: + mean_N1_total_operators: -0.0264 + mean_N2_total_operands: -0.0507 + mean_difficulty: 0.0180 + mean_effort: -0.0147 + mean_estimated_bugs: -0.0326 + mean_length: -0.0353 + mean_n1_unique_operators: 0.0613 + mean_n2_unique_operands: -0.0074 + mean_time_to_implement_seconds: -0.0147 + mean_vocabulary: 0.0108 + mean_volume: -0.0327 + heaps: + mean_beta: 0.0179 + mean_k: -0.0082 + mean_r_squared: -0.0062 + identifier_length_variance: + mean_mean: -0.0239 + mean_std_dev: -0.0185 + mean_variance: -0.0371 + indentation: + mean_blank_line_ratio: 0.0518 + mean_max_depth: 0.1362 + mean_mean_depth: 0.0506 + mean_variance: 0.1451 + line_patterns: + mean_blank_line_ratio: 0.0518 + mean_string_literal_ratio: 0.1674 + mean_unique_line_ratio: -0.0137 + magic_number_density: + mean_density: -2.0000 + mean_string_literal_ratio: 0.1674 + near_duplicate_blocks_file: + mean_block_count: 0.0967 + mean_near_dup_block_d7: -0.4658 + ngram: + mean_bigram_hapax_fraction: 0.0479 + mean_bigram_repeated_unique: -0.0222 + mean_bigram_repetition_rate: -0.0480 + mean_bigram_total: -0.0336 + mean_bigram_unique: 0.0376 + mean_trigram_hapax_fraction: 0.0610 + mean_trigram_repeated_unique: -0.1263 + mean_trigram_repetition_rate: -0.1619 + mean_trigram_total: -0.0337 + mean_trigram_unique: 0.0524 + punctuation_density: + mean_bracket_nonalpha_prefix_count: -0.0967 + mean_bracket_nonalpha_suffix_count: -0.1131 + mean_colon_suffix_density: 0.0056 + mean_dot_count: 0.9099 + mean_id_nonalpha_suffix_density: -0.0434 + readability: + mean_avg_line_length: -0.1345 + mean_avg_sub_words_per_id: -0.0093 + mean_avg_tokens_per_line: -0.1302 + mean_flesch_adapted: 0.0271 + mean_fog_adapted: -0.1290 + mean_total_lines: 0.0967 + symbol_density: + mean_density: 0.0124 + mean_distinct_symbol_types: 0.1042 + mean_symbol_count: -0.0218 + vocabulary: + mean_mattr: 0.0150 + mean_raw_ttr: 0.0153 + mean_total_identifiers: -0.0153 + vowel_density: + mean_total_chars: -0.0393 + zipf: + mean_exponent: 0.0101 + mean_r_squared: -0.0074 + mean_total_tokens: -0.0335 + mean_vocab_size: 0.0261 + +uses_ternary_expression: + _doc: "Simple conditional assignments should use inline expressions rather than full if-blocks." + _log_baseline: -4.5289 + branching: + mean_branch_count: -0.4160 + mean_branching_density: 0.1134 + mean_non_blank_count: -0.5296 + brevity: + mean_sample_size: 0.0095 + casing_entropy: + mean_entropy: 0.0068 + mean_snake_case_count: -0.0141 + compression: + mean_raw_bytes: -0.0819 + mean_redundancy: -0.0629 + mean_unique_line_ratio: 0.1604 + mean_zlib_bytes: 0.0148 + mean_zlib_ratio: -0.0967 + entropy: + mean_char_entropy: 0.0664 + mean_char_normalized: 0.0636 + mean_token_entropy: -0.0078 + mean_token_normalized: -0.0101 + mean_total_tokens: 0.0859 + mean_vocab_size: 0.0095 + function_metrics: + mean_avg_function_lines: -0.6785 + mean_function_count: 0.2434 + mean_max_function_lines: -0.4160 + halstead: + mean_N1_total_operators: 0.1567 + mean_N2_total_operands: 0.0551 + mean_difficulty: 0.0844 + mean_effort: 0.2135 + mean_estimated_bugs: 0.1291 + mean_length: 0.1267 + mean_n1_unique_operators: 0.0293 + mean_time_to_implement_seconds: 0.2135 + mean_vocabulary: 0.0101 + mean_volume: 0.1291 + heaps: + mean_beta: -0.0301 + mean_k: 0.0594 + identifier_length_variance: + mean_mean: 0.0749 + mean_std_dev: 0.0535 + mean_variance: 0.1070 + indentation: + mean_blank_line_ratio: 0.5054 + mean_max_depth: -0.2434 + mean_mean_depth: -0.3243 + mean_variance: -0.5454 + line_patterns: + mean_blank_line_ratio: 0.5054 + mean_string_literal_ratio: -0.0855 + mean_unique_line_ratio: 0.1630 + magic_number_density: + mean_density: -0.0859 + mean_string_literal_ratio: -0.0855 + near_duplicate_blocks_file: + mean_block_count: -0.2821 + mean_near_dup_block_d0: -2.0000 + mean_sub_block_count: 0.2434 + ngram: + mean_bigram_hapax_fraction: -0.0546 + mean_bigram_repeated_unique: 0.2141 + mean_bigram_repetition_rate: 0.0785 + mean_bigram_total: 0.0863 + mean_bigram_unique: 0.0432 + mean_trigram_hapax_fraction: -0.0165 + mean_trigram_repeated_unique: 0.1339 + mean_trigram_repetition_rate: 0.1178 + mean_trigram_total: 0.0866 + mean_trigram_unique: 0.0400 + punctuation_density: + mean_bracket_nonalpha_prefix_count: 0.2713 + mean_bracket_nonalpha_suffix_count: 0.5255 + mean_bracket_number_pair_count: 0.4160 + mean_colon_suffix_density: 1.7729 + mean_dot_count: -1.1679 + mean_id_nonalpha_suffix_density: 0.1908 + readability: + mean_avg_line_length: 0.4657 + mean_avg_sub_words_per_id: 0.0312 + mean_avg_tokens_per_line: 0.6155 + mean_flesch_adapted: -0.0682 + mean_fog_adapted: 0.5360 + mean_total_lines: -0.5296 + symbol_density: + mean_density: 0.3167 + mean_distinct_symbol_types: 0.0364 + mean_symbol_count: 0.2350 + vocabulary: + mean_mattr: -0.0068 + mean_raw_ttr: -0.0068 + mean_total_identifiers: -0.0117 + mean_unique_identifiers: -0.0185 + vowel_density: + mean_total_chars: 0.0632 + zipf: + mean_exponent: 0.0320 + mean_r_squared: 0.0133 + mean_total_tokens: 0.0859 + mean_vocab_size: 0.0095 + diff --git a/priv/combined_metrics/naming_conventions.yml b/priv/combined_metrics/naming_conventions.yml new file mode 100644 index 00000000..83ce0781 --- /dev/null +++ b/priv/combined_metrics/naming_conventions.yml @@ -0,0 +1,268 @@ +class_name_is_noun: + _doc: "Class and module names should be nouns describing what they represent, not verbs or gerunds." + _languages: [elixir] + _log_baseline: 2.9861 + brevity: + mean_sample_size: 0.7106 + compression: + mean_raw_bytes: 0.1346 + mean_redundancy: -0.0605 + mean_zlib_bytes: 0.2139 + mean_zlib_ratio: -0.0794 + entropy: + mean_token_entropy: 0.1236 + mean_token_max_entropy: 0.1716 + mean_token_normalized: -0.0484 + mean_vocab_size: 0.7106 + halstead: + mean_difficulty: -1.1493 + mean_effort: -0.9669 + mean_estimated_bugs: 0.1818 + mean_n2_unique_operands: 1.1492 + mean_time_to_implement_seconds: -0.9669 + mean_vocabulary: 0.7462 + mean_volume: 0.1823 + heaps: + mean_beta: 0.4086 + mean_k: -0.6266 + identifier_length_variance: + mean_max: -0.4031 + mean_mean: 0.3287 + mean_std_dev: -0.8347 + mean_variance: -1.6695 + ngram: + mean_bigram_hapax_fraction: 0.2542 + mean_bigram_repeated_unique: -0.5967 + mean_bigram_repetition_rate: -0.9599 + mean_bigram_unique: 0.6173 + mean_trigram_hapax_fraction: 0.2449 + mean_trigram_repeated_unique: -2.0000 + mean_trigram_repetition_rate: -1.9547 + mean_trigram_unique: 0.6002 + punctuation_density: + mean_exclamation_density: -0.3314 + readability: + mean_avg_line_length: 0.1418 + symbol_density: + mean_density: -0.1381 + vocabulary: + mean_mattr: 1.4020 + mean_raw_ttr: 1.4020 + mean_unique_identifiers: 1.4020 + vowel_density: + mean_total_chars: 0.3287 + zipf: + mean_exponent: -0.2180 + mean_vocab_size: 0.7106 + +file_name_matches_primary_export: + _doc: "The file name should match the primary class or module it exports (e.g. `user.js` exports `User`)." + _fix_hint: "Rename the file to match the primary module it defines" + _languages: [elixir] + _log_baseline: 0.0000 + casing_entropy: + mean_pascal_case_count: 0.0000 + vocabulary: + mean_unique_identifiers: 0.0000 + +function_name_is_not_single_word: + _doc: "Single-word function names like `run`, `process`, or `handle` are too vague to convey intent." + _languages: [elixir] + _log_baseline: 17.8470 + compression: + mean_raw_bytes: 0.2434 + mean_redundancy: 0.0776 + mean_zlib_bytes: 0.1029 + mean_zlib_ratio: 0.1405 + entropy: + mean_char_entropy: 0.0241 + mean_char_normalized: 0.0241 + identifier_length_variance: + mean_max: 0.7685 + mean_mean: 0.5825 + mean_std_dev: 1.0000 + mean_variance: 2.0000 + readability: + mean_avg_line_length: 0.2559 + mean_avg_sub_words_per_id: 0.3083 + mean_flesch_adapted: -0.3181 + mean_fog_adapted: 1.3258 + symbol_density: + mean_density: -0.2431 + vowel_density: + mean_total_chars: 0.5825 + +function_name_matches_return_type: + _doc: "Functions prefixed with `get_`, `fetch_`, or `find_` should return the thing they name." + _languages: [elixir] + _log_baseline: 7.5638 + branching: + mean_max_nesting_depth: 0.1335 + brevity: + mean_sample_size: 0.0257 + casing_entropy: + mean_entropy: 0.0310 + mean_other_count: 0.0347 + mean_snake_case_count: -0.0296 + compression: + mean_raw_bytes: -0.0190 + mean_redundancy: -0.0180 + mean_unique_line_ratio: -0.0104 + mean_zlib_bytes: 0.0143 + mean_zlib_ratio: -0.0332 + entropy: + mean_char_entropy: 0.0079 + mean_char_max_entropy: 0.0071 + mean_token_max_entropy: 0.0059 + mean_token_normalized: -0.0045 + mean_total_tokens: 0.0030 + mean_vocab_size: 0.0257 + halstead: + mean_N1_total_operators: 0.0392 + mean_N2_total_operands: -0.0539 + mean_difficulty: 0.0029 + mean_effort: 0.0080 + mean_estimated_bugs: 0.0050 + mean_n1_unique_operators: 0.0629 + mean_n2_unique_operands: 0.0060 + mean_time_to_implement_seconds: 0.0080 + mean_vocabulary: 0.0218 + mean_volume: 0.0050 + heaps: + mean_beta: 0.0291 + mean_k: -0.0519 + mean_r_squared: 0.0038 + identifier_length_variance: + mean_max: 0.1082 + mean_std_dev: 0.0326 + mean_variance: 0.0653 + line_patterns: + mean_max_nesting_depth: 0.1335 + mean_string_literal_ratio: -0.0027 + mean_unique_line_ratio: -0.0108 + magic_number_density: + mean_density: -0.0108 + mean_string_literal_ratio: -0.0027 + near_duplicate_blocks_file: + mean_near_dup_block_d0: -0.5899 + mean_near_dup_block_d5: -0.2282 + mean_near_dup_block_d7: 0.2282 + mean_sub_block_count: 0.0314 + ngram: + mean_bigram_hapax_fraction: 0.0106 + mean_bigram_repeated_unique: 0.0095 + mean_bigram_repetition_rate: -0.0167 + mean_bigram_total: 0.0030 + mean_bigram_unique: 0.0261 + mean_trigram_hapax_fraction: 0.0174 + mean_trigram_repeated_unique: -0.0297 + mean_trigram_repetition_rate: -0.0444 + mean_trigram_total: 0.0030 + mean_trigram_unique: 0.0245 + punctuation_density: + mean_bracket_nonalpha_prefix_count: 0.1048 + mean_colon_suffix_density: -0.0027 + mean_dot_count: 0.1335 + mean_id_nonalpha_suffix_density: 0.0266 + mean_question_mark_density: -2.0000 + readability: + mean_avg_line_length: 0.0175 + mean_avg_sub_words_per_id: 0.0087 + mean_avg_tokens_per_line: 0.0030 + mean_flesch_adapted: -0.0107 + mean_fog_adapted: 0.0058 + symbol_density: + mean_density: 0.0633 + mean_distinct_symbol_types: 0.0639 + mean_symbol_count: 0.0442 + vocabulary: + mean_mattr: 0.0350 + mean_raw_ttr: 0.0299 + mean_total_identifiers: -0.0225 + mean_unique_identifiers: 0.0074 + vowel_density: + mean_total_chars: -0.0235 + zipf: + mean_exponent: -0.0047 + mean_r_squared: 0.0105 + mean_total_tokens: 0.0030 + mean_vocab_size: 0.0257 + +test_name_starts_with_verb: + _doc: "Test descriptions should start with a verb: `creates`, `raises`, `returns`, not a noun phrase." + _languages: [elixir] + _log_baseline: 7.8915 + branching: + mean_branch_count: 1.9977 + mean_branching_density: 2.0000 + brevity: + mean_sample_size: 0.0694 + casing_entropy: + mean_entropy: -0.0711 + mean_snake_case_count: 0.1381 + compression: + mean_raw_bytes: 0.0914 + mean_redundancy: 0.0182 + mean_zlib_bytes: 0.0482 + mean_zlib_ratio: 0.0431 + entropy: + mean_char_max_entropy: 0.0064 + mean_char_normalized: -0.0121 + mean_token_entropy: 0.0259 + mean_token_max_entropy: 0.0155 + mean_token_normalized: 0.0104 + mean_total_tokens: 0.0600 + mean_vocab_size: 0.0694 + halstead: + mean_N1_total_operators: 0.0411 + mean_difficulty: 0.0577 + mean_effort: 0.0855 + mean_estimated_bugs: 0.0277 + mean_length: 0.0240 + mean_n1_unique_operators: 0.0577 + mean_time_to_implement_seconds: 0.0855 + mean_vocabulary: 0.0164 + mean_volume: 0.0278 + heaps: + mean_beta: -0.0149 + mean_k: 0.0795 + mean_r_squared: -0.0081 + identifier_length_variance: + mean_std_dev: -0.0192 + mean_variance: -0.0384 + line_patterns: + mean_string_literal_ratio: -0.0611 + magic_number_density: + mean_string_literal_ratio: -0.0611 + ngram: + mean_bigram_hapax_fraction: -0.0506 + mean_bigram_repeated_unique: 0.1209 + mean_bigram_repetition_rate: 0.0150 + mean_bigram_total: 0.0602 + mean_bigram_unique: 0.0621 + mean_trigram_hapax_fraction: -0.0206 + mean_trigram_repeated_unique: 0.0961 + mean_trigram_repetition_rate: 0.0117 + mean_trigram_total: 0.0603 + mean_trigram_unique: 0.0596 + punctuation_density: + mean_arrow_density: -0.1129 + mean_colon_suffix_density: -0.0591 + mean_id_nonalpha_suffix_density: -0.0602 + readability: + mean_avg_line_length: 0.0943 + mean_avg_tokens_per_line: 0.0600 + mean_fog_adapted: 0.0600 + symbol_density: + mean_density: -0.0912 + vocabulary: + mean_mattr: 0.0463 + mean_total_identifiers: 0.1129 + mean_unique_identifiers: 0.1161 + vowel_density: + mean_total_chars: 0.1122 + zipf: + mean_exponent: -0.0239 + mean_total_tokens: 0.0600 + mean_vocab_size: 0.0694 + diff --git a/priv/combined_metrics/scope_and_assignment.yml b/priv/combined_metrics/scope_and_assignment.yml new file mode 100644 index 00000000..c33ac845 --- /dev/null +++ b/priv/combined_metrics/scope_and_assignment.yml @@ -0,0 +1,674 @@ +declared_close_to_use: + _doc: "Variables should be declared near their first use, not hoisted to the top of the function." + _log_baseline: -44.7729 + branching: + mean_branch_count: -0.3390 + mean_branching_density: -0.1842 + mean_non_blank_count: -0.1592 + brevity: + mean_sample_size: -0.1806 + casing_entropy: + mean_camel_case_count: -0.2981 + mean_entropy: 0.0653 + mean_other_count: -0.0412 + mean_pascal_case_count: -0.0760 + mean_snake_case_count: -0.3047 + comment_structure: + mean_comment_line_count: -2.0000 + mean_comment_line_ratio: -1.8774 + compression: + mean_raw_bytes: -0.2184 + mean_redundancy: -0.0281 + mean_unique_line_ratio: 0.0060 + mean_zlib_bytes: -0.1781 + mean_zlib_ratio: -0.0415 + entropy: + mean_char_entropy: 0.0068 + mean_char_max_entropy: -0.0078 + mean_char_normalized: 0.0146 + mean_token_entropy: -0.0400 + mean_token_max_entropy: -0.0376 + mean_total_tokens: -0.1836 + mean_vocab_size: -0.1806 + function_metrics: + mean_avg_function_lines: -0.1689 + mean_max_function_lines: -0.1306 + halstead: + mean_N1_total_operators: -0.1260 + mean_N2_total_operands: -0.2537 + mean_difficulty: -0.1345 + mean_effort: -0.3512 + mean_estimated_bugs: -0.2227 + mean_length: -0.1843 + mean_n1_unique_operators: -0.0976 + mean_n2_unique_operands: -0.2209 + mean_time_to_implement_seconds: -0.3512 + mean_vocabulary: -0.1908 + mean_volume: -0.2227 + heaps: + mean_beta: -0.0243 + identifier_length_variance: + mean_mean: 0.0266 + mean_std_dev: 0.0070 + mean_variance: 0.0144 + indentation: + mean_blank_line_ratio: 0.0709 + mean_mean_depth: -0.0132 + mean_variance: 0.0652 + line_patterns: + mean_blank_line_ratio: 0.0709 + mean_string_literal_ratio: 0.1211 + magic_number_density: + mean_density: -0.2075 + mean_magic_number_count: -0.3961 + mean_string_literal_ratio: 0.1211 + near_duplicate_blocks_file: + mean_block_count: -0.0380 + mean_near_dup_block_d0: -0.7925 + mean_near_dup_block_d8: 1.1610 + mean_sub_block_count: 0.0116 + ngram: + mean_bigram_repeated_unique: -0.1810 + mean_bigram_repetition_rate: 0.0038 + mean_bigram_total: -0.1839 + mean_bigram_unique: -0.1871 + mean_trigram_hapax_fraction: -0.0077 + mean_trigram_repeated_unique: -0.1259 + mean_trigram_repetition_rate: 0.0540 + mean_trigram_total: -0.1843 + mean_trigram_unique: -0.1904 + punctuation_density: + mean_arrow_density: 0.1987 + mean_bracket_nonalpha_prefix_count: -0.1993 + mean_bracket_nonalpha_suffix_count: -0.0283 + mean_colon_suffix_density: 0.1708 + mean_id_nonalpha_suffix_density: 0.0916 + mean_question_mark_density: 0.5000 + readability: + mean_avg_line_length: -0.0057 + mean_avg_sub_words_per_id: 0.0055 + mean_avg_tokens_per_line: -0.0774 + mean_fog_adapted: -0.0368 + mean_total_lines: -0.1115 + separator_counts: + mean_slash_count: -1.4037 + mean_underscore_count: -0.1887 + symbol_density: + mean_density: 0.0980 + mean_distinct_symbol_types: -0.0329 + mean_symbol_count: -0.1211 + vocabulary: + mean_mattr: -0.0686 + mean_raw_ttr: 0.0736 + mean_total_identifiers: -0.2738 + mean_unique_identifiers: -0.2075 + vowel_density: + mean_total_chars: -0.2512 + zipf: + mean_exponent: -0.0102 + mean_total_tokens: -0.1836 + mean_vocab_size: -0.1806 + +mutated_after_initial_assignment: + _doc: "Variables should not be reassigned after their initial value — prefer introducing a new name." + _log_baseline: 6.2569 + branching: + mean_branch_count: 0.1519 + mean_branching_density: 0.2073 + mean_max_nesting_depth: 0.0856 + mean_non_blank_count: -0.0553 + brevity: + mean_sample_size: 0.0068 + casing_entropy: + mean_entropy: -0.0947 + mean_pascal_case_count: -0.2061 + mean_snake_case_count: -0.0436 + compression: + mean_raw_bytes: -0.0496 + mean_redundancy: -0.0291 + mean_unique_line_ratio: -0.0110 + mean_zlib_bytes: 0.0022 + mean_zlib_ratio: -0.0518 + entropy: + mean_char_entropy: -0.0034 + mean_char_max_entropy: 0.0039 + mean_char_normalized: -0.0074 + mean_token_entropy: 0.0082 + mean_token_max_entropy: 0.0015 + mean_token_normalized: 0.0067 + mean_total_tokens: -0.0392 + mean_vocab_size: 0.0068 + function_metrics: + mean_avg_function_lines: -0.1001 + mean_max_function_lines: -0.0511 + halstead: + mean_N1_total_operators: -0.0218 + mean_N2_total_operands: -0.0623 + mean_difficulty: -0.0341 + mean_effort: -0.0690 + mean_estimated_bugs: -0.0348 + mean_length: -0.0382 + mean_n1_unique_operators: 0.0337 + mean_n2_unique_operands: 0.0056 + mean_time_to_implement_seconds: -0.0690 + mean_vocabulary: 0.0149 + mean_volume: -0.0348 + heaps: + mean_beta: 0.0300 + mean_k: -0.0775 + mean_r_squared: 0.0063 + identifier_length_variance: + mean_mean: -0.0249 + mean_std_dev: 0.0286 + mean_variance: 0.0571 + indentation: + mean_blank_line_ratio: -0.1139 + mean_max_depth: 0.2725 + mean_mean_depth: 0.0979 + mean_variance: 0.5878 + line_patterns: + mean_blank_line_ratio: -0.1139 + mean_max_nesting_depth: 0.0856 + mean_string_literal_ratio: 0.0397 + mean_unique_line_ratio: -0.0203 + magic_number_density: + mean_density: -0.6790 + mean_magic_number_count: -0.7131 + mean_string_literal_ratio: 0.0397 + near_duplicate_blocks_file: + mean_sub_block_count: -0.1967 + ngram: + mean_bigram_hapax_fraction: 0.0087 + mean_bigram_repeated_unique: -0.0100 + mean_bigram_repetition_rate: -0.0293 + mean_bigram_total: -0.0393 + mean_bigram_unique: 0.0068 + mean_trigram_hapax_fraction: 0.0021 + mean_trigram_repeated_unique: -0.0086 + mean_trigram_repetition_rate: -0.0546 + mean_trigram_total: -0.0394 + punctuation_density: + mean_arrow_density: -1.5022 + mean_bracket_nonalpha_prefix_count: 0.0497 + mean_bracket_nonalpha_suffix_count: 0.4473 + mean_colon_suffix_density: 0.3529 + mean_dot_count: -0.1332 + mean_exclamation_density: -2.0000 + mean_id_nonalpha_suffix_density: 0.0542 + readability: + mean_avg_line_length: 0.0073 + mean_avg_sub_words_per_id: -0.0018 + mean_avg_tokens_per_line: 0.0161 + mean_fog_adapted: 0.0169 + mean_total_lines: -0.0553 + separator_counts: + mean_dot_count: -0.1332 + mean_hyphen_count: 0.3267 + mean_underscore_count: -0.0617 + symbol_density: + mean_density: 0.0095 + mean_distinct_symbol_types: 0.0436 + mean_symbol_count: -0.0402 + vocabulary: + mean_mattr: 0.0885 + mean_raw_ttr: 0.0564 + mean_total_identifiers: -0.0623 + mean_unique_identifiers: -0.0059 + vowel_density: + mean_total_chars: -0.0872 + zipf: + mean_exponent: -0.0305 + mean_r_squared: 0.0040 + mean_total_tokens: -0.0392 + mean_vocab_size: 0.0068 + +reassigned_multiple_times: + _doc: "A variable reassigned many times is a sign the name is too generic or the function does too much." + _log_baseline: -6.7462 + branching: + mean_max_nesting_depth: 0.0680 + mean_non_blank_count: 0.0226 + brevity: + mean_sample_size: 0.0344 + casing_entropy: + mean_entropy: 0.0328 + mean_other_count: 0.1073 + mean_pascal_case_count: -0.1015 + mean_screaming_snake_density: -2.0000 + mean_snake_case_count: -0.1201 + compression: + mean_raw_bytes: -0.0613 + mean_redundancy: -0.0497 + mean_unique_line_ratio: -0.0222 + mean_zlib_bytes: 0.0248 + mean_zlib_ratio: -0.0851 + entropy: + mean_char_entropy: 0.0021 + mean_char_max_entropy: 0.0066 + mean_char_normalized: -0.0041 + mean_token_entropy: 0.0105 + mean_token_max_entropy: 0.0072 + mean_token_normalized: 0.0033 + mean_total_tokens: -0.0838 + mean_vocab_size: 0.0344 + function_metrics: + mean_avg_function_lines: 0.5612 + mean_function_count: 0.0833 + mean_max_function_lines: 0.5399 + halstead: + mean_N1_total_operators: -0.0755 + mean_N2_total_operands: -0.1513 + mean_difficulty: -0.1428 + mean_effort: -0.2522 + mean_estimated_bugs: -0.1029 + mean_length: -0.1057 + mean_n1_unique_operators: 0.0270 + mean_n2_unique_operands: 0.0094 + mean_time_to_implement_seconds: -0.2522 + mean_vocabulary: 0.0139 + mean_volume: -0.1029 + heaps: + mean_beta: 0.0677 + mean_k: -0.1329 + mean_r_squared: 0.0059 + identifier_length_variance: + mean_max: 0.0594 + mean_mean: 0.0509 + mean_std_dev: 0.1441 + mean_variance: 0.2867 + indentation: + mean_blank_line_ratio: 0.0585 + mean_max_depth: 0.1513 + mean_mean_depth: 0.0393 + mean_variance: 0.2588 + line_patterns: + mean_blank_line_ratio: 0.0585 + mean_max_nesting_depth: 0.0680 + mean_string_literal_ratio: 0.1161 + mean_unique_line_ratio: -0.0221 + magic_number_density: + mean_density: -0.1020 + mean_magic_number_count: -0.1906 + mean_string_literal_ratio: 0.1161 + near_duplicate_blocks_file: + mean_block_count: 0.0393 + mean_near_dup_block_d1: -0.2586 + mean_near_dup_block_d8: -0.2586 + mean_sub_block_count: -0.0637 + ngram: + mean_bigram_hapax_fraction: 0.0798 + mean_bigram_repeated_unique: -0.1513 + mean_bigram_repetition_rate: -0.0805 + mean_bigram_total: -0.0839 + mean_bigram_unique: 0.0180 + mean_trigram_hapax_fraction: 0.0653 + mean_trigram_repeated_unique: -0.2098 + mean_trigram_repetition_rate: -0.1158 + mean_trigram_total: -0.0841 + mean_trigram_unique: -0.0052 + punctuation_density: + mean_arrow_density: 0.2665 + mean_bracket_nonalpha_prefix_count: -0.0325 + mean_bracket_nonalpha_suffix_count: 0.1513 + mean_bracket_number_pair_count: -0.1906 + mean_colon_suffix_density: 0.0229 + mean_dot_count: -0.1300 + mean_id_nonalpha_suffix_density: -0.0584 + readability: + mean_avg_line_length: -0.0872 + mean_avg_sub_words_per_id: 0.0183 + mean_avg_tokens_per_line: -0.1102 + mean_flesch_adapted: -0.0030 + mean_fog_adapted: -0.1102 + mean_total_lines: 0.0226 + separator_counts: + mean_dot_count: -0.1300 + mean_hyphen_count: 0.2586 + mean_slash_count: -0.2586 + mean_underscore_count: 0.2460 + symbol_density: + mean_density: -0.0051 + mean_distinct_symbol_types: 0.0118 + mean_symbol_count: -0.0622 + vocabulary: + mean_mattr: 0.1928 + mean_raw_ttr: 0.1782 + mean_total_identifiers: -0.1113 + mean_unique_identifiers: 0.0580 + vowel_density: + mean_total_chars: -0.0602 + zipf: + mean_exponent: -0.0522 + mean_r_squared: 0.0034 + mean_total_tokens: -0.0838 + mean_vocab_size: 0.0344 + +scope_is_minimal: + _doc: "Variables should be scoped as narrowly as possible — not declared at a wider scope than needed." + _log_baseline: -7.6942 + branching: + mean_branch_count: -0.1072 + mean_branching_density: -0.0452 + mean_non_blank_count: -0.0619 + brevity: + mean_sample_size: -0.0368 + casing_entropy: + mean_entropy: -0.0341 + mean_other_count: -0.1823 + mean_snake_case_count: -0.0471 + comment_structure: + mean_comment_line_count: -0.4075 + mean_comment_line_ratio: 2.0000 + compression: + mean_raw_bytes: -0.1109 + mean_redundancy: -0.0367 + mean_unique_line_ratio: -0.1005 + mean_zlib_bytes: -0.0343 + mean_zlib_ratio: -0.0767 + entropy: + mean_char_entropy: 0.0481 + mean_char_normalized: 0.0481 + mean_token_entropy: 0.0037 + mean_token_max_entropy: -0.0081 + mean_token_normalized: 0.0118 + mean_total_tokens: 0.0143 + mean_vocab_size: -0.0368 + function_metrics: + mean_avg_function_lines: -0.2553 + mean_avg_param_count: -0.1037 + mean_function_count: 0.1691 + mean_max_function_lines: -0.8149 + mean_max_param_count: -0.4075 + halstead: + mean_N1_total_operators: 0.0334 + mean_N2_total_operands: -0.0363 + mean_difficulty: 0.1281 + mean_effort: 0.1258 + mean_length: 0.0079 + mean_n1_unique_operators: 0.0751 + mean_n2_unique_operands: -0.0892 + mean_time_to_implement_seconds: 0.1258 + mean_vocabulary: -0.0460 + heaps: + mean_beta: -0.0099 + mean_k: -0.0341 + mean_r_squared: 0.0103 + identifier_length_variance: + mean_mean: -0.0320 + mean_std_dev: 0.0864 + mean_variance: 0.1729 + indentation: + mean_blank_line_ratio: -0.1076 + mean_max_depth: -0.4694 + mean_mean_depth: -0.4430 + mean_variance: -1.1640 + line_patterns: + mean_blank_line_ratio: -0.1076 + mean_string_literal_ratio: -0.0144 + mean_unique_line_ratio: -0.0307 + magic_number_density: + mean_string_literal_ratio: -0.0144 + near_duplicate_blocks_file: + mean_block_count: -0.1691 + mean_sub_block_count: 0.0447 + ngram: + mean_bigram_hapax_fraction: -0.0078 + mean_bigram_repeated_unique: 0.0406 + mean_bigram_total: 0.0144 + mean_bigram_unique: 0.0265 + mean_trigram_hapax_fraction: 0.0113 + mean_trigram_repeated_unique: 0.0074 + mean_trigram_repetition_rate: -0.0382 + mean_trigram_total: 0.0144 + mean_trigram_unique: 0.0419 + punctuation_density: + mean_arrow_density: -0.2974 + mean_bracket_nonalpha_prefix_count: 0.7364 + mean_bracket_nonalpha_suffix_count: 0.1691 + mean_colon_suffix_density: -0.1217 + mean_dot_count: 0.3290 + mean_id_nonalpha_suffix_density: 0.0100 + readability: + mean_avg_line_length: -0.0336 + mean_avg_sub_words_per_id: -0.0359 + mean_avg_tokens_per_line: 0.0563 + mean_flesch_adapted: 0.0319 + mean_fog_adapted: 0.0564 + mean_total_lines: -0.0420 + separator_counts: + mean_dot_count: 0.3290 + mean_underscore_count: -0.1941 + symbol_density: + mean_density: 0.1427 + mean_distinct_symbol_types: 0.0336 + mean_symbol_count: 0.0316 + vocabulary: + mean_raw_ttr: -0.0313 + mean_total_identifiers: -0.0577 + mean_unique_identifiers: -0.0891 + vowel_density: + mean_total_chars: -0.0897 + zipf: + mean_exponent: 0.0070 + mean_total_tokens: 0.0143 + mean_vocab_size: -0.0368 + +shadowed_by_inner_scope: + _doc: "Inner-scope names that shadow outer-scope names cause confusion about which value is in play." + _log_baseline: -32.0799 + branching: + mean_branching_density: 2.0000 + mean_max_nesting_depth: -0.1450 + mean_non_blank_count: -0.1418 + brevity: + mean_sample_size: -0.0786 + casing_entropy: + mean_entropy: 0.1132 + mean_pascal_case_count: -0.0306 + mean_snake_case_count: -0.2452 + comment_structure: + mean_comment_line_count: -1.1073 + mean_comment_line_ratio: 0.8936 + compression: + mean_raw_bytes: -0.1107 + mean_unique_line_ratio: -0.0068 + mean_zlib_bytes: -0.1099 + entropy: + mean_char_entropy: 0.0202 + mean_char_max_entropy: -0.0068 + mean_char_normalized: 0.0270 + mean_token_entropy: -0.0147 + mean_token_max_entropy: -0.0178 + mean_token_normalized: 0.0030 + mean_total_tokens: -0.1540 + mean_vocab_size: -0.0786 + function_metrics: + mean_avg_function_lines: -0.1314 + mean_max_function_lines: -0.2043 + halstead: + mean_N1_total_operators: -0.0341 + mean_N2_total_operands: -0.2093 + mean_difficulty: -0.1558 + mean_effort: -0.2907 + mean_estimated_bugs: -0.1349 + mean_length: -0.1141 + mean_n1_unique_operators: -0.0504 + mean_n2_unique_operands: -0.1040 + mean_time_to_implement_seconds: -0.2907 + mean_vocabulary: -0.0894 + mean_volume: -0.1349 + heaps: + mean_beta: 0.0296 + mean_k: -0.0817 + mean_r_squared: -0.0030 + identifier_length_variance: + mean_mean: 0.1622 + mean_std_dev: 0.2441 + mean_variance: 0.4883 + indentation: + mean_blank_line_ratio: 0.0798 + mean_mean_depth: -0.0595 + mean_variance: -0.0634 + line_patterns: + mean_blank_line_ratio: 0.0798 + mean_max_nesting_depth: -0.1450 + mean_string_literal_ratio: 0.1539 + mean_unique_line_ratio: -0.0187 + magic_number_density: + mean_density: 0.1539 + mean_string_literal_ratio: 0.1539 + ngram: + mean_bigram_hapax_fraction: 0.0443 + mean_bigram_repeated_unique: -0.2754 + mean_bigram_repetition_rate: -0.0922 + mean_bigram_total: -0.1545 + mean_bigram_unique: -0.1141 + mean_trigram_hapax_fraction: 0.0321 + mean_trigram_repeated_unique: -0.4412 + mean_trigram_repetition_rate: -0.2046 + mean_trigram_total: -0.1550 + mean_trigram_unique: -0.1252 + punctuation_density: + mean_arrow_density: 0.1539 + mean_colon_suffix_density: 0.1539 + mean_dot_count: -0.0206 + mean_id_nonalpha_suffix_density: 0.0780 + readability: + mean_avg_line_length: 0.0827 + mean_avg_sub_words_per_id: 0.0859 + mean_avg_tokens_per_line: -0.1060 + mean_flesch_adapted: -0.0728 + mean_fog_adapted: -0.1060 + mean_total_lines: -0.0480 + separator_counts: + mean_dot_count: -0.0206 + mean_underscore_count: 0.6826 + symbol_density: + mean_density: -0.0169 + mean_distinct_symbol_types: -0.0561 + mean_symbol_count: -0.1275 + vocabulary: + mean_mattr: 0.1103 + mean_raw_ttr: 0.1343 + mean_total_identifiers: -0.2135 + mean_unique_identifiers: -0.0792 + vowel_density: + mean_total_chars: -0.0513 + zipf: + mean_exponent: -0.0364 + mean_r_squared: 0.0058 + mean_total_tokens: -0.1540 + mean_vocab_size: -0.0786 + +used_only_once: + _doc: "A variable used only once is a candidate for inlining — it rarely adds clarity over a direct expression." + _log_baseline: -37.0606 + branching: + mean_branch_count: -0.2490 + mean_branching_density: -0.4526 + mean_max_nesting_depth: 0.3756 + mean_non_blank_count: -0.1648 + brevity: + mean_sample_size: -0.1367 + casing_entropy: + mean_camel_case_count: -0.6401 + mean_entropy: 0.1031 + mean_snake_case_count: -0.3410 + comment_structure: + mean_comment_line_ratio: 0.2167 + compression: + mean_raw_bytes: -0.2646 + mean_redundancy: -0.0574 + mean_unique_line_ratio: -0.0557 + mean_zlib_bytes: -0.1898 + mean_zlib_ratio: -0.0716 + entropy: + mean_token_entropy: -0.0310 + mean_token_max_entropy: -0.0279 + mean_total_tokens: -0.1912 + mean_vocab_size: -0.1367 + function_metrics: + mean_avg_function_lines: -0.0842 + mean_max_function_lines: -0.0181 + halstead: + mean_N1_total_operators: -0.1122 + mean_N2_total_operands: -0.3657 + mean_difficulty: -0.0847 + mean_effort: -0.2955 + mean_estimated_bugs: -0.2562 + mean_length: -0.2119 + mean_n1_unique_operators: 0.0156 + mean_n2_unique_operands: -0.2778 + mean_time_to_implement_seconds: -0.2955 + mean_vocabulary: -0.2073 + mean_volume: -0.2563 + heaps: + mean_beta: 0.0237 + mean_k: -0.0587 + mean_r_squared: -0.0061 + identifier_length_variance: + mean_mean: -0.0354 + mean_std_dev: 0.0671 + mean_variance: 0.1339 + indentation: + mean_blank_line_ratio: 0.1012 + mean_max_depth: 0.5085 + mean_mean_depth: 0.1325 + mean_variance: 0.8783 + line_patterns: + mean_blank_line_ratio: 0.1012 + mean_max_nesting_depth: 0.3756 + mean_string_literal_ratio: 0.1006 + mean_unique_line_ratio: -0.0692 + magic_number_density: + mean_density: -0.6247 + mean_magic_number_count: -0.7735 + mean_string_literal_ratio: 0.1006 + near_duplicate_blocks_file: + mean_near_dup_block_d0: 0.4526 + mean_near_dup_block_d1: 2.0000 + mean_near_dup_block_d7: 0.6562 + mean_sub_block_count: -0.3167 + ngram: + mean_bigram_hapax_fraction: -0.0797 + mean_bigram_repeated_unique: 0.0377 + mean_bigram_repetition_rate: 0.1844 + mean_bigram_total: -0.1916 + mean_bigram_unique: -0.2779 + mean_trigram_hapax_fraction: -0.0334 + mean_trigram_repeated_unique: 0.1364 + mean_trigram_repetition_rate: 0.2670 + mean_trigram_total: -0.1921 + mean_trigram_unique: -0.2343 + punctuation_density: + mean_arrow_density: 0.2929 + mean_bracket_nonalpha_prefix_count: -0.1228 + mean_bracket_nonalpha_suffix_count: 0.1242 + mean_colon_suffix_density: 0.1986 + mean_id_nonalpha_suffix_density: 0.0506 + mean_question_mark_density: 0.2491 + readability: + mean_avg_line_length: -0.1265 + mean_avg_tokens_per_line: -0.0534 + mean_fog_adapted: 0.0120 + mean_total_lines: -0.1653 + separator_counts: + mean_hyphen_count: 0.3212 + mean_underscore_count: -0.1733 + symbol_density: + mean_density: 0.1222 + mean_distinct_symbol_types: -0.0098 + mean_symbol_count: -0.1384 + vocabulary: + mean_mattr: 0.0895 + mean_raw_ttr: 0.1161 + mean_total_identifiers: -0.2782 + mean_unique_identifiers: -0.2003 + vowel_density: + mean_total_chars: -0.3381 + zipf: + mean_exponent: 0.0324 + mean_r_squared: -0.0076 + mean_total_tokens: -0.1912 + mean_vocab_size: -0.1367 + diff --git a/priv/combined_metrics/testing.yml b/priv/combined_metrics/testing.yml new file mode 100644 index 00000000..1d036f93 --- /dev/null +++ b/priv/combined_metrics/testing.yml @@ -0,0 +1,441 @@ +reasonable_test_to_code_ratio: + _doc: "There should be an adequate number of test cases relative to the code being tested." + _languages: [elixir] + _log_baseline: 11.2157 + branching: + mean_branch_count: 0.1869 + mean_branching_density: 0.0352 + mean_non_blank_count: 0.1517 + brevity: + mean_sample_size: 0.0290 + casing_entropy: + mean_entropy: 0.0656 + mean_pascal_case_count: 0.2097 + mean_snake_case_count: 0.0455 + comment_structure: + mean_comment_line_count: -0.5246 + mean_comment_line_ratio: 0.5016 + compression: + mean_raw_bytes: 0.1290 + mean_redundancy: 0.0346 + mean_unique_line_ratio: -0.0556 + mean_zlib_bytes: 0.0559 + mean_zlib_ratio: 0.0732 + entropy: + mean_char_entropy: -0.0074 + mean_char_normalized: -0.0064 + mean_token_entropy: 0.0033 + mean_token_max_entropy: 0.0063 + mean_token_normalized: -0.0029 + mean_total_tokens: 0.1093 + mean_vocab_size: 0.0290 + function_metrics: + mean_avg_function_lines: -0.0273 + mean_function_count: -0.0257 + mean_max_function_lines: -0.0776 + halstead: + mean_N1_total_operators: 0.1006 + mean_N2_total_operands: 0.1082 + mean_difficulty: 0.0314 + mean_effort: 0.1446 + mean_estimated_bugs: 0.1132 + mean_length: 0.1034 + mean_n1_unique_operators: -0.0102 + mean_n2_unique_operands: 0.0667 + mean_time_to_implement_seconds: 0.1446 + mean_vocabulary: 0.0443 + mean_volume: 0.1132 + heaps: + mean_beta: -0.0095 + mean_k: 0.0107 + mean_r_squared: 0.0171 + identifier_length_variance: + mean_mean: 0.0278 + mean_std_dev: 0.0092 + mean_variance: 0.0185 + indentation: + mean_blank_line_ratio: -0.0175 + mean_max_depth: 0.1093 + mean_mean_depth: 0.1424 + mean_variance: 0.3988 + line_patterns: + mean_blank_line_ratio: -0.0175 + mean_string_literal_ratio: 0.2536 + mean_unique_line_ratio: -0.0582 + magic_number_density: + mean_density: 0.3159 + mean_magic_number_count: 0.4248 + mean_string_literal_ratio: 0.2536 + near_duplicate_blocks_file: + mean_block_count: 0.3136 + mean_near_dup_block_d0: 0.6699 + mean_near_dup_block_d3: 0.1869 + mean_near_dup_block_d4: 0.5246 + mean_near_dup_block_d5: 0.1869 + mean_near_dup_block_d7: 0.7475 + mean_near_dup_block_d8: 0.1869 + mean_sub_block_count: 0.0723 + ngram: + mean_bigram_hapax_fraction: -0.0173 + mean_bigram_repeated_unique: 0.0681 + mean_bigram_repetition_rate: 0.0346 + mean_bigram_total: 0.1095 + mean_bigram_unique: 0.0370 + mean_trigram_hapax_fraction: -0.0158 + mean_trigram_repeated_unique: 0.1118 + mean_trigram_repetition_rate: 0.0487 + mean_trigram_total: 0.1097 + mean_trigram_unique: 0.0623 + punctuation_density: + mean_bracket_number_pair_count: 0.1869 + mean_colon_suffix_density: -0.1100 + mean_dot_count: 0.2776 + mean_exclamation_density: 2.0000 + mean_id_nonalpha_suffix_density: -0.0296 + readability: + mean_avg_line_length: -0.0303 + mean_avg_tokens_per_line: -0.0880 + mean_flesch_adapted: 0.0106 + mean_fog_adapted: -0.0829 + mean_total_lines: 0.1973 + symbol_density: + mean_density: -0.0353 + mean_distinct_symbol_types: -0.0284 + mean_symbol_count: 0.0960 + vocabulary: + mean_mattr: -0.0271 + mean_raw_ttr: -0.0488 + mean_total_identifiers: 0.0704 + mean_unique_identifiers: 0.0216 + vowel_density: + mean_total_chars: 0.0982 + zipf: + mean_exponent: 0.0408 + mean_r_squared: -0.0086 + mean_total_tokens: 0.1093 + mean_vocab_size: 0.0290 + +test_has_assertion: + _doc: "Every test body must contain at least one assertion — a test without assertions proves nothing." + _languages: [elixir] + _log_baseline: -10.8081 + branching: + mean_branch_count: 0.0918 + mean_branching_density: 0.1642 + mean_non_blank_count: -0.0727 + brevity: + mean_sample_size: -0.0555 + casing_entropy: + mean_entropy: -0.0026 + mean_other_count: -0.1294 + mean_pascal_case_count: -0.0450 + mean_snake_case_count: -0.0568 + comment_structure: + mean_comment_line_count: -0.6211 + mean_comment_line_ratio: 0.6522 + compression: + mean_raw_bytes: -0.0140 + mean_redundancy: 0.0137 + mean_unique_line_ratio: -0.0037 + mean_zlib_bytes: -0.0369 + mean_zlib_ratio: 0.0229 + entropy: + mean_char_entropy: 0.0022 + mean_char_max_entropy: -0.0025 + mean_char_normalized: 0.0048 + mean_token_entropy: -0.0124 + mean_token_max_entropy: -0.0118 + mean_vocab_size: -0.0555 + halstead: + mean_N1_total_operators: 0.0184 + mean_N2_total_operands: -0.0481 + mean_difficulty: 0.0979 + mean_effort: 0.0682 + mean_estimated_bugs: -0.0297 + mean_length: -0.0087 + mean_n1_unique_operators: 0.0130 + mean_n2_unique_operands: -0.1329 + mean_time_to_implement_seconds: 0.0682 + mean_vocabulary: -0.0965 + mean_volume: -0.0297 + heaps: + mean_beta: -0.0415 + mean_k: 0.0917 + mean_r_squared: -0.0091 + identifier_length_variance: + mean_mean: 0.0215 + mean_std_dev: 0.0214 + mean_variance: 0.0429 + indentation: + mean_blank_line_ratio: 0.0261 + mean_max_depth: -0.1294 + mean_mean_depth: -0.0289 + mean_variance: -0.0253 + line_patterns: + mean_blank_line_ratio: 0.0261 + mean_string_literal_ratio: -0.0139 + mean_unique_line_ratio: -0.0056 + magic_number_density: + mean_string_literal_ratio: -0.0139 + near_duplicate_blocks_file: + mean_block_count: -0.0376 + mean_near_dup_block_d0: -0.3507 + mean_near_dup_block_d5: 0.2212 + mean_near_dup_block_d6: -0.4425 + mean_near_dup_block_d7: 0.2212 + mean_near_dup_block_d8: -0.5719 + mean_sub_block_count: 0.0228 + ngram: + mean_bigram_hapax_fraction: -0.0509 + mean_bigram_repeated_unique: 0.0678 + mean_bigram_repetition_rate: 0.0505 + mean_bigram_unique: -0.0434 + mean_trigram_hapax_fraction: -0.0376 + mean_trigram_repeated_unique: 0.1149 + mean_trigram_repetition_rate: 0.0832 + mean_trigram_unique: -0.0336 + punctuation_density: + mean_arrow_density: 2.0000 + mean_bracket_nonalpha_prefix_count: -0.0712 + mean_bracket_nonalpha_suffix_count: -0.0492 + mean_colon_suffix_density: -0.0460 + mean_dot_count: 0.0108 + mean_id_nonalpha_suffix_density: 0.0127 + readability: + mean_avg_line_length: 0.0691 + mean_avg_sub_words_per_id: 0.0042 + mean_avg_tokens_per_line: 0.0284 + mean_flesch_adapted: -0.0065 + mean_fog_adapted: 0.0284 + mean_total_lines: -0.0284 + symbol_density: + mean_density: 0.0263 + mean_distinct_symbol_types: -0.0194 + mean_symbol_count: 0.0126 + vocabulary: + mean_mattr: -0.0607 + mean_raw_ttr: -0.0243 + mean_total_identifiers: -0.0553 + mean_unique_identifiers: -0.0796 + vowel_density: + mean_total_chars: -0.0338 + zipf: + mean_exponent: 0.0248 + mean_r_squared: -0.0049 + mean_vocab_size: -0.0555 + +test_name_describes_behavior: + _doc: "Test names should describe the expected behaviour, not just the method under test." + _languages: [elixir] + _log_baseline: 57.2080 + branching: + mean_branch_count: 2.0000 + mean_branching_density: -1.5965 + mean_non_blank_count: 0.2388 + brevity: + mean_sample_size: 0.1814 + casing_entropy: + mean_entropy: -0.1610 + mean_pascal_case_count: 0.0729 + mean_snake_case_count: 0.4125 + compression: + mean_raw_bytes: 0.3524 + mean_redundancy: 0.0412 + mean_unique_line_ratio: -0.0357 + mean_zlib_bytes: 0.2483 + mean_zlib_ratio: 0.1041 + entropy: + mean_char_entropy: -0.0171 + mean_char_max_entropy: 0.0138 + mean_char_normalized: -0.0308 + mean_token_entropy: 0.0449 + mean_token_max_entropy: 0.0420 + mean_total_tokens: 0.2704 + mean_vocab_size: 0.1814 + halstead: + mean_N1_total_operators: 0.2985 + mean_N2_total_operands: 0.1414 + mean_difficulty: 0.2542 + mean_effort: 0.5528 + mean_estimated_bugs: 0.2986 + mean_length: 0.2432 + mean_n1_unique_operators: 0.3063 + mean_n2_unique_operands: 0.1935 + mean_time_to_implement_seconds: 0.5528 + mean_vocabulary: 0.2268 + mean_volume: 0.2986 + heaps: + mean_beta: 0.0106 + mean_k: -0.0084 + identifier_length_variance: + mean_mean: 0.0422 + mean_std_dev: -0.0249 + mean_variance: -0.0498 + indentation: + mean_blank_line_ratio: -0.1184 + mean_max_depth: 0.3691 + mean_mean_depth: 0.3712 + mean_variance: 0.8827 + line_patterns: + mean_blank_line_ratio: -0.1184 + mean_string_literal_ratio: -0.1419 + mean_unique_line_ratio: -0.0535 + magic_number_density: + mean_string_literal_ratio: -0.1419 + near_duplicate_blocks_file: + mean_block_count: 0.0868 + mean_near_dup_block_d0: 0.6309 + mean_near_dup_block_d7: -0.2619 + mean_sub_block_count: 0.0868 + ngram: + mean_bigram_hapax_fraction: 0.0841 + mean_bigram_repeated_unique: 0.1923 + mean_bigram_repetition_rate: -0.0199 + mean_bigram_total: 0.2709 + mean_bigram_unique: 0.2850 + mean_trigram_hapax_fraction: 0.0812 + mean_trigram_repeated_unique: 0.1582 + mean_trigram_repetition_rate: -0.0339 + mean_trigram_total: 0.2715 + mean_trigram_unique: 0.2823 + punctuation_density: + mean_colon_suffix_density: -0.2725 + mean_dot_count: 0.0701 + mean_id_nonalpha_suffix_density: -0.0527 + readability: + mean_avg_line_length: 0.1184 + mean_avg_tokens_per_line: 0.0316 + mean_total_lines: 0.2388 + symbol_density: + mean_density: -0.1391 + mean_distinct_symbol_types: 0.0729 + mean_symbol_count: 0.2136 + vocabulary: + mean_mattr: -0.0698 + mean_raw_ttr: -0.1544 + mean_total_identifiers: 0.3273 + mean_unique_identifiers: 0.1730 + vowel_density: + mean_total_chars: 0.3695 + zipf: + mean_r_squared: 0.0095 + mean_total_tokens: 0.2704 + mean_vocab_size: 0.1814 + +test_single_concept: + _doc: "Each test should verify a single concept — tests covering multiple things are harder to diagnose when they fail." + _languages: [elixir] + _log_baseline: 37.2588 + branching: + mean_branch_count: 0.3696 + mean_branching_density: -2.0000 + mean_max_nesting_depth: 0.1534 + mean_non_blank_count: 0.2620 + brevity: + mean_sample_size: 0.0495 + casing_entropy: + mean_entropy: -0.0830 + mean_other_count: 0.3696 + mean_pascal_case_count: -0.0146 + mean_snake_case_count: 0.1912 + comment_structure: + mean_comment_line_count: -1.0376 + mean_comment_line_ratio: 1.0694 + compression: + mean_raw_bytes: 0.1970 + mean_redundancy: 0.0534 + mean_unique_line_ratio: -0.1814 + mean_zlib_bytes: 0.0851 + mean_zlib_ratio: 0.1119 + entropy: + mean_char_entropy: -0.0169 + mean_char_normalized: -0.0187 + mean_token_entropy: 0.0065 + mean_token_max_entropy: 0.0104 + mean_token_normalized: -0.0039 + mean_total_tokens: 0.1633 + mean_vocab_size: 0.0495 + function_metrics: + mean_avg_function_lines: 0.8129 + mean_avg_param_count: -0.0628 + mean_function_count: 0.0628 + mean_max_function_lines: 1.3538 + halstead: + mean_N1_total_operators: 0.1507 + mean_N2_total_operands: 0.0958 + mean_difficulty: 0.1387 + mean_effort: 0.2736 + mean_estimated_bugs: 0.1348 + mean_length: 0.1284 + mean_n1_unique_operators: 0.0628 + mean_n2_unique_operands: 0.0199 + mean_time_to_implement_seconds: 0.2736 + mean_vocabulary: 0.0302 + mean_volume: 0.1349 + heaps: + mean_beta: -0.0648 + mean_k: 0.1502 + mean_r_squared: 0.0046 + identifier_length_variance: + mean_mean: 0.0255 + mean_std_dev: 0.0733 + mean_variance: 0.1466 + indentation: + mean_blank_line_ratio: -0.0702 + mean_max_depth: 0.2162 + mean_mean_depth: 0.1207 + mean_variance: 0.3787 + line_patterns: + mean_blank_line_ratio: -0.0702 + mean_max_nesting_depth: 0.1534 + mean_string_literal_ratio: 0.3581 + mean_unique_line_ratio: -0.1894 + magic_number_density: + mean_density: -0.0115 + mean_magic_number_count: 0.1534 + mean_string_literal_ratio: 0.3581 + near_duplicate_blocks_file: + mean_block_count: 0.5858 + mean_near_dup_block_d0: 0.3696 + mean_near_dup_block_d4: 0.3696 + mean_sub_block_count: 0.1857 + ngram: + mean_bigram_hapax_fraction: -0.0471 + mean_bigram_repeated_unique: 0.1345 + mean_bigram_repetition_rate: 0.0547 + mean_bigram_total: 0.1635 + mean_bigram_unique: 0.0656 + mean_trigram_hapax_fraction: -0.0703 + mean_trigram_repeated_unique: 0.2632 + mean_trigram_repetition_rate: 0.1415 + mean_trigram_total: 0.1638 + mean_trigram_unique: 0.0672 + punctuation_density: + mean_arrow_density: -0.1592 + mean_bracket_nonalpha_prefix_count: 0.0712 + mean_colon_suffix_density: -0.0922 + mean_dot_count: 0.0317 + mean_id_nonalpha_suffix_density: -0.0343 + readability: + mean_avg_line_length: -0.1064 + mean_avg_sub_words_per_id: 0.0125 + mean_avg_tokens_per_line: -0.1758 + mean_flesch_adapted: 0.0098 + mean_fog_adapted: -0.1758 + mean_total_lines: 0.3391 + symbol_density: + mean_density: -0.0634 + mean_symbol_count: 0.1338 + vocabulary: + mean_mattr: -0.0701 + mean_raw_ttr: -0.1129 + mean_total_identifiers: 0.1594 + mean_unique_identifiers: 0.0464 + vowel_density: + mean_total_chars: 0.1849 + zipf: + mean_exponent: 0.0281 + mean_r_squared: -0.0039 + mean_total_tokens: 0.1633 + mean_vocab_size: 0.0495 + diff --git a/priv/combined_metrics/type_and_value.yml b/priv/combined_metrics/type_and_value.yml new file mode 100644 index 00000000..b9737213 --- /dev/null +++ b/priv/combined_metrics/type_and_value.yml @@ -0,0 +1,563 @@ +boolean_assigned_from_comparison: + _doc: "Boolean variables should be assigned directly from comparisons or predicate calls, not set via conditionals." + _log_baseline: 2.8516 + branching: + mean_branch_count: -0.8402 + mean_branching_density: 0.3349 + mean_max_nesting_depth: 0.1944 + mean_non_blank_count: -0.2723 + brevity: + mean_sample_size: 0.0454 + casing_entropy: + mean_entropy: -0.1192 + mean_other_count: -0.1944 + mean_pascal_case_count: 0.1137 + mean_snake_case_count: 0.1080 + compression: + mean_raw_bytes: -0.0537 + mean_redundancy: -0.0549 + mean_unique_line_ratio: 0.0903 + mean_zlib_bytes: 0.0350 + mean_zlib_ratio: -0.0887 + entropy: + mean_char_entropy: 0.0469 + mean_char_max_entropy: 0.0084 + mean_char_normalized: 0.0385 + mean_token_entropy: 0.0094 + mean_token_max_entropy: 0.0103 + mean_total_tokens: 0.0172 + mean_vocab_size: 0.0454 + function_metrics: + mean_avg_function_lines: -0.3955 + mean_function_count: 0.1137 + mean_max_function_lines: -0.4184 + halstead: + mean_N1_total_operators: 0.0223 + mean_N2_total_operands: 0.0441 + mean_difficulty: -0.1054 + mean_effort: -0.0588 + mean_estimated_bugs: 0.0466 + mean_length: 0.0317 + mean_n1_unique_operators: -0.0359 + mean_n2_unique_operands: 0.1137 + mean_time_to_implement_seconds: -0.0588 + mean_vocabulary: 0.0635 + mean_volume: 0.0466 + heaps: + mean_beta: 0.0212 + mean_k: -0.0531 + mean_r_squared: 0.0046 + identifier_length_variance: + mean_max: 0.0208 + mean_mean: 0.0168 + mean_std_dev: -0.0019 + mean_variance: -0.0038 + indentation: + mean_blank_line_ratio: 0.1081 + mean_max_depth: -0.2570 + mean_mean_depth: -0.2106 + mean_variance: -0.3879 + line_patterns: + mean_blank_line_ratio: 0.1081 + mean_max_nesting_depth: 0.1944 + mean_string_literal_ratio: -0.0182 + mean_unique_line_ratio: 0.1190 + magic_number_density: + mean_density: 0.1772 + mean_magic_number_count: 0.1944 + mean_string_literal_ratio: -0.0182 + near_duplicate_blocks_file: + mean_block_count: -0.2570 + mean_near_dup_block_d0: -0.3081 + mean_sub_block_count: 0.2455 + ngram: + mean_bigram_hapax_fraction: 0.0123 + mean_bigram_repeated_unique: -0.0200 + mean_bigram_repetition_rate: -0.0701 + mean_bigram_total: 0.0173 + mean_bigram_unique: 0.0466 + mean_trigram_hapax_fraction: 0.0153 + mean_trigram_repeated_unique: -0.1345 + mean_trigram_repetition_rate: -0.1804 + mean_trigram_total: 0.0174 + mean_trigram_unique: 0.0434 + punctuation_density: + mean_arrow_density: 0.9500 + mean_bracket_nonalpha_prefix_count: 0.1137 + mean_colon_suffix_density: 0.0338 + mean_dot_count: 0.1570 + mean_exclamation_density: -2.0000 + mean_id_nonalpha_suffix_density: 0.1365 + readability: + mean_avg_line_length: 0.2263 + mean_avg_sub_words_per_id: 0.0384 + mean_avg_tokens_per_line: 0.2895 + mean_flesch_adapted: -0.0583 + mean_fog_adapted: 0.2895 + mean_total_lines: -0.2723 + separator_counts: + mean_dot_count: 0.1570 + mean_hyphen_count: -0.5832 + mean_underscore_count: 0.1351 + symbol_density: + mean_density: 0.1390 + mean_distinct_symbol_types: 0.0351 + mean_symbol_count: 0.0855 + vocabulary: + mean_raw_ttr: -0.0051 + mean_total_identifiers: 0.1004 + mean_unique_identifiers: 0.0953 + vowel_density: + mean_total_chars: 0.1172 + zipf: + mean_exponent: -0.0211 + mean_r_squared: 0.0120 + mean_total_tokens: 0.0172 + mean_vocab_size: 0.0454 + +hardcoded_url_or_path: + _doc: "URLs, file paths, and host names should be configuration values, not inline string literals." + _log_baseline: 57.6828 + branching: + mean_max_nesting_depth: 0.4526 + brevity: + mean_sample_size: 0.1491 + casing_entropy: + mean_entropy: -0.0622 + mean_other_count: -0.9458 + mean_pascal_case_count: 0.4526 + mean_snake_case_count: 0.0807 + compression: + mean_raw_bytes: 0.3137 + mean_redundancy: 0.0589 + mean_unique_line_ratio: 0.0620 + mean_zlib_bytes: 0.2242 + mean_zlib_ratio: 0.0896 + entropy: + mean_char_entropy: 0.0090 + mean_char_normalized: 0.0133 + mean_token_entropy: 0.0319 + mean_token_max_entropy: 0.0321 + mean_total_tokens: 0.2263 + mean_vocab_size: 0.1491 + function_metrics: + mean_avg_function_lines: -0.4231 + mean_avg_param_count: 0.4526 + mean_function_count: 0.3756 + mean_max_param_count: 0.4526 + halstead: + mean_N1_total_operators: 0.2692 + mean_N2_total_operands: 0.3555 + mean_difficulty: 0.0776 + mean_effort: 0.4552 + mean_estimated_bugs: 0.3775 + mean_length: 0.2955 + mean_n1_unique_operators: 0.1560 + mean_n2_unique_operands: 0.4338 + mean_time_to_implement_seconds: 0.4552 + mean_vocabulary: 0.3487 + mean_volume: 0.3775 + heaps: + mean_beta: 0.0218 + mean_k: -0.0742 + identifier_length_variance: + mean_mean: 0.3229 + mean_std_dev: 0.2786 + mean_variance: 0.5571 + indentation: + mean_blank_line_ratio: 0.1827 + mean_mean_depth: -0.1314 + mean_variance: -0.1681 + line_patterns: + mean_blank_line_ratio: 0.1827 + mean_max_nesting_depth: 0.4526 + mean_string_literal_ratio: -0.0095 + mean_unique_line_ratio: 0.0639 + magic_number_density: + mean_density: -0.1946 + mean_string_literal_ratio: -0.0095 + near_duplicate_blocks_file: + mean_near_dup_block_d0: -0.7737 + mean_near_dup_block_d7: -2.0000 + mean_sub_block_count: 0.4147 + ngram: + mean_bigram_hapax_fraction: -0.0739 + mean_bigram_repeated_unique: 0.4206 + mean_bigram_repetition_rate: 0.0333 + mean_bigram_total: 0.2268 + mean_bigram_unique: 0.2470 + mean_trigram_repeated_unique: 0.3267 + mean_trigram_repetition_rate: -0.1301 + mean_trigram_total: 0.2273 + mean_trigram_unique: 0.3319 + punctuation_density: + mean_bracket_nonalpha_prefix_count: 0.5246 + mean_bracket_nonalpha_suffix_count: 0.9458 + mean_colon_suffix_density: -0.2240 + mean_dot_count: 0.0519 + mean_exclamation_density: 0.1397 + mean_id_nonalpha_suffix_density: -0.0717 + mean_question_mark_density: -0.3462 + readability: + mean_avg_line_length: 0.3239 + mean_avg_sub_words_per_id: 0.2015 + mean_avg_tokens_per_line: 0.2263 + mean_flesch_adapted: -0.2352 + mean_fog_adapted: 0.4805 + separator_counts: + mean_dot_count: 0.0519 + mean_slash_count: -0.5872 + mean_underscore_count: 1.5120 + symbol_density: + mean_density: -0.0101 + mean_symbol_count: 0.3038 + vocabulary: + mean_mattr: 0.1813 + mean_raw_ttr: 0.0726 + mean_total_identifiers: 0.0850 + mean_unique_identifiers: 0.1575 + vowel_density: + mean_total_chars: 0.4079 + zipf: + mean_r_squared: 0.0163 + mean_total_tokens: 0.2263 + mean_vocab_size: 0.1491 + +no_empty_string_initial: + _doc: "Initialising a variable to an empty string and reassigning later signals missing structure." + _log_baseline: -5.2140 + branching: + mean_branch_count: -0.1509 + mean_branching_density: -0.0146 + mean_max_nesting_depth: 0.0360 + mean_non_blank_count: -0.0973 + brevity: + mean_sample_size: 0.0215 + casing_entropy: + mean_entropy: 0.0015 + mean_other_count: -0.1377 + mean_pascal_case_count: 0.0650 + mean_screaming_snake_density: -1.4394 + mean_snake_case_count: -0.0016 + compression: + mean_raw_bytes: -0.0402 + mean_redundancy: -0.0283 + mean_unique_line_ratio: 0.0029 + mean_zlib_bytes: -0.0041 + mean_zlib_ratio: -0.0348 + entropy: + mean_char_entropy: 0.0151 + mean_char_max_entropy: 0.0021 + mean_char_normalized: 0.0130 + mean_token_entropy: 0.0194 + mean_token_max_entropy: 0.0055 + mean_token_normalized: 0.0140 + mean_total_tokens: -0.0095 + mean_vocab_size: 0.0215 + function_metrics: + mean_avg_function_lines: 0.0337 + mean_function_count: 0.1377 + mean_max_function_lines: 0.0057 + halstead: + mean_N1_total_operators: 0.0288 + mean_N2_total_operands: -0.0392 + mean_difficulty: -0.0209 + mean_effort: -0.0057 + mean_estimated_bugs: 0.0052 + mean_length: 0.0035 + mean_n1_unique_operators: 0.0154 + mean_n2_unique_operands: -0.0068 + mean_time_to_implement_seconds: -0.0057 + mean_vocabulary: 0.0014 + mean_volume: 0.0052 + heaps: + mean_beta: 0.0191 + mean_k: -0.0225 + identifier_length_variance: + mean_max: 0.0398 + mean_mean: -0.0269 + mean_std_dev: -0.0088 + mean_variance: -0.0270 + indentation: + mean_blank_line_ratio: -0.0374 + mean_max_depth: -0.0257 + mean_mean_depth: -0.0491 + mean_variance: -0.0047 + line_patterns: + mean_blank_line_ratio: -0.0374 + mean_max_nesting_depth: 0.0360 + mean_string_literal_ratio: -0.1459 + mean_unique_line_ratio: -0.0016 + magic_number_density: + mean_density: 0.1538 + mean_magic_number_count: 0.1869 + mean_string_literal_ratio: -0.1459 + near_duplicate_blocks_file: + mean_block_count: -0.0962 + mean_near_dup_block_d0: -0.5606 + mean_near_dup_block_d4: 0.2962 + mean_near_dup_block_d8: -0.2962 + mean_sub_block_count: 0.1045 + ngram: + mean_bigram_repeated_unique: 0.0065 + mean_bigram_repetition_rate: -0.0091 + mean_bigram_total: -0.0095 + mean_trigram_hapax_fraction: -0.0027 + mean_trigram_repeated_unique: 0.0056 + mean_trigram_repetition_rate: 0.0124 + mean_trigram_total: -0.0095 + mean_trigram_unique: -0.0170 + punctuation_density: + mean_arrow_density: -0.0278 + mean_bracket_nonalpha_prefix_count: -0.0345 + mean_bracket_nonalpha_suffix_count: 0.0463 + mean_colon_suffix_density: 0.1503 + mean_dot_count: 0.0512 + mean_exclamation_density: 2.0000 + mean_id_nonalpha_suffix_density: 0.0806 + readability: + mean_avg_line_length: 0.0536 + mean_avg_sub_words_per_id: -0.0116 + mean_avg_tokens_per_line: 0.0704 + mean_flesch_adapted: 0.0051 + mean_fog_adapted: 0.0674 + mean_total_lines: -0.0973 + separator_counts: + mean_dot_count: 0.0512 + mean_hyphen_count: 0.1377 + mean_slash_count: 0.3738 + mean_underscore_count: -0.1093 + symbol_density: + mean_density: 0.0072 + mean_distinct_symbol_types: 0.0051 + mean_symbol_count: -0.0197 + vocabulary: + mean_mattr: 0.0067 + mean_raw_ttr: 0.0068 + mean_total_identifiers: 0.0059 + mean_unique_identifiers: 0.0122 + vowel_density: + mean_total_chars: -0.0176 + zipf: + mean_exponent: -0.0117 + mean_r_squared: -0.0027 + mean_total_tokens: -0.0095 + mean_vocab_size: 0.0215 + +no_implicit_null_initial: + _doc: "Initialising a variable to `nil`/`null` and assigning it later in a branch signals missing structure." + _log_baseline: -3.2593 + branching: + mean_branch_count: 0.0293 + mean_branching_density: 0.0871 + mean_non_blank_count: -0.0578 + brevity: + mean_sample_size: 0.0132 + casing_entropy: + mean_entropy: 0.0660 + mean_other_count: 0.1247 + mean_screaming_snake_density: 0.0448 + mean_snake_case_count: -0.0534 + compression: + mean_raw_bytes: -0.0246 + mean_redundancy: -0.0100 + mean_unique_line_ratio: -0.0159 + mean_zlib_bytes: -0.0071 + mean_zlib_ratio: -0.0174 + entropy: + mean_char_entropy: 0.0071 + mean_char_max_entropy: 0.0063 + mean_token_entropy: -0.0025 + mean_token_max_entropy: 0.0029 + mean_token_normalized: -0.0055 + mean_vocab_size: 0.0132 + function_metrics: + mean_avg_function_lines: -0.1325 + mean_avg_param_count: -0.0029 + mean_function_count: 0.0440 + mean_max_function_lines: -0.1618 + halstead: + mean_N1_total_operators: 0.0268 + mean_N2_total_operands: -0.0416 + mean_difficulty: 0.0085 + mean_effort: 0.0109 + mean_estimated_bugs: 0.0024 + mean_n1_unique_operators: 0.0393 + mean_n2_unique_operands: -0.0108 + mean_time_to_implement_seconds: 0.0109 + mean_vocabulary: 0.0038 + mean_volume: 0.0024 + heaps: + mean_beta: -0.0079 + mean_k: 0.0418 + mean_r_squared: -0.0034 + identifier_length_variance: + mean_mean: 0.0185 + mean_std_dev: 0.0177 + mean_variance: 0.0354 + indentation: + mean_blank_line_ratio: -0.1146 + mean_mean_depth: 0.0089 + mean_variance: 0.1759 + line_patterns: + mean_blank_line_ratio: -0.1146 + mean_string_literal_ratio: -0.0022 + mean_unique_line_ratio: -0.0135 + magic_number_density: + mean_density: -0.0066 + mean_string_literal_ratio: -0.0022 + near_duplicate_blocks_file: + mean_block_count: -0.1493 + mean_sub_block_count: 0.0422 + ngram: + mean_bigram_hapax_fraction: -0.0181 + mean_bigram_repeated_unique: 0.0178 + mean_bigram_repetition_rate: 0.0335 + mean_bigram_unique: -0.0287 + mean_trigram_hapax_fraction: -0.0209 + mean_trigram_repeated_unique: 0.0959 + mean_trigram_repetition_rate: 0.0860 + mean_trigram_unique: -0.0245 + punctuation_density: + mean_arrow_density: -1.4388 + mean_bracket_nonalpha_prefix_count: 0.0807 + mean_bracket_nonalpha_suffix_count: 0.1035 + mean_colon_suffix_density: 0.1166 + mean_exclamation_density: 2.0000 + mean_id_nonalpha_suffix_density: 0.0685 + readability: + mean_avg_line_length: 0.0358 + mean_avg_sub_words_per_id: 0.0035 + mean_avg_tokens_per_line: 0.0586 + mean_flesch_adapted: -0.0082 + mean_fog_adapted: 0.0549 + mean_total_lines: -0.0578 + separator_counts: + mean_hyphen_count: 0.4264 + mean_slash_count: 0.0885 + mean_underscore_count: -0.0054 + symbol_density: + mean_density: 0.0549 + mean_distinct_symbol_types: 0.0342 + mean_symbol_count: 0.0303 + vocabulary: + mean_mattr: 0.0540 + mean_raw_ttr: 0.0466 + mean_total_identifiers: -0.0466 + vowel_density: + mean_total_chars: -0.0281 + zipf: + mean_exponent: -0.0074 + mean_r_squared: 0.0047 + mean_vocab_size: 0.0132 + +no_magic_value_assigned: + _doc: "Literal strings and numbers assigned to variables should be named constants, not inline values." + _log_baseline: -9.5635 + branching: + mean_branch_count: -0.2035 + mean_branching_density: -0.1140 + mean_non_blank_count: -0.0893 + brevity: + mean_sample_size: -0.0122 + casing_entropy: + mean_entropy: -0.0411 + mean_other_count: -0.3211 + mean_snake_case_count: -0.0502 + compression: + mean_raw_bytes: -0.1800 + mean_redundancy: -0.0342 + mean_unique_line_ratio: 0.0140 + mean_zlib_bytes: -0.1207 + mean_zlib_ratio: -0.0593 + entropy: + mean_char_max_entropy: -0.0144 + mean_char_normalized: 0.0154 + mean_token_entropy: 0.0837 + mean_token_normalized: 0.0863 + mean_total_tokens: -0.1270 + mean_vocab_size: -0.0122 + function_metrics: + mean_avg_function_lines: -1.1104 + mean_avg_param_count: -0.3099 + mean_function_count: 0.8801 + mean_max_function_lines: -0.5872 + halstead: + mean_N1_total_operators: 0.2796 + mean_N2_total_operands: -0.0340 + mean_difficulty: -0.1408 + mean_effort: 0.0099 + mean_estimated_bugs: 0.1506 + mean_length: 0.1539 + mean_n1_unique_operators: -0.0859 + mean_n2_unique_operands: 0.0209 + mean_time_to_implement_seconds: 0.0099 + mean_vocabulary: -0.0140 + mean_volume: 0.1507 + heaps: + mean_beta: 0.0889 + mean_k: -0.2042 + mean_r_squared: 0.0230 + identifier_length_variance: + mean_max: -0.0438 + mean_mean: -0.1501 + mean_std_dev: -0.2041 + mean_variance: -0.4083 + indentation: + mean_blank_line_ratio: -0.1918 + mean_mean_depth: -0.2808 + mean_variance: -0.1572 + line_patterns: + mean_blank_line_ratio: -0.1918 + mean_string_literal_ratio: -1.7484 + mean_unique_line_ratio: 0.0086 + magic_number_density: + mean_string_literal_ratio: -1.7484 + near_duplicate_blocks_file: + mean_block_count: 0.1491 + mean_near_dup_block_d0: 0.7737 + mean_sub_block_count: 0.2805 + ngram: + mean_bigram_hapax_fraction: 0.0998 + mean_bigram_repeated_unique: -0.1265 + mean_bigram_repetition_rate: -0.2414 + mean_bigram_total: -0.1274 + mean_bigram_unique: 0.1208 + mean_trigram_hapax_fraction: 0.1034 + mean_trigram_repeated_unique: -0.9134 + mean_trigram_repetition_rate: -0.8787 + mean_trigram_total: -0.1278 + mean_trigram_unique: 0.1009 + punctuation_density: + mean_arrow_density: -0.3964 + mean_bracket_nonalpha_prefix_count: -0.7737 + mean_bracket_nonalpha_suffix_count: 2.0000 + mean_colon_suffix_density: -0.2477 + mean_id_nonalpha_suffix_density: -0.0427 + mean_question_mark_density: 0.9772 + readability: + mean_avg_line_length: -0.0927 + mean_avg_sub_words_per_id: -0.1514 + mean_avg_tokens_per_line: -0.0377 + mean_flesch_adapted: 0.2297 + mean_fog_adapted: -0.5835 + mean_total_lines: -0.0893 + separator_counts: + mean_hyphen_count: -0.5246 + mean_underscore_count: -0.4596 + symbol_density: + mean_density: -0.0945 + mean_symbol_count: -0.2742 + vocabulary: + mean_mattr: 0.0346 + mean_raw_ttr: 0.0360 + mean_total_identifiers: -0.0550 + mean_unique_identifiers: -0.0191 + vowel_density: + mean_total_chars: -0.2051 + zipf: + mean_exponent: -0.0525 + mean_r_squared: 0.0208 + mean_total_tokens: -0.1270 + mean_vocab_size: -0.0122 + diff --git a/priv/combined_metrics/variable_naming.yml b/priv/combined_metrics/variable_naming.yml new file mode 100644 index 00000000..f07e7891 --- /dev/null +++ b/priv/combined_metrics/variable_naming.yml @@ -0,0 +1,1289 @@ +boolean_has_is_has_prefix: + _doc: "Boolean variables should be prefixed with `is_`, `has_`, or `can_`." + _languages: [elixir, javascript, ruby] + _log_baseline: 22.4319 + brevity: + mean_sample_size: 0.0752 + casing_entropy: + mean_camel_case_count: 2.0000 + mean_entropy: 0.4870 + mean_snake_case_count: -0.2309 + compression: + mean_raw_bytes: 0.1698 + mean_redundancy: 0.0581 + mean_zlib_bytes: 0.0387 + mean_zlib_ratio: 0.1318 + entropy: + mean_char_entropy: 0.0136 + mean_token_entropy: 0.0198 + mean_token_max_entropy: 0.0173 + mean_vocab_size: 0.0752 + halstead: + mean_difficulty: -0.0725 + mean_effort: -0.0594 + mean_estimated_bugs: 0.0109 + mean_n2_unique_operands: 0.0694 + mean_time_to_implement_seconds: -0.0594 + mean_vocabulary: 0.0494 + mean_volume: 0.0110 + heaps: + mean_k: 0.1146 + identifier_length_variance: + mean_max: 0.3229 + mean_mean: 0.3109 + mean_std_dev: 0.3325 + mean_variance: 0.6646 + ngram: + mean_bigram_hapax_fraction: 0.0164 + mean_bigram_repeated_unique: -0.0209 + mean_bigram_repetition_rate: -0.0199 + mean_bigram_unique: 0.0182 + mean_trigram_repeated_unique: -0.0226 + mean_trigram_repetition_rate: -0.0134 + punctuation_density: + mean_exclamation_density: -0.1826 + mean_question_mark_density: -0.1720 + readability: + mean_avg_line_length: 0.1754 + mean_avg_sub_words_per_id: 0.3932 + mean_flesch_adapted: -0.4857 + mean_fog_adapted: 0.5482 + separator_counts: + mean_underscore_count: 1.8116 + symbol_density: + mean_density: -0.1660 + vocabulary: + mean_mattr: 0.1186 + mean_raw_ttr: 0.1173 + mean_unique_identifiers: 0.1175 + vowel_density: + mean_total_chars: 0.3117 + zipf: + mean_exponent: -0.0403 + mean_r_squared: 0.0110 + mean_vocab_size: 0.0752 + +collection_name_is_plural: + _doc: "Variables holding a collection should use a plural name." + _languages: [elixir, javascript, ruby] + _log_baseline: 24.0478 + brevity: + mean_sample_size: -0.5320 + casing_entropy: + mean_camel_case_count: 0.4724 + mean_entropy: 0.1726 + mean_snake_case_count: -0.2009 + compression: + mean_raw_bytes: 0.8299 + mean_redundancy: 0.3207 + mean_zlib_bytes: 0.1239 + mean_zlib_ratio: 0.7180 + entropy: + mean_token_entropy: -0.1027 + mean_token_max_entropy: -0.1240 + mean_vocab_size: -0.5320 + halstead: + mean_N2_total_operands: -0.0506 + mean_difficulty: 0.7075 + mean_effort: 0.5207 + mean_estimated_bugs: -0.1558 + mean_n2_unique_operands: -0.7698 + mean_time_to_implement_seconds: 0.5207 + mean_vocabulary: -0.5251 + mean_volume: -0.1559 + heaps: + mean_k: -0.7238 + identifier_length_variance: + mean_mean: 1.6364 + mean_std_dev: -0.9858 + mean_variance: -2.0000 + ngram: + mean_bigram_hapax_fraction: -0.1392 + mean_bigram_repetition_rate: 0.1490 + mean_bigram_unique: -0.1850 + mean_trigram_repeated_unique: -0.1677 + punctuation_density: + mean_arrow_density: 0.0702 + mean_colon_suffix_density: -0.7988 + mean_question_mark_density: -0.5639 + readability: + mean_avg_line_length: 0.8649 + mean_avg_sub_words_per_id: 0.1285 + mean_flesch_adapted: -0.1311 + mean_fog_adapted: 0.8035 + separator_counts: + mean_underscore_count: 0.6811 + symbol_density: + mean_density: -0.8598 + vocabulary: + mean_mattr: -0.6972 + mean_raw_ttr: -0.7582 + mean_total_identifiers: -0.1337 + mean_unique_identifiers: -0.8807 + vowel_density: + mean_total_chars: 1.4857 + zipf: + mean_exponent: 0.1576 + mean_r_squared: -0.0933 + mean_vocab_size: -0.5320 + +loop_var_is_single_letter: + _doc: "Loop index variables (`i`, `j`, `k`) are acceptable inside loop bodies." + _languages: [elixir, javascript, ruby] + _log_baseline: -32.9785 + brevity: + mean_sample_size: -0.1049 + casing_entropy: + mean_camel_case_count: -2.0000 + mean_entropy: -0.3919 + mean_snake_case_count: 0.2033 + comment_structure: + mean_comment_line_ratio: 0.0080 + compression: + mean_raw_bytes: -0.4302 + mean_redundancy: -0.1176 + mean_unique_line_ratio: -0.0195 + mean_zlib_bytes: -0.1067 + mean_zlib_ratio: -0.3218 + entropy: + mean_char_normalized: -0.0122 + mean_token_entropy: -0.0181 + mean_token_max_entropy: -0.0244 + mean_vocab_size: -0.1049 + function_metrics: + mean_max_function_lines: -0.0254 + halstead: + mean_N1_total_operators: 0.0117 + mean_difficulty: 0.1678 + mean_effort: 0.1521 + mean_estimated_bugs: -0.0119 + mean_n1_unique_operators: 0.0342 + mean_n2_unique_operands: -0.1437 + mean_time_to_implement_seconds: 0.1521 + mean_vocabulary: -0.0770 + mean_volume: -0.0118 + heaps: + mean_beta: -0.0911 + mean_k: 0.1233 + mean_r_squared: -0.0216 + identifier_length_variance: + mean_max: -0.5833 + mean_mean: -0.8498 + mean_std_dev: -0.9251 + mean_variance: -1.8576 + indentation: + mean_max_depth: -0.0956 + mean_mean_depth: -0.0126 + mean_variance: -0.0698 + line_patterns: + mean_unique_line_ratio: -0.0229 + ngram: + mean_bigram_repeated_unique: -0.0377 + mean_bigram_repetition_rate: 0.0174 + mean_bigram_unique: -0.0494 + mean_trigram_hapax_fraction: -0.0166 + mean_trigram_repetition_rate: 0.0370 + mean_trigram_unique: -0.0433 + punctuation_density: + mean_id_nonalpha_suffix_density: 0.0170 + readability: + mean_avg_line_length: -0.4449 + mean_avg_sub_words_per_id: -0.3465 + mean_avg_tokens_per_line: 0.0100 + mean_flesch_adapted: 0.4102 + mean_fog_adapted: -1.3612 + separator_counts: + mean_hyphen_count: 0.1363 + mean_underscore_count: -1.4177 + symbol_density: + mean_density: 0.4330 + mean_distinct_symbol_types: 0.0533 + mean_symbol_count: 0.0087 + vocabulary: + mean_mattr: -0.1052 + mean_raw_ttr: -0.1626 + mean_unique_identifiers: -0.1618 + vowel_density: + mean_total_chars: -0.8389 + zipf: + mean_exponent: 0.0112 + mean_r_squared: -0.0134 + mean_vocab_size: -0.1049 + +name_contains_and: + _doc: "Variable names containing `and` signal a variable that holds two concerns." + _languages: [elixir, javascript, ruby] + _log_baseline: -2.9877 + branching: + mean_branch_count: -0.3666 + mean_branching_density: -0.3925 + mean_non_blank_count: 0.0242 + brevity: + mean_sample_size: 0.0107 + casing_entropy: + mean_camel_case_count: -0.2172 + mean_entropy: 0.0678 + mean_other_count: 0.6301 + mean_pascal_case_count: 0.0894 + mean_snake_case_count: 0.1042 + comment_structure: + mean_comment_line_ratio: -0.0282 + compression: + mean_raw_bytes: -0.0626 + mean_redundancy: -0.0240 + mean_unique_line_ratio: 0.0672 + mean_zlib_bytes: -0.0158 + mean_zlib_ratio: -0.0478 + entropy: + mean_char_max_entropy: 0.0084 + mean_char_normalized: -0.0068 + mean_token_normalized: -0.0068 + mean_total_tokens: 0.0583 + mean_vocab_size: 0.0107 + function_metrics: + mean_avg_function_lines: -0.3218 + mean_avg_param_count: -0.0939 + mean_function_count: 0.3368 + mean_max_function_lines: 0.0231 + halstead: + mean_N1_total_operators: 0.0512 + mean_N2_total_operands: 0.0805 + mean_difficulty: 0.0870 + mean_effort: 0.1544 + mean_estimated_bugs: 0.0665 + mean_length: 0.0626 + mean_n1_unique_operators: 0.0264 + mean_n2_unique_operands: 0.0189 + mean_time_to_implement_seconds: 0.1544 + mean_vocabulary: 0.0205 + mean_volume: 0.0665 + heaps: + mean_beta: -0.0639 + mean_k: 0.2017 + identifier_length_variance: + mean_max: -0.3666 + mean_mean: -0.2347 + mean_std_dev: -0.4600 + mean_variance: -0.9236 + indentation: + mean_blank_line_ratio: -0.0595 + mean_max_depth: -0.1211 + mean_mean_depth: -0.1378 + mean_variance: -0.2812 + line_patterns: + mean_blank_line_ratio: -0.0595 + mean_string_literal_ratio: -0.3480 + mean_unique_line_ratio: 0.0808 + magic_number_density: + mean_density: 0.3971 + mean_magic_number_count: 0.4541 + mean_string_literal_ratio: -0.0605 + near_duplicate_blocks_file: + mean_block_count: 0.1874 + mean_near_dup_block_d0: 1.2114 + mean_near_dup_block_d3: 1.3353 + mean_near_dup_block_d4: 1.7204 + mean_near_dup_block_d5: 2.0000 + mean_near_dup_block_d6: 0.3458 + mean_near_dup_block_d7: -0.2294 + mean_near_dup_block_d8: 0.5102 + mean_sub_block_count: 0.2831 + ngram: + mean_bigram_hapax_fraction: 0.0107 + mean_bigram_repeated_unique: 0.0144 + mean_bigram_total: 0.0584 + mean_bigram_unique: 0.0410 + mean_trigram_hapax_fraction: -0.0161 + mean_trigram_repeated_unique: 0.1073 + mean_trigram_repetition_rate: 0.0834 + mean_trigram_total: 0.0585 + mean_trigram_unique: 0.0248 + punctuation_density: + mean_arrow_density: -0.0894 + mean_bracket_nonalpha_suffix_count: -0.1211 + mean_colon_suffix_density: -0.4936 + mean_dot_count: -0.2504 + mean_exclamation_density: 0.7124 + mean_id_nonalpha_suffix_density: -0.0280 + mean_question_mark_density: 0.7124 + readability: + mean_avg_line_length: -0.0925 + mean_avg_sub_words_per_id: -0.2424 + mean_avg_tokens_per_line: 0.0317 + mean_flesch_adapted: 0.3817 + mean_fog_adapted: -0.9412 + mean_total_lines: 0.0244 + separator_counts: + mean_dot_count: -0.2504 + mean_underscore_count: -0.6180 + symbol_density: + mean_density: 0.0832 + mean_distinct_symbol_types: 0.0748 + mean_symbol_count: 0.0212 + vocabulary: + mean_mattr: -0.0887 + mean_raw_ttr: -0.0633 + mean_total_identifiers: 0.0782 + mean_unique_identifiers: 0.0162 + vowel_density: + mean_total_chars: -0.1561 + zipf: + mean_exponent: 0.0059 + mean_total_tokens: 0.0583 + mean_vocab_size: 0.0107 + +name_contains_type_suffix: + _doc: "Type suffixes in names (`userString`, `nameList`) are redundant noise." + _languages: [elixir, javascript, ruby] + _log_baseline: -33.1356 + branching: + mean_branch_count: -0.4150 + mean_branching_density: -0.4125 + brevity: + mean_sample_size: -0.1936 + casing_entropy: + mean_camel_case_count: -1.4300 + mean_entropy: -0.3631 + mean_other_count: -2.0000 + mean_pascal_case_count: -0.1660 + mean_snake_case_count: 0.1449 + compression: + mean_raw_bytes: -0.2768 + mean_redundancy: -0.1061 + mean_zlib_bytes: -0.1005 + mean_zlib_ratio: -0.1770 + entropy: + mean_char_entropy: -0.0082 + mean_token_entropy: -0.0294 + mean_token_max_entropy: -0.0396 + mean_token_normalized: 0.0102 + mean_total_tokens: -0.0142 + mean_vocab_size: -0.1936 + halstead: + mean_N1_total_operators: -0.0138 + mean_N2_total_operands: -0.0198 + mean_difficulty: 0.2824 + mean_effort: 0.2335 + mean_estimated_bugs: -0.0503 + mean_length: -0.0161 + mean_n1_unique_operators: 0.0679 + mean_n2_unique_operands: -0.2321 + mean_time_to_implement_seconds: 0.2335 + mean_vocabulary: -0.1655 + mean_volume: -0.0503 + heaps: + mean_beta: -0.1004 + mean_k: 0.1792 + identifier_length_variance: + mean_max: -0.3735 + mean_mean: -0.4788 + mean_std_dev: -0.5916 + mean_variance: -1.1882 + line_patterns: + mean_string_literal_ratio: 0.0109 + ngram: + mean_bigram_hapax_fraction: -0.0337 + mean_bigram_repeated_unique: 0.0355 + mean_bigram_repetition_rate: 0.0884 + mean_bigram_total: -0.0143 + mean_bigram_unique: -0.0700 + mean_trigram_hapax_fraction: -0.0406 + mean_trigram_repeated_unique: 0.2397 + mean_trigram_repetition_rate: 0.2387 + mean_trigram_total: -0.0143 + mean_trigram_unique: -0.0591 + punctuation_density: + mean_colon_suffix_density: -0.1444 + mean_id_nonalpha_suffix_density: -0.0281 + mean_question_mark_density: 0.5850 + readability: + mean_avg_line_length: -0.2896 + mean_avg_sub_words_per_id: -0.3400 + mean_avg_tokens_per_line: -0.0147 + mean_flesch_adapted: 0.4136 + mean_fog_adapted: -1.0490 + separator_counts: + mean_underscore_count: -1.7225 + symbol_density: + mean_density: 0.2546 + mean_symbol_count: -0.0213 + vocabulary: + mean_mattr: -0.2373 + mean_raw_ttr: -0.2260 + mean_total_identifiers: -0.0765 + mean_unique_identifiers: -0.3022 + vowel_density: + mean_total_chars: -0.5536 + zipf: + mean_exponent: 0.0794 + mean_r_squared: 0.0102 + mean_total_tokens: -0.0142 + mean_vocab_size: -0.1936 + +name_is_abbreviation: + _doc: "Abbreviated names (`usr`, `cfg`, `mgr`) reduce readability." + _languages: [elixir, javascript, ruby] + _log_baseline: 9.2985 + brevity: + mean_sample_size: -0.1542 + casing_entropy: + mean_camel_case_count: 0.3184 + mean_entropy: 0.2713 + mean_snake_case_count: -0.4803 + compression: + mean_raw_bytes: 0.5303 + mean_redundancy: 0.1964 + mean_unique_line_ratio: 0.1217 + mean_zlib_bytes: 0.0699 + mean_zlib_ratio: 0.4576 + entropy: + mean_char_entropy: -0.0398 + mean_char_normalized: -0.0573 + mean_token_entropy: -0.0375 + mean_token_max_entropy: -0.0330 + mean_total_tokens: -0.1093 + mean_vocab_size: -0.1542 + halstead: + mean_N1_total_operators: -0.1081 + mean_N2_total_operands: -0.1080 + mean_difficulty: 0.2026 + mean_effort: 0.0309 + mean_estimated_bugs: -0.1545 + mean_length: -0.1081 + mean_n2_unique_operands: -0.2963 + mean_time_to_implement_seconds: 0.0309 + mean_vocabulary: -0.2193 + mean_volume: -0.1547 + heaps: + mean_beta: -0.1056 + mean_k: 0.2303 + mean_r_squared: -0.0265 + identifier_length_variance: + mean_max: 1.2862 + mean_mean: 1.3727 + mean_variance: 0.0294 + line_patterns: + mean_string_literal_ratio: -0.0949 + mean_unique_line_ratio: 0.1274 + magic_number_density: + mean_density: 0.3656 + mean_string_literal_ratio: -0.2174 + near_duplicate_blocks_file: + mean_near_dup_block_d8: -2.0000 + ngram: + mean_bigram_repeated_unique: -0.0607 + mean_bigram_total: -0.1094 + mean_bigram_unique: -0.0339 + mean_trigram_hapax_fraction: 0.0915 + mean_trigram_repeated_unique: -0.2019 + mean_trigram_repetition_rate: -0.0842 + mean_trigram_total: -0.1095 + mean_trigram_unique: -0.0336 + punctuation_density: + mean_arrow_density: 0.2295 + mean_bracket_nonalpha_prefix_count: -0.3052 + mean_bracket_nonalpha_suffix_count: -0.1134 + mean_colon_suffix_density: -0.4215 + mean_dot_count: -0.1179 + mean_exclamation_density: -0.4702 + mean_id_nonalpha_suffix_density: -0.0410 + mean_question_mark_density: -0.5810 + readability: + mean_avg_line_length: 0.5519 + mean_avg_tokens_per_line: -0.1093 + mean_fog_adapted: -0.0959 + separator_counts: + mean_dot_count: -0.1179 + mean_slash_count: -0.5591 + mean_underscore_count: 0.0690 + symbol_density: + mean_density: -0.6485 + mean_symbol_count: -0.1218 + vocabulary: + mean_mattr: -0.1900 + mean_raw_ttr: 0.1813 + mean_total_identifiers: -0.3611 + mean_unique_identifiers: -0.2161 + vowel_density: + mean_total_chars: 1.0156 + zipf: + mean_exponent: 0.0603 + mean_total_tokens: -0.1093 + mean_vocab_size: -0.1542 + +name_is_generic: + _doc: "Generic names (`data`, `result`, `tmp`, `val`, `obj`) convey no domain meaning." + _languages: [elixir, javascript, ruby] + _log_baseline: 43.6270 + branching: + mean_branch_count: 0.5193 + mean_branching_density: 0.3889 + mean_max_nesting_depth: -0.0599 + mean_non_blank_count: 0.0756 + brevity: + mean_sample_size: 0.2053 + casing_entropy: + mean_camel_case_count: 2.0000 + mean_entropy: 0.3582 + mean_snake_case_count: 0.0915 + compression: + mean_raw_bytes: 0.3477 + mean_redundancy: 0.0524 + mean_zlib_bytes: 0.2391 + mean_zlib_ratio: 0.1093 + entropy: + mean_char_entropy: 0.0197 + mean_char_max_entropy: 0.0088 + mean_char_normalized: 0.0110 + mean_token_entropy: 0.0372 + mean_token_max_entropy: 0.0450 + mean_token_normalized: -0.0077 + mean_total_tokens: 0.0784 + mean_vocab_size: 0.2053 + function_metrics: + mean_avg_function_lines: -0.1306 + mean_avg_param_count: -0.0443 + mean_function_count: 0.2694 + mean_max_function_lines: -0.2279 + halstead: + mean_N1_total_operators: 0.0995 + mean_N2_total_operands: 0.0454 + mean_difficulty: -0.2194 + mean_effort: -0.0846 + mean_estimated_bugs: 0.1286 + mean_length: 0.0789 + mean_n1_unique_operators: 0.0328 + mean_n2_unique_operands: 0.2960 + mean_time_to_implement_seconds: -0.0846 + mean_vocabulary: 0.2199 + mean_volume: 0.1286 + heaps: + mean_beta: 0.1174 + mean_k: -0.2339 + mean_r_squared: 0.0145 + identifier_length_variance: + mean_max: 0.4477 + mean_mean: 0.5582 + mean_std_dev: 0.6755 + mean_variance: 1.3586 + indentation: + mean_blank_line_ratio: 0.0556 + mean_max_depth: -0.1451 + mean_mean_depth: -0.0760 + mean_variance: -0.2765 + line_patterns: + mean_blank_line_ratio: 0.0556 + mean_max_nesting_depth: -0.0599 + mean_string_literal_ratio: -0.0386 + mean_unique_line_ratio: 0.0182 + magic_number_density: + mean_density: -0.0624 + mean_string_literal_ratio: -0.1451 + near_duplicate_blocks_file: + mean_block_count: 0.1243 + mean_near_dup_block_d0: 0.9543 + mean_near_dup_block_d6: -0.3521 + mean_near_dup_block_d7: 0.6021 + mean_near_dup_block_d8: 0.0644 + mean_sub_block_count: 0.1831 + ngram: + mean_bigram_hapax_fraction: 0.1528 + mean_bigram_repeated_unique: -0.1344 + mean_bigram_repetition_rate: -0.1251 + mean_bigram_total: 0.0786 + mean_bigram_unique: 0.1718 + mean_trigram_hapax_fraction: 0.1086 + mean_trigram_repeated_unique: -0.2389 + mean_trigram_repetition_rate: -0.2091 + mean_trigram_total: 0.0787 + mean_trigram_unique: 0.1550 + punctuation_density: + mean_arrow_density: -0.1087 + mean_bracket_nonalpha_prefix_count: 0.2766 + mean_bracket_number_pair_count: 0.2499 + mean_colon_suffix_density: -0.0752 + mean_exclamation_density: -0.2923 + mean_id_nonalpha_suffix_density: -0.0340 + mean_question_mark_density: 0.1584 + readability: + mean_avg_line_length: 0.2897 + mean_avg_sub_words_per_id: 0.2590 + mean_flesch_adapted: -0.2843 + mean_fog_adapted: 0.5030 + mean_total_lines: 0.0756 + separator_counts: + mean_slash_count: 0.9542 + mean_underscore_count: 1.9344 + symbol_density: + mean_density: -0.2642 + mean_distinct_symbol_types: 0.0252 + mean_symbol_count: 0.0858 + vocabulary: + mean_mattr: 0.1932 + mean_raw_ttr: 0.1681 + mean_total_identifiers: 0.2205 + mean_unique_identifiers: 0.3862 + vowel_density: + mean_total_chars: 0.7766 + zipf: + mean_exponent: -0.0977 + mean_r_squared: 0.0316 + mean_total_tokens: 0.0784 + mean_vocab_size: 0.2053 + +name_is_number_like: + _doc: "Number-suffixed names (`var1`, `thing2`) signal a missing abstraction." + _languages: [elixir, javascript, ruby] + _log_baseline: 4.1505 + brevity: + mean_sample_size: -0.0262 + casing_entropy: + mean_camel_case_count: 0.6902 + mean_entropy: -0.4687 + mean_other_count: -2.0000 + mean_snake_case_count: 0.1969 + compression: + mean_raw_bytes: 0.1098 + mean_redundancy: 0.0379 + mean_zlib_bytes: 0.0415 + mean_zlib_ratio: 0.0696 + entropy: + mean_char_entropy: -0.0074 + mean_char_max_entropy: -0.0070 + mean_token_entropy: -0.0056 + mean_token_max_entropy: -0.0054 + mean_total_tokens: -0.0075 + mean_vocab_size: -0.0262 + halstead: + mean_N1_total_operators: -0.0036 + mean_N2_total_operands: -0.0096 + mean_difficulty: -0.0139 + mean_effort: -0.0244 + mean_estimated_bugs: -0.0103 + mean_length: -0.0059 + mean_n1_unique_operators: -0.0253 + mean_n2_unique_operands: -0.0217 + mean_time_to_implement_seconds: -0.0244 + mean_vocabulary: -0.0224 + mean_volume: -0.0102 + heaps: + mean_beta: 0.0225 + mean_k: -0.1085 + mean_r_squared: 0.0046 + identifier_length_variance: + mean_max: 0.0623 + mean_mean: 0.2335 + mean_std_dev: 0.2269 + mean_variance: 0.4543 + line_patterns: + mean_string_literal_ratio: 0.0201 + ngram: + mean_bigram_repeated_unique: -0.0072 + mean_bigram_repetition_rate: 0.0064 + mean_bigram_total: -0.0075 + mean_bigram_unique: -0.0124 + mean_trigram_repeated_unique: -0.0135 + mean_trigram_repetition_rate: 0.0035 + mean_trigram_total: -0.0075 + mean_trigram_unique: -0.0093 + punctuation_density: + mean_colon_suffix_density: 0.0087 + mean_question_mark_density: -0.1007 + readability: + mean_avg_line_length: 0.1147 + mean_avg_sub_words_per_id: 0.1104 + mean_avg_tokens_per_line: -0.0075 + mean_flesch_adapted: -0.1154 + mean_fog_adapted: 0.0448 + separator_counts: + mean_hyphen_count: -0.5988 + mean_underscore_count: 0.6819 + symbol_density: + mean_density: -0.1135 + mean_distinct_symbol_types: -0.0272 + mean_symbol_count: -0.0042 + vocabulary: + mean_mattr: -0.0033 + mean_total_identifiers: -0.0235 + mean_unique_identifiers: -0.0258 + vowel_density: + mean_total_chars: 0.2085 + zipf: + mean_exponent: 0.0060 + mean_total_tokens: -0.0075 + mean_vocab_size: -0.0262 + +name_is_single_letter: + _doc: "Single-letter names outside loop indices are too opaque." + _languages: [elixir, javascript, ruby] + _log_baseline: 30.8986 + branching: + mean_branching_density: -0.0445 + mean_non_blank_count: 0.0426 + brevity: + mean_sample_size: 0.2360 + casing_entropy: + mean_camel_case_count: 1.9409 + mean_entropy: 0.3197 + mean_snake_case_count: -0.1073 + comment_structure: + mean_comment_line_ratio: -0.0655 + compression: + mean_raw_bytes: 0.6122 + mean_redundancy: 0.1536 + mean_unique_line_ratio: 0.0952 + mean_zlib_bytes: 0.3372 + mean_zlib_ratio: 0.2802 + entropy: + mean_char_entropy: 0.0250 + mean_char_max_entropy: 0.0106 + mean_char_normalized: 0.0143 + mean_token_entropy: 0.0808 + mean_token_max_entropy: 0.0529 + mean_token_normalized: 0.0277 + mean_total_tokens: -0.0290 + mean_vocab_size: 0.2360 + function_metrics: + mean_avg_function_lines: 0.0364 + halstead: + mean_N1_total_operators: -0.0299 + mean_N2_total_operands: -0.0195 + mean_difficulty: -0.5303 + mean_effort: -0.4968 + mean_estimated_bugs: 0.0311 + mean_length: -0.0257 + mean_n1_unique_operators: -0.0930 + mean_n2_unique_operands: 0.4254 + mean_time_to_implement_seconds: -0.4968 + mean_vocabulary: 0.2528 + mean_volume: 0.0311 + heaps: + mean_beta: 0.2456 + mean_k: -0.4826 + mean_r_squared: 0.0606 + identifier_length_variance: + mean_mean: 1.2618 + mean_variance: 0.0143 + indentation: + mean_blank_line_ratio: -0.0401 + mean_mean_depth: 0.0273 + mean_variance: 0.0329 + line_patterns: + mean_blank_line_ratio: -0.0401 + mean_unique_line_ratio: 0.0990 + magic_number_density: + mean_density: 0.0296 + ngram: + mean_bigram_hapax_fraction: 0.2403 + mean_bigram_repeated_unique: -0.2404 + mean_bigram_repetition_rate: -0.2658 + mean_bigram_total: -0.0291 + mean_bigram_unique: 0.2611 + mean_trigram_hapax_fraction: 0.1247 + mean_trigram_repeated_unique: -0.4172 + mean_trigram_repetition_rate: -0.3206 + mean_trigram_total: -0.0291 + mean_trigram_unique: 0.1202 + punctuation_density: + mean_colon_suffix_density: 0.0303 + mean_id_nonalpha_suffix_density: -0.0315 + mean_question_mark_density: -0.8899 + readability: + mean_avg_line_length: 0.6061 + mean_avg_sub_words_per_id: 0.2456 + mean_avg_tokens_per_line: -0.0753 + mean_flesch_adapted: -0.2349 + mean_fog_adapted: -0.0666 + mean_total_lines: 0.0431 + separator_counts: + mean_hyphen_count: -0.1345 + mean_underscore_count: 2.0000 + symbol_density: + mean_density: -0.6407 + mean_symbol_count: -0.0254 + vocabulary: + mean_mattr: 0.4875 + mean_raw_ttr: 0.4010 + mean_total_identifiers: 0.0481 + mean_unique_identifiers: 0.4452 + vowel_density: + mean_total_chars: 1.2917 + zipf: + mean_exponent: -0.2098 + mean_r_squared: 0.1010 + mean_total_tokens: -0.0290 + mean_vocab_size: 0.2360 + +name_is_too_long: + _doc: "Names longer than ~30 characters harm readability." + _languages: [elixir, javascript, ruby] + _log_baseline: -10.5110 + branching: + mean_branch_count: 0.0340 + mean_branching_density: 0.0916 + mean_max_nesting_depth: 0.0484 + mean_non_blank_count: -0.0724 + brevity: + mean_sample_size: -0.0167 + casing_entropy: + mean_camel_case_count: -0.1082 + mean_entropy: 0.0194 + mean_other_count: 0.0922 + mean_pascal_case_count: 0.0340 + mean_snake_case_count: 0.1095 + comment_structure: + mean_comment_line_ratio: 0.1321 + compression: + mean_raw_bytes: -0.2235 + mean_redundancy: -0.0299 + mean_unique_line_ratio: -0.0154 + mean_zlib_bytes: -0.1618 + mean_zlib_ratio: -0.0643 + entropy: + mean_token_entropy: -0.0111 + mean_token_max_entropy: -0.0038 + mean_token_normalized: -0.0072 + mean_total_tokens: 0.0756 + mean_vocab_size: -0.0167 + function_metrics: + mean_avg_function_lines: -0.1811 + mean_avg_param_count: -0.0267 + mean_function_count: 0.1054 + mean_max_function_lines: -0.1862 + halstead: + mean_N1_total_operators: 0.0815 + mean_N2_total_operands: 0.0706 + mean_difficulty: 0.1104 + mean_effort: 0.1960 + mean_estimated_bugs: 0.0734 + mean_length: 0.0774 + mean_n1_unique_operators: 0.0062 + mean_n2_unique_operands: -0.0320 + mean_time_to_implement_seconds: 0.1960 + mean_vocabulary: -0.0217 + mean_volume: 0.0734 + heaps: + mean_beta: -0.0480 + mean_k: 0.1004 + mean_r_squared: -0.0095 + identifier_length_variance: + mean_max: -0.4664 + mean_mean: -0.4056 + mean_std_dev: -0.5951 + mean_variance: -1.1923 + indentation: + mean_blank_line_ratio: 0.0206 + mean_max_depth: -0.2280 + mean_mean_depth: -0.1355 + mean_variance: -0.2997 + line_patterns: + mean_blank_line_ratio: 0.0206 + mean_max_nesting_depth: 0.0484 + mean_string_literal_ratio: -0.0763 + mean_unique_line_ratio: -0.0145 + magic_number_density: + mean_density: -0.0708 + mean_string_literal_ratio: -0.1025 + near_duplicate_blocks_file: + mean_block_count: 0.0593 + mean_near_dup_block_d0: 0.3891 + mean_near_dup_block_d4: 0.6367 + mean_near_dup_block_d6: 0.5046 + mean_near_dup_block_d7: -0.5046 + mean_near_dup_block_d8: 0.9550 + mean_sub_block_count: 0.1005 + ngram: + mean_bigram_hapax_fraction: -0.0613 + mean_bigram_repeated_unique: 0.1532 + mean_bigram_repetition_rate: 0.0826 + mean_bigram_total: 0.0758 + mean_bigram_unique: 0.0140 + mean_trigram_hapax_fraction: -0.0412 + mean_trigram_repeated_unique: 0.2154 + mean_trigram_repetition_rate: 0.1235 + mean_trigram_total: 0.0759 + mean_trigram_unique: 0.0351 + punctuation_density: + mean_arrow_density: 0.1321 + mean_bracket_nonalpha_prefix_count: 0.0708 + mean_bracket_nonalpha_suffix_count: 0.0511 + mean_colon_suffix_density: -0.0806 + mean_dot_count: 0.0613 + mean_exclamation_density: 0.3183 + mean_id_nonalpha_suffix_density: 0.0149 + mean_question_mark_density: 0.1862 + readability: + mean_avg_line_length: -0.1589 + mean_avg_sub_words_per_id: -0.3392 + mean_avg_tokens_per_line: 0.1430 + mean_flesch_adapted: 2.0000 + mean_fog_adapted: -0.3969 + mean_total_lines: -0.0733 + separator_counts: + mean_dot_count: 0.0613 + mean_hyphen_count: 0.1862 + mean_slash_count: 0.1025 + mean_underscore_count: -0.5989 + symbol_density: + mean_density: 0.2971 + mean_distinct_symbol_types: 0.0446 + mean_symbol_count: 0.0831 + vocabulary: + mean_mattr: -0.0939 + mean_raw_ttr: -0.1041 + mean_total_identifiers: 0.0610 + mean_unique_identifiers: -0.0445 + vowel_density: + mean_total_chars: -0.3409 + zipf: + mean_exponent: 0.0555 + mean_total_tokens: 0.0756 + mean_vocab_size: -0.0167 + +name_is_too_short: + _doc: "Names shorter than 3 characters (outside loops) are too opaque." + _languages: [elixir, javascript, ruby] + _log_baseline: -3.8620 + branching: + mean_branch_count: -0.2327 + mean_branching_density: -0.2381 + brevity: + mean_sample_size: -0.1256 + casing_entropy: + mean_camel_case_count: -0.0450 + mean_entropy: -0.4018 + mean_other_count: -2.0000 + mean_snake_case_count: -0.1480 + comment_structure: + mean_comment_line_ratio: -0.0092 + compression: + mean_raw_bytes: 0.2713 + mean_redundancy: 0.1160 + mean_zlib_bytes: 0.0526 + mean_zlib_ratio: 0.2197 + entropy: + mean_char_entropy: -0.0115 + mean_char_max_entropy: 0.0191 + mean_char_normalized: -0.0304 + mean_token_entropy: -0.0467 + mean_token_max_entropy: -0.0267 + mean_token_normalized: -0.0201 + mean_total_tokens: -0.0256 + mean_vocab_size: -0.1256 + halstead: + mean_N1_total_operators: -0.0225 + mean_N2_total_operands: -0.0374 + mean_difficulty: 0.0242 + mean_effort: -0.0350 + mean_estimated_bugs: -0.0584 + mean_length: -0.0283 + mean_n1_unique_operators: -0.0974 + mean_n2_unique_operands: -0.1584 + mean_time_to_implement_seconds: -0.0350 + mean_vocabulary: -0.1401 + mean_volume: -0.0584 + heaps: + mean_k: -0.1166 + mean_r_squared: 0.0306 + identifier_length_variance: + mean_mean: 0.6923 + mean_std_dev: -0.2499 + mean_variance: -0.5009 + indentation: + mean_variance: 0.0168 + line_patterns: + mean_string_literal_ratio: 0.0229 + near_duplicate_blocks_file: + mean_near_dup_block_d6: -1.2621 + ngram: + mean_bigram_hapax_fraction: 0.0137 + mean_bigram_repeated_unique: -0.0944 + mean_bigram_repetition_rate: 0.0110 + mean_bigram_total: -0.0256 + mean_bigram_unique: -0.0691 + mean_trigram_repeated_unique: -0.0446 + mean_trigram_total: -0.0256 + mean_trigram_unique: -0.0318 + punctuation_density: + mean_colon_suffix_density: -0.2391 + mean_id_nonalpha_suffix_density: -0.0101 + mean_question_mark_density: -0.2722 + readability: + mean_avg_line_length: 0.2778 + mean_avg_tokens_per_line: -0.0329 + mean_fog_adapted: -0.0263 + separator_counts: + mean_hyphen_count: -0.5797 + mean_underscore_count: -0.1641 + symbol_density: + mean_density: -0.2806 + mean_distinct_symbol_types: 0.0241 + mean_symbol_count: -0.0137 + vocabulary: + mean_mattr: -0.2556 + mean_raw_ttr: -0.1188 + mean_total_identifiers: -0.1971 + mean_unique_identifiers: -0.3128 + vowel_density: + mean_total_chars: 0.4916 + zipf: + mean_exponent: 0.0521 + mean_total_tokens: -0.0256 + mean_vocab_size: -0.1256 + +negated_boolean_name: + _doc: "Negated boolean names (`isNotValid`, `notActive`) are harder to reason about." + _languages: [elixir, javascript, ruby] + _log_baseline: -6.4001 + brevity: + mean_sample_size: -0.0998 + casing_entropy: + mean_camel_case_count: -0.1117 + compression: + mean_raw_bytes: -0.0414 + mean_zlib_bytes: -0.0643 + mean_zlib_ratio: 0.0231 + entropy: + mean_token_max_entropy: -0.0196 + mean_vocab_size: -0.0998 + halstead: + mean_difficulty: 0.0956 + mean_effort: 0.0772 + mean_estimated_bugs: -0.0162 + mean_n2_unique_operands: -0.1082 + mean_time_to_implement_seconds: 0.0772 + mean_vocabulary: -0.0818 + mean_volume: -0.0162 + heaps: + mean_beta: 0.0357 + mean_k: -0.2055 + identifier_length_variance: + mean_max: -0.0454 + mean_mean: -0.1116 + mean_std_dev: -0.2685 + mean_variance: -0.5427 + line_patterns: + mean_string_literal_ratio: 0.0321 + magic_number_density: + mean_string_literal_ratio: 0.0648 + near_duplicate_blocks_file: + mean_near_dup_block_d4: -2.0000 + mean_near_dup_block_d5: 2.0000 + mean_sub_block_count: -0.0546 + ngram: + mean_bigram_hapax_fraction: -0.0303 + mean_bigram_repeated_unique: 0.0479 + mean_bigram_repetition_rate: 0.0366 + mean_bigram_unique: -0.0275 + mean_trigram_hapax_fraction: -0.0202 + mean_trigram_repeated_unique: 0.0439 + mean_trigram_repetition_rate: 0.0732 + mean_trigram_unique: -0.0366 + punctuation_density: + mean_bracket_nonalpha_prefix_count: -0.2405 + mean_bracket_nonalpha_suffix_count: -0.0403 + mean_colon_suffix_density: -0.1292 + mean_exclamation_density: -0.4070 + mean_id_nonalpha_suffix_density: -0.0280 + readability: + mean_avg_line_length: -0.0419 + mean_avg_sub_words_per_id: -0.1284 + mean_flesch_adapted: 0.1760 + mean_fog_adapted: -0.1206 + separator_counts: + mean_underscore_count: -0.4507 + symbol_density: + mean_density: 0.0375 + vocabulary: + mean_mattr: -0.0261 + mean_raw_ttr: -0.1352 + mean_unique_identifiers: -0.1462 + vowel_density: + mean_total_chars: -0.1238 + zipf: + mean_exponent: 0.0151 + mean_vocab_size: -0.0998 + +no_hungarian_notation: + _doc: "Hungarian notation prefixes (`strName`, `bFlag`) add noise without type safety." + _languages: [elixir, javascript, ruby] + _log_baseline: -8.4371 + brevity: + mean_sample_size: -0.0295 + casing_entropy: + mean_camel_case_count: -1.8340 + mean_entropy: -0.2028 + mean_pascal_case_count: 0.0217 + mean_screaming_snake_density: 0.0473 + mean_snake_case_count: 0.2648 + compression: + mean_raw_bytes: -0.1404 + mean_redundancy: -0.0554 + mean_zlib_bytes: -0.0494 + mean_zlib_ratio: -0.0857 + entropy: + mean_char_entropy: -0.0226 + mean_char_normalized: -0.0276 + mean_total_tokens: 0.0139 + mean_vocab_size: -0.0295 + function_metrics: + mean_avg_function_lines: 1.1030 + mean_function_count: 0.0489 + mean_max_function_lines: 0.6027 + halstead: + mean_difficulty: 0.0972 + mean_effort: 0.0546 + mean_estimated_bugs: -0.0179 + mean_n2_unique_operands: -0.1112 + mean_time_to_implement_seconds: 0.0546 + mean_vocabulary: -0.0811 + mean_volume: -0.0178 + heaps: + mean_k: 0.0266 + identifier_length_variance: + mean_max: -0.0632 + mean_mean: -0.2501 + mean_std_dev: 0.0436 + mean_variance: 0.0825 + indentation: + mean_blank_line_ratio: -0.0083 + line_patterns: + mean_blank_line_ratio: -0.0083 + mean_string_literal_ratio: -0.2464 + magic_number_density: + mean_density: -0.0242 + mean_string_literal_ratio: -0.3014 + near_duplicate_blocks_file: + mean_near_dup_block_d4: -2.0000 + mean_near_dup_block_d7: -0.4150 + mean_sub_block_count: -0.0133 + ngram: + mean_bigram_hapax_fraction: 0.0082 + mean_bigram_total: 0.0140 + mean_trigram_hapax_fraction: 0.0091 + mean_trigram_repeated_unique: -0.0105 + mean_trigram_repetition_rate: -0.0222 + mean_trigram_total: 0.0140 + mean_trigram_unique: 0.0176 + punctuation_density: + mean_arrow_density: -0.0515 + mean_bracket_nonalpha_suffix_count: -0.0310 + mean_colon_suffix_density: -0.1246 + mean_dot_count: 0.0112 + mean_question_mark_density: 0.1699 + readability: + mean_avg_line_length: -0.1457 + mean_avg_sub_words_per_id: -0.3755 + mean_avg_tokens_per_line: 0.0141 + mean_flesch_adapted: 0.4848 + mean_fog_adapted: -0.5445 + separator_counts: + mean_dot_count: 0.0112 + mean_underscore_count: -1.6136 + symbol_density: + mean_density: 0.1326 + mean_distinct_symbol_types: 0.0704 + mean_symbol_count: 0.0074 + vocabulary: + mean_mattr: -0.0445 + mean_raw_ttr: -0.0346 + mean_total_identifiers: -0.0810 + mean_unique_identifiers: -0.1508 + vowel_density: + mean_total_chars: -0.3322 + zipf: + mean_exponent: 0.0415 + mean_total_tokens: 0.0139 + mean_vocab_size: -0.0295 + +screaming_snake_for_constants: + _doc: "Module-level constants should use SCREAMING_SNAKE_CASE." + _languages: [elixir, javascript, ruby] + _log_baseline: -4.4685 + branching: + mean_branching_density: 0.0176 + mean_non_blank_count: -0.0180 + brevity: + mean_sample_size: -0.0136 + casing_entropy: + mean_camel_case_count: 0.0302 + mean_entropy: 0.0261 + mean_macro_case_count: 1.9913 + mean_pascal_case_count: -0.1674 + mean_screaming_snake_density: 2.0000 + comment_structure: + mean_comment_line_ratio: 0.0267 + compression: + mean_raw_bytes: -0.0086 + mean_redundancy: -0.0090 + mean_unique_line_ratio: -0.0048 + mean_zlib_bytes: 0.0081 + mean_zlib_ratio: -0.0168 + entropy: + mean_char_entropy: 0.0305 + mean_char_max_entropy: 0.0092 + mean_char_normalized: 0.0213 + mean_total_tokens: -0.0066 + mean_vocab_size: -0.0136 + halstead: + mean_N1_total_operators: -0.0036 + mean_N2_total_operands: -0.0117 + mean_difficulty: 0.0034 + mean_effort: -0.0052 + mean_estimated_bugs: -0.0088 + mean_length: -0.0067 + mean_n2_unique_operands: -0.0163 + mean_time_to_implement_seconds: -0.0052 + mean_vocabulary: -0.0115 + mean_volume: -0.0088 + identifier_length_variance: + mean_mean: 0.0048 + mean_std_dev: 0.0244 + mean_variance: 0.0482 + indentation: + mean_blank_line_ratio: 0.0078 + mean_mean_depth: 0.0069 + mean_variance: 0.0048 + line_patterns: + mean_blank_line_ratio: 0.0078 + mean_string_literal_ratio: 0.0089 + mean_unique_line_ratio: -0.0059 + magic_number_density: + mean_density: 0.0057 + mean_string_literal_ratio: 0.0090 + ngram: + mean_bigram_hapax_fraction: -0.0085 + mean_bigram_repetition_rate: 0.0069 + mean_bigram_total: -0.0066 + mean_bigram_unique: -0.0131 + mean_trigram_hapax_fraction: -0.0045 + mean_trigram_repetition_rate: 0.0072 + mean_trigram_total: -0.0066 + mean_trigram_unique: -0.0101 + punctuation_density: + mean_arrow_density: 0.0071 + mean_colon_suffix_density: 0.0103 + mean_id_nonalpha_suffix_density: 0.0078 + readability: + mean_avg_line_length: 0.0087 + mean_avg_sub_words_per_id: -0.0090 + mean_avg_tokens_per_line: 0.0102 + mean_flesch_adapted: 0.0095 + mean_fog_adapted: -0.0082 + mean_total_lines: -0.0182 + separator_counts: + mean_underscore_count: 0.3971 + symbol_density: + mean_symbol_count: -0.0036 + vocabulary: + mean_mattr: -0.0037 + mean_raw_ttr: -0.0055 + mean_total_identifiers: -0.0157 + mean_unique_identifiers: -0.0212 + vowel_density: + mean_total_chars: -0.0111 + zipf: + mean_exponent: 0.0038 + mean_total_tokens: -0.0066 + mean_vocab_size: -0.0136 + diff --git a/scripts/run.sh b/scripts/run.sh index 98042053..4cc0918f 100755 --- a/scripts/run.sh +++ b/scripts/run.sh @@ -47,17 +47,20 @@ esac # --- Build CLI arguments --- ARGS=("$INPUT_COMMAND" "$INPUT_PATH") CAPTURE_STDOUT=false +COMMENT_MODE=false case "$INPUT_COMMAND" in health-report) - ARGS+=("--output" "$OUTPUT_FILE") ARGS+=("--detail" "$INPUT_DETAIL") ARGS+=("--top" "$INPUT_TOP") if [[ -n "$INPUT_CONFIG" ]]; then ARGS+=("--config" "$INPUT_CONFIG") fi if [[ "${INPUT_COMMENT:-false}" == "true" ]]; then - ARGS+=("--format" "github") + ARGS+=("--comment") + COMMENT_MODE=true + else + ARGS+=("--output" "$OUTPUT_FILE") fi ;; compare) @@ -117,6 +120,77 @@ else "$CODEQA" "${ARGS[@]}" fi +# --- Post multi-part PR comments (health-report with comment mode) --- +if [[ "$COMMENT_MODE" == "true" ]]; then + TMPDIR="${TMPDIR:-/tmp}" + PART_COUNT_FILE="${TMPDIR}/codeqa-part-count.txt" + + if [[ ! -f "$PART_COUNT_FILE" ]]; then + echo "::error::Part count file not found at ${PART_COUNT_FILE}" + exit 1 + fi + + PART_COUNT=$(cat "$PART_COUNT_FILE") + echo "Posting ${PART_COUNT} comment parts..." + + # GitHub API settings + API_URL="${GITHUB_API_URL:-https://api.github.com}" + REPO="${GITHUB_REPOSITORY}" + PR_NUMBER="${PR_NUMBER:-}" + + if [[ -z "$PR_NUMBER" ]]; then + echo "::error::PR_NUMBER not set. Cannot post PR comments." + exit 1 + fi + + for i in $(seq 1 "$PART_COUNT"); do + PART_FILE="${TMPDIR}/codeqa-part-${i}.md" + SENTINEL="" + + if [[ ! -f "$PART_FILE" ]]; then + echo "::warning::Part file ${PART_FILE} not found, skipping" + continue + fi + + BODY=$(cat "$PART_FILE") + + # Search for existing comment with this sentinel + echo "Searching for existing comment with sentinel: ${SENTINEL}" + COMMENTS_JSON=$(curl -fsSL \ + -H "Authorization: Bearer ${GITHUB_TOKEN}" \ + -H "Accept: application/vnd.github+json" \ + "${API_URL}/repos/${REPO}/issues/${PR_NUMBER}/comments?per_page=100" 2>/dev/null || echo "[]") + + # Find comment ID containing the sentinel + COMMENT_ID=$(echo "$COMMENTS_JSON" | jq -r --arg sentinel "$SENTINEL" \ + '.[] | select(.body | contains($sentinel)) | .id' | head -1) + + # Prepare JSON payload + PAYLOAD=$(jq -n --arg body "$BODY" '{"body": $body}') + + if [[ -n "$COMMENT_ID" && "$COMMENT_ID" != "null" ]]; then + echo "Updating existing comment ${COMMENT_ID} for part ${i}..." + curl -fsSL -X PATCH \ + -H "Authorization: Bearer ${GITHUB_TOKEN}" \ + -H "Accept: application/vnd.github+json" \ + "${API_URL}/repos/${REPO}/issues/comments/${COMMENT_ID}" \ + -d "$PAYLOAD" > /dev/null + else + echo "Creating new comment for part ${i}..." + curl -fsSL -X POST \ + -H "Authorization: Bearer ${GITHUB_TOKEN}" \ + -H "Accept: application/vnd.github+json" \ + "${API_URL}/repos/${REPO}/issues/${PR_NUMBER}/comments" \ + -d "$PAYLOAD" > /dev/null + fi + done + + echo "All ${PART_COUNT} comment parts posted successfully" + + # Use part 1 as the main output file for grade extraction + OUTPUT_FILE="${TMPDIR}/codeqa-part-1.md" +fi + # --- Extract grade (health-report only) --- GRADE="" if [[ "$INPUT_COMMAND" == "health-report" && -f "$OUTPUT_FILE" ]]; then diff --git a/test/codeqa/analysis/behavior_config_server_test.exs b/test/codeqa/analysis/behavior_config_server_test.exs new file mode 100644 index 00000000..ebcc31bb --- /dev/null +++ b/test/codeqa/analysis/behavior_config_server_test.exs @@ -0,0 +1,75 @@ +defmodule CodeQA.Analysis.BehaviorConfigServerTest do + use ExUnit.Case, async: true + + alias CodeQA.Analysis.BehaviorConfigServer + + setup do + {:ok, pid} = BehaviorConfigServer.start_link() + {:ok, pid: pid} + end + + test "get_all_behaviors/1 returns a non-empty map of categories", %{pid: pid} do + behaviors = BehaviorConfigServer.get_all_behaviors(pid) + assert is_map(behaviors) + assert map_size(behaviors) > 0 + + Enum.each(behaviors, fn {category, list} -> + assert is_binary(category) + assert is_list(list) + assert list != [] + + Enum.each(list, fn {behavior, data} -> + assert is_binary(behavior) + assert is_map(data) + end) + end) + end + + test "get_all_behaviors/1 matches YamlElixir direct reads", %{pid: pid} do + behaviors = BehaviorConfigServer.get_all_behaviors(pid) + yaml_dir = "priv/combined_metrics" + + {:ok, files} = File.ls(yaml_dir) + + Enum.each(files |> Enum.filter(&String.ends_with?(&1, ".yml")), fn yml_file -> + category = String.trim_trailing(yml_file, ".yml") + {:ok, data} = YamlElixir.read_from_file(Path.join(yaml_dir, yml_file)) + + expected_behaviors = + data |> Enum.filter(fn {_k, v} -> is_map(v) end) |> Enum.map(&elem(&1, 0)) + + server_behaviors = Map.get(behaviors, category, []) |> Enum.map(&elem(&1, 0)) + assert Enum.sort(expected_behaviors) == Enum.sort(server_behaviors) + end) + end + + test "get_scalars/3 returns a map of {group, key} => scalar", %{pid: pid} do + behaviors = BehaviorConfigServer.get_all_behaviors(pid) + {category, [{behavior, _data} | _]} = Enum.at(behaviors, 0) + + scalars = BehaviorConfigServer.get_scalars(pid, category, behavior) + assert is_map(scalars) + + Enum.each(scalars, fn {{group, key}, scalar} -> + assert is_binary(group) + assert is_binary(key) + assert is_float(scalar) + end) + end + + test "get_scalars/3 returns empty map for unknown behavior", %{pid: pid} do + assert BehaviorConfigServer.get_scalars(pid, "nonexistent", "also_nonexistent") == %{} + end + + test "get_log_baseline/3 returns a float", %{pid: pid} do + behaviors = BehaviorConfigServer.get_all_behaviors(pid) + {category, [{behavior, _data} | _]} = Enum.at(behaviors, 0) + + baseline = BehaviorConfigServer.get_log_baseline(pid, category, behavior) + assert is_float(baseline) + end + + test "get_log_baseline/3 returns 0.0 for unknown behavior", %{pid: pid} do + assert BehaviorConfigServer.get_log_baseline(pid, "nonexistent", "also_nonexistent") == 0.0 + end +end diff --git a/test/codeqa/analysis/file_context_server_test.exs b/test/codeqa/analysis/file_context_server_test.exs new file mode 100644 index 00000000..660bd9a3 --- /dev/null +++ b/test/codeqa/analysis/file_context_server_test.exs @@ -0,0 +1,38 @@ +defmodule CodeQA.Analysis.FileContextServerTest do + use ExUnit.Case, async: true + + alias CodeQA.Analysis.FileContextServer + alias CodeQA.Engine.{FileContext, Pipeline} + + setup do + {:ok, pid} = FileContextServer.start_link() + {:ok, pid: pid} + end + + test "get/2 returns a Pipeline.FileContext", %{pid: pid} do + content = "defmodule Foo do\n def bar, do: :ok\nend\n" + ctx = FileContextServer.get(pid, content) + assert %FileContext{} = ctx + assert is_binary(ctx.content) + end + + test "get/2 returns identical struct on second call without rebuilding", %{pid: pid} do + content = "defmodule Foo do\n def bar, do: :ok\nend\n" + ctx1 = FileContextServer.get(pid, content) + ctx2 = FileContextServer.get(pid, content) + assert ctx1 == ctx2 + end + + test "get/2 with different content returns different results", %{pid: pid} do + ctx_a = FileContextServer.get(pid, "defmodule A do\nend\n") + ctx_b = FileContextServer.get(pid, "defmodule B do\n def foo, do: 1\nend\n") + assert ctx_a != ctx_b + end + + test "get/2 matches Pipeline.build_file_context/1 directly", %{pid: pid} do + content = "x = 1\ny = 2\n" + expected = Pipeline.build_file_context(content) + result = FileContextServer.get(pid, content) + assert result == expected + end +end diff --git a/test/codeqa/analysis/file_metrics_server_test.exs b/test/codeqa/analysis/file_metrics_server_test.exs new file mode 100644 index 00000000..b68f4b37 --- /dev/null +++ b/test/codeqa/analysis/file_metrics_server_test.exs @@ -0,0 +1,92 @@ +defmodule CodeQA.Analysis.FileMetricsServerTest do + use ExUnit.Case, async: true + + alias CodeQA.Analysis.FileMetricsServer + alias CodeQA.Engine.Analyzer + + defp build_registry do + Analyzer.build_registry() + end + + setup do + {:ok, pid} = FileMetricsServer.start_link() + {:ok, pid: pid} + end + + describe "populate/3 and get_by_path/2" do + test "returns pre-populated baseline metrics for a path", %{pid: pid} do + content = "defmodule A do\n def foo, do: 1\nend\n" + + pipeline_result = %{ + "files" => %{ + "lib/a.ex" => %{"metrics" => %{"halstead" => %{"tokens" => 5.0}}} + } + } + + files_map = %{"lib/a.ex" => content} + :ok = FileMetricsServer.populate(pid, pipeline_result, files_map) + + metrics = FileMetricsServer.get_by_path(pid, "lib/a.ex") + assert metrics == %{"halstead" => %{"tokens" => 5.0}} + end + + test "returns nil for unknown path", %{pid: pid} do + :ok = FileMetricsServer.populate(pid, %{"files" => %{}}, %{}) + assert FileMetricsServer.get_by_path(pid, "nonexistent.ex") == nil + end + end + + describe "get_for_content/3" do + test "computes and caches metrics on first call", %{pid: pid} do + registry = build_registry() + content = "defmodule A do\n def foo, do: 1\nend\n" + + metrics = FileMetricsServer.get_for_content(pid, registry, content) + assert is_map(metrics) + assert map_size(metrics) > 0 + end + + test "returns identical result on second call (cache hit)", %{pid: pid} do + registry = build_registry() + content = "defmodule A do\n def foo, do: 1\nend\n" + + m1 = FileMetricsServer.get_for_content(pid, registry, content) + m2 = FileMetricsServer.get_for_content(pid, registry, content) + assert m1 == m2 + end + + test "different content returns different metrics", %{pid: pid} do + registry = build_registry() + ma = FileMetricsServer.get_for_content(pid, registry, "x = 1\n") + + mb = + FileMetricsServer.get_for_content( + pid, + registry, + String.duplicate("def foo(a, b), do: a + b\n", 20) + ) + + assert ma != mb + end + + test "populate cross-indexes hash so get_for_content hits cache", %{pid: pid} do + registry = build_registry() + content = "defmodule A do\n def foo, do: 1\nend\n" + + pipeline_result = %{ + "files" => %{ + "lib/a.ex" => %{ + "metrics" => %{"halstead" => %{"tokens" => 99.0}} + } + } + } + + files_map = %{"lib/a.ex" => content} + :ok = FileMetricsServer.populate(pid, pipeline_result, files_map) + + # Should hit the hash-keyed cache entry seeded from pipeline_result + metrics = FileMetricsServer.get_for_content(pid, registry, content) + assert metrics == %{"halstead" => %{"tokens" => 99.0}} + end + end +end diff --git a/test/codeqa/ast/classification/node_classifier_test.exs b/test/codeqa/ast/classification/node_classifier_test.exs new file mode 100644 index 00000000..990a35d1 --- /dev/null +++ b/test/codeqa/ast/classification/node_classifier_test.exs @@ -0,0 +1,312 @@ +defmodule CodeQA.AST.NodeClassifierTest do + use ExUnit.Case, async: true + + alias CodeQA.AST.Classification.NodeClassifier + alias CodeQA.AST.Enrichment.Node + alias CodeQA.AST.Lexing.Token + alias CodeQA.AST.Lexing.TokenNormalizer + alias CodeQA.AST.Parsing.Parser + + alias CodeQA.AST.Nodes.{ + AttributeNode, + CodeNode, + DocNode, + FunctionNode, + ImportNode, + ModuleNode, + TestNode + } + + alias CodeQA.Languages.Code.Native.Go + alias CodeQA.Languages.Code.Native.Rust + alias CodeQA.Languages.Code.Scripting.Python + alias CodeQA.Languages.Code.Scripting.Ruby + alias CodeQA.Languages.Code.Vm.CSharp + alias CodeQA.Languages.Code.Vm.Elixir, as: ElixirLang + alias CodeQA.Languages.Code.Vm.Java + alias CodeQA.Languages.Code.Web.JavaScript + alias CodeQA.Languages.Code.Web.TypeScript + alias CodeQA.Languages.Unknown + + defp classify_first(code, opts \\ []) do + lang_mod = opts[:language_module] || Unknown + + [block | _] = + code + |> TokenNormalizer.normalize_structural() + |> Parser.detect_blocks(lang_mod) + + NodeClassifier.classify(block, lang_mod) + end + + defp node_with_tokens(tokens) do + %Node{ + tokens: tokens, + line_count: 1, + children: [] + } + end + + describe "classify/1 — function detection" do + test "def → FunctionNode" do + assert %FunctionNode{} = + classify_first("def foo(x), do: x + 1", language_module: ElixirLang) + end + + test "defp → FunctionNode" do + assert %FunctionNode{} = classify_first("defp bar(x), do: x", language_module: ElixirLang) + end + + test "defmacro → FunctionNode" do + assert %FunctionNode{} = + classify_first("defmacro my_macro(x), do: x", language_module: ElixirLang) + end + + test "function keyword → FunctionNode" do + assert %FunctionNode{} = + classify_first("function foo(x) {\n return x\n}", language_module: JavaScript) + end + + test "func keyword → FunctionNode" do + assert %FunctionNode{} = + classify_first("func Foo(x int) int {\n return x\n}", language_module: Go) + end + + test "fn keyword → FunctionNode" do + assert %FunctionNode{} = + classify_first("fn main() {\n println!(\"hello\")\n}", language_module: Rust) + end + end + + describe "classify/1 — module detection" do + test "defmodule → ModuleNode" do + assert %ModuleNode{} = + classify_first("defmodule Foo do\n :ok\nend", language_module: ElixirLang) + end + + test "class → ModuleNode" do + assert %ModuleNode{} = classify_first("class Foo:\n pass", language_module: Python) + end + + test "module → ModuleNode" do + assert %ModuleNode{} = + classify_first("module Foo\n def bar; end\nend", language_module: Ruby) + end + + test "interface → ModuleNode" do + assert %ModuleNode{} = + classify_first("interface Foo {\n bar(): void\n}", language_module: TypeScript) + end + + test "struct → ModuleNode" do + assert %ModuleNode{} = + classify_first("struct Point {\n x: f64,\n y: f64,\n}", language_module: Rust) + end + end + + describe "classify/1 — import detection" do + test "import → ImportNode" do + assert %ImportNode{} = classify_first("import Foo", language_module: ElixirLang) + end + + test "alias → ImportNode" do + assert %ImportNode{} = classify_first("alias Foo.Bar", language_module: ElixirLang) + end + + test "use → ImportNode" do + assert %ImportNode{} = + classify_first("use ExUnit.Case, async: true", language_module: ElixirLang) + end + + test "require → ImportNode" do + assert %ImportNode{} = classify_first("require Logger", language_module: ElixirLang) + end + + test "from keyword → ImportNode" do + assert %ImportNode{} = classify_first("from os import path", language_module: Python) + end + end + + describe "classify/1 — test detection" do + test "test macro → TestNode" do + assert %TestNode{} = + classify_first(~s(test "something" do\n :ok\nend), language_module: ElixirLang) + end + + test "describe → TestNode" do + assert %TestNode{} = + classify_first(~s(describe "some context" do\n :ok\nend), + language_module: ElixirLang + ) + end + + test "it → TestNode" do + code = "it \"behaves correctly\" do\n :ok\nend" + assert %TestNode{} = classify_first(code, language_module: JavaScript) + end + end + + describe "classify/1 — doc detection" do + test " token → DocNode" do + # A standalone triple-quoted string starts directly with the token + assert %DocNode{} = classify_first(~s("""\nSome doc.\n""")) + end + + test "direct token in node → DocNode" do + doc_token = %Token{kind: "", content: ~s("""), line: 1, col: 0} + nl = %Token{kind: "", content: "\n", line: 2, col: 0} + node = node_with_tokens([doc_token, nl]) + assert %DocNode{} = NodeClassifier.classify(node, Unknown) + end + end + + describe "classify/1 — attribute detection" do + test "@spec → AttributeNode with kind: :typespec" do + result = classify_first("@spec foo(integer()) :: :ok", language_module: ElixirLang) + assert %AttributeNode{kind: :typespec} = result + end + + test "@type → AttributeNode with kind: :typespec" do + result = classify_first("@type user_id :: integer()", language_module: ElixirLang) + assert %AttributeNode{kind: :typespec} = result + end + + test "@typep → AttributeNode with kind: :typespec" do + result = classify_first("@typep internal :: atom()", language_module: ElixirLang) + assert %AttributeNode{kind: :typespec} = result + end + + test "@callback → AttributeNode with kind: :typespec" do + result = + classify_first("@callback fetch(term()) :: {:ok, term()}", language_module: ElixirLang) + + assert %AttributeNode{kind: :typespec} = result + end + + test "@enforce_keys → AttributeNode with kind: nil" do + result = classify_first("@enforce_keys [:name, :age]", language_module: ElixirLang) + assert %AttributeNode{kind: nil} = result + end + + test "all Elixir typespec attributes are recognized" do + for attr <- ~w[spec type typep opaque callback macrocallback] do + result = classify_first("@#{attr} foo :: bar", language_module: ElixirLang) + + assert %AttributeNode{kind: :typespec} = result, + "expected AttributeNode(kind: :typespec) for @#{attr}" + end + end + end + + describe "classify/1 — code fallback" do + test "unrecognized token → CodeNode" do + assert %CodeNode{} = classify_first("x = 1 + 2") + end + + test "empty-like node with only whitespace tokens → CodeNode" do + nl = %Token{kind: "", content: "\n", line: 1, col: 0} + node = node_with_tokens([nl]) + + assert %CodeNode{} = + NodeClassifier.classify(node, Unknown) + end + end + + describe "classify/1 — ambiguity resolution" do + test "test beats function (test is not defp-style)" do + # 'test' is in TestSignal; FunctionSignal does not include 'test' + result = classify_first(~s(test "foo" do\n :ok\nend), language_module: ElixirLang) + assert %TestNode{} = result + end + + test "@inside code body at indent > 0 does not make block :attribute" do + code = "def foo do\n @cache true\n :ok\nend" + # FunctionSignal sees 'def' at indent 0 → :function wins + # AttributeSignal sees '@cache' but at indent 2, not 0 → no vote + result = classify_first(code, language_module: ElixirLang) + assert %FunctionNode{} = result + end + end + + describe "classify/1 — field preservation" do + test "preserves tokens, line_count, children, start/end_line" do + tokens = + "def foo, do: :ok" + |> TokenNormalizer.normalize_structural() + + [node] = Parser.detect_blocks(tokens, ElixirLang) + result = NodeClassifier.classify(node, ElixirLang) + + assert result.tokens == node.tokens + assert result.line_count == node.line_count + assert result.children == node.children + assert result.start_line == node.start_line + assert result.end_line == node.end_line + end + end + + describe "classify/3 — sub-block parent context" do + test "alias-list sub-block classifies as :import when parent_context contains alias keyword" do + # Simulates a multi-line `alias Foo.{Bar, Baz}` where the BracketSignal + # has split off the `{Bar, Baz}` sub-block, leaving `alias` in the parent. + code = """ + alias Foo.{ + Bar, + Baz + } + """ + + tokens = TokenNormalizer.normalize_structural(code) + [parent] = Parser.detect_blocks(tokens, ElixirLang) + [sub_block] = parent.children + + # Premise: in isolation, the sub-block is :code (no alias keyword visible). + assert %CodeNode{} = NodeClassifier.classify(sub_block, ElixirLang) + + # With parent context (the parent's tokens that come BEFORE the sub-block), + # the classifier should see the `alias` keyword and vote :import. + parent_context = parent_tokens_before(parent, sub_block) + + assert %ImportNode{} = NodeClassifier.classify(sub_block, ElixirLang, parent_context) + end + + test "attribute-list sub-block classifies as :attribute when parent_context contains @name" do + code = """ + @all_signals [ + :a, + :b + ] + """ + + tokens = TokenNormalizer.normalize_structural(code) + [parent] = Parser.detect_blocks(tokens, ElixirLang) + [sub_block] = parent.children + + assert %CodeNode{} = NodeClassifier.classify(sub_block, ElixirLang) + + parent_context = parent_tokens_before(parent, sub_block) + + assert %AttributeNode{} = NodeClassifier.classify(sub_block, ElixirLang, parent_context) + end + + test "classify/3 with nil parent_context behaves identically to classify/2" do + code = "def foo, do: :ok" + [block] = code |> TokenNormalizer.normalize_structural() |> Parser.detect_blocks(ElixirLang) + + assert NodeClassifier.classify(block, ElixirLang) == + NodeClassifier.classify(block, ElixirLang, nil) + end + end + + # Returns the parent's tokens that come strictly before the sub-block's first token. + # Look-back is bounded to the current source line (everything since the last newline). + defp parent_tokens_before(parent, sub_block) do + sub_first = List.first(sub_block.tokens) + + parent.tokens + |> Enum.take_while(fn t -> t != sub_first end) + |> Enum.reverse() + |> Enum.take_while(fn t -> t.kind != :"" end) + |> Enum.reverse() + end +end diff --git a/test/codeqa/ast/classification/node_protocol_test.exs b/test/codeqa/ast/classification/node_protocol_test.exs new file mode 100644 index 00000000..5e79a00d --- /dev/null +++ b/test/codeqa/ast/classification/node_protocol_test.exs @@ -0,0 +1,113 @@ +defmodule CodeQA.AST.NodeProtocolTest.FakeNode do + defstruct [:tokens, :line_count, :children, :start_line, :end_line, :label] + + defimpl CodeQA.AST.Classification.NodeProtocol do + alias CodeQA.AST.Classification.NodeProtocol + + def tokens(n), do: n.tokens + def line_count(n), do: n.line_count + def children(n), do: n.children + def start_line(n), do: n.start_line + def end_line(n), do: n.end_line + def label(n), do: n.label + + def flat_tokens(n) do + if Enum.empty?(n.children), + do: n.tokens, + else: Enum.flat_map(n.children, &NodeProtocol.flat_tokens/1) + end + end +end + +defmodule CodeQA.AST.NodeProtocolTest do + use ExUnit.Case, async: true + + alias CodeQA.AST.Classification.NodeProtocol + alias CodeQA.AST.Enrichment.Node + alias CodeQA.AST.NodeProtocolTest.FakeNode + + @node %FakeNode{ + tokens: [:a, :b], + line_count: 3, + children: [], + start_line: 1, + end_line: 3, + label: "foo.ex:1" + } + + test "tokens/1" do + assert NodeProtocol.tokens(@node) == [:a, :b] + end + + test "line_count/1" do + assert NodeProtocol.line_count(@node) == 3 + end + + test "children/1" do + assert NodeProtocol.children(@node) == [] + end + + test "start_line/1" do + assert NodeProtocol.start_line(@node) == 1 + end + + test "end_line/1" do + assert NodeProtocol.end_line(@node) == 3 + end + + test "label/1" do + assert NodeProtocol.label(@node) == "foo.ex:1" + end + + describe "flat_tokens/1" do + test "leaf node returns own tokens" do + leaf = %Node{tokens: [:a, :b], line_count: 1, children: []} + assert NodeProtocol.flat_tokens(leaf) == [:a, :b] + end + + test "non-leaf node returns flattened descendant tokens" do + child_a = %Node{tokens: [:a], line_count: 1, children: []} + child_b = %Node{tokens: [:b, :c], line_count: 1, children: []} + parent = %Node{tokens: [:x], line_count: 2, children: [child_a, child_b]} + assert NodeProtocol.flat_tokens(parent) == [:a, :b, :c] + end + + test "deeply nested node returns all leaf tokens" do + leaf = %Node{tokens: [:z], line_count: 1, children: []} + mid = %Node{tokens: [:y], line_count: 1, children: [leaf]} + root = %Node{tokens: [:x], line_count: 2, children: [mid]} + assert NodeProtocol.flat_tokens(root) == [:z] + end + end + + describe "Node implements NodeProtocol" do + setup do + node = %Node{ + tokens: [:x, :y], + line_count: 3, + children: [], + start_line: 1, + end_line: 3, + label: "f.ex:1" + } + + %{node: node} + end + + test "tokens/1", %{node: node} do + assert NodeProtocol.tokens(node) == [:x, :y] + end + + test "children/1", %{node: node} do + assert NodeProtocol.children(node) == [] + end + + test "start_line/1", %{node: node} do + assert NodeProtocol.start_line(node) == 1 + end + + test "label/1", %{node: node} do + assert NodeProtocol.label(node) == "f.ex:1" + end + end +end diff --git a/test/codeqa/ast/classification/node_type_detector_test.exs b/test/codeqa/ast/classification/node_type_detector_test.exs new file mode 100644 index 00000000..f4c97530 --- /dev/null +++ b/test/codeqa/ast/classification/node_type_detector_test.exs @@ -0,0 +1,147 @@ +defmodule CodeQA.AST.Classification.NodeTypeDetectorTest do + use ExUnit.Case, async: true + alias CodeQA.AST.Classification.NodeTypeDetector + alias CodeQA.AST.Enrichment.Node + alias CodeQA.AST.Lexing.Token + alias CodeQA.AST.Lexing.TokenNormalizer + alias CodeQA.AST.Nodes.{AttributeNode, CodeNode, DocNode, FunctionNode} + alias CodeQA.AST.Parsing.Parser + alias CodeQA.Languages.Code.Vm.Elixir, as: ElixirLang + alias CodeQA.Languages.Unknown + + defp detect_types(code, lang_mod \\ ElixirLang) do + code + |> TokenNormalizer.normalize_structural() + |> Parser.detect_blocks(lang_mod) + |> NodeTypeDetector.detect_types(lang_mod) + end + + describe "detect_types/1" do + test "block with gets type :doc" do + code = ~s(@moduledoc """\nSome doc.\n""") + [block] = detect_types(code) + assert is_struct(block, DocNode) + end + + test "block with @spec gets type :typespec" do + code = "@spec fetch_user(integer()) :: {:ok, term()}" + [block] = detect_types(code) + assert is_struct(block, AttributeNode) + assert block.kind == :typespec + end + + test "block with @type gets type :typespec" do + code = "@type user_id :: integer()" + [block] = detect_types(code) + assert is_struct(block, AttributeNode) + assert block.kind == :typespec + end + + test "block starting with def gets type :function" do + code = "def foo(x), do: x + 1" + [block] = detect_types(code) + assert is_struct(block, FunctionNode) + end + + test "@ attribute inside function body does not make block :attribute" do + # FunctionSignal sees 'def' first → :function wins + # AttributeSignal sees '@cache' but at indent > 0 → no vote + code = "def foo do\n @cache true\n :ok\nend" + blocks = detect_types(code) + + code_block = + Enum.find(blocks, fn b -> + Enum.any?(b.tokens, &(&1.kind == "" and &1.content == "def")) + end) + + assert is_struct(code_block, FunctionNode) + end + + test "returns same number of blocks as input" do + code = "@spec foo() :: :ok\n\n\ndef foo, do: :ok" + blocks = detect_types(code) + assert length(blocks) == 2 + end + + test "all @typespec_attributes are recognized" do + for attr <- ~w[spec type typep opaque callback macrocallback] do + code = "@#{attr} foo :: bar" + [block] = detect_types(code) + + assert is_struct(block, AttributeNode) and block.kind == :typespec, + "expected AttributeNode with kind: :typespec for @#{attr}" + end + end + + test "empty list returns empty list" do + assert [] == NodeTypeDetector.detect_types([], Unknown) + end + end + + describe "detect_types/1 — typed struct output" do + test "returns DocNode for doc blocks" do + doc_token = %Token{kind: "", content: ~s("""), line: 1, col: 0} + nl = %Token{kind: "", content: "\n", line: 2, col: 0} + + node = %Node{ + tokens: [doc_token, nl], + line_count: 2, + children: [], + start_line: 1, + end_line: 2 + } + + [result] = + NodeTypeDetector.detect_types( + [node], + ElixirLang + ) + + assert is_struct(result, DocNode) + end + + test "returns AttributeNode for typespec blocks" do + at = %Token{kind: "@", content: "@", line: 1, col: 0} + spec = %Token{kind: "", content: "spec", line: 1, col: 1} + nl = %Token{kind: "", content: "\n", line: 1, col: 5} + + node = %Node{ + tokens: [at, spec, nl], + line_count: 1, + children: [], + start_line: 1, + end_line: 1 + } + + [result] = + NodeTypeDetector.detect_types( + [node], + ElixirLang + ) + + assert is_struct(result, AttributeNode) + assert result.kind == :typespec + end + + test "returns CodeNode for unclassified blocks" do + id = %Token{kind: "", content: "foo", line: 1, col: 0} + nl = %Token{kind: "", content: "\n", line: 1, col: 3} + + node = %Node{ + tokens: [id, nl], + line_count: 1, + children: [], + start_line: 1, + end_line: 1 + } + + [result] = + NodeTypeDetector.detect_types( + [node], + ElixirLang + ) + + assert is_struct(result, CodeNode) + end + end +end diff --git a/test/codeqa/ast/classification/typed_node_kind_test.exs b/test/codeqa/ast/classification/typed_node_kind_test.exs new file mode 100644 index 00000000..84149cd6 --- /dev/null +++ b/test/codeqa/ast/classification/typed_node_kind_test.exs @@ -0,0 +1,25 @@ +defmodule CodeQA.AST.Classification.TypedNodeKindTest do + use ExUnit.Case, async: true + + alias CodeQA.AST.Classification.TypedNodeKind + + alias CodeQA.AST.Nodes.{ + AttributeNode, + CodeNode, + DocNode, + FunctionNode, + ImportNode, + ModuleNode, + TestNode + } + + test "maps each typed node struct to its kind atom" do + assert TypedNodeKind.of(%DocNode{}) == :doc + assert TypedNodeKind.of(%AttributeNode{}) == :attribute + assert TypedNodeKind.of(%FunctionNode{}) == :function + assert TypedNodeKind.of(%ModuleNode{}) == :module + assert TypedNodeKind.of(%ImportNode{}) == :import + assert TypedNodeKind.of(%TestNode{}) == :test + assert TypedNodeKind.of(%CodeNode{}) == :code + end +end diff --git a/test/codeqa/ast/enrichment/compound_node_assertions_languages_test.exs b/test/codeqa/ast/enrichment/compound_node_assertions_languages_test.exs new file mode 100644 index 00000000..3a6adbb6 --- /dev/null +++ b/test/codeqa/ast/enrichment/compound_node_assertions_languages_test.exs @@ -0,0 +1,136 @@ +defmodule CodeQA.AST.Enrichment.CompoundNodeAssertionsLanguagesTest do + use ExUnit.Case, async: true + + alias CodeQA.AST.Classification.NodeProtocol + alias CodeQA.AST.Classification.NodeTypeDetector + alias CodeQA.AST.Enrichment.CompoundNode + alias CodeQA.AST.Enrichment.CompoundNodeBuilder + alias CodeQA.AST.Enrichment.Node + alias CodeQA.AST.Lexing.TokenNormalizer + alias CodeQA.AST.Parsing.Parser + alias CodeQA.Languages.Unknown + + Module.register_attribute(__MODULE__, :fixture, accumulate: true, persist: false) + + # Elixir fixtures + use Test.Fixtures.Elixir.Calculator + use Test.Fixtures.Elixir.EventBus + use Test.Fixtures.Elixir.RateLimiter + + # Python fixtures + use Test.Fixtures.Python.Calculator + use Test.Fixtures.Python.CsvPipeline + use Test.Fixtures.Python.ConfigParser + + # JavaScript fixtures + use Test.Fixtures.JavaScript.Calculator + use Test.Fixtures.JavaScript.FormValidator + use Test.Fixtures.JavaScript.ShoppingCart + + # Go fixtures + use Test.Fixtures.Go.Calculator + use Test.Fixtures.Go.HttpMiddleware + use Test.Fixtures.Go.CliParser + + # Rust fixtures + use Test.Fixtures.Rust.Calculator + use Test.Fixtures.Rust.Tokenizer + use Test.Fixtures.Rust.RingBuffer + + # Ruby fixtures + use Test.Fixtures.Ruby.Calculator + use Test.Fixtures.Ruby.OrmLite + use Test.Fixtures.Ruby.MarkdownRenderer + + # TypeScript fixtures + use Test.Fixtures.TypeScript.UserProfileStore + use Test.Fixtures.TypeScript.EventEmitter + use Test.Fixtures.TypeScript.DependencyInjection + + # Java fixtures + use Test.Fixtures.Java.BuilderPattern + use Test.Fixtures.Java.RepositoryPattern + use Test.Fixtures.Java.StrategyPattern + + # C# fixtures + use Test.Fixtures.CSharp.LinqPipeline + use Test.Fixtures.CSharp.AsyncTaskManager + use Test.Fixtures.CSharp.PluginSystem + + # Swift fixtures + use Test.Fixtures.Swift.ResultType + use Test.Fixtures.Swift.CombineStream + use Test.Fixtures.Swift.ActorModel + + # Kotlin fixtures + use Test.Fixtures.Kotlin.SealedState + use Test.Fixtures.Kotlin.CoroutineFlow + use Test.Fixtures.Kotlin.ExtensionLibrary + + # C++ fixtures + use Test.Fixtures.Cpp.SmartPointer + use Test.Fixtures.Cpp.TemplateContainer + use Test.Fixtures.Cpp.ObserverPattern + + # Scala fixtures + use Test.Fixtures.Scala.CaseClassAlgebra + use Test.Fixtures.Scala.TypeclassPattern + use Test.Fixtures.Scala.ActorMessages + + # Dart fixtures + use Test.Fixtures.Dart.WidgetState + use Test.Fixtures.Dart.FuturesAsync + use Test.Fixtures.Dart.MixinComposition + + # Zig fixtures + use Test.Fixtures.Zig.AllocatorInterface + use Test.Fixtures.Zig.TaggedUnion + use Test.Fixtures.Zig.IteratorProtocol + + # Lua fixtures + use Test.Fixtures.Lua.ClassSystem + use Test.Fixtures.Lua.EventSystem + use Test.Fixtures.Lua.StateMachine + + # Generate tests for fixtures with block_assertions + for {language, code, block_assertions} <- @fixture, block_assertion <- block_assertions do + test "[#{language}] #{block_assertion.description}" do + compounds = compound_nodes(unquote(code)) + none_of = Map.get(unquote(Macro.escape(block_assertion)), :none_of, []) + all_of = unquote(Macro.escape(block_assertion)).all_of + + assert Enum.any?(compounds, fn compound -> + tokens = all_tokens(compound) + compound_satisfies?(tokens, all_of, none_of) + end), + "No compound node found matching: #{unquote(block_assertion.description)}" + end + end + + defp compound_nodes(code) do + code + |> TokenNormalizer.normalize_structural() + |> Parser.detect_blocks(Unknown) + |> NodeTypeDetector.detect_types(Unknown) + |> CompoundNodeBuilder.build() + end + + defp all_tokens(%CompoundNode{docs: docs, typespecs: typespecs, code: code}) do + (docs ++ typespecs ++ code) + |> Enum.flat_map(&node_tokens/1) + end + + defp node_tokens(node) do + NodeProtocol.tokens(node) + end + + defp matches?({:exact, field, value}, token), do: Map.get(token, field) == value + + defp matches?({:partial, field, value}, token), + do: String.contains?(Map.get(token, field, ""), value) + + defp compound_satisfies?(tokens, all_of, none_of) do + Enum.all?(all_of, fn matcher -> Enum.any?(tokens, &matches?(matcher, &1)) end) and + Enum.all?(none_of, fn matcher -> not Enum.any?(tokens, &matches?(matcher, &1)) end) + end +end diff --git a/test/codeqa/ast/enrichment/compound_node_builder_test.exs b/test/codeqa/ast/enrichment/compound_node_builder_test.exs new file mode 100644 index 00000000..00a10065 --- /dev/null +++ b/test/codeqa/ast/enrichment/compound_node_builder_test.exs @@ -0,0 +1,136 @@ +defmodule CodeQA.AST.Enrichment.CompoundNodeBuilderTest do + use ExUnit.Case, async: true + + alias CodeQA.AST.Classification.NodeTypeDetector + alias CodeQA.AST.Enrichment.CompoundNode + alias CodeQA.AST.Enrichment.CompoundNodeBuilder + alias CodeQA.AST.Lexing.TokenNormalizer + alias CodeQA.AST.Nodes.{AttributeNode, CodeNode, DocNode} + alias CodeQA.AST.Parsing.Parser + + defp build(code) do + lang_mod = CodeQA.Languages.Code.Vm.Elixir + opts = [language_module: lang_mod] + + code + |> TokenNormalizer.normalize_structural() + |> Parser.detect_blocks(lang_mod) + |> NodeTypeDetector.detect_types(lang_mod) + |> CompoundNodeBuilder.build() + end + + describe "build/1" do + test "returns CompoundNode structs" do + [compound | _] = build("def foo, do: :ok") + assert %CompoundNode{} = compound + end + + test "bare code block wraps in compound with empty docs and typespecs" do + [compound] = build("def foo, do: :ok") + assert compound.docs == [] + assert compound.typespecs == [] + assert length(compound.code) == 1 + end + + test "@doc block attaches to following code block" do + code = ~s(@doc """\nSome doc.\n"""\ndef foo, do: :ok) + [compound] = build(code) + assert length(compound.docs) == 1 + assert length(compound.code) == 1 + end + + test "@spec block attaches to following code block" do + code = "@spec foo() :: :ok\ndef foo, do: :ok" + [compound] = build(code) + assert length(compound.typespecs) == 1 + assert length(compound.code) == 1 + end + + test "consecutive code clauses accumulate in same compound" do + code = "def foo(:a), do: 1\ndef foo(:b), do: 2\ndef foo(_), do: 3" + [compound] = build(code) + assert length(compound.code) == 3 + end + + test "doc after code starts a new compound" do + code = ~s(def foo do\n :ok\nend\n\n\n@doc """\nSome doc.\n"""\ndef bar, do: :ok) + compounds = build(code) + assert length(compounds) == 2 + [first, second] = compounds + assert first.docs == [] + assert length(second.docs) == 1 + end + + test "two blank lines between code blocks starts a new compound" do + code = "def foo, do: :ok\n\n\ndef bar, do: :ok" + compounds = build(code) + assert length(compounds) == 2 + end + + test "single blank line between code blocks does NOT start a new compound" do + code = "def foo(:a), do: 1\n\ndef foo(:b), do: 2" + [compound] = build(code) + assert length(compound.code) == 2 + end + + test "start_line is set from first non-whitespace token" do + [compound] = build("def foo, do: :ok") + assert is_integer(compound.start_line) + assert compound.start_line >= 1 + end + + test "start_col is set from first non-whitespace token" do + [compound] = build("def foo, do: :ok") + assert is_integer(compound.start_col) + end + + test "typespec block before any code attaches to compound (no flush)" do + code = "@spec foo() :: :ok\ndef foo, do: :ok" + [compound] = build(code) + assert length(compound.typespecs) == 1 + assert length(compound.code) == 1 + end + + test "end_line is set from last non-whitespace token" do + [compound] = build("def foo, do: :ok") + assert is_integer(compound.end_line) + end + + test "end_col is set from last non-whitespace token" do + [compound] = build("def foo, do: :ok") + assert is_integer(compound.end_col) + end + + test "empty list returns empty list" do + assert [] == CompoundNodeBuilder.build([]) + end + end + + describe "build/1 with typed node structs" do + test "routes DocNode to docs bucket" do + doc = %DocNode{tokens: [:d], line_count: 1, children: [], start_line: 1, end_line: 1} + code = %CodeNode{tokens: [:c], line_count: 2, children: [], start_line: 2, end_line: 3} + + [compound] = CompoundNodeBuilder.build([doc, code]) + assert length(compound.docs) == 1 + assert is_struct(hd(compound.docs), DocNode) + end + + test "routes AttributeNode to typespecs bucket" do + attr = %AttributeNode{ + tokens: [:a], + line_count: 1, + children: [], + start_line: 1, + end_line: 1, + kind: :typespec + } + + code = %CodeNode{tokens: [:c], line_count: 2, children: [], start_line: 2, end_line: 3} + + [compound] = CompoundNodeBuilder.build([attr, code]) + assert length(compound.typespecs) == 1 + assert is_struct(hd(compound.typespecs), AttributeNode) + end + end +end diff --git a/test/codeqa/ast/enrichment/node_analyzer_test.exs b/test/codeqa/ast/enrichment/node_analyzer_test.exs new file mode 100644 index 00000000..6f3e4398 --- /dev/null +++ b/test/codeqa/ast/enrichment/node_analyzer_test.exs @@ -0,0 +1,62 @@ +defmodule CodeQA.AST.Enrichment.NodeAnalyzerTest do + use ExUnit.Case, async: true + alias CodeQA.AST.Enrichment.NodeAnalyzer + alias CodeQA.AST.Lexing.TokenNormalizer + + defp tokenize(code), do: TokenNormalizer.normalize_structural(code) + defp bound(code), do: code |> tokenize() |> NodeAnalyzer.bound_variables() + + describe "bound_variables/1" do + test "simple assignment binds the LHS identifier" do + assert "user" in bound("user = Repo.get!(id)") + end + + test "assignment RHS identifiers are NOT bound" do + result = bound("user = Repo.get!(id)") + refute "repo" in result + refute "id" in result + end + + test "with-clause binding (<-) binds the LHS identifier" do + assert "user" in bound("{:ok, user} <- fetch_user(id)") + end + + test "multiple assignments in a block are all bound" do + code = "a = foo()\nb = bar()\nc = baz()" + result = bound(code) + assert "a" in result + assert "b" in result + assert "c" in result + end + + test "compound LHS: only the immediately before = is bound" do + # `x.field = val` — `x` is not re-bound; skip non-simple LHS + result = bound("result = compute(x)") + assert "result" in result + end + + test "== operator does not create a binding" do + result = bound("x == y") + refute "x" in result + refute "y" in result + end + + test "=> fat arrow does not create a binding" do + result = bound("key => value") + refute "key" in result + end + + test "=~ regex match does not create a binding" do + result = bound("str =~ pattern") + refute "str" in result + end + + test "returns MapSet" do + assert %MapSet{} = bound("x = 1") + end + + test "empty token list returns empty MapSet" do + assert MapSet.new() == NodeAnalyzer.bound_variables([]) + end + end +end diff --git a/test/codeqa/ast/lexing/string_token_test.exs b/test/codeqa/ast/lexing/string_token_test.exs new file mode 100644 index 00000000..0a99e9e0 --- /dev/null +++ b/test/codeqa/ast/lexing/string_token_test.exs @@ -0,0 +1,195 @@ +defmodule CodeQA.AST.StringTokenTest do + use ExUnit.Case, async: true + + alias CodeQA.AST.Lexing.StringToken + alias CodeQA.AST.Lexing.TokenNormalizer + + describe "StringToken struct" do + test "has kind, content, line, col, interpolations, multiline, and quotes fields" do + tok = %StringToken{ + kind: "", + content: ~s("hello"), + line: 1, + col: 0, + interpolations: nil + } + + assert tok.kind == "" + assert tok.content == ~s("hello") + assert tok.line == 1 + assert tok.col == 0 + assert tok.interpolations == nil + assert tok.multiline == false + assert tok.quotes == :double + end + + test "interpolations defaults to nil" do + tok = %StringToken{kind: "", content: ~s("hello")} + assert tok.interpolations == nil + end + + test "multiline defaults to false" do + tok = %StringToken{kind: "", content: ~s("hello")} + assert tok.multiline == false + end + + test "quotes defaults to :double" do + tok = %StringToken{kind: "", content: ~s("hello")} + assert tok.quotes == :double + end + + test "multiline triple-quote struct" do + tok = %StringToken{kind: "", content: ~s("""), multiline: true, quotes: :double} + assert tok.multiline == true + assert tok.quotes == :double + end + end + + describe "TokenNormalizer emits StringToken for strings" do + test "plain string emits a StringToken" do + [tok] = + TokenNormalizer.normalize_structural(~s("hello")) + |> Enum.filter(&(&1.kind == "")) + + assert %StringToken{} = tok + end + + test "plain string StringToken has nil interpolations" do + [tok] = + TokenNormalizer.normalize_structural(~s("hello")) + |> Enum.filter(&(&1.kind == "")) + + assert tok.interpolations == nil + end + + test "Elixir/Ruby interpolated string emits a StringToken" do + [tok] = + TokenNormalizer.normalize_structural(~S|"hello #{name}"|) + |> Enum.filter(&(&1.kind == "")) + + assert %StringToken{} = tok + end + + test "JS/TS backtick interpolated string emits a StringToken" do + [tok] = + TokenNormalizer.normalize_structural(~S|`hello ${name}`|) + |> Enum.filter(&(&1.kind == "")) + + assert %StringToken{} = tok + end + + test "Kotlin/Dart/Scala interpolated string emits a StringToken" do + [tok] = + TokenNormalizer.normalize_structural(~S|"hello ${name}"|) + |> Enum.filter(&(&1.kind == "")) + + assert %StringToken{} = tok + end + + test "Swift interpolated string emits a StringToken" do + [tok] = + TokenNormalizer.normalize_structural(~S|"hello \(name)"|) + |> Enum.filter(&(&1.kind == "")) + + assert %StringToken{} = tok + end + + test "plain backtick string emits a StringToken" do + [tok] = + TokenNormalizer.normalize_structural(~S|`hello`|) + |> Enum.filter(&(&1.kind == "")) + + assert %StringToken{} = tok + end + + test "non-string tokens are still plain Token structs" do + tokens = TokenNormalizer.normalize_structural("foo = 42") + id = Enum.find(tokens, &(&1.kind == "")) + refute match?(%StringToken{}, id) + end + end + + describe "quotes field" do + test "double-quoted string has quotes :double" do + [tok] = + TokenNormalizer.normalize_structural(~s("hello")) + |> Enum.filter(&(&1.kind == "")) + + assert tok.quotes == :double + end + + test "single-quoted string has quotes :single" do + [tok] = + TokenNormalizer.normalize_structural("'hello'") + |> Enum.filter(&(&1.kind == "")) + + assert tok.quotes == :single + end + + test "backtick string has quotes :backtick" do + [tok] = + TokenNormalizer.normalize_structural(~S|`hello`|) + |> Enum.filter(&(&1.kind == "")) + + assert tok.quotes == :backtick + end + + test "backtick interpolated string has quotes :backtick" do + [tok] = + TokenNormalizer.normalize_structural(~S|`hello ${name}`|) + |> Enum.filter(&(&1.kind == "")) + + assert tok.quotes == :backtick + end + + test "Elixir interpolated string has quotes :double" do + [tok] = + TokenNormalizer.normalize_structural(~S|"hello #{name}"|) + |> Enum.filter(&(&1.kind == "")) + + assert tok.quotes == :double + end + end + + describe "multiline field" do + test "regular string has multiline false" do + [tok] = + TokenNormalizer.normalize_structural(~s("hello")) + |> Enum.filter(&(&1.kind == "")) + + assert tok.multiline == false + end + + test "double triple-quote token has multiline true" do + [tok | _] = + TokenNormalizer.normalize_structural(~s("""\nhello\n""")) + |> Enum.filter(&(&1.kind == "")) + + assert tok.multiline == true + end + + test "single triple-quote token has multiline true" do + [tok | _] = + TokenNormalizer.normalize_structural("'''\nhello\n'''") + |> Enum.filter(&(&1.kind == "")) + + assert tok.multiline == true + end + + test "triple-quote token quotes :double for \"\"\"" do + [tok | _] = + TokenNormalizer.normalize_structural(~s("""\nhello\n""")) + |> Enum.filter(&(&1.kind == "")) + + assert tok.quotes == :double + end + + test "triple-quote token quotes :single for '''" do + [tok | _] = + TokenNormalizer.normalize_structural("'''\nhello\n'''") + |> Enum.filter(&(&1.kind == "")) + + assert tok.quotes == :single + end + end +end diff --git a/test/codeqa/ast/lexing/token_normalizer_test.exs b/test/codeqa/ast/lexing/token_normalizer_test.exs new file mode 100644 index 00000000..19a886ae --- /dev/null +++ b/test/codeqa/ast/lexing/token_normalizer_test.exs @@ -0,0 +1,332 @@ +defmodule CodeQA.AST.TokenNormalizerTest do + use ExUnit.Case, async: true + alias CodeQA.AST.Lexing.StringToken + alias CodeQA.AST.Lexing.Token + alias CodeQA.AST.Lexing.TokenNormalizer + + defp kinds(tokens), do: Enum.map(tokens, & &1.kind) + + describe "normalize_structural/1" do + test "emits between lines" do + result = TokenNormalizer.normalize_structural("a\nb") + assert "" in kinds(result) + end + + test "two blank lines produce two or more consecutive tokens" do + result = TokenNormalizer.normalize_structural("a\n\nb") + + nl_runs = + result + |> Enum.chunk_by(&(&1.kind == "")) + |> Enum.filter(fn [h | _] -> h.kind == "" end) + |> Enum.map(&length/1) + + assert Enum.any?(nl_runs, &(&1 >= 2)) + end + + test "emits one token per 2 leading spaces" do + result = TokenNormalizer.normalize_structural(" foo") + assert Enum.count(result, &(&1.kind == "")) == 2 + end + + test "emits one token per tab" do + result = TokenNormalizer.normalize_structural("\t\tfoo") + assert Enum.count(result, &(&1.kind == "")) == 2 + end + + test "normalizes identifiers to " do + result = TokenNormalizer.normalize_structural("foo bar") + assert kinds(result) == ["", ""] + end + + test "normalizes numbers to " do + result = TokenNormalizer.normalize_structural("x = 42") + assert "" in kinds(result) + end + + test "empty string returns empty list" do + assert TokenNormalizer.normalize_structural("") == [] + end + + test "single leading space produces zero tokens (below threshold)" do + result = TokenNormalizer.normalize_structural(" foo") + assert not Enum.any?(result, &(&1.kind == "")) + end + + test "punctuation tokens like ( and : survive as individual tokens" do + result = TokenNormalizer.normalize_structural("foo(x):") + assert "(" in kinds(result) + assert ")" in kinds(result) + assert ":" in kinds(result) + end + + test "tokens carry line numbers" do + result = TokenNormalizer.normalize_structural("foo\nbar") + lines = Enum.map(result, & &1.line) + assert 1 in lines + assert 2 in lines + end + + test "tokens carry col offsets" do + result = TokenNormalizer.normalize_structural("foo") + [tok] = result + assert tok.col == 0 + end + + test "identifier token preserves original content" do + result = TokenNormalizer.normalize_structural("myVar") + [tok] = result + assert tok.kind == "" + assert tok.content == "myVar" + end + + test "keyword content is preserved (not normalized away)" do + result = TokenNormalizer.normalize_structural("def foo") + contents = Enum.map(result, & &1.content) + assert "def" in contents + end + + test "string token content is the original literal" do + result = TokenNormalizer.normalize_structural(~s("hello")) + tok = Enum.find(result, &(&1.kind == "")) + assert tok.content == ~s("hello") + end + + # multi-char operator tests + + test ">= is a single token" do + result = TokenNormalizer.normalize_structural("x >= y") + assert ">=" in kinds(result) + refute ">" in kinds(result) + end + + test "<= is a single token" do + result = TokenNormalizer.normalize_structural("x <= y") + assert "<=" in kinds(result) + refute "<" in kinds(result) + end + + test "== is a single token" do + result = TokenNormalizer.normalize_structural("x == y") + assert "==" in kinds(result) + end + + test "!= is a single token" do + result = TokenNormalizer.normalize_structural("x != y") + assert "!=" in kinds(result) + refute "!" in kinds(result) + end + + test "=== is a single token (not == + =)" do + result = TokenNormalizer.normalize_structural("x === y") + assert "===" in kinds(result) + refute "==" in kinds(result) + end + + test "!== is a single token" do + result = TokenNormalizer.normalize_structural("x !== y") + assert "!==" in kinds(result) + refute "!=" in kinds(result) + end + + test "|> is a single token (Elixir pipe)" do + result = TokenNormalizer.normalize_structural("x |> f") + assert "|>" in kinds(result) + refute "|" in kinds(result) + end + + test "<> is a single token (Elixir concat)" do + result = TokenNormalizer.normalize_structural(~s("a" <> "b")) + assert "<>" in kinds(result) + end + + test "<- is a single token (Elixir/Go arrow)" do + result = TokenNormalizer.normalize_structural("x <- y") + assert "<-" in kinds(result) + refute "<" in kinds(result) + end + + test "-> is a single token" do + result = TokenNormalizer.normalize_structural("x -> y") + assert "->" in kinds(result) + refute "-" in kinds(result) + end + + test "=> is a single token (fat arrow)" do + result = TokenNormalizer.normalize_structural("k => v") + assert "=>" in kinds(result) + end + + test "=~ is a single token (regex match)" do + result = TokenNormalizer.normalize_structural("x =~ y") + assert "=~" in kinds(result) + end + + test "&& is a single token" do + result = TokenNormalizer.normalize_structural("a && b") + assert "&&" in kinds(result) + refute "&" in kinds(result) + end + + test "|| is a single token" do + result = TokenNormalizer.normalize_structural("a || b") + assert "||" in kinds(result) + refute "|" in kinds(result) + end + + test ":: is a single token" do + result = TokenNormalizer.normalize_structural("Foo::Bar") + assert "::" in kinds(result) + refute ":" in kinds(result) + end + + test ".. is a single token" do + result = TokenNormalizer.normalize_structural("1..10") + assert ".." in kinds(result) + end + + test "... is a single token (not .. + .)" do + result = TokenNormalizer.normalize_structural("1...10") + assert "..." in kinds(result) + refute ".." in kinds(result) + end + + test "multi-char operator value equals content (no normalization)" do + result = TokenNormalizer.normalize_structural("x >= y") + tok = Enum.find(result, &(&1.kind == ">=")) + assert tok.content == ">=" + end + end + + describe "interpolated string tokens are normalised to " do + test "Elixir/Ruby #{} emits with interpolations" do + [tok] = + TokenNormalizer.normalize_structural(~S|"hello #{name}"|) + |> Enum.filter(&(&1.kind == "")) + + assert tok.interpolations == ["name"] + end + + test "JS/TS backtick with \${} emits with interpolations" do + [tok] = + TokenNormalizer.normalize_structural(~S|`hello ${name}`|) + |> Enum.filter(&(&1.kind == "")) + + assert tok.interpolations == ["name"] + end + + test "JS/TS backtick static content has interpolation stripped" do + [tok] = + TokenNormalizer.normalize_structural(~S|`hello ${name} world`|) + |> Enum.filter(&(&1.kind == "")) + + assert tok.content == "`hello world`" + end + + test "JS/TS backtick two interpolations are both captured" do + [tok] = + TokenNormalizer.normalize_structural(~S|`${a} and ${b}`|) + |> Enum.filter(&(&1.kind == "")) + + assert tok.interpolations == ["a", "b"] + end + + test "plain backtick string without interpolation emits with nil interpolations" do + [tok] = + TokenNormalizer.normalize_structural(~S|`hello world`|) + |> Enum.filter(&(&1.kind == "")) + + assert tok.interpolations == nil + end + + test "Kotlin/Dart/Scala \${} emits with interpolations" do + [tok] = + TokenNormalizer.normalize_structural(~S|"hello ${name}"|) + |> Enum.filter(&(&1.kind == "")) + + assert tok.interpolations == ["name"] + end + + test "Kotlin/Dart/Scala static content has interpolation stripped" do + [tok] = + TokenNormalizer.normalize_structural(~S|"hello ${name} world"|) + |> Enum.filter(&(&1.kind == "")) + + assert tok.content == ~S|"hello world"| + end + + test "Kotlin/Dart/Scala two interpolations are both captured" do + [tok] = + TokenNormalizer.normalize_structural(~S|"${a} and ${b}"|) + |> Enum.filter(&(&1.kind == "")) + + assert tok.interpolations == ["a", "b"] + end + + test "Swift \\(...) emits with interpolations" do + [tok] = + TokenNormalizer.normalize_structural(~S|"hello \(name)"|) + |> Enum.filter(&(&1.kind == "")) + + assert tok.interpolations == ["name"] + end + + test "Swift static content has interpolation stripped" do + [tok] = + TokenNormalizer.normalize_structural(~S|"hello \(name) world"|) + |> Enum.filter(&(&1.kind == "")) + + assert tok.content == ~S|"hello world"| + end + + test "Swift two interpolations are both captured" do + [tok] = + TokenNormalizer.normalize_structural(~S|"\(a) and \(b)"|) + |> Enum.filter(&(&1.kind == "")) + + assert tok.interpolations == ["a", "b"] + end + + test "plain double-quoted string has nil interpolations" do + [tok] = + TokenNormalizer.normalize_structural(~s("hello")) + |> Enum.filter(&(&1.kind == "")) + + assert tok.interpolations == nil + end + end + + describe " token" do + test "triple double-quotes emits a StringToken with kind " do + tokens = TokenNormalizer.normalize_structural(~s(""")) + + assert [%StringToken{kind: "", content: ~s("""), multiline: true, quotes: :double}] = + tokens + end + + test "triple single-quotes emits a StringToken with kind " do + tokens = TokenNormalizer.normalize_structural("'''") + + assert [%StringToken{kind: "", content: "'''", multiline: true, quotes: :single}] = + tokens + end + + test "triple-quote is not consumed as empty string + bare quote" do + tokens = TokenNormalizer.normalize_structural(~s(""")) + refute Enum.any?(tokens, &(&1.kind == "")) + end + + test "content between triple-quotes is tokenized normally" do + code = ~s("""\nhello world\n""") + tokens = TokenNormalizer.normalize_structural(code) + trip_count = Enum.count(tokens, &(&1.kind == "")) + assert trip_count == 2 + assert Enum.any?(tokens, &(&1.kind == "" and &1.content == "hello")) + end + + test "regular double-quoted string still works" do + tokens = TokenNormalizer.normalize_structural(~s("hello")) + assert [%StringToken{kind: ""}] = tokens + end + end +end diff --git a/test/codeqa/ast/lexing/token_protocol_test.exs b/test/codeqa/ast/lexing/token_protocol_test.exs new file mode 100644 index 00000000..340d94a9 --- /dev/null +++ b/test/codeqa/ast/lexing/token_protocol_test.exs @@ -0,0 +1,142 @@ +defmodule CodeQA.AST.Lexing.TokenProtocolTest do + use ExUnit.Case, async: true + + alias CodeQA.AST.Lexing.StringToken + alias CodeQA.AST.Lexing.Token + alias CodeQA.AST.Lexing.TokenProtocol + + describe "Token implementation" do + setup do + {:ok, token: %Token{kind: "", content: "foo", line: 3, col: 7}} + end + + test "kind/1", %{token: t} do + assert TokenProtocol.kind(t) == "" + end + + test "content/1", %{token: t} do + assert TokenProtocol.content(t) == "foo" + end + + test "line/1", %{token: t} do + assert TokenProtocol.line(t) == 3 + end + + test "col/1", %{token: t} do + assert TokenProtocol.col(t) == 7 + end + + test "nil location fields are preserved" do + t = %Token{kind: "", content: "\n", line: nil, col: nil} + assert TokenProtocol.line(t) == nil + assert TokenProtocol.col(t) == nil + end + end + + describe "StringToken implementation" do + setup do + {:ok, + token: %StringToken{ + kind: "", + content: "\"hello\"", + line: 10, + col: 2, + interpolations: nil + }} + end + + test "kind/1", %{token: t} do + assert TokenProtocol.kind(t) == "" + end + + test "content/1", %{token: t} do + assert TokenProtocol.content(t) == "\"hello\"" + end + + test "line/1", %{token: t} do + assert TokenProtocol.line(t) == 10 + end + + test "col/1", %{token: t} do + assert TokenProtocol.col(t) == 2 + end + + test "works with interpolated string token" do + t = %StringToken{ + kind: "", + content: "\"\#{x}\"", + line: 5, + col: 0, + interpolations: ["x"] + } + + assert TokenProtocol.kind(t) == "" + assert TokenProtocol.content(t) == "\"\#{x}\"" + end + end + + describe "StringToken (multiline) via protocol" do + setup do + {:ok, + token: %StringToken{ + kind: "", + content: ~s("""), + line: 2, + col: 0, + multiline: true, + quotes: :double + }} + end + + test "kind/1", %{token: t} do + assert TokenProtocol.kind(t) == "" + end + + test "content/1", %{token: t} do + assert TokenProtocol.content(t) == ~s(""") + end + + test "line/1", %{token: t} do + assert TokenProtocol.line(t) == 2 + end + + test "col/1", %{token: t} do + assert TokenProtocol.col(t) == 0 + end + + test "single-quote variant" do + t = %StringToken{ + kind: "", + content: "'''", + line: 5, + col: 0, + multiline: true, + quotes: :single + } + + assert TokenProtocol.kind(t) == "" + assert t.quotes == :single + end + end + + describe "polymorphic use" do + test "mixed token list can be processed uniformly" do + tokens = [ + %Token{kind: "", content: "x", line: 1, col: 0}, + %StringToken{kind: "", content: "\"hi\"", line: 1, col: 4}, + %StringToken{ + kind: "", + content: ~s("""), + line: 2, + col: 0, + multiline: true, + quotes: :double + }, + %Token{kind: "", content: "\n", line: 2, col: 3} + ] + + kinds = Enum.map(tokens, &TokenProtocol.kind/1) + assert kinds == ["", "", "", ""] + end + end +end diff --git a/test/codeqa/ast/nodes/code_node_test.exs b/test/codeqa/ast/nodes/code_node_test.exs new file mode 100644 index 00000000..20082f0c --- /dev/null +++ b/test/codeqa/ast/nodes/code_node_test.exs @@ -0,0 +1,55 @@ +defmodule CodeQA.AST.Nodes.CodeNodeTest do + use ExUnit.Case, async: true + + alias CodeQA.AST.Classification.NodeProtocol + alias CodeQA.AST.Nodes.{CodeNode, DocNode} + + @tokens [:a, :b, :c] + + describe "CodeNode" do + setup do + node = %CodeNode{ + tokens: @tokens, + line_count: 2, + children: [], + start_line: 1, + end_line: 2, + label: "f.ex:1" + } + + %{node: node} + end + + test "implements NodeProtocol", %{node: node} do + assert NodeProtocol.tokens(node) == @tokens + assert NodeProtocol.line_count(node) == 2 + assert NodeProtocol.children(node) == [] + assert NodeProtocol.start_line(node) == 1 + assert NodeProtocol.end_line(node) == 2 + assert NodeProtocol.label(node) == "f.ex:1" + end + + test "all common fields default to nil except children" do + node = %CodeNode{tokens: [], line_count: 0, children: []} + assert NodeProtocol.start_line(node) == nil + assert NodeProtocol.end_line(node) == nil + assert NodeProtocol.label(node) == nil + end + end + + describe "DocNode" do + test "implements NodeProtocol" do + node = %DocNode{ + tokens: @tokens, + line_count: 1, + children: [], + start_line: 5, + end_line: 5, + label: nil + } + + assert NodeProtocol.tokens(node) == @tokens + assert NodeProtocol.children(node) == [] + end + end +end diff --git a/test/codeqa/ast/nodes/function_node_test.exs b/test/codeqa/ast/nodes/function_node_test.exs new file mode 100644 index 00000000..a1770bce --- /dev/null +++ b/test/codeqa/ast/nodes/function_node_test.exs @@ -0,0 +1,68 @@ +defmodule CodeQA.AST.Nodes.FunctionNodeTest do + use ExUnit.Case, async: true + + alias CodeQA.AST.Classification.NodeProtocol + alias CodeQA.AST.Nodes.{FunctionNode, ModuleNode} + + describe "FunctionNode" do + setup do + node = %FunctionNode{ + tokens: [:a], + line_count: 5, + children: [], + start_line: 10, + end_line: 14, + label: "foo.ex:10", + name: "calculate", + arity: 2, + visibility: :public + } + + %{node: node} + end + + test "implements NodeProtocol", %{node: node} do + assert NodeProtocol.tokens(node) == [:a] + assert NodeProtocol.line_count(node) == 5 + assert NodeProtocol.start_line(node) == 10 + end + + test "specific fields are accessible", %{node: node} do + assert node.name == "calculate" + assert node.arity == 2 + assert node.visibility == :public + end + + test "specific fields default to nil" do + node = %FunctionNode{tokens: [], line_count: 0, children: []} + assert node.name == nil + assert node.arity == nil + assert node.visibility == nil + end + end + + describe "ModuleNode" do + test "implements NodeProtocol" do + node = %ModuleNode{ + tokens: [:m], + line_count: 20, + children: [], + start_line: 1, + end_line: 20, + label: nil, + name: "MyApp.Foo", + kind: :module + } + + assert NodeProtocol.tokens(node) == [:m] + assert node.name == "MyApp.Foo" + assert node.kind == :module + end + + test "specific fields default to nil" do + node = %ModuleNode{tokens: [], line_count: 0, children: []} + assert node.name == nil + assert node.kind == nil + end + end +end diff --git a/test/codeqa/ast/nodes/import_node_test.exs b/test/codeqa/ast/nodes/import_node_test.exs new file mode 100644 index 00000000..53c4a989 --- /dev/null +++ b/test/codeqa/ast/nodes/import_node_test.exs @@ -0,0 +1,74 @@ +defmodule CodeQA.AST.Nodes.ImportNodeTest do + use ExUnit.Case, async: true + + alias CodeQA.AST.Classification.NodeProtocol + alias CodeQA.AST.Nodes.{AttributeNode, ImportNode, TestNode} + + describe "ImportNode" do + test "implements NodeProtocol" do + node = %ImportNode{ + tokens: [:i], + line_count: 1, + children: [], + start_line: 3, + end_line: 3, + label: nil, + target: "MyApp.Repo" + } + + assert NodeProtocol.tokens(node) == [:i] + assert node.target == "MyApp.Repo" + end + + test "target defaults to nil" do + node = %ImportNode{tokens: [], line_count: 0, children: []} + assert node.target == nil + end + end + + describe "AttributeNode" do + test "implements NodeProtocol" do + node = %AttributeNode{ + tokens: [:a], + line_count: 1, + children: [], + start_line: 2, + end_line: 2, + label: nil, + name: "moduledoc", + kind: :annotation + } + + assert NodeProtocol.tokens(node) == [:a] + assert node.name == "moduledoc" + assert node.kind == :annotation + end + + test "supports :typespec kind" do + node = %AttributeNode{tokens: [], line_count: 0, children: [], kind: :typespec} + assert node.kind == :typespec + end + end + + describe "TestNode" do + test "implements NodeProtocol" do + node = %TestNode{ + tokens: [:t], + line_count: 4, + children: [], + start_line: 10, + end_line: 13, + label: nil, + description: "returns the sum" + } + + assert NodeProtocol.tokens(node) == [:t] + assert node.description == "returns the sum" + end + + test "description defaults to nil" do + node = %TestNode{tokens: [], line_count: 0, children: []} + assert node.description == nil + end + end +end diff --git a/test/codeqa/ast/parsing/parser_languages_test.exs b/test/codeqa/ast/parsing/parser_languages_test.exs new file mode 100644 index 00000000..5526d10b --- /dev/null +++ b/test/codeqa/ast/parsing/parser_languages_test.exs @@ -0,0 +1,168 @@ +defmodule CodeQA.AST.Parsing.ParserLanguagesTest do + use ExUnit.Case, async: true + + alias CodeQA.AST.Lexing.TokenNormalizer + alias CodeQA.AST.Parsing.Parser + alias CodeQA.Language + alias CodeQA.Languages.Unknown + + Module.register_attribute(__MODULE__, :fixture, accumulate: true, persist: false) + + # Elixir fixtures + use Test.Fixtures.Elixir.Calculator + use Test.Fixtures.Elixir.EventBus + use Test.Fixtures.Elixir.RateLimiter + + # Python fixtures + use Test.Fixtures.Python.Calculator + use Test.Fixtures.Python.CsvPipeline + use Test.Fixtures.Python.ConfigParser + + # JavaScript fixtures + use Test.Fixtures.JavaScript.Calculator + use Test.Fixtures.JavaScript.FormValidator + use Test.Fixtures.JavaScript.ShoppingCart + + # Go fixtures + use Test.Fixtures.Go.Calculator + use Test.Fixtures.Go.HttpMiddleware + use Test.Fixtures.Go.CliParser + + # Rust fixtures + use Test.Fixtures.Rust.Calculator + use Test.Fixtures.Rust.Tokenizer + use Test.Fixtures.Rust.RingBuffer + + # Ruby fixtures + use Test.Fixtures.Ruby.Calculator + use Test.Fixtures.Ruby.OrmLite + use Test.Fixtures.Ruby.MarkdownRenderer + + # TypeScript fixtures + use Test.Fixtures.TypeScript.UserProfileStore + use Test.Fixtures.TypeScript.EventEmitter + use Test.Fixtures.TypeScript.DependencyInjection + + # Java fixtures + use Test.Fixtures.Java.BuilderPattern + use Test.Fixtures.Java.RepositoryPattern + use Test.Fixtures.Java.StrategyPattern + + # C# fixtures + use Test.Fixtures.CSharp.LinqPipeline + use Test.Fixtures.CSharp.AsyncTaskManager + use Test.Fixtures.CSharp.PluginSystem + + # Swift fixtures + use Test.Fixtures.Swift.ResultType + use Test.Fixtures.Swift.CombineStream + use Test.Fixtures.Swift.ActorModel + + # Kotlin fixtures + use Test.Fixtures.Kotlin.SealedState + use Test.Fixtures.Kotlin.CoroutineFlow + use Test.Fixtures.Kotlin.ExtensionLibrary + + # C++ fixtures + use Test.Fixtures.Cpp.SmartPointer + use Test.Fixtures.Cpp.TemplateContainer + use Test.Fixtures.Cpp.ObserverPattern + + # Scala fixtures + use Test.Fixtures.Scala.CaseClassAlgebra + use Test.Fixtures.Scala.TypeclassPattern + use Test.Fixtures.Scala.ActorMessages + + # Dart fixtures + use Test.Fixtures.Dart.WidgetState + use Test.Fixtures.Dart.FuturesAsync + use Test.Fixtures.Dart.MixinComposition + + # Zig fixtures + use Test.Fixtures.Zig.AllocatorInterface + use Test.Fixtures.Zig.TaggedUnion + use Test.Fixtures.Zig.IteratorProtocol + + # Lua fixtures + use Test.Fixtures.Lua.ClassSystem + use Test.Fixtures.Lua.EventSystem + use Test.Fixtures.Lua.StateMachine + + # Note: accumulate: true prepends, so Enum.at(0) is the LAST registered fixture. + # All @code values use 0 leading spaces, so @indentation_level will always be 0 + # and the normalization branch below is never taken. + @indentation_level @fixture + |> Enum.at(0) + |> elem(1) + |> String.split("\n") + |> List.first() + |> then(&Regex.run(~r/^\s*/, &1)) + |> List.first() + |> String.length() + + @normalized_fixtures for {language, code, block_assertions} <- @fixture, + do: + {language, + if @indentation_level > 0 do + code + |> String.split("\n") + |> Enum.map_join( + "\n", + &String.replace_leading( + &1, + String.duplicate(" ", @indentation_level), + "" + ) + ) + else + code + end, block_assertions} + + defp blocks(code, lang_mod \\ Unknown) do + code + |> TokenNormalizer.normalize_structural() + |> Parser.detect_blocks(lang_mod) + end + + defp children(code, lang_mod \\ Unknown) do + code + |> TokenNormalizer.normalize_structural() + |> Parser.detect_blocks(lang_mod) + |> Enum.flat_map(& &1.children) + end + + describe "blocks/2" do + for {language, code, _block_assertions} <- @normalized_fixtures do + lang_name = language |> String.split() |> hd() + lang_mod = Language.find(lang_name) + + test "detects at least 3 blocks for #{language} code" do + lang_mod = unquote(lang_mod) + result = blocks(unquote(code), lang_mod) + + if unquote(lang_mod) == Unknown do + assert result != [] + else + assert length(result) >= 3 + end + end + + test "detects at least 3 sub-blocks for #{language} code" do + lang_mod = unquote(lang_mod) + result = children(unquote(code), lang_mod) + + if unquote(lang_mod) == Unknown do + assert is_list(result) + else + assert length(result) >= 3 + end + end + + test "detects less sub-blocks than line-numbers for #{language} code" do + lang_mod = unquote(lang_mod) + result = children(unquote(code), lang_mod) + assert length(result) < length(String.split(unquote(code), "\n")) + end + end + end +end diff --git a/test/codeqa/ast/parsing/parser_test.exs b/test/codeqa/ast/parsing/parser_test.exs new file mode 100644 index 00000000..51ead52e --- /dev/null +++ b/test/codeqa/ast/parsing/parser_test.exs @@ -0,0 +1,188 @@ +defmodule CodeQA.AST.Parsing.ParserTest do + use ExUnit.Case, async: true + alias CodeQA.AST.Enrichment.Node + alias CodeQA.AST.Lexing.TokenNormalizer + alias CodeQA.AST.Parsing.Parser + alias CodeQA.Languages.Code.Scripting.Python + alias CodeQA.Languages.Code.Vm.Elixir, as: ElixirLang + alias CodeQA.Languages.Unknown + + defp tokenize(code), do: TokenNormalizer.normalize_structural(code) + + describe "detect_blocks/2" do + test "single block for file with no blank lines" do + tokens = tokenize("def foo\n x = 1\nend\n") + blocks = Parser.detect_blocks(tokens, ElixirLang) + assert length(blocks) == 1 + end + + test "splits into two blocks at blank line" do + tokens = tokenize("def foo\n x\nend\n\n\ndef bar\n y\nend\n") + blocks = Parser.detect_blocks(tokens, ElixirLang) + assert length(blocks) == 2 + end + + test "each block has correct line_count" do + tokens = tokenize("def foo\n x\nend\n\n\ndef bar\n y\nend\n") + [b1, b2] = Parser.detect_blocks(tokens, ElixirLang) + assert b1.line_count >= 3 + assert b2.line_count >= 3 + end + + test "empty input returns empty list" do + assert Parser.detect_blocks([], Unknown) == [] + end + + test "detects bracket sub-blocks" do + tokens = tokenize("foo(a, b)\nbar(c)\n") + [block] = Parser.detect_blocks(tokens, Unknown) + assert block.children != [] + end + + test "detects colon-indent sub-blocks for python language hint" do + tokens = tokenize("def foo:\n return 1\n") + [block] = Parser.detect_blocks(tokens, Python) + assert block.children != [] + end + + test "fewer sub-blocks without python hint than with it (colon rule not applied)" do + tokens = tokenize("def foo:\n return 1\n") + without_hint = Parser.detect_blocks(tokens, Unknown) + with_hint = Parser.detect_blocks(tokens, Python) + count_without = without_hint |> Enum.map(&length(&1.children)) |> Enum.sum() + count_with = with_hint |> Enum.map(&length(&1.children)) |> Enum.sum() + assert count_with >= count_without + end + + test "block has children_count accessible via Node.children_count/1" do + tokens = tokenize("foo(a)\nbar(b)\n") + [block] = Parser.detect_blocks(tokens, Unknown) + assert Node.children_count(block) == length(block.children) + end + end + + describe "recursive sub-block nesting" do + test "nested bracket calls produce a multi-level sub-block tree" do + # def foo(bar(x, y), baz) — the arg list contains another call with its own args + tokens = tokenize("def foo(bar(x, y), baz)\n result\nend\n") + [block] = Parser.detect_blocks(tokens, Unknown) + + # depth 1 — the outer argument list + args = + Enum.find(block.children, fn b -> + Enum.any?(b.tokens, &(&1.content == "bar")) + end) + + assert args != nil, "expected an arg-list sub-block containing 'bar'" + + # depth 2 — the inner call (x, y) inside bar(...) + inner = + Enum.find(args.children, fn b -> + Enum.any?(b.tokens, &(&1.content == "x")) + end) + + assert inner != nil, "expected a sub-block for the inner call (x, y)" + + # depth 3 — (x, y) is a leaf: no further bracket structure inside + assert inner.children == [] + end + + test "triply nested brackets produce three levels of sub-blocks" do + tokens = tokenize("def outer(inner(deep(value)))\n :ok\nend\n") + [block] = Parser.detect_blocks(tokens, Unknown) + + # depth 1: (inner(deep(value))) + d1 = + Enum.find(block.children, fn b -> + Enum.any?(b.tokens, &(&1.content == "inner")) + end) + + assert d1 != nil + + # depth 2: (deep(value)) + d2 = + Enum.find(d1.children, fn b -> + Enum.any?(b.tokens, &(&1.content == "deep")) + end) + + assert d2 != nil + + # depth 3: (value) — leaf + d3 = + Enum.find(d2.children, fn b -> + Enum.any?(b.tokens, &(&1.content == "value")) + end) + + assert d3 != nil + assert d3.children == [] + end + end + + describe "triple-quote protection" do + test "blank lines inside a heredoc do not create a new block" do + code = """ + before + + + \""" + Some doc. + + More doc. + \""" + + after + """ + + tokens = TokenNormalizer.normalize_structural(code) + blocks = Parser.detect_blocks(tokens, Unknown) + # The heredoc (including its blank line) should be ONE block, not split + heredoc_block = + Enum.find(blocks, fn b -> + Enum.any?(b.tokens, &(&1.kind == "")) + end) + + assert heredoc_block != nil + # Ensure no split happened inside — the heredoc block contains both "Some" and "More" + contents = Enum.filter(heredoc_block.tokens, &(&1.kind == "")) + names = Enum.map(contents, & &1.content) + assert "Some" in names + assert "More" in names + end + + test "content before and after a heredoc becomes separate blocks" do + code = """ + def foo do + :ok + end + + + \""" + doc here + \""" + + + def bar do + :ok + end + """ + + tokens = TokenNormalizer.normalize_structural(code) + blocks = Parser.detect_blocks(tokens, Unknown) + # Expect exactly 3 blocks: code-before, heredoc, code-after + assert length(blocks) == 3 + assert Enum.any?(Enum.at(blocks, 0).tokens, &(&1.content == "foo")) + assert Enum.any?(Enum.at(blocks, 1).tokens, &(&1.kind == "")) + assert Enum.any?(Enum.at(blocks, 2).tokens, &(&1.content == "bar")) + end + end + + describe "language_from_path/1" do + test "returns :python for .py files" do + assert Parser.language_from_path("lib/foo.py") == :python + end + + test "returns :unknown for unknown extensions" do + assert Parser.language_from_path("lib/foo.xyz") == :unknown + end + end +end diff --git a/test/codeqa/ast/parsing/signal_registry_test.exs b/test/codeqa/ast/parsing/signal_registry_test.exs new file mode 100644 index 00000000..f0c07887 --- /dev/null +++ b/test/codeqa/ast/parsing/signal_registry_test.exs @@ -0,0 +1,33 @@ +defmodule CodeQA.AST.Parsing.SignalRegistryTest do + use ExUnit.Case, async: true + alias CodeQA.AST.Parsing.SignalRegistry + + test "new/0 returns empty registry" do + r = SignalRegistry.new() + assert r.structural == [] + assert r.classification == [] + end + + test "register_structural/2 appends signal" do + alias CodeQA.AST.Signals.Structural.BlankLineSignal + r = SignalRegistry.new() |> SignalRegistry.register_structural(%BlankLineSignal{}) + assert length(r.structural) == 1 + end + + test "register_classification/2 appends signal" do + alias CodeQA.AST.Signals.Classification.FunctionSignal + r = SignalRegistry.new() |> SignalRegistry.register_classification(%FunctionSignal{}) + assert length(r.classification) == 1 + end + + test "default/0 includes all built-in signals" do + r = SignalRegistry.default() + assert length(r.structural) >= 4 + assert length(r.classification) >= 6 + end + + test "default/0 has exactly 10 classification signals" do + r = SignalRegistry.default() + assert length(r.classification) == 10 + end +end diff --git a/test/codeqa/ast/parsing/signal_stream_test.exs b/test/codeqa/ast/parsing/signal_stream_test.exs new file mode 100644 index 00000000..69cfcaf2 --- /dev/null +++ b/test/codeqa/ast/parsing/signal_stream_test.exs @@ -0,0 +1,43 @@ +defmodule CodeQA.AST.SignalStreamTest do + use ExUnit.Case, async: true + + alias CodeQA.AST.Lexing.Token + alias CodeQA.AST.Parsing.SignalStream + alias CodeQA.Support.CounterSignal + + defp tok(kind, content), do: %Token{kind: kind, content: content, line: 1, col: 0} + + test "returns one emission list per signal" do + tokens = [tok("", "foo"), tok("", "\n"), tok("", "bar")] + results = SignalStream.run(tokens, [%CounterSignal{}], []) + assert length(results) == 1 + end + + test "emissions list contains all emitted values from the signal" do + tokens = [tok("", "foo"), tok("", "\n"), tok("", "bar")] + + [ + [ + {CodeQA.Support.CounterSignal, :test, :id_seen, 0}, + {CodeQA.Support.CounterSignal, :test, :id_seen, 2} + ] + ] = + SignalStream.run(tokens, [%CounterSignal{}], []) + end + + test "non-emitting tokens produce no entries" do + tokens = [tok("", "\n"), tok("", "\n")] + [[]] = SignalStream.run(tokens, [%CounterSignal{}], []) + end + + test "multiple signals run independently" do + tokens = [tok("", "x")] + results = SignalStream.run(tokens, [%CounterSignal{}, %CounterSignal{}], []) + assert length(results) == 2 + end + + test "empty token stream returns empty emissions per signal" do + results = SignalStream.run([], [%CounterSignal{}], []) + assert results == [[]] + end +end diff --git a/test/codeqa/ast/parsing/signal_test.exs b/test/codeqa/ast/parsing/signal_test.exs new file mode 100644 index 00000000..47d72ad6 --- /dev/null +++ b/test/codeqa/ast/parsing/signal_test.exs @@ -0,0 +1,56 @@ +defmodule CodeQA.AST.SignalTest do + use ExUnit.Case, async: true + + defmodule TestSignal do + defstruct [] + + defimpl CodeQA.AST.Parsing.Signal do + def source(_), do: TestSignal + def group(_), do: :split + def init(_, _opts), do: %{count: 0} + + def emit(_, _token, state) do + new_state = %{state | count: state.count + 1} + {MapSet.new([{:tick, state.count}]), new_state} + end + end + end + + defmodule SilentSignal do + defstruct [] + + defimpl CodeQA.AST.Parsing.Signal do + def source(_), do: SilentSignal + def group(_), do: :split + def init(_, _), do: %{} + def emit(_, _token, state), do: {MapSet.new(), state} + end + end + + alias CodeQA.AST.Parsing.Signal + + test "source returns the implementing module" do + assert Signal.source(%TestSignal{}) == TestSignal + end + + test "group returns the signal's group atom" do + assert Signal.group(%TestSignal{}) == :split + end + + test "init returns initial state" do + assert Signal.init(%TestSignal{}, []) == %{count: 0} + end + + test "emit returns {MapSet of {name, value} pairs, new_state}" do + token = %CodeQA.AST.Lexing.Token{kind: "", content: "foo", line: 1, col: 0} + {emissions, new_state} = Signal.emit(%TestSignal{}, token, %{count: 0}) + assert MapSet.member?(emissions, {:tick, 0}) + assert new_state == %{count: 1} + end + + test "emit may return empty MapSet for no emission" do + token = %CodeQA.AST.Lexing.Token{kind: "", content: "\n", line: 1, col: 0} + {emissions, _state} = Signal.emit(%SilentSignal{}, token, %{}) + assert MapSet.size(emissions) == 0 + end +end diff --git a/test/codeqa/ast/signals/classification/comment_density_signal_test.exs b/test/codeqa/ast/signals/classification/comment_density_signal_test.exs new file mode 100644 index 00000000..374b191a --- /dev/null +++ b/test/codeqa/ast/signals/classification/comment_density_signal_test.exs @@ -0,0 +1,46 @@ +defmodule CodeQA.AST.Signals.Classification.CommentDensitySignalTest do + use ExUnit.Case, async: true + alias CodeQA.AST.Parsing.SignalStream + alias CodeQA.AST.Signals.Classification.CommentDensitySignal + alias CodeQA.Languages.Code.Scripting.Python + alias CodeQA.Languages.Unknown + + defp run(tokens, lang_mod \\ Unknown), + do: SignalStream.run(tokens, [%CommentDensitySignal{}], lang_mod) |> List.flatten() + + defp t(content, kind \\ ""), do: %{kind: kind, content: content, line: 1, col: 0} + defp nl, do: %{kind: "", content: "\n", line: 1, col: 0} + defp on_line(tokens, line), do: Enum.map(tokens, &%{&1 | line: line}) + + test "votes comment when >60% of lines start with #" do + tokens = + on_line([t("#"), t("license")], 1) ++ + [nl()] ++ + on_line([t("#"), t("copyright")], 2) ++ + [nl()] ++ + on_line([t("#"), t("author")], 3) ++ + [nl()] ++ + on_line([t("def"), t("foo")], 4) + + emissions = run(tokens, Python) + assert [{CommentDensitySignal, :classification, :comment_vote, _}] = emissions + end + + test "does not vote when comment density is low" do + tokens = + on_line([t("def"), t("foo")], 1) ++ + [nl()] ++ + on_line([t("#"), t("note")], 2) + + assert run(tokens, Python) == [] + end + + test "does not vote when no comment_prefixes provided" do + tokens = + on_line([t("#"), t("comment")], 1) ++ + [nl()] ++ + on_line([t("#"), t("comment")], 2) + + assert run(tokens, Unknown) == [] + end +end diff --git a/test/codeqa/ast/signals/classification/config_signal_test.exs b/test/codeqa/ast/signals/classification/config_signal_test.exs new file mode 100644 index 00000000..da510c2b --- /dev/null +++ b/test/codeqa/ast/signals/classification/config_signal_test.exs @@ -0,0 +1,28 @@ +defmodule CodeQA.AST.Signals.Classification.ConfigSignalTest do + use ExUnit.Case, async: true + alias CodeQA.AST.Parsing.SignalStream + alias CodeQA.AST.Signals.Classification.ConfigSignal + + defp run(tokens), do: SignalStream.run(tokens, [%ConfigSignal{}], []) |> List.flatten() + defp t(content, kind \\ ""), do: %{kind: kind, content: content, line: 1, col: 0} + + test "emits config_vote for 'config' keyword at indent 0" do + emissions = run([t("config"), t(":app"), t(","), t("key:"), t("val")]) + assert [{ConfigSignal, :classification, :config_vote, 3}] = emissions + end + + test "emits config_vote for 'configure' keyword" do + emissions = run([t("configure")]) + assert [{ConfigSignal, :classification, :config_vote, 3}] = emissions + end + + test "does not emit when indented" do + emissions = run([t("", ""), t("config")]) + assert emissions == [] + end + + test "does not emit for 'config' inside brackets" do + tokens = [t("(", "("), t("config"), t(")", ")")] + assert run(tokens) == [] + end +end diff --git a/test/codeqa/ast/signals/classification/data_signal_test.exs b/test/codeqa/ast/signals/classification/data_signal_test.exs new file mode 100644 index 00000000..852067bc --- /dev/null +++ b/test/codeqa/ast/signals/classification/data_signal_test.exs @@ -0,0 +1,28 @@ +defmodule CodeQA.AST.Signals.Classification.DataSignalTest do + use ExUnit.Case, async: true + alias CodeQA.AST.Parsing.SignalStream + alias CodeQA.AST.Signals.Classification.DataSignal + + defp run(tokens), do: SignalStream.run(tokens, [%DataSignal{}], []) |> List.flatten() + + defp t(content, kind), do: %{kind: kind, content: content, line: 1, col: 0} + defp str(v), do: t(v, "") + defp num(v), do: t(v, "") + defp id(v), do: t(v, "") + + test "votes data for high-literal token stream" do + tokens = [str("foo"), str("bar"), num("1"), num("2"), id("key")] + emissions = run(tokens) + assert [{DataSignal, :classification, :data_vote, _}] = emissions + end + + test "does not vote when control-flow keyword present" do + tokens = [str("foo"), id("if"), str("bar")] + assert run(tokens) == [] + end + + test "does not vote when literal ratio is low" do + tokens = [id("foo"), id("bar"), id("baz"), str("one")] + assert run(tokens) == [] + end +end diff --git a/test/codeqa/ast/signals/classification/type_signal_test.exs b/test/codeqa/ast/signals/classification/type_signal_test.exs new file mode 100644 index 00000000..aa400d38 --- /dev/null +++ b/test/codeqa/ast/signals/classification/type_signal_test.exs @@ -0,0 +1,40 @@ +defmodule CodeQA.AST.Signals.Classification.TypeSignalTest do + use ExUnit.Case, async: true + alias CodeQA.AST.Parsing.SignalStream + alias CodeQA.AST.Signals.Classification.TypeSignal + + defp run(tokens), do: SignalStream.run(tokens, [%TypeSignal{}], []) |> List.flatten() + + defp t(content, kind \\ ""), do: %{kind: kind, content: content, line: 1, col: 0} + + test "emits type_vote weight 3 for @type at indent 0" do + emissions = run([t("@", "@"), t("type"), t("t"), t("::"), t("integer")]) + assert [{TypeSignal, :classification, :type_vote, 3}] = emissions + end + + test "emits type_vote for @typep" do + emissions = run([t("@", "@"), t("typep"), t("t"), t("::")]) + assert [{TypeSignal, :classification, :type_vote, 3}] = emissions + end + + test "emits type_vote for @opaque" do + emissions = run([t("@", "@"), t("opaque"), t("t"), t("::")]) + assert [{TypeSignal, :classification, :type_vote, 3}] = emissions + end + + test "does not emit for @spec" do + emissions = run([t("@", "@"), t("spec"), t("foo"), t("()")]) + assert emissions == [] + end + + test "does not emit for @type inside indented block" do + emissions = run([t("", ""), t("@", "@"), t("type"), t("t")]) + assert emissions == [] + end + + test "emits at most one vote" do + tokens = [t("@", "@"), t("type"), t("a"), t("", ""), t("@", "@"), t("typep"), t("b")] + emissions = run(tokens) + assert length(emissions) == 1 + end +end diff --git a/test/codeqa/ast/signals/structural/access_modifier_signal_test.exs b/test/codeqa/ast/signals/structural/access_modifier_signal_test.exs new file mode 100644 index 00000000..2a863526 --- /dev/null +++ b/test/codeqa/ast/signals/structural/access_modifier_signal_test.exs @@ -0,0 +1,49 @@ +defmodule CodeQA.AST.Signals.Structural.AccessModifierSignalTest do + use ExUnit.Case, async: true + + alias CodeQA.AST.Lexing.TokenNormalizer + alias CodeQA.AST.Parsing.Signal + alias CodeQA.AST.Parsing.SignalStream + alias CodeQA.AST.Signals.Structural.AccessModifierSignal + alias CodeQA.Languages.Code.Vm.Java + + defp split_values(code, lang_mod) do + tokens = TokenNormalizer.normalize_structural(code) + [emissions] = SignalStream.run(tokens, [%AccessModifierSignal{}], lang_mod) + for {_src, :split, :access_modifier_split, v} <- emissions, do: v + end + + test "no split for first modifier (seen_content == false)" do + assert split_values("public void foo() {}\n", Java) == [] + end + + test "emits split at second public modifier after content" do + splits = split_values("public void foo() {}\npublic void bar() {}\n", Java) + assert length(splits) == 1 + end + + test "emits split at private modifier after content" do + splits = split_values("public void foo() {}\nprivate void bar() {}\n", Java) + assert length(splits) == 1 + end + + test "does not split when modifier is inside brackets" do + splits = split_values("public void foo(private int x) {}\n", Java) + assert splits == [] + end + + test "does not split on identifier that matches modifier but is not at line start" do + splits = split_values("public void foo() {}\nfoo.public.bar()\n", Java) + assert splits == [] + end + + test "works at indent > 0 (unlike KeywordSignal)" do + # Two indented public declarations, no enclosing brackets — should split + splits = split_values(" public void foo() {}\n public void bar() {}\n", Java) + assert length(splits) == 1 + end + + test "group is :split" do + assert Signal.group(%AccessModifierSignal{}) == :split + end +end diff --git a/test/codeqa/ast/signals/structural/assignment_function_signal_test.exs b/test/codeqa/ast/signals/structural/assignment_function_signal_test.exs new file mode 100644 index 00000000..bd76abf1 --- /dev/null +++ b/test/codeqa/ast/signals/structural/assignment_function_signal_test.exs @@ -0,0 +1,84 @@ +defmodule CodeQA.AST.Signals.Structural.AssignmentFunctionSignalTest do + use ExUnit.Case, async: true + + alias CodeQA.AST.Lexing.TokenNormalizer + alias CodeQA.AST.Parsing.Signal + alias CodeQA.AST.Parsing.SignalStream + alias CodeQA.AST.Signals.Structural.AssignmentFunctionSignal + + defp split_indices(code) do + tokens = TokenNormalizer.normalize_structural(code) + [emissions] = SignalStream.run(tokens, [%AssignmentFunctionSignal{}], []) + for {_src, :split, :assignment_function_split, v} <- emissions, do: v + end + + test "emits split for identifier = function() pattern (second in file)" do + code = """ + const first = function() {} + const foo = function() {} + """ + + splits = split_indices(code) + assert length(splits) == 1 + end + + test "emits split for arrow function pattern: bar = () => {}" do + code = """ + const first = function() {} + const bar = () => {} + """ + + splits = split_indices(code) + assert length(splits) == 1 + end + + test "emits split for async function pattern: baz = async function() {}" do + code = """ + const first = function() {} + const baz = async function() {} + """ + + splits = split_indices(code) + assert length(splits) == 1 + end + + test "does NOT emit for the first assignment in file (seen_content == false)" do + code = "const foo = function() {}\n" + splits = split_indices(code) + assert splits == [] + end + + test "does NOT emit for plain assignment: x = 1" do + code = """ + const first = function() {} + x = 1 + """ + + splits = split_indices(code) + assert splits == [] + end + + test "does NOT emit when identifier is indented (indent > 0)" do + code = """ + const first = function() {} + foo = function() {} + """ + + splits = split_indices(code) + assert splits == [] + end + + test "emits split for module.exports = function() pattern" do + code = """ + const first = function() {} + module.exports = function() {} + """ + + splits = split_indices(code) + assert length(splits) == 1 + end + + test "group/1 returns :split" do + assert Signal.group(%AssignmentFunctionSignal{}) == :split + end +end diff --git a/test/codeqa/ast/signals/structural/blank_line_signal_test.exs b/test/codeqa/ast/signals/structural/blank_line_signal_test.exs new file mode 100644 index 00000000..4e7d9d27 --- /dev/null +++ b/test/codeqa/ast/signals/structural/blank_line_signal_test.exs @@ -0,0 +1,36 @@ +defmodule CodeQA.AST.Signals.Structural.BlankLineSignalTest do + use ExUnit.Case, async: true + + alias CodeQA.AST.Lexing.TokenNormalizer + alias CodeQA.AST.Parsing.Signal + alias CodeQA.AST.Parsing.SignalStream + alias CodeQA.AST.Signals.Structural.BlankLineSignal + alias CodeQA.Languages.Code.Vm.Elixir, as: ElixirLang + + defp split_values(code, lang_mod) do + tokens = TokenNormalizer.normalize_structural(code) + [emissions] = SignalStream.run(tokens, [%BlankLineSignal{}], lang_mod) + for {_src, :split, :blank_split, v} <- emissions, do: v + end + + test "no splits for single block" do + assert split_values("def foo\n x\nend\n", ElixirLang) == [] + end + + test "emits split after blank line following block-end token" do + splits = split_values("def foo\n x\nend\n\n\ndef bar\n y\nend\n", ElixirLang) + assert length(splits) == 1 + end + + test "no split when blank line does not follow block-end token" do + assert split_values("x = 1\n\n\ny = 2\n", ElixirLang) == [] + end + + test "group is :split" do + assert Signal.group(%BlankLineSignal{}) == :split + end + + test "source is BlankLineSignal" do + assert Signal.source(%BlankLineSignal{}) == BlankLineSignal + end +end diff --git a/test/codeqa/ast/signals/structural/bracket_signal_test.exs b/test/codeqa/ast/signals/structural/bracket_signal_test.exs new file mode 100644 index 00000000..611474bd --- /dev/null +++ b/test/codeqa/ast/signals/structural/bracket_signal_test.exs @@ -0,0 +1,43 @@ +defmodule CodeQA.AST.Signals.Structural.BracketSignalTest do + use ExUnit.Case, async: true + + alias CodeQA.AST.Lexing.TokenNormalizer + alias CodeQA.AST.Parsing.Signal + alias CodeQA.AST.Parsing.SignalStream + alias CodeQA.AST.Signals.Structural.BracketSignal + + defp enclosure_values(code) do + tokens = TokenNormalizer.normalize_structural(code) + [emissions] = SignalStream.run(tokens, [%BracketSignal{}], []) + for {_src, :enclosure, :bracket_enclosure, v} <- emissions, do: v + end + + test "no enclosures for code without brackets" do + assert enclosure_values("foo\n") == [] + end + + test "emits enclosure for a single bracketed expression" do + enclosures = enclosure_values("foo(a, b)\n") + assert length(enclosures) == 1 + end + + test "emits only outermost enclosure for nested brackets" do + enclosures = enclosure_values("foo(bar(x))\n") + assert length(enclosures) == 1 + end + + test "enclosure value is {start_idx, end_idx} tuple" do + [{start, stop}] = enclosure_values("foo(a)\n") + assert is_integer(start) + assert is_integer(stop) + assert stop > start + end + + test "mismatched closing bracket is silently skipped" do + assert enclosure_values("foo)\n") == [] + end + + test "group is :enclosure" do + assert Signal.group(%BracketSignal{}) == :enclosure + end +end diff --git a/test/codeqa/ast/signals/structural/branch_split_signal_test.exs b/test/codeqa/ast/signals/structural/branch_split_signal_test.exs new file mode 100644 index 00000000..320390c9 --- /dev/null +++ b/test/codeqa/ast/signals/structural/branch_split_signal_test.exs @@ -0,0 +1,93 @@ +defmodule CodeQA.AST.Signals.Structural.BranchSplitSignalTest do + use ExUnit.Case, async: true + + alias CodeQA.AST.Lexing.TokenNormalizer + alias CodeQA.AST.Parsing.{Signal, SignalStream} + alias CodeQA.AST.Signals.Structural.BranchSplitSignal + alias CodeQA.Languages.Code.Scripting.PHP + alias CodeQA.Languages.Code.Scripting.Python + alias CodeQA.Languages.Code.Scripting.Ruby + alias CodeQA.Languages.Code.Vm.Elixir, as: ElixirLang + alias CodeQA.Languages.Code.Vm.Java + + defp split_values(code, lang_mod) do + tokens = TokenNormalizer.normalize_structural(code) + [emissions] = SignalStream.run(tokens, [%BranchSplitSignal{}], lang_mod) + for {_src, :branch_split, :branch_split, v} <- emissions, do: v + end + + test "group is :branch_split" do + assert Signal.group(%BranchSplitSignal{}) == :branch_split + end + + test "no split for code with no branch keywords" do + assert split_values("x = 1\ny = 2\n", ElixirLang) == [] + end + + test "emits split at else after seen content" do + splits = split_values("if x do\n :a\nelse\n :b\nend\n", ElixirLang) + assert length(splits) == 1 + end + + test "emits split at elif" do + splits = split_values("if x:\n pass\nelif y:\n pass\n", Python) + assert length(splits) == 1 + end + + test "emits split at multiple branch keywords" do + splits = split_values("if x do\n :a\nelsif y\n :b\nelse\n :c\nend\n", Ruby) + assert length(splits) == 2 + end + + test "does not emit at first keyword (no seen_content yet)" do + splits = split_values("if x do\n :a\nend\n", ElixirLang) + assert splits == [] + end + + test "does not emit when keyword is inside brackets" do + splits = split_values("foo(if x do 1 else 2 end)\n", ElixirLang) + assert splits == [] + end + + test "emits split at rescue" do + splits = split_values("try do\n :ok\nrescue\n _ -> :error\nend\n", ElixirLang) + assert length(splits) == 1 + end + + test "emits split at cond branch" do + splits = split_values("x = 1\ncond do\n x -> :a\nend\n", ElixirLang) + assert length(splits) == 1 + end + + test "emits split at except (Python)" do + splits = split_values("try:\n pass\nexcept ValueError:\n pass\n", Python) + assert length(splits) == 1 + end + + test "emits split at ensure (Elixir)" do + splits = + split_values( + "try do\n :ok\nrescue\n _ -> :error\nensure\n cleanup()\nend\n", + ElixirLang + ) + + assert length(splits) == 2 + end + + test "emits split at elseif (PHP)" do + splits = split_values("if x then\n :a\nelseif y then\n :b\nend\n", PHP) + assert length(splits) == 1 + end + + test "emits split at case label (switch body)" do + splits = + split_values("switch x\n case 1:\n :a\n case 2:\n :b\nend\n", Java) + + assert splits != [] + end + + test "emits split at when keyword" do + splits = split_values("x = 1\nwhen x > 0 do\n :pos\nend\n", ElixirLang) + assert length(splits) == 1 + end +end diff --git a/test/codeqa/ast/signals/structural/colon_indent_signal_test.exs b/test/codeqa/ast/signals/structural/colon_indent_signal_test.exs new file mode 100644 index 00000000..7ff96a0c --- /dev/null +++ b/test/codeqa/ast/signals/structural/colon_indent_signal_test.exs @@ -0,0 +1,30 @@ +defmodule CodeQA.AST.Signals.Structural.ColonIndentSignalTest do + use ExUnit.Case, async: true + + alias CodeQA.AST.Lexing.TokenNormalizer + alias CodeQA.AST.Parsing.Signal + alias CodeQA.AST.Parsing.SignalStream + alias CodeQA.AST.Signals.Structural.ColonIndentSignal + alias CodeQA.Languages.Code.Scripting.Python + alias CodeQA.Languages.Unknown + + defp enclosure_values(code, lang_mod \\ Python) do + tokens = TokenNormalizer.normalize_structural(code) + [emissions] = SignalStream.run(tokens, [%ColonIndentSignal{}], lang_mod) + for {_src, :enclosure, :colon_indent_enclosure, v} <- emissions, do: v + end + + test "no enclosures for non-python language" do + assert enclosure_values("def foo:\n return 1\n", Unknown) == + [] + end + + test "emits enclosure for colon-indented block in python" do + enclosures = enclosure_values("def foo:\n return 1\n") + assert enclosures != [] + end + + test "group is :enclosure" do + assert Signal.group(%ColonIndentSignal{}) == :enclosure + end +end diff --git a/test/codeqa/ast/signals/structural/comment_divider_signal_test.exs b/test/codeqa/ast/signals/structural/comment_divider_signal_test.exs new file mode 100644 index 00000000..29762cb1 --- /dev/null +++ b/test/codeqa/ast/signals/structural/comment_divider_signal_test.exs @@ -0,0 +1,52 @@ +defmodule CodeQA.AST.Signals.Structural.CommentDividerSignalTest do + use ExUnit.Case, async: true + + alias CodeQA.AST.Lexing.TokenNormalizer + alias CodeQA.AST.Parsing.Signal + alias CodeQA.AST.Parsing.SignalStream + alias CodeQA.AST.Signals.Structural.CommentDividerSignal + alias CodeQA.Languages.Code.Vm.Elixir, as: ElixirLang + alias CodeQA.Languages.Code.Vm.Java + alias CodeQA.Languages.Data.Sql + + defp split_values(code, lang_mod) do + tokens = TokenNormalizer.normalize_structural(code) + [emissions] = SignalStream.run(tokens, [%CommentDividerSignal{}], lang_mod) + for {_src, :split, :comment_divider_split, v} <- emissions, do: v + end + + test "no split for first divider comment (seen_content == false at start of file)" do + assert split_values("# ---\n", ElixirLang) == [] + end + + test "emits split at # --- after prior content" do + splits = split_values("x = 1\n# ---\ny = 2\n", ElixirLang) + assert length(splits) == 1 + end + + test "emits split at // === after prior content" do + splits = split_values("x = 1\n// ===\ny = 2\n", Java) + assert length(splits) == 1 + end + + test "emits split at -- --- after prior content (SQL style)" do + splits = split_values("x = 1\n-- ---\ny = 2\n", Sql) + assert length(splits) == 1 + end + + test "does NOT emit for # followed by identifier (real comment)" do + assert split_values("x = 1\n# This is a real comment\n", ElixirLang) == [] + end + + test "does NOT emit when # is not at line start" do + assert split_values("x = 1\nx # ---\n", ElixirLang) == [] + end + + test "does NOT emit for indented divider comment (inside a block)" do + assert split_values("x = 1\n # ---\n", ElixirLang) == [] + end + + test "group is :split" do + assert Signal.group(%CommentDividerSignal{}) == :split + end +end diff --git a/test/codeqa/ast/signals/structural/decorator_signal_test.exs b/test/codeqa/ast/signals/structural/decorator_signal_test.exs new file mode 100644 index 00000000..6a5bb108 --- /dev/null +++ b/test/codeqa/ast/signals/structural/decorator_signal_test.exs @@ -0,0 +1,47 @@ +defmodule CodeQA.AST.Signals.Structural.DecoratorSignalTest do + use ExUnit.Case, async: true + + alias CodeQA.AST.Lexing.TokenNormalizer + alias CodeQA.AST.Parsing.Signal + alias CodeQA.AST.Parsing.SignalStream + alias CodeQA.AST.Signals.Structural.DecoratorSignal + + defp split_values(code) do + tokens = TokenNormalizer.normalize_structural(code) + [emissions] = SignalStream.run(tokens, [%DecoratorSignal{}], []) + for {_src, :split, :decorator_split, v} <- emissions, do: v + end + + test "no split for first @ (seen_content == false at start of file)" do + assert split_values("@decorator\ndef foo() {}\n") == [] + end + + test "emits split at second @decorator after content" do + splits = split_values("@decorator\ndef foo() {}\n@decorator\ndef bar() {}\n") + assert length(splits) == 1 + end + + test "does not emit when @ is inside brackets" do + splits = split_values("@decorator\ndef foo(@param x) {}\n") + assert splits == [] + end + + test "does not emit when @ is not at line start (mid-expression)" do + splits = split_values("@decorator\ndef foo() { x@y }\n") + assert splits == [] + end + + test "emits split for Rust #[ pattern at line start after content" do + splits = split_values("#[derive(Debug)]\nstruct Foo {}\n#[derive(Clone)]\nstruct Bar {}\n") + assert length(splits) == 1 + end + + test "does not emit for # at line start when next token is not [" do + splits = split_values("@decorator\ndef foo() {}\n# comment\ndef bar() {}\n") + assert splits == [] + end + + test "group is :split" do + assert Signal.group(%DecoratorSignal{}) == :split + end +end diff --git a/test/codeqa/ast/signals/structural/dedent_to_zero_signal_test.exs b/test/codeqa/ast/signals/structural/dedent_to_zero_signal_test.exs new file mode 100644 index 00000000..ddf8702d --- /dev/null +++ b/test/codeqa/ast/signals/structural/dedent_to_zero_signal_test.exs @@ -0,0 +1,55 @@ +defmodule CodeQA.AST.Signals.Structural.DedentToZeroSignalTest do + use ExUnit.Case, async: true + + alias CodeQA.AST.Lexing.TokenNormalizer + alias CodeQA.AST.Parsing.Signal + alias CodeQA.AST.Parsing.SignalStream + alias CodeQA.AST.Signals.Structural.DedentToZeroSignal + + defp split_count(code) do + tokens = TokenNormalizer.normalize_structural(code) + [emissions] = SignalStream.run(tokens, [%DedentToZeroSignal{}], []) + length(for {_src, :split, :dedent_split, _v} <- emissions, do: true) + end + + test "no split in a single flat block (no indentation change)" do + code = "foo\nbar\nbaz\n" + assert split_count(code) == 0 + end + + test "emits split when first token of a new line at indent 0 after indented content" do + code = "def foo:\n return 1\ndef bar:\n" + assert split_count(code) == 1 + end + + test "does NOT emit when returning to indent 0 from same-level content (no prior indent)" do + code = "foo\nbar\n" + assert split_count(code) == 0 + end + + test "does NOT emit at the very start of file (seen_content == false)" do + code = "foo\n bar\n" + # The very first line has no prior indent, so no split should fire + assert split_count(code) == 0 + end + + test "handles multiple indented blocks with splits" do + code = "foo:\n x = 1\nbar:\n y = 2\nbaz:\n" + # split at "bar" and "baz" + assert split_count(code) == 2 + end + + test "does NOT split if current line also has indent (both lines indented)" do + code = "foo:\n x = 1\n y = 2\n" + assert split_count(code) == 0 + end + + test "emits split when a blank line separates an indented block from a new block at indent 0" do + code = "def foo:\n return 1\n\ndef bar:\n" + assert split_count(code) == 1 + end + + test "group/1 returns :split" do + assert Signal.group(%DedentToZeroSignal{}) == :split + end +end diff --git a/test/codeqa/ast/signals/structural/doc_comment_lead_signal_test.exs b/test/codeqa/ast/signals/structural/doc_comment_lead_signal_test.exs new file mode 100644 index 00000000..da269e8b --- /dev/null +++ b/test/codeqa/ast/signals/structural/doc_comment_lead_signal_test.exs @@ -0,0 +1,44 @@ +defmodule CodeQA.AST.Signals.Structural.DocCommentLeadSignalTest do + use ExUnit.Case, async: true + + alias CodeQA.AST.Lexing.TokenNormalizer + alias CodeQA.AST.Parsing.Signal + alias CodeQA.AST.Parsing.SignalStream + alias CodeQA.AST.Signals.Structural.DocCommentLeadSignal + + defp split_values(code) do + tokens = TokenNormalizer.normalize_structural(code) + [emissions] = SignalStream.run(tokens, [%DocCommentLeadSignal{}], []) + for {_src, :split, :doc_comment_split, v} <- emissions, do: v + end + + test "no split for first /// (seen_content == false at start of file)" do + assert split_values("/// doc\n") == [] + end + + test "emits split at /// after prior content (Rust/C# doc comment)" do + splits = split_values("fn foo() {}\n/// doc\n") + assert length(splits) == 1 + end + + test "emits split at /** after prior content (Java/JS JSDoc)" do + splits = split_values("function foo() {}\n/**\n * doc\n */\n") + assert length(splits) == 1 + end + + test "does NOT emit for // followed by identifier (regular line comment)" do + assert split_values("x = 1\n// regular comment\n") == [] + end + + test "does NOT emit for // that is not at line start" do + assert split_values("x = 1\nx // doc\n") == [] + end + + test "does NOT emit for / at line start when next is not *" do + assert split_values("x = 1\n/ something\n") == [] + end + + test "group is :split" do + assert Signal.group(%DocCommentLeadSignal{}) == :split + end +end diff --git a/test/codeqa/ast/signals/structural/keyword_signal_test.exs b/test/codeqa/ast/signals/structural/keyword_signal_test.exs new file mode 100644 index 00000000..b269c408 --- /dev/null +++ b/test/codeqa/ast/signals/structural/keyword_signal_test.exs @@ -0,0 +1,38 @@ +defmodule CodeQA.AST.Signals.Structural.KeywordSignalTest do + use ExUnit.Case, async: true + + alias CodeQA.AST.Lexing.TokenNormalizer + alias CodeQA.AST.Parsing.Signal + alias CodeQA.AST.Parsing.SignalStream + alias CodeQA.AST.Signals.Structural.KeywordSignal + alias CodeQA.Languages.Code.Vm.Elixir, as: ElixirLang + + defp split_values(code, lang_mod) do + tokens = TokenNormalizer.normalize_structural(code) + [emissions] = SignalStream.run(tokens, [%KeywordSignal{}], lang_mod) + for {_src, :split, :keyword_split, v} <- emissions, do: v + end + + test "no split for single def" do + assert split_values("def foo\n x\nend\n", ElixirLang) == [] + end + + test "emits split at second def keyword at depth 0 indent 0" do + splits = split_values("def foo\n x\nend\ndef bar\n y\nend\n", ElixirLang) + assert length(splits) == 1 + end + + test "does not split on def inside a module (indented)" do + splits = split_values("defmodule Foo do\n def foo, do: 1\nend\n", ElixirLang) + assert splits == [] + end + + test "does not split on keyword inside brackets" do + splits = split_values("foo(def, bar)\n", ElixirLang) + assert splits == [] + end + + test "group is :split" do + assert Signal.group(%KeywordSignal{}) == :split + end +end diff --git a/test/codeqa/ast/signals/structural/sql_block_signal_test.exs b/test/codeqa/ast/signals/structural/sql_block_signal_test.exs new file mode 100644 index 00000000..5f89598a --- /dev/null +++ b/test/codeqa/ast/signals/structural/sql_block_signal_test.exs @@ -0,0 +1,60 @@ +defmodule CodeQA.AST.Signals.Structural.SQLBlockSignalTest do + use ExUnit.Case, async: true + + alias CodeQA.AST.Lexing.TokenNormalizer + alias CodeQA.AST.Parsing.Signal + alias CodeQA.AST.Parsing.SignalStream + alias CodeQA.AST.Signals.Structural.SQLBlockSignal + alias CodeQA.Languages.Data.Sql + + defp split_values(code) do + tokens = TokenNormalizer.normalize_structural(code) + [emissions] = SignalStream.run(tokens, [%SQLBlockSignal{}], Sql) + for {_src, :split, :sql_block_split, v} <- emissions, do: v + end + + test "no split for the first statement (seen_content == false)" do + assert split_values("CREATE TABLE users (id INT);\n") == [] + end + + test "emits split at second CREATE TABLE DDL statement" do + code = "CREATE TABLE users (id INT);\nCREATE TABLE orders (id INT);\n" + splits = split_values(code) + assert length(splits) == 1 + end + + test "emits split at SELECT when a query follows other content" do + code = "CREATE TABLE users (id INT);\nSELECT id FROM users;\n" + splits = split_values(code) + assert length(splits) == 1 + end + + test "emits split at lowercase create (case-insensitive match)" do + code = "create table users (id INT);\ncreate table orders (id INT);\n" + splits = split_values(code) + assert length(splits) == 1 + end + + test "emits split at INSERT after prior content" do + code = "CREATE TABLE users (id INT);\nINSERT INTO users VALUES (1);\n" + splits = split_values(code) + assert length(splits) == 1 + end + + test "does NOT emit for SQL keyword mid-statement (not at line start)" do + # FROM is not at line start; only SELECT is, but it's the first statement + code = "SELECT id FROM users;\n" + splits = split_values(code) + assert splits == [] + end + + test "does NOT emit for non-SQL identifier at line start" do + code = "CREATE TABLE users (id INT);\nusername VARCHAR(255);\n" + splits = split_values(code) + assert splits == [] + end + + test "group/1 returns :split" do + assert Signal.group(%SQLBlockSignal{}) == :split + end +end diff --git a/test/codeqa/ast/signals/structural/triple_quote_signal_test.exs b/test/codeqa/ast/signals/structural/triple_quote_signal_test.exs new file mode 100644 index 00000000..2b840bd1 --- /dev/null +++ b/test/codeqa/ast/signals/structural/triple_quote_signal_test.exs @@ -0,0 +1,35 @@ +defmodule CodeQA.AST.Signals.Structural.TripleQuoteSignalTest do + use ExUnit.Case, async: true + + alias CodeQA.AST.Lexing.TokenNormalizer + alias CodeQA.AST.Parsing.Signal + alias CodeQA.AST.Parsing.SignalStream + alias CodeQA.AST.Signals.Structural.TripleQuoteSignal + + defp split_values(code) do + tokens = TokenNormalizer.normalize_structural(code) + [emissions] = SignalStream.run(tokens, [%TripleQuoteSignal{}], []) + for {_src, :split, :triple_split, v} <- emissions, do: v + end + + test "no splits for plain code" do + assert split_values("def foo\n :ok\nend\n") == [] + end + + test "emits two splits for a complete heredoc" do + code = "\"\"\"\nhello\n\"\"\"\n" + splits = split_values(code) + assert length(splits) == 2 + end + + test "emits one split for unclosed heredoc (mismatch tolerance)" do + # single token with no closing pair + code = "\"\"\"\nhello\n" + splits = split_values(code) + assert length(splits) == 1 + end + + test "group is :split" do + assert Signal.group(%TripleQuoteSignal{}) == :split + end +end diff --git a/test/codeqa/block_impact/codebase_impact_test.exs b/test/codeqa/block_impact/codebase_impact_test.exs new file mode 100644 index 00000000..55ef4b44 --- /dev/null +++ b/test/codeqa/block_impact/codebase_impact_test.exs @@ -0,0 +1,60 @@ +defmodule CodeQA.BlockImpact.CodebaseImpactTest do + use ExUnit.Case, async: true + + alias CodeQA.AST.Lexing.TokenNormalizer + alias CodeQA.AST.Parsing.Parser + alias CodeQA.BlockImpact.CodebaseImpact + alias CodeQA.Engine.Analyzer + alias CodeQA.Languages.Unknown + + @content_a """ + defmodule A do + def foo do + x = 1 + 2 + y = x * 3 + x + y + end + + def bar do + :ok + end + end + """ + + @content_b """ + defmodule B do + def baz, do: :baz + end + """ + + defp files_map, do: %{"lib/a.ex" => @content_a, "lib/b.ex" => @content_b} + + defp first_block(content) do + tokens = TokenNormalizer.normalize_structural(content) + [first | _] = Parser.detect_blocks(tokens, Unknown) + first + end + + describe "compute/4" do + test "returns a codebase aggregate map" do + node = first_block(@content_a) + result = CodebaseImpact.compute("lib/a.ex", @content_a, node, files_map()) + assert is_map(result) + # Should have at least one group with mean_ keys + all_keys = result |> Map.values() |> Enum.flat_map(&Map.keys/1) + assert Enum.any?(all_keys, &String.starts_with?(&1, "mean_")) + end + + test "produces a different aggregate than the baseline when a large node is removed" do + node = first_block(@content_a) + + if length(node.tokens) >= 10 do + baseline = Analyzer.analyze_codebase_aggregate(files_map()) + without = CodebaseImpact.compute("lib/a.ex", @content_a, node, files_map()) + # Not necessarily different in all keys, but result is valid + assert is_map(without) + assert is_map(baseline) + end + end + end +end diff --git a/test/codeqa/block_impact/file_impact_test.exs b/test/codeqa/block_impact/file_impact_test.exs new file mode 100644 index 00000000..b44f0a9d --- /dev/null +++ b/test/codeqa/block_impact/file_impact_test.exs @@ -0,0 +1,64 @@ +defmodule CodeQA.BlockImpact.FileImpactTest do + use ExUnit.Case, async: true + + alias CodeQA.AST.Lexing.TokenNormalizer + alias CodeQA.AST.Parsing.Parser + alias CodeQA.BlockImpact.FileImpact + alias CodeQA.Languages.Unknown + + @fixture_content """ + defmodule MyModule do + def foo do + x = 1 + y = 2 + x + y + end + + def bar do + :ok + end + end + """ + + defp get_first_block(content) do + tokens = TokenNormalizer.normalize_structural(content) + [first | _] = Parser.detect_blocks(tokens, Unknown) + first + end + + describe "compute/2" do + test "returns a metrics map when node has >= 10 tokens" do + node = get_first_block(@fixture_content) + + if length(node.tokens) >= 10 do + result = FileImpact.compute(@fixture_content, node) + assert is_map(result) + assert map_size(result) > 0 + end + end + + test "returns nil for a node with fewer than 10 tokens" do + # Create a tiny node by parsing very short content + tiny_content = "x = 1" + tokens = TokenNormalizer.normalize_structural(tiny_content) + nodes = Parser.detect_blocks(tokens, Unknown) + # Find or construct a node with < 10 tokens + small_nodes = Enum.filter(nodes, fn n -> length(n.tokens) < 10 end) + + if small_nodes != [] do + node = List.first(small_nodes) + assert FileImpact.compute(tiny_content, node) == nil + end + end + + test "reconstructed content does not contain the removed node's first token line" do + tokens = TokenNormalizer.normalize_structural(@fixture_content) + [node | _] = Parser.detect_blocks(tokens, Unknown) + # Only test if node is large enough + if length(node.tokens) >= 10 do + result = FileImpact.compute(@fixture_content, node) + assert is_map(result) + end + end + end +end diff --git a/test/codeqa/block_impact/refactoring_potentials_test.exs b/test/codeqa/block_impact/refactoring_potentials_test.exs new file mode 100644 index 00000000..8593dfc3 --- /dev/null +++ b/test/codeqa/block_impact/refactoring_potentials_test.exs @@ -0,0 +1,202 @@ +defmodule CodeQA.BlockImpact.RefactoringPotentialsTest do + use ExUnit.Case, async: true + + alias CodeQA.BlockImpact.RefactoringPotentials + alias CodeQA.CombinedMetrics.FileScorer + alias CodeQA.CombinedMetrics.SampleRunner + alias CodeQA.Engine.Analyzer + + defp file_cosines(fm) do + fm + |> FileScorer.file_to_aggregate() + |> SampleRunner.diagnose_aggregate(top: 99_999) + end + + describe "compute/5" do + test "returns a list of maps with category, behavior, cosine_delta" do + content = """ + defmodule Foo do + def bar(a, b, c) do + if a do + if b do + if c do + :nested + end + end + end + end + end + """ + + baseline_fm = Analyzer.analyze_file("lib/foo.ex", content) + simple = "defmodule Foo do\n def bar, do: :ok\nend\n" + without_fm = Analyzer.analyze_file("lib/foo.ex", simple) + + files = %{"lib/foo.ex" => content} + baseline_agg = Analyzer.analyze_codebase_aggregate(files) + without_agg = Analyzer.analyze_codebase_aggregate(%{"lib/foo.ex" => simple}) + + baseline_file_cosines = file_cosines(baseline_fm) + baseline_codebase_cosines = SampleRunner.diagnose_aggregate(baseline_agg, top: 99_999) + + result = + RefactoringPotentials.compute( + baseline_file_cosines, + without_fm, + baseline_codebase_cosines, + without_agg + ) + + assert is_list(result) + + Enum.each(result, fn item -> + assert Map.has_key?(item, "category") + assert Map.has_key?(item, "behavior") + assert Map.has_key?(item, "cosine_delta") + assert is_binary(item["category"]) + assert is_binary(item["behavior"]) + assert is_float(item["cosine_delta"]) + end) + end + + test "returns at most top N results (default 3)" do + content = "defmodule A do\n def foo, do: 1\nend\n" + fm = Analyzer.analyze_file("lib/a.ex", content) + agg = Analyzer.analyze_codebase_aggregate(%{"lib/a.ex" => content}) + + baseline_file_cosines = file_cosines(fm) + baseline_codebase_cosines = SampleRunner.diagnose_aggregate(agg, top: 99_999) + + result = + RefactoringPotentials.compute(baseline_file_cosines, fm, baseline_codebase_cosines, agg) + + assert length(result) <= 3 + end + + test "respects top: N option" do + content = "defmodule A do\n def foo, do: 1\nend\n" + fm = Analyzer.analyze_file("lib/a.ex", content) + agg = Analyzer.analyze_codebase_aggregate(%{"lib/a.ex" => content}) + + baseline_file_cosines = file_cosines(fm) + baseline_codebase_cosines = SampleRunner.diagnose_aggregate(agg, top: 99_999) + + result = + RefactoringPotentials.compute(baseline_file_cosines, fm, baseline_codebase_cosines, agg, + top: 5 + ) + + assert length(result) <= 5 + end + + test "results are sorted descending by cosine_delta" do + content = "defmodule A do\n def foo, do: 1\nend\n" + fm = Analyzer.analyze_file("lib/a.ex", content) + agg = Analyzer.analyze_codebase_aggregate(%{"lib/a.ex" => content}) + + baseline_file_cosines = file_cosines(fm) + baseline_codebase_cosines = SampleRunner.diagnose_aggregate(agg, top: 99_999) + + result = + RefactoringPotentials.compute(baseline_file_cosines, fm, baseline_codebase_cosines, agg, + top: 99 + ) + + deltas = Enum.map(result, & &1["cosine_delta"]) + assert deltas == Enum.sort(deltas, :desc) + end + + test "skips behaviors whose _excludes_block_types includes block_type" do + content = """ + defmodule Foo do + def bar(a, b, c) do + if a do + if b do + if c do + :nested + end + end + end + end + end + """ + + baseline_fm = Analyzer.analyze_file("lib/foo.ex", content) + simple = "defmodule Foo do\n def bar, do: :ok\nend\n" + without_fm = Analyzer.analyze_file("lib/foo.ex", simple) + + files = %{"lib/foo.ex" => content} + baseline_agg = Analyzer.analyze_codebase_aggregate(files) + without_agg = Analyzer.analyze_codebase_aggregate(%{"lib/foo.ex" => simple}) + + baseline_file_cosines = file_cosines(baseline_fm) + baseline_codebase_cosines = SampleRunner.diagnose_aggregate(baseline_agg, top: 99_999) + + behavior_map = %{ + "function_design" => [ + {"cyclomatic_complexity_under_10", %{"_excludes_block_types" => ["module"]}} + ] + } + + result_unfiltered = + RefactoringPotentials.compute( + baseline_file_cosines, + without_fm, + baseline_codebase_cosines, + without_agg, + top: 99_999 + ) + + result_module = + RefactoringPotentials.compute( + baseline_file_cosines, + without_fm, + baseline_codebase_cosines, + without_agg, + top: 99_999, + block_type: :module, + behavior_map: behavior_map + ) + + result_function = + RefactoringPotentials.compute( + baseline_file_cosines, + without_fm, + baseline_codebase_cosines, + without_agg, + top: 99_999, + block_type: :function, + behavior_map: behavior_map + ) + + excluded_present? = fn result -> + Enum.any?(result, fn p -> + p["category"] == "function_design" and p["behavior"] == "cyclomatic_complexity_under_10" + end) + end + + assert excluded_present?.(result_unfiltered), + "test premise: excluded behavior must appear when no filter is set" + + refute excluded_present?.(result_module), + "behavior should be filtered out for :module block" + + assert excluded_present?.(result_function), + "behavior should remain for :function block (not in negative list)" + end + + test "no block_type option means no filtering (backwards compat)" do + content = "defmodule A do\n def foo, do: 1\nend\n" + fm = Analyzer.analyze_file("lib/a.ex", content) + agg = Analyzer.analyze_codebase_aggregate(%{"lib/a.ex" => content}) + + baseline_file_cosines = file_cosines(fm) + baseline_codebase_cosines = SampleRunner.diagnose_aggregate(agg, top: 99_999) + + result = + RefactoringPotentials.compute(baseline_file_cosines, fm, baseline_codebase_cosines, agg) + + assert is_list(result) + end + end +end diff --git a/test/codeqa/block_impact_analyzer_test.exs b/test/codeqa/block_impact_analyzer_test.exs new file mode 100644 index 00000000..e793f088 --- /dev/null +++ b/test/codeqa/block_impact_analyzer_test.exs @@ -0,0 +1,108 @@ +defmodule CodeQA.BlockImpactAnalyzerTest do + # async: false because the orchestrator uses Task.async_stream internally + use ExUnit.Case, async: false + + alias CodeQA.BlockImpactAnalyzer + alias CodeQA.Engine.Analyzer + + @fixture_content """ + defmodule MyModule do + def foo do + x = 1 + y = 2 + x + y + end + + def bar do + :ok + end + end + """ + + describe "analyze/3" do + test "adds 'nodes' key to each file entry in the pipeline result" do + files = %{"lib/my_module.ex" => @fixture_content} + pipeline_result = Analyzer.analyze_codebase(files) + + result = BlockImpactAnalyzer.analyze(pipeline_result, files) + + assert Map.has_key?(result, "files") + assert Map.has_key?(result["files"], "lib/my_module.ex") + file_data = result["files"]["lib/my_module.ex"] + assert Map.has_key?(file_data, "nodes") + assert is_list(file_data["nodes"]) + end + + test "each node has required fields" do + files = %{"lib/my_module.ex" => @fixture_content} + pipeline_result = Analyzer.analyze_codebase(files) + result = BlockImpactAnalyzer.analyze(pipeline_result, files) + + nodes = result["files"]["lib/my_module.ex"]["nodes"] + + Enum.each(nodes, fn node -> + assert Map.has_key?(node, "start_line") + assert Map.has_key?(node, "end_line") + assert Map.has_key?(node, "column_start") + assert Map.has_key?(node, "char_length") + assert Map.has_key?(node, "type") + assert Map.has_key?(node, "token_count") + assert Map.has_key?(node, "refactoring_potentials") + assert Map.has_key?(node, "children") + assert is_list(node["refactoring_potentials"]) + assert is_list(node["children"]) + end) + end + + test "nodes are sorted by start_line ascending" do + files = %{"lib/my_module.ex" => @fixture_content} + pipeline_result = Analyzer.analyze_codebase(files) + result = BlockImpactAnalyzer.analyze(pipeline_result, files) + + nodes = result["files"]["lib/my_module.ex"]["nodes"] + start_lines = Enum.map(nodes, & &1["start_line"]) + assert start_lines == Enum.sort(start_lines) + end + + test "preserves existing 'codebase' key in pipeline result" do + files = %{"lib/my_module.ex" => @fixture_content} + pipeline_result = Analyzer.analyze_codebase(files) + result = BlockImpactAnalyzer.analyze(pipeline_result, files) + + assert Map.has_key?(result, "codebase") + assert result["codebase"] == pipeline_result["codebase"] + end + + test "nodes_top option limits refactoring_potentials per node" do + files = %{"lib/my_module.ex" => @fixture_content} + pipeline_result = Analyzer.analyze_codebase(files) + result = BlockImpactAnalyzer.analyze(pipeline_result, files, nodes_top: 1) + + nodes = result["files"]["lib/my_module.ex"]["nodes"] + + Enum.each(nodes, fn node -> + assert length(node["refactoring_potentials"]) <= 1 + end) + end + + test "node['type'] reflects classified block kind, not the always-:code default" do + content = """ + defmodule Foo do + @moduledoc "hi" + + def bar, do: :ok + end + """ + + files = %{"lib/foo.ex" => content} + pipeline_result = Analyzer.analyze_codebase(files) + result = BlockImpactAnalyzer.analyze(pipeline_result, files) + + nodes = result["files"]["lib/foo.ex"]["nodes"] + types = nodes |> Enum.map(& &1["type"]) |> Enum.uniq() + + assert "module" in types or "function" in types or "doc" in types, + "expected real classification, got only: #{inspect(types)}" + end + end +end diff --git a/test/codeqa/block_matcher_test.exs b/test/codeqa/block_matcher_test.exs new file mode 100644 index 00000000..da55a099 --- /dev/null +++ b/test/codeqa/block_matcher_test.exs @@ -0,0 +1,37 @@ +defmodule Test.NodeMatcherTest do + use ExUnit.Case, async: true + + alias Test.NodeMatcher + + describe "exact/2" do + test "returns tagged tuple for :content field" do + assert {:exact, :content, "add"} = NodeMatcher.exact(:content, "add") + end + + test "returns tagged tuple for :value field" do + assert {:exact, :value, "identifier"} = NodeMatcher.exact(:value, "identifier") + end + + test "raises FunctionClauseError for unsupported field" do + assert_raise FunctionClauseError, fn -> + NodeMatcher.exact(:type, "something") + end + end + end + + describe "partial/2" do + test "returns tagged tuple for :content field" do + assert {:partial, :content, "@doc"} = NodeMatcher.partial(:content, "@doc") + end + + test "returns tagged tuple for :value field" do + assert {:partial, :value, "doc"} = NodeMatcher.partial(:value, "doc") + end + + test "raises FunctionClauseError for unsupported field" do + assert_raise FunctionClauseError, fn -> + NodeMatcher.partial(:type, "something") + end + end + end +end diff --git a/test/codeqa/cli_compare_test.exs b/test/codeqa/cli_compare_test.exs deleted file mode 100644 index f43578af..00000000 --- a/test/codeqa/cli_compare_test.exs +++ /dev/null @@ -1,85 +0,0 @@ -defmodule CodeQA.CLI.CompareTest do - use ExUnit.Case, async: true - - @moduletag :tmp_dir - - setup %{tmp_dir: tmp_dir} do - # Initialize a git repo with one source file and one non-source file - System.cmd("git", ["init"], cd: tmp_dir) - System.cmd("git", ["config", "user.email", "test@test.com"], cd: tmp_dir) - System.cmd("git", ["config", "user.name", "Test"], cd: tmp_dir) - - File.mkdir_p!(Path.join(tmp_dir, "lib")) - File.write!(Path.join(tmp_dir, "lib/app.ex"), "defmodule App do\nend") - System.cmd("git", ["add", "."], cd: tmp_dir) - System.cmd("git", ["commit", "-m", "initial"], cd: tmp_dir) - - %{repo: tmp_dir} - end - - describe "compare with github format" do - test "file changes section shows actual file count when source files changed", %{repo: repo} do - File.write!(Path.join(repo, "lib/app.ex"), """ - defmodule App do - def hello, do: :world - def goodbye, do: :world - end - """) - - System.cmd("git", ["add", "."], cd: repo) - System.cmd("git", ["commit", "-m", "update app"], cd: repo) - - stdout = - ExUnit.CaptureIO.capture_io(fn -> - ExUnit.CaptureIO.capture_io(:stderr, fn -> - CodeQA.CLI.main(["compare", repo, "--base-ref", "HEAD~1", "--format", "github"]) - end) - end) - - assert stdout =~ "File changes — 1 modified" - refute stdout =~ "File changes — no changes" - end - end - - describe "compare with no source file changes" do - test "exits 0 when only non-source files changed", %{repo: repo} do - # Create a branch, change only a .md file (not a source file) - File.write!(Path.join(repo, "README.md"), "# Hello") - System.cmd("git", ["add", "."], cd: repo) - System.cmd("git", ["commit", "-m", "add readme"], cd: repo) - - # compare should succeed (not crash) when no source files changed - {base_ref, head_ref} = {"HEAD~1", "HEAD"} - - changes = CodeQA.Git.changed_files(repo, base_ref, head_ref) - assert changes == [], "expected no source file changes, got: #{inspect(changes)}" - - # Verify the CLI handles this gracefully by calling main - # Capture stderr to verify the message - output = - ExUnit.CaptureIO.capture_io(:stderr, fn -> - CodeQA.CLI.main(["compare", repo, "--base-ref", base_ref, "--changes-only", "--format", "json"]) - end) - - assert output =~ "No source files changed" - end - - test "outputs valid JSON with empty comparison", %{repo: repo} do - # Change only a non-source file - File.write!(Path.join(repo, "README.md"), "# Hello") - System.cmd("git", ["add", "."], cd: repo) - System.cmd("git", ["commit", "-m", "add readme"], cd: repo) - - # Capture stdout (the JSON output) to verify it's valid - stdout = - ExUnit.CaptureIO.capture_io(fn -> - ExUnit.CaptureIO.capture_io(:stderr, fn -> - CodeQA.CLI.main(["compare", repo, "--base-ref", "HEAD~1", "--changes-only", "--format", "json"]) - end) - end) - - assert {:ok, result} = Jason.decode(stdout) - assert result["metadata"]["total_files_compared"] == 0 - end - end -end diff --git a/test/codeqa/cli_test.exs b/test/codeqa/cli_test.exs index 2f2b51f6..9abd9911 100644 --- a/test/codeqa/cli_test.exs +++ b/test/codeqa/cli_test.exs @@ -1,11 +1,17 @@ defmodule CodeQA.CLITest do - use ExUnit.Case, async: true + use ExUnit.Case, async: false - @moduletag :tmp_dir - - setup %{tmp_dir: tmp_dir} do + setup do + CodeQA.Config.reset() + tmp_dir = Path.join(System.tmp_dir!(), "codeqa_test_#{System.unique_integer([:positive])}") File.mkdir_p!(Path.join(tmp_dir, "lib")) File.write!(Path.join(tmp_dir, "lib/app.ex"), "defmodule App do\nend\n") + + on_exit(fn -> + CodeQA.Config.reset() + File.rm_rf!(tmp_dir) + end) + %{dir: tmp_dir} end @@ -19,23 +25,20 @@ defmodule CodeQA.CLITest do - ignored/** """) - output = - ExUnit.CaptureIO.capture_io(:stderr, fn -> - CodeQA.CLI.main(["analyze", dir]) - end) + json = CodeQA.CLI.main(["analyze", dir, "--show-files"]) + report = Jason.decode!(json) - # The ignored file should not be counted - refute output =~ "secret.ex" - assert output =~ "Analyzing 1 files" + # total_files == 1 proves the ignored file was excluded (setup has exactly 2 files) + assert report["metadata"]["total_files"] == 1 + # file paths confirm secret.ex is absent + refute Map.has_key?(report["files"], Path.join(dir, "ignored/secret.ex")) end test "works normally when .codeqa.yml is absent", %{dir: dir} do - output = - ExUnit.CaptureIO.capture_io(:stderr, fn -> - CodeQA.CLI.main(["analyze", dir]) - end) + json = CodeQA.CLI.main(["analyze", dir]) + report = Jason.decode!(json) - assert output =~ "Analyzing 1 files" + assert report["metadata"]["total_files"] == 1 end test "config file and --ignore-paths are merged additively", %{dir: dir} do @@ -49,13 +52,11 @@ defmodule CodeQA.CLITest do - ignored_by_config/** """) - output = - ExUnit.CaptureIO.capture_io(:stderr, fn -> - CodeQA.CLI.main(["analyze", dir, "--ignore-paths", "ignored_by_flag/**"]) - end) + json = CodeQA.CLI.main(["analyze", dir, "--ignore-paths", "ignored_by_flag/**"]) + report = Jason.decode!(json) - # Only lib/app.ex should be analyzed - assert output =~ "Analyzing 1 files" + # Only lib/app.ex should be analyzed — both ignore sources must apply + assert report["metadata"]["total_files"] == 1 end end end diff --git a/test/codeqa/collector_test.exs b/test/codeqa/collector_test.exs index 0a2a3f5c..f2aeb599 100644 --- a/test/codeqa/collector_test.exs +++ b/test/codeqa/collector_test.exs @@ -1,7 +1,12 @@ defmodule CodeQA.CollectorTest do - use ExUnit.Case, async: true + use ExUnit.Case, async: false - alias CodeQA.Collector + alias CodeQA.Engine.Collector + + setup do + CodeQA.Config.reset() + on_exit(&CodeQA.Config.reset/0) + end describe "ignored?/2" do test "matches simple wildcard pattern" do @@ -93,7 +98,7 @@ defmodule CodeQA.CollectorTest do %{path: "lib/bar.ex", status: "modified"} ] - result = Collector.reject_ignored(items, ["test/*"], & &1.path) + result = Collector.reject_ignored(items, & &1.path, ["test/*"]) assert length(result) == 1 assert hd(result).path == "lib/bar.ex" @@ -101,7 +106,32 @@ defmodule CodeQA.CollectorTest do test "empty patterns returns list unchanged" do items = [%{path: "test/foo.ex"}] - assert Collector.reject_ignored(items, [], & &1.path) == items + assert Collector.reject_ignored(items, & &1.path, []) == items + end + end + + describe "collect_files/2 respects .gitignore" do + setup do + tmp_dir = + Path.join(System.tmp_dir!(), "codeqa_git_collector_#{System.unique_integer([:positive])}") + + File.mkdir_p!(Path.join(tmp_dir, "lib")) + System.cmd("git", ["init"], cd: tmp_dir) + System.cmd("git", ["config", "user.email", "test@test.com"], cd: tmp_dir) + System.cmd("git", ["config", "user.name", "Test"], cd: tmp_dir) + File.write!(Path.join(tmp_dir, "lib/app.ex"), "defmodule App do\nend") + File.write!(Path.join(tmp_dir, "lib/generated.ex"), "defmodule Gen do\nend") + File.write!(Path.join(tmp_dir, ".gitignore"), "lib/generated.ex\n") + + on_exit(fn -> File.rm_rf!(tmp_dir) end) + + %{tmp_dir: tmp_dir} + end + + test "excludes files listed in .gitignore", %{tmp_dir: tmp_dir} do + files = Collector.collect_files(tmp_dir) + assert Map.has_key?(files, "lib/app.ex") + refute Map.has_key?(files, "lib/generated.ex") end end @@ -125,9 +155,21 @@ defmodule CodeQA.CollectorTest do end test "with ignore patterns excludes matching files", %{tmp_dir: tmp_dir} do - files = Collector.collect_files(tmp_dir, ignore_patterns: ["test/*"]) + files = Collector.collect_files(tmp_dir, ["test/*"]) assert Map.has_key?(files, "lib/app.ex") refute Map.has_key?(files, "test/app_test.exs") end + + test "respects ignore_paths from .codeqa.yml", %{tmp_dir: tmp_dir} do + File.mkdir_p!(Path.join(tmp_dir, "generated")) + File.write!(Path.join(tmp_dir, "generated/schema.ex"), "defmodule Schema do\nend") + File.write!(Path.join(tmp_dir, ".codeqa.yml"), "ignore_paths:\n - generated/**\n") + + CodeQA.Config.load(tmp_dir) + files = Collector.collect_files(tmp_dir) + + assert Map.has_key?(files, "lib/app.ex") + refute Map.has_key?(files, "generated/schema.ex") + end end end diff --git a/test/codeqa/combined_metrics/file_scorer_test.exs b/test/codeqa/combined_metrics/file_scorer_test.exs new file mode 100644 index 00000000..55ef9334 --- /dev/null +++ b/test/codeqa/combined_metrics/file_scorer_test.exs @@ -0,0 +1,295 @@ +defmodule CodeQA.CombinedMetrics.FileScorerTest do + use ExUnit.Case, async: true + + alias CodeQA.CombinedMetrics.FileScorer + + describe "file_to_aggregate/1" do + test "prefixes each key with mean_" do + input = %{"halstead" => %{"tokens" => 42.0, "effort" => 100.5}} + + assert FileScorer.file_to_aggregate(input) == %{ + "halstead" => %{"mean_tokens" => 42.0, "mean_effort" => 100.5} + } + end + + test "handles multiple groups" do + input = %{ + "halstead" => %{"tokens" => 10.0}, + "branching" => %{"branching_density" => 0.5} + } + + result = FileScorer.file_to_aggregate(input) + + assert result == %{ + "halstead" => %{"mean_tokens" => 10.0}, + "branching" => %{"mean_branching_density" => 0.5} + } + end + + test "returns empty map for empty input" do + assert FileScorer.file_to_aggregate(%{}) == %{} + end + + test "preserves values unchanged" do + input = %{"entropy" => %{"normalized_entropy" => 0.87}} + result = FileScorer.file_to_aggregate(input) + assert get_in(result, ["entropy", "mean_normalized_entropy"]) == 0.87 + end + end + + describe "worst_files_per_behavior/2" do + test "returns a map with string keys in category.behavior format" do + files_map = build_files_map() + result = FileScorer.worst_files_per_behavior(files_map, combined_top: 2) + + assert is_map(result) + + for {key, entries} <- result do + assert is_binary(key) + assert String.contains?(key, ".") + assert is_list(entries) + end + end + + test "each entry has file and cosine keys" do + files_map = build_files_map() + result = FileScorer.worst_files_per_behavior(files_map, combined_top: 2) + + for {_key, entries} <- result do + for entry <- entries do + assert Map.has_key?(entry, :file) + assert Map.has_key?(entry, :cosine) + assert is_binary(entry.file) + assert is_float(entry.cosine) + end + end + end + + test "respects combined_top limit" do + files_map = build_files_map() + result = FileScorer.worst_files_per_behavior(files_map, combined_top: 1) + + for {_key, entries} <- result do + assert length(entries) <= 1 + end + end + + test "entries are sorted ascending by cosine (most negative first)" do + files_map = build_files_map() + result = FileScorer.worst_files_per_behavior(files_map, combined_top: 99) + + for {_key, entries} <- result do + cosines = Enum.map(entries, & &1.cosine) + assert cosines == Enum.sort(cosines) + end + end + + test "skips files with empty metrics" do + files_map = %{ + "lib/empty.ex" => %{"metrics" => %{}, "lines" => 10}, + "lib/nokey.ex" => %{"lines" => 5} + } + + result = FileScorer.worst_files_per_behavior(files_map) + + for {_key, entries} <- result do + file_paths = Enum.map(entries, & &1.file) + refute "lib/empty.ex" in file_paths + refute "lib/nokey.ex" in file_paths + end + end + + test "uses default combined_top of 2" do + files_map = build_files_map() + result = FileScorer.worst_files_per_behavior(files_map) + + for {_key, entries} <- result do + assert length(entries) <= 2 + end + end + + test "each entry has top_metrics key" do + files_map = build_files_map() + result = FileScorer.worst_files_per_behavior(files_map) + + for {_key, entries} <- result, entry <- entries do + assert Map.has_key?(entry, :top_metrics), "missing :top_metrics in #{inspect(entry)}" + end + end + + test "top_metrics is a list" do + files_map = build_files_map() + result = FileScorer.worst_files_per_behavior(files_map) + + for {_key, entries} <- result, entry <- entries do + assert is_list(entry.top_metrics) + end + end + + test "top_metrics is [] (not nil) when all contributions are zero" do + # Single file with no variation — cosines will be near 0 + files_map = %{ + "lib/zero.ex" => %{ + "metrics" => %{ + "halstead" => %{"tokens" => 0.0} + }, + "lines" => 1, + "bytes" => 5 + } + } + + result = FileScorer.worst_files_per_behavior(files_map) + + for {_key, entries} <- result, entry <- entries do + assert entry.top_metrics == [] + end + end + + test "each entry has top_nodes key" do + files_map = build_files_map() + result = FileScorer.worst_files_per_behavior(files_map) + + for {_key, entries} <- result, entry <- entries do + assert Map.has_key?(entry, :top_nodes), "missing :top_nodes in #{inspect(entry)}" + end + end + + test "top_nodes is [] when file_data has no nodes key" do + files_map = build_files_map() + result = FileScorer.worst_files_per_behavior(files_map) + + for {_key, entries} <- result, entry <- entries do + assert entry.top_nodes == [] + end + end + + test "top_nodes is [] when file_data nodes is nil" do + files_map = + build_files_map() + |> Map.new(fn {path, data} -> {path, Map.put(data, "nodes", nil)} end) + + result = FileScorer.worst_files_per_behavior(files_map) + + for {_key, entries} <- result, entry <- entries do + assert entry.top_nodes == [] + end + end + + test "top_nodes is [] when file_data nodes is []" do + files_map = + build_files_map() + |> Map.new(fn {path, data} -> {path, Map.put(data, "nodes", [])} end) + + result = FileScorer.worst_files_per_behavior(files_map) + + for {_key, entries} <- result, entry <- entries do + assert entry.top_nodes == [] + end + end + end + + describe "worst_files_per_behavior/2 language filtering" do + test "does not include rust-only behaviors when scoring an elixir file" do + fake_metrics = %{"halstead" => %{"tokens" => 100.0, "difficulty" => 5.0}} + files_map = %{"lib/foo.ex" => %{"metrics" => fake_metrics}} + + results = FileScorer.worst_files_per_behavior(files_map) + + # Any behavior that only applies to rust should not have this .ex file in results + rust_only_keys = + Enum.filter(results, fn {key, entries} -> + [cat, beh] = String.split(key, ".", parts: 2) + yaml_path = "priv/combined_metrics/#{cat}.yml" + + case YamlElixir.read_from_file(yaml_path) do + {:ok, data} -> + langs = get_in(data, [beh, "_languages"]) || [] + langs != [] and "elixir" not in langs and entries != [] + + _ -> + false + end + end) + + assert rust_only_keys == [] + end + end + + # Build a realistic files_map using a real project file so diagnose_aggregate + # has real metric values to work with. We use a small fixed map rather than + # running the full analyzer to keep tests fast. + defp build_files_map do + %{ + "lib/example_a.ex" => %{ + "metrics" => %{ + "halstead" => %{ + "tokens" => 80.0, + "vocabulary" => 30.0, + "volume" => 400.0, + "difficulty" => 12.0, + "effort" => 4800.0, + "bugs" => 0.1 + }, + "branching" => %{ + "branching_density" => 0.3 + }, + "entropy" => %{ + "normalized_entropy" => 0.75 + }, + "function_metrics" => %{ + "avg_function_length" => 20.0, + "max_function_length" => 40.0, + "function_count" => 5.0, + "avg_params" => 2.0, + "max_params" => 4.0 + }, + "readability" => %{ + "readability_score" => 0.6 + }, + "indentation" => %{ + "avg_indent_level" => 2.0, + "max_indent_level" => 4.0, + "indent_variance" => 0.5 + } + }, + "lines" => 100, + "bytes" => 2048 + }, + "lib/example_b.ex" => %{ + "metrics" => %{ + "halstead" => %{ + "tokens" => 200.0, + "vocabulary" => 60.0, + "volume" => 1200.0, + "difficulty" => 30.0, + "effort" => 36_000.0, + "bugs" => 0.4 + }, + "branching" => %{ + "branching_density" => 0.7 + }, + "entropy" => %{ + "normalized_entropy" => 0.9 + }, + "function_metrics" => %{ + "avg_function_length" => 50.0, + "max_function_length" => 120.0, + "function_count" => 15.0, + "avg_params" => 4.0, + "max_params" => 8.0 + }, + "readability" => %{ + "readability_score" => 0.3 + }, + "indentation" => %{ + "avg_indent_level" => 4.0, + "max_indent_level" => 8.0, + "indent_variance" => 2.0 + } + }, + "lines" => 300, + "bytes" => 8192 + } + } + end +end diff --git a/test/codeqa/combined_metrics/sample_runner_test.exs b/test/codeqa/combined_metrics/sample_runner_test.exs new file mode 100644 index 00000000..692c306a --- /dev/null +++ b/test/codeqa/combined_metrics/sample_runner_test.exs @@ -0,0 +1,148 @@ +defmodule CodeQA.CombinedMetrics.SampleRunnerTest do + use ExUnit.Case + + alias CodeQA.CombinedMetrics.SampleRunner + alias CodeQA.Engine.Analyzer + alias CodeQA.Engine.Collector + alias CodeQA.HealthReport.Grader + + setup_all do + results = SampleRunner.run(category: "variable_naming", verbose: true) + %{results: results} + end + + describe "apply_languages/1" do + test "returns one entry per requested category" do + stats = SampleRunner.apply_languages(category: "variable_naming") + assert length(stats) == 1 + [entry] = stats + assert entry.category == "variable_naming" + assert is_integer(entry.behaviors_with_languages) + end + + test "writes _languages to behaviors that have samples" do + SampleRunner.apply_languages(category: "variable_naming") + {:ok, data} = YamlElixir.read_from_file("priv/combined_metrics/variable_naming.yml") + langs = get_in(data, ["name_is_generic", "_languages"]) + assert is_list(langs) + assert langs != [] + assert Enum.all?(langs, &is_binary/1) + end + + test "behaviors without sample dirs get no _languages key" do + SampleRunner.apply_languages(category: "variable_naming") + {:ok, data} = YamlElixir.read_from_file("priv/combined_metrics/variable_naming.yml") + + Enum.each(data, fn {_behavior, groups} -> + if is_map(groups) do + case Map.get(groups, "_languages") do + nil -> :ok + langs -> assert is_list(langs) and langs != [] + end + end + end) + end + + test "only includes languages with both good and bad samples" do + # uses code_smells which has single-language behaviors + SampleRunner.apply_languages(category: "code_smells") + {:ok, data} = YamlElixir.read_from_file("priv/combined_metrics/code_smells.yml") + + # no_dead_code_after_return has only .ex samples + langs = get_in(data, ["no_dead_code_after_return", "_languages"]) + assert langs == ["elixir"] + end + end + + describe "diagnose_aggregate/2 language option" do + test "accepts :language option without crashing" do + # minimal aggregate — behavior will be scored but most will have no scalars + agg = %{} + result = SampleRunner.diagnose_aggregate(agg, top: 5, language: "elixir") + assert is_list(result) + end + + test "accepts :languages option without crashing" do + agg = %{} + result = SampleRunner.diagnose_aggregate(agg, top: 5, languages: ["elixir", "rust"]) + assert is_list(result) + end + + # NOTE: This test uses `<=` intentionally. Before Task 7 + `mix compile --force`, + # all behaviors have empty `_languages` in the compiled cache, so no filtering + # occurs and all three counts are equal. The `<=` assertion passes in both + # pre- and post-Task-7 states. + test "with language option returns subset of unfiltered results" do + agg = + "priv/combined_metrics/samples/variable_naming/name_is_generic/bad" + |> Collector.collect_files() + |> Analyzer.analyze_codebase() + |> get_in(["codebase", "aggregate"]) + + all = SampleRunner.diagnose_aggregate(agg, top: 999) + elixir_only = SampleRunner.diagnose_aggregate(agg, top: 999, language: "elixir") + rust_only = SampleRunner.diagnose_aggregate(agg, top: 999, language: "rust") + + # Filtered sets are subsets (or equal, pre-Task-7) of unfiltered + assert length(elixir_only) <= length(all) + assert length(rust_only) <= length(all) + end + end + + describe "score_aggregate/2 language filtering" do + test "accepts :languages option without crashing" do + result = SampleRunner.score_aggregate(%{}, languages: ["elixir"]) + assert is_list(result) + assert Enum.all?(result, &Map.has_key?(&1, :behaviors)) + end + + test "with languages option returns fewer behaviors than unfiltered" do + agg = + "priv/combined_metrics/samples/variable_naming/name_is_generic/bad" + |> Collector.collect_files() + |> Analyzer.analyze_codebase() + |> get_in(["codebase", "aggregate"]) + + all_count = SampleRunner.score_aggregate(agg) |> Enum.flat_map(& &1.behaviors) |> length() + + elixir_count = + SampleRunner.score_aggregate(agg, languages: ["elixir"]) + |> Enum.flat_map(& &1.behaviors) + |> length() + + # elixir-only project sees fewer or equal behaviors + assert elixir_count <= all_count + end + end + + describe "grade_cosine_categories/3" do + test "returns a list for empty input" do + result = Grader.grade_cosine_categories(%{}, %{}) + assert is_list(result) + end + end + + describe "run/1" do + test "returns a list of results with required keys", %{results: results} do + assert is_list(results) + assert results != [] + result = hd(results) + assert Map.has_key?(result, :bad_score) + assert Map.has_key?(result, :good_score) + assert Map.has_key?(result, :ratio) + assert Map.has_key?(result, :direction_ok) + end + + test "name_is_generic result has good_score > bad_score", %{results: results} do + generic = Enum.find(results, &(&1.behavior == "name_is_generic")) + assert generic != nil + assert generic.good_score > generic.bad_score + end + + test "verbose: true populates metric_detail", %{results: results} do + [result | _] = results + assert is_list(result.metric_detail) + # only populated when behavior has scalars configured + end + end +end diff --git a/test/codeqa/combined_metrics/scorer_test.exs b/test/codeqa/combined_metrics/scorer_test.exs new file mode 100644 index 00000000..42713dd6 --- /dev/null +++ b/test/codeqa/combined_metrics/scorer_test.exs @@ -0,0 +1,29 @@ +defmodule CodeQA.CombinedMetrics.ScorerTest do + use ExUnit.Case, async: true + + alias CodeQA.CombinedMetrics.Scorer + + describe "referenced_file_metric_names/0" do + test "returns a MapSet" do + assert %MapSet{} = Scorer.referenced_file_metric_names() + end + + test "contains heavy hitters that obviously appear in YAMLs" do + set = Scorer.referenced_file_metric_names() + + for name <- ~w[halstead ngram entropy branching readability] do + assert MapSet.member?(set, name), + "expected #{name} in referenced file metric names" + end + end + + test "excludes meta keys (anything starting with _)" do + set = Scorer.referenced_file_metric_names() + + for name <- set do + refute String.starts_with?(name, "_"), + "meta key leaked into referenced metrics: #{inspect(name)}" + end + end + end +end diff --git a/test/codeqa/config_test.exs b/test/codeqa/config_test.exs new file mode 100644 index 00000000..fced9036 --- /dev/null +++ b/test/codeqa/config_test.exs @@ -0,0 +1,104 @@ +defmodule CodeQA.ConfigTest do + use ExUnit.Case, async: false + + alias CodeQA.Config + + setup do + Config.reset() + on_exit(&Config.reset/0) + end + + describe "load/1 and accessors" do + test "returns defaults when no .codeqa.yml exists" do + dir = System.tmp_dir!() + Config.load(dir) + + assert Config.ignore_paths() == [] + assert Config.combined_top() == 2 + assert Config.cosine_significance_threshold() == 0.15 + assert Config.near_duplicate_blocks_opts() == [] + assert is_map(Config.impact_map()) + assert Map.get(Config.impact_map(), "complexity") == 5 + end + + test "loads ignore_paths from .codeqa.yml" do + dir = + tmp_dir_with_config(""" + ignore_paths: + - priv/** + - docs/** + """) + + Config.load(dir) + + assert Config.ignore_paths() == ["priv/**", "docs/**"] + end + + test "loads impact overrides" do + dir = + tmp_dir_with_config(""" + impact: + complexity: 10 + documentation: 3 + """) + + Config.load(dir) + + assert Config.impact_map()["complexity"] == 10 + assert Config.impact_map()["documentation"] == 3 + assert Config.impact_map()["function_design"] == 4 + end + + test "loads combined_top" do + dir = tmp_dir_with_config("combined_top: 5\n") + Config.load(dir) + assert Config.combined_top() == 5 + end + + test "loads cosine_significance_threshold" do + dir = tmp_dir_with_config("cosine_significance_threshold: 0.25\n") + Config.load(dir) + assert Config.cosine_significance_threshold() == 0.25 + end + + test "loads near_duplicate_blocks opts" do + dir = + tmp_dir_with_config(""" + near_duplicate_blocks: + max_pairs_per_bucket: 25 + """) + + Config.load(dir) + + assert Config.near_duplicate_blocks_opts() == [max_pairs_per_bucket: 25] + end + + test "caches: second load/1 call is a no-op" do + dir1 = tmp_dir_with_config("combined_top: 7\n") + dir2 = tmp_dir_with_config("combined_top: 3\n") + + Config.load(dir1) + Config.load(dir2) + + assert Config.combined_top() == 7 + end + + test "reset/0 clears cache so load/1 works again" do + dir1 = tmp_dir_with_config("combined_top: 7\n") + dir2 = tmp_dir_with_config("combined_top: 3\n") + + Config.load(dir1) + Config.reset() + Config.load(dir2) + + assert Config.combined_top() == 3 + end + end + + defp tmp_dir_with_config(yaml) do + dir = Path.join(System.tmp_dir!(), "codeqa_config_test_#{System.unique_integer([:positive])}") + File.mkdir_p!(dir) + File.write!(Path.join(dir, ".codeqa.yml"), yaml) + dir + end +end diff --git a/test/codeqa/diagnostics_test.exs b/test/codeqa/diagnostics_test.exs new file mode 100644 index 00000000..4e5db617 --- /dev/null +++ b/test/codeqa/diagnostics_test.exs @@ -0,0 +1,48 @@ +defmodule CodeQA.DiagnosticsTest do + use ExUnit.Case, async: true + + @small_path Path.expand("../../lib/codeqa/health_report/formatter", __DIR__) + + describe "run/1 aggregate mode" do + test "plain format output structure" do + output = CodeQA.Diagnostics.run(path: @small_path, mode: :aggregate, top: 5, format: :plain) + + assert output =~ "## Diagnose: aggregate" + assert output =~ "| Behavior | Cosine | Score |" + assert output =~ "###" + end + + test "json format returns valid JSON with issues and categories keys" do + output = CodeQA.Diagnostics.run(path: @small_path, mode: :aggregate, top: 5, format: :json) + + decoded = Jason.decode!(output) + assert Map.has_key?(decoded, "issues") + assert Map.has_key?(decoded, "categories") + end + end + + describe "run/1 per-file mode" do + @tag timeout: 120_000 + test "runs without error on a small directory" do + output = CodeQA.Diagnostics.run(path: @small_path, mode: :per_file, top: 3, format: :plain) + + assert output =~ "## Diagnose: per-file" + end + + @tag timeout: 120_000 + test "output contains per-file table header" do + output = CodeQA.Diagnostics.run(path: @small_path, mode: :per_file, top: 3, format: :plain) + + assert output =~ "| File | Behavior | Cosine | Score |" + end + + @tag timeout: 120_000 + test "json format returns valid JSON with files key" do + output = CodeQA.Diagnostics.run(path: @small_path, mode: :per_file, top: 3, format: :json) + + decoded = Jason.decode!(output) + assert Map.has_key?(decoded, "files") + assert is_list(decoded["files"]) + end + end +end diff --git a/test/codeqa/engine/analyzer_test.exs b/test/codeqa/engine/analyzer_test.exs new file mode 100644 index 00000000..38886c6c --- /dev/null +++ b/test/codeqa/engine/analyzer_test.exs @@ -0,0 +1,99 @@ +defmodule CodeQA.Engine.AnalyzerTest do + use ExUnit.Case, async: true + + alias CodeQA.Engine.Analyzer + + describe "analyze_file/2" do + test "returns a metrics map with group keys" do + content = "defmodule Foo do\n def bar, do: :ok\nend\n" + result = Analyzer.analyze_file("lib/foo.ex", content) + assert is_map(result) + assert map_size(result) > 0 + # Each value should be a map of metric keys to numbers + Enum.each(result, fn {_group, keys} -> + assert is_map(keys) + end) + end + end + + describe "analyze_codebase_aggregate/2" do + test "returns aggregate map with mean_ keys" do + files = %{ + "lib/a.ex" => "defmodule A do\n def foo, do: :a\nend\n", + "lib/b.ex" => "defmodule B do\n def bar, do: :b\nend\n" + } + + agg = Analyzer.analyze_codebase_aggregate(files) + assert is_map(agg) + # At least one group should have mean_ keys + Enum.each(agg, fn {_group, keys} -> + Enum.each(keys, fn {key, val} -> + assert String.starts_with?(key, "mean_") or String.starts_with?(key, "std_") or + String.starts_with?(key, "min_") or String.starts_with?(key, "max_") + + assert is_float(val) or is_integer(val) + end) + end) + end + + test "does not run codebase metrics (returns quickly for large input)" do + # Just assert it returns without error for a reasonable input + files = %{"lib/foo.ex" => "defmodule Foo do\n def bar, do: 1\nend\n"} + agg = Analyzer.analyze_codebase_aggregate(files) + assert is_map(agg) + end + end + + describe "analyze_file_for_loo_partial/3" do + @sample """ + defmodule Foo do + def bar do + x = 1 + y = 2 + x + y + end + end + """ + + test "result matches analyze_file_for_loo/2 for referenced metrics" do + baseline = Analyzer.analyze_file_for_loo("lib/foo.ex", @sample) + partial = Analyzer.analyze_file_for_loo_partial("lib/foo.ex", @sample, baseline) + referenced = CodeQA.CombinedMetrics.Scorer.referenced_file_metric_names() + + for name <- referenced, Map.has_key?(baseline, name) do + assert Map.get(partial, name) == Map.get(baseline, name), + "referenced metric #{name} diverges in partial" + end + end + + test "non-referenced metrics are inherited verbatim from baseline" do + baseline = Analyzer.analyze_file_for_loo("lib/foo.ex", @sample) + sentinel = %{"sentinel_key" => 99.0} + + tampered_baseline = + Enum.reduce(baseline, %{}, fn {name, _val}, acc -> + if name in CodeQA.CombinedMetrics.Scorer.referenced_file_metric_names() do + Map.put(acc, name, baseline[name]) + else + Map.put(acc, name, sentinel) + end + end) + + partial = + Analyzer.analyze_file_for_loo_partial("lib/foo.ex", @sample, tampered_baseline) + + for {name, value} <- partial, + name not in CodeQA.CombinedMetrics.Scorer.referenced_file_metric_names() do + assert value == sentinel, + "non-referenced metric #{name} was recomputed instead of inherited" + end + end + + test "result has same set of metric names as analyze_file_for_loo/2" do + baseline = Analyzer.analyze_file_for_loo("lib/foo.ex", @sample) + partial = Analyzer.analyze_file_for_loo_partial("lib/foo.ex", @sample, baseline) + + assert MapSet.new(Map.keys(partial)) == MapSet.new(Map.keys(baseline)) + end + end +end diff --git a/test/codeqa/formatter_test.exs b/test/codeqa/formatter_test.exs deleted file mode 100644 index ccefca53..00000000 --- a/test/codeqa/formatter_test.exs +++ /dev/null @@ -1,149 +0,0 @@ -defmodule CodeQA.FormatterTest do - use ExUnit.Case, async: true - - alias CodeQA.Formatter - - @sample_comparison %{ - "metadata" => %{ - "total_files_compared" => 1, - "summary" => "1 modified", - "base_ref" => "abc123", - "head_ref" => "HEAD" - }, - "files" => %{ - "lib/foo.ex" => %{ - "status" => "modified", - "base" => %{ - "metrics" => %{"halstead" => %{"volume" => 1000.0}}, - "lines" => 100, - "bytes" => 3000 - }, - "head" => %{ - "metrics" => %{"halstead" => %{"volume" => 800.0}}, - "lines" => 95, - "bytes" => 2800 - }, - "delta" => %{ - "metrics" => %{"halstead" => %{"volume" => -200.0}}, - "lines" => -5, - "bytes" => -200 - } - } - }, - "codebase" => %{ - "base" => %{ - "aggregate" => %{ - "readability" => %{ - "mean_flesch_adapted" => 65.0, - "mean_fog_adapted" => 8.0, - "mean_avg_tokens_per_line" => 7.0, - "mean_avg_line_length" => 45.0 - }, - "halstead" => %{ - "mean_difficulty" => 15.0, - "mean_effort" => 8000.0, - "mean_volume" => 500.0, - "mean_estimated_bugs" => 0.2 - } - } - }, - "head" => %{ - "aggregate" => %{ - "readability" => %{ - "mean_flesch_adapted" => 75.0, - "mean_fog_adapted" => 7.0, - "mean_avg_tokens_per_line" => 6.0, - "mean_avg_line_length" => 42.0 - }, - "halstead" => %{ - "mean_difficulty" => 12.0, - "mean_effort" => 6000.0, - "mean_volume" => 400.0, - "mean_estimated_bugs" => 0.15 - } - } - }, - "delta" => %{ - "aggregate" => %{ - "readability" => %{ - "mean_flesch_adapted" => 10.0, - "mean_fog_adapted" => -1.0, - "mean_avg_tokens_per_line" => -1.0, - "mean_avg_line_length" => -3.0 - }, - "halstead" => %{ - "mean_difficulty" => -3.0, - "mean_effort" => -2000.0, - "mean_volume" => -100.0, - "mean_estimated_bugs" => -0.05 - } - } - } - } - } - - describe "format_github/1" do - test "includes mermaid chart of head scores" do - result = Formatter.format_github(@sample_comparison) - assert result =~ "```mermaid" - assert result =~ "xychart-beta" - assert result =~ "bar [" - end - - test "includes progress bars with base → head" do - result = Formatter.format_github(@sample_comparison) - assert result =~ "→" - end - - test "includes grade emoji" do - result = Formatter.format_github(@sample_comparison) - assert result =~ "🟢" or result =~ "🟡" or result =~ "🟠" or result =~ "🔴" - end - - test "wraps file details in collapsible section" do - result = Formatter.format_github(@sample_comparison) - assert result =~ "
" - assert result =~ "
" - end - - test "shows no changes message when zero files compared" do - comparison = put_in(@sample_comparison, ["metadata", "total_files_compared"], 0) - result = Formatter.format_github(comparison) - assert result =~ "No file changes detected" - end - - test "shows 🟢 in aggregate delta for improving high-is-better metric" do - # flesch_adapted is good: :high, delta +10.0 → improvement - result = Formatter.format_github(@sample_comparison) - assert result =~ "🟢 +10.00" - end - - test "file changes section shows actual file counts, not 'no changes'" do - result = Formatter.format_github(@sample_comparison) - assert result =~ "File changes — 1 modified" - refute result =~ "File changes — no changes" - end - - test "file changes section reflects metric directions from codebase data" do - result = Formatter.format_github(@sample_comparison) - # halstead.mean_volume drops 100/500 = 20% → "decreased"; readability rises 10/65 ≈ 15% → "increased slightly" - refute result =~ "File changes — 1 modified — all metrics stable" - end - - test "shows 🔴 in aggregate delta for worsening low-is-better metric" do - # halstead.volume is good: :low, delta +300 → regression - worsening = - put_in( - @sample_comparison, - ["codebase", "head", "aggregate", "halstead", "mean_volume"], - 800.0 - ) - |> put_in(["codebase", "delta", "aggregate"], %{ - "halstead" => %{"mean_volume" => 300.0} - }) - - result = Formatter.format_github(worsening) - assert result =~ "🔴 +300.00" - end - end -end diff --git a/test/codeqa/git_test.exs b/test/codeqa/git_test.exs new file mode 100644 index 00000000..f1a800cb --- /dev/null +++ b/test/codeqa/git_test.exs @@ -0,0 +1,297 @@ +defmodule CodeQA.GitTest do + use ExUnit.Case, async: true + + alias CodeQA.Git + + describe "gitignored_files/2" do + test "returns files that are gitignored" do + in_tmp_git_repo(fn repo -> + File.write!(Path.join(repo, ".gitignore"), "*.secret\n") + File.write!(Path.join(repo, "config.secret"), "password=123") + File.write!(Path.join(repo, "app.ex"), "defmodule App do end") + + ignored = Git.gitignored_files(repo, ["config.secret", "app.ex"]) + + assert ignored == MapSet.new(["config.secret"]) + end) + end + + test "returns empty set when no files are gitignored" do + in_tmp_git_repo(fn repo -> + File.write!(Path.join(repo, ".gitignore"), "*.secret\n") + File.write!(Path.join(repo, "app.ex"), "defmodule App do end") + + ignored = Git.gitignored_files(repo, ["app.ex"]) + + assert ignored == MapSet.new() + end) + end + + test "handles empty file list" do + in_tmp_git_repo(fn repo -> + File.write!(Path.join(repo, ".gitignore"), "*.secret\n") + + ignored = Git.gitignored_files(repo, []) + + assert ignored == MapSet.new() + end) + end + + test "respects nested .gitignore files" do + in_tmp_git_repo(fn repo -> + File.mkdir_p!(Path.join(repo, "subdir")) + File.write!(Path.join(repo, "subdir/.gitignore"), "local.ex\n") + File.write!(Path.join(repo, "subdir/local.ex"), "# local") + File.write!(Path.join(repo, "subdir/other.ex"), "# other") + + ignored = Git.gitignored_files(repo, ["subdir/local.ex", "subdir/other.ex"]) + + assert ignored == MapSet.new(["subdir/local.ex"]) + end) + end + + test "handles more than 1000 paths without ARG_MAX issues" do + in_tmp_git_repo(fn repo -> + File.write!(Path.join(repo, ".gitignore"), "ignored.ex\n") + + paths = Enum.map(1..1200, fn i -> "file_#{i}.ex" end) ++ ["ignored.ex"] + + ignored = Git.gitignored_files(repo, paths) + + assert ignored == MapSet.new(["ignored.ex"]) + end) + end + + test "filters files inside a gitignored directory" do + in_tmp_git_repo(fn repo -> + File.write!(Path.join(repo, ".gitignore"), "/docs/\n") + + ignored = + Git.gitignored_files(repo, [ + "docs/readme.md", + "docs/guide/intro.md", + "lib/app.ex" + ]) + + assert ignored == MapSet.new(["docs/readme.md", "docs/guide/intro.md"]) + end) + end + + test "filters gitignored-pattern files even when already tracked by git" do + in_tmp_git_repo(fn repo -> + File.mkdir_p!(Path.join(repo, "docs")) + File.mkdir_p!(Path.join(repo, "lib")) + File.write!(Path.join(repo, "docs/readme.md"), "# Docs") + File.write!(Path.join(repo, "lib/app.ex"), "defmodule App do end") + + System.cmd("git", ["add", "."], cd: repo) + System.cmd("git", ["commit", "-m", "initial"], cd: repo) + + File.write!(Path.join(repo, ".gitignore"), "/docs/\n") + + ignored = Git.gitignored_files(repo, ["docs/readme.md", "lib/app.ex"]) + + assert ignored == MapSet.new(["docs/readme.md"]) + end) + end + end + + describe "diff_line_ranges/3" do + test "parses single-line hunks" do + in_tmp_git_repo(fn repo -> + # Create initial commit + File.write!(Path.join(repo, "foo.ex"), "line1\nline2\nline3\n") + System.cmd("git", ["add", "."], cd: repo) + System.cmd("git", ["commit", "-m", "initial"], cd: repo) + + # Modify a single line + File.write!(Path.join(repo, "foo.ex"), "line1\nmodified\nline3\n") + System.cmd("git", ["add", "."], cd: repo) + System.cmd("git", ["commit", "-m", "change"], cd: repo) + + {:ok, ranges} = Git.diff_line_ranges(repo, "HEAD~1", "HEAD") + + assert Map.has_key?(ranges, "foo.ex") + assert {2, 2} in ranges["foo.ex"] + end) + end + + test "parses multi-line hunks" do + in_tmp_git_repo(fn repo -> + File.write!(Path.join(repo, "foo.ex"), "a\nb\nc\nd\ne\n") + {_, 0} = System.cmd("git", ["add", "."], cd: repo) + {_, 0} = System.cmd("git", ["commit", "-m", "initial"], cd: repo) + + # Replace lines 2-4 + File.write!(Path.join(repo, "foo.ex"), "a\nX\nY\nZ\ne\n") + {_, 0} = System.cmd("git", ["add", "."], cd: repo) + {_, 0} = System.cmd("git", ["commit", "-m", "change"], cd: repo) + + {:ok, ranges} = Git.diff_line_ranges(repo, "HEAD~1", "HEAD") + + assert Map.has_key?(ranges, "foo.ex") + assert {2, 4} in ranges["foo.ex"] + end) + end + + test "handles multiple hunks in same file" do + in_tmp_git_repo(fn repo -> + lines = Enum.map_join(1..20, "\n", &"line#{&1}") + File.write!(Path.join(repo, "foo.ex"), lines <> "\n") + {_, 0} = System.cmd("git", ["add", "."], cd: repo) + {_, 0} = System.cmd("git", ["commit", "-m", "initial"], cd: repo) + + # Change line 2 and line 15 + new_lines = + 1..20 + |> Enum.map(fn + 2 -> "changed2" + 15 -> "changed15" + n -> "line#{n}" + end) + |> Enum.join("\n") + + File.write!(Path.join(repo, "foo.ex"), new_lines <> "\n") + {_, 0} = System.cmd("git", ["add", "."], cd: repo) + {_, 0} = System.cmd("git", ["commit", "-m", "change"], cd: repo) + + {:ok, ranges} = Git.diff_line_ranges(repo, "HEAD~1", "HEAD") + + assert Map.has_key?(ranges, "foo.ex") + assert length(ranges["foo.ex"]) == 2 + assert {2, 2} in ranges["foo.ex"] + assert {15, 15} in ranges["foo.ex"] + end) + end + + test "handles multiple files" do + in_tmp_git_repo(fn repo -> + File.write!(Path.join(repo, "a.ex"), "a1\na2\n") + File.write!(Path.join(repo, "b.ex"), "b1\nb2\n") + {_, 0} = System.cmd("git", ["add", "."], cd: repo) + {_, 0} = System.cmd("git", ["commit", "-m", "initial"], cd: repo) + + File.write!(Path.join(repo, "a.ex"), "a1\nchanged\n") + File.write!(Path.join(repo, "b.ex"), "b1\nchanged\n") + {_, 0} = System.cmd("git", ["add", "."], cd: repo) + {_, 0} = System.cmd("git", ["commit", "-m", "change"], cd: repo) + + {:ok, ranges} = Git.diff_line_ranges(repo, "HEAD~1", "HEAD") + + assert {2, 2} in ranges["a.ex"] + assert {2, 2} in ranges["b.ex"] + end) + end + + test "handles added lines (insertion)" do + in_tmp_git_repo(fn repo -> + File.write!(Path.join(repo, "foo.ex"), "a\nb\n") + System.cmd("git", ["add", "."], cd: repo) + System.cmd("git", ["commit", "-m", "initial"], cd: repo) + + # Insert new line between a and b + File.write!(Path.join(repo, "foo.ex"), "a\nnew\nb\n") + System.cmd("git", ["add", "."], cd: repo) + System.cmd("git", ["commit", "-m", "insert"], cd: repo) + + {:ok, ranges} = Git.diff_line_ranges(repo, "HEAD~1", "HEAD") + + assert Map.has_key?(ranges, "foo.ex") + # Line 2 is the new line + assert {2, 2} in ranges["foo.ex"] + end) + end + + test "handles deleted lines (no new lines)" do + in_tmp_git_repo(fn repo -> + File.write!(Path.join(repo, "foo.ex"), "a\nb\nc\n") + System.cmd("git", ["add", "."], cd: repo) + System.cmd("git", ["commit", "-m", "initial"], cd: repo) + + # Delete line b + File.write!(Path.join(repo, "foo.ex"), "a\nc\n") + System.cmd("git", ["add", "."], cd: repo) + System.cmd("git", ["commit", "-m", "delete"], cd: repo) + + {:ok, ranges} = Git.diff_line_ranges(repo, "HEAD~1", "HEAD") + + # File should either not be in ranges or have empty list (deletion only) + ranges_for_file = Map.get(ranges, "foo.ex", []) + # No new lines were added, so no ranges pointing to new content + assert ranges_for_file == [] or not Map.has_key?(ranges, "foo.ex") + end) + end + + test "returns empty map when no diff" do + in_tmp_git_repo(fn repo -> + File.write!(Path.join(repo, "foo.ex"), "content\n") + {_, 0} = System.cmd("git", ["add", "."], cd: repo) + {_, 0} = System.cmd("git", ["commit", "-m", "initial"], cd: repo) + + {:ok, ranges} = Git.diff_line_ranges(repo, "HEAD", "HEAD") + + assert ranges == %{} + end) + end + + test "handles new file (no base version)" do + in_tmp_git_repo(fn repo -> + File.write!(Path.join(repo, "existing.ex"), "existing\n") + {_, 0} = System.cmd("git", ["add", "."], cd: repo) + {_, 0} = System.cmd("git", ["commit", "-m", "initial"], cd: repo) + + File.write!(Path.join(repo, "new.ex"), "line1\nline2\nline3\n") + {_, 0} = System.cmd("git", ["add", "."], cd: repo) + {_, 0} = System.cmd("git", ["commit", "-m", "add new file"], cd: repo) + + {:ok, ranges} = Git.diff_line_ranges(repo, "HEAD~1", "HEAD") + + assert Map.has_key?(ranges, "new.ex") + assert {1, 3} in ranges["new.ex"] + end) + end + + test "returns ranges in ascending order" do + in_tmp_git_repo(fn repo -> + lines = Enum.map_join(1..20, "\n", &"line#{&1}") + File.write!(Path.join(repo, "foo.ex"), lines <> "\n") + {_, 0} = System.cmd("git", ["add", "."], cd: repo) + {_, 0} = System.cmd("git", ["commit", "-m", "initial"], cd: repo) + + # Change lines 2, 10, and 18 + new_lines = + 1..20 + |> Enum.map(fn + 2 -> "changed2" + 10 -> "changed10" + 18 -> "changed18" + n -> "line#{n}" + end) + |> Enum.join("\n") + + File.write!(Path.join(repo, "foo.ex"), new_lines <> "\n") + {_, 0} = System.cmd("git", ["add", "."], cd: repo) + {_, 0} = System.cmd("git", ["commit", "-m", "change"], cd: repo) + + {:ok, ranges} = Git.diff_line_ranges(repo, "HEAD~1", "HEAD") + + # Ranges should be in ascending order by start line + assert ranges["foo.ex"] == [{2, 2}, {10, 10}, {18, 18}] + end) + end + end + + defp in_tmp_git_repo(fun) do + tmp = Path.join(System.tmp_dir!(), "codeqa_git_test_#{:rand.uniform(999_999)}") + File.mkdir_p!(tmp) + System.cmd("git", ["init"], cd: tmp) + System.cmd("git", ["config", "user.email", "test@test.com"], cd: tmp) + System.cmd("git", ["config", "user.name", "Test"], cd: tmp) + + try do + fun.(tmp) + after + File.rm_rf!(tmp) + end + end +end diff --git a/test/codeqa/health_report/behavior_labels_test.exs b/test/codeqa/health_report/behavior_labels_test.exs new file mode 100644 index 00000000..0992488e --- /dev/null +++ b/test/codeqa/health_report/behavior_labels_test.exs @@ -0,0 +1,42 @@ +defmodule CodeQA.HealthReport.BehaviorLabelsTest do + use ExUnit.Case, async: true + + alias CodeQA.HealthReport.BehaviorLabels + + describe "label/2" do + test "returns human-readable label for known behavior" do + assert BehaviorLabels.label("function_design", "no_boolean_parameter") == + "Boolean parameter increases coupling" + end + + test "falls back to humanized behavior name for unknown" do + assert BehaviorLabels.label("unknown_cat", "some_weird_behavior") == + "Some Weird Behavior" + end + + test "labels the report's most common false-positive behaviors" do + assert BehaviorLabels.label("code_smells", "no_debug_print_statements") == + "Debug print left in code" + + assert BehaviorLabels.label("scope_and_assignment", "used_only_once") == + "Variable used only once" + + assert BehaviorLabels.label("consistency", "consistent_error_return_shape") == + "Mixed error-return shapes" + end + end + + describe "action/2" do + test "returns action for known behavior" do + assert is_binary(BehaviorLabels.action("function_design", "no_boolean_parameter")) + end + + test "falls back to fix_hint from YAML when no hardcoded action" do + assert is_binary(BehaviorLabels.action("naming_conventions", "filename_matches_module")) + end + + test "returns generic action for completely unknown behavior" do + assert BehaviorLabels.action("unknown", "unknown") == "Review this code block" + end + end +end diff --git a/test/codeqa/health_report/categories_test.exs b/test/codeqa/health_report/categories_test.exs new file mode 100644 index 00000000..0912d46b --- /dev/null +++ b/test/codeqa/health_report/categories_test.exs @@ -0,0 +1,71 @@ +defmodule CodeQA.HealthReport.CategoriesTest do + use ExUnit.Case + + alias CodeQA.HealthReport.Categories + + describe "defaults/0" do + test "all metrics have fix_hint field" do + categories = Categories.defaults() + + metrics = Enum.flat_map(categories, & &1.metrics) + + Enum.each(metrics, fn metric -> + assert Map.has_key?(metric, :fix_hint), + "Metric #{metric.name} missing :fix_hint field" + + assert is_binary(metric.fix_hint), + "Metric #{metric.name} :fix_hint must be a string" + + assert String.length(metric.fix_hint) > 0, + "Metric #{metric.name} :fix_hint cannot be empty" + end) + end + + test "all categories have expected keys" do + categories = Categories.defaults() + + Enum.each(categories, fn category -> + assert Map.has_key?(category, :key) + assert Map.has_key?(category, :name) + assert Map.has_key?(category, :metrics) + end) + end + + test "all metrics have required threshold keys" do + categories = Categories.defaults() + + metrics = Enum.flat_map(categories, & &1.metrics) + + Enum.each(metrics, fn metric -> + assert Map.has_key?(metric, :name) + assert Map.has_key?(metric, :source) + assert Map.has_key?(metric, :weight) + assert Map.has_key?(metric, :good) + assert Map.has_key?(metric, :thresholds) + assert Map.has_key?(metric, :fix_hint) + end) + end + + test "fix_hint is accessible via Map.get" do + categories = Categories.defaults() + + metrics = Enum.flat_map(categories, & &1.metrics) + + Enum.each(metrics, fn metric -> + hint = Map.get(metric, :fix_hint) + assert is_binary(hint) + assert String.length(hint) > 0 + end) + end + + test "has exactly 24 metrics across 6 categories" do + categories = Categories.defaults() + + assert length(categories) == 6 + + metrics = Enum.flat_map(categories, & &1.metrics) + + assert length(metrics) == 24 + end + end +end diff --git a/test/codeqa/health_report/config_test.exs b/test/codeqa/health_report/config_test.exs new file mode 100644 index 00000000..da4d6a12 --- /dev/null +++ b/test/codeqa/health_report/config_test.exs @@ -0,0 +1,111 @@ +defmodule CodeQA.HealthReport.ConfigTest do + use ExUnit.Case, async: true + + alias CodeQA.HealthReport.Config + + @default_impact %{ + "complexity" => 5, + "file_structure" => 4, + "function_design" => 4, + "code_smells" => 3, + "naming_conventions" => 2, + "error_handling" => 2, + "consistency" => 2, + "documentation" => 1, + "testing" => 1 + } + + describe "load/1 with nil" do + test "returns default impact map" do + result = Config.load(nil) + assert result.impact_map == @default_impact + end + + test "returns combined_top of 2" do + result = Config.load(nil) + assert result.combined_top == 2 + end + + test "returns categories and grade_scale" do + result = Config.load(nil) + assert is_list(result.categories) + assert is_list(result.grade_scale) + end + end + + describe "load/1 with YAML path" do + defp write_temp_yaml(content) do + path = Path.join(System.tmp_dir!(), "test_config_#{System.unique_integer()}.yml") + File.write!(path, content) + on_exit(fn -> File.rm(path) end) + path + end + + test "user impact values override defaults, defaults fill gaps" do + path = + write_temp_yaml(""" + impact: + complexity: 10 + testing: 9 + """) + + result = Config.load(path) + + assert result.impact_map["complexity"] == 10 + assert result.impact_map["testing"] == 9 + # Default values for keys not overridden + assert result.impact_map["file_structure"] == 4 + assert result.impact_map["function_design"] == 4 + assert result.impact_map["code_smells"] == 3 + assert result.impact_map["naming_conventions"] == 2 + assert result.impact_map["error_handling"] == 2 + assert result.impact_map["consistency"] == 2 + assert result.impact_map["documentation"] == 1 + end + + test "reads combined_top from YAML" do + path = + write_temp_yaml(""" + combined_top: 5 + """) + + result = Config.load(path) + assert result.combined_top == 5 + end + + test "defaults to combined_top: 2 when absent from YAML" do + path = + write_temp_yaml(""" + categories: {} + """) + + result = Config.load(path) + assert result.combined_top == 2 + end + + test "defaults to full default impact map when impact absent from YAML" do + path = + write_temp_yaml(""" + categories: {} + """) + + result = Config.load(path) + assert result.impact_map == @default_impact + end + + test "returns categories and grade_scale alongside impact fields" do + path = + write_temp_yaml(""" + impact: + complexity: 5 + combined_top: 3 + """) + + result = Config.load(path) + assert is_list(result.categories) + assert is_list(result.grade_scale) + assert is_map(result.impact_map) + assert is_integer(result.combined_top) + end + end +end diff --git a/test/codeqa/health_report/delta_test.exs b/test/codeqa/health_report/delta_test.exs new file mode 100644 index 00000000..6932e0c0 --- /dev/null +++ b/test/codeqa/health_report/delta_test.exs @@ -0,0 +1,61 @@ +defmodule CodeQA.HealthReport.DeltaTest do + use ExUnit.Case, async: true + + alias CodeQA.HealthReport.Delta + + defp make_results(aggregate) do + %{"codebase" => %{"aggregate" => aggregate}} + end + + test "returns base, head, and delta aggregates" do + base = make_results(%{"entropy" => %{"mean_value" => 5.0}}) + head = make_results(%{"entropy" => %{"mean_value" => 6.0}}) + + result = Delta.compute(base, head) + + assert result.base.aggregate == %{"entropy" => %{"mean_value" => 5.0}} + assert result.head.aggregate == %{"entropy" => %{"mean_value" => 6.0}} + assert result.delta.aggregate == %{"entropy" => %{"mean_value" => 1.0}} + end + + test "rounds delta to 4 decimal places" do + base = make_results(%{"entropy" => %{"mean_value" => 1.0}}) + head = make_results(%{"entropy" => %{"mean_value" => 4.3333}}) + + result = Delta.compute(base, head) + assert result.delta.aggregate["entropy"]["mean_value"] == 3.3333 + end + + test "handles missing base codebase gracefully" do + base = %{} + head = make_results(%{"entropy" => %{"mean_value" => 6.0}}) + + result = Delta.compute(base, head) + assert result.delta.aggregate == %{} + end + + test "handles missing head codebase gracefully" do + base = make_results(%{"entropy" => %{"mean_value" => 5.0}}) + head = %{} + + result = Delta.compute(base, head) + assert result.delta.aggregate == %{} + end + + test "skips non-numeric metric keys" do + base = make_results(%{"entropy" => %{"mean_value" => 5.0, "label" => "x"}}) + head = make_results(%{"entropy" => %{"mean_value" => 6.0, "label" => "y"}}) + + result = Delta.compute(base, head) + refute Map.has_key?(result.delta.aggregate["entropy"], "label") + assert result.delta.aggregate["entropy"]["mean_value"] == 1.0 + end + + test "metric key present only in head produces no delta entry" do + base = make_results(%{"entropy" => %{"mean_value" => 5.0}}) + head = make_results(%{"entropy" => %{"mean_value" => 6.0, "new_metric" => 3.0}}) + + result = Delta.compute(base, head) + refute Map.has_key?(result.delta.aggregate["entropy"], "new_metric") + end +end diff --git a/test/codeqa/health_report/formatter_test.exs b/test/codeqa/health_report/formatter_test.exs index 1fddec3f..8518d945 100644 --- a/test/codeqa/health_report/formatter_test.exs +++ b/test/codeqa/health_report/formatter_test.exs @@ -9,24 +9,49 @@ defmodule CodeQA.HealthReport.FormatterTest do overall_grade: "B+", categories: [ %{ + type: :threshold, name: "Readability", key: :readability, score: 100, grade: "A", + impact: 3, summary: "Excellent", metric_scores: [ - %{name: "flesch_adapted", source: "readability", weight: 0.4, good: :high, value: 102.5, score: 100} + %{ + name: "flesch_adapted", + source: "readability", + weight: 0.4, + good: :high, + value: 102.5, + score: 100 + } ], worst_offenders: [ - %{path: "lib/foo.ex", score: 75, grade: "B+", lines: 120, bytes: 3840, - metric_scores: [%{name: "flesch_adapted", source: "readability", good: :high, value: 65.0, score: 75}]} + %{ + path: "lib/foo.ex", + score: 75, + grade: "B+", + lines: 120, + bytes: 3840, + metric_scores: [ + %{ + name: "flesch_adapted", + source: "readability", + good: :high, + value: 65.0, + score: 75 + } + ] + } ] }, %{ + type: :threshold, name: "Complexity", key: :complexity, score: 35, grade: "D", + impact: 5, summary: "Critical — requires attention", metric_scores: [ %{name: "difficulty", source: "halstead", weight: 0.35, value: 24.01, score: 65} @@ -36,6 +61,98 @@ defmodule CodeQA.HealthReport.FormatterTest do ] } + @cosine_category %{ + type: :cosine, + key: "function_design", + name: "Function Design", + score: 64, + grade: "C", + impact: 1, + behaviors: [ + %{ + behavior: "no_boolean_parameter", + cosine: 0.12, + score: 56, + grade: "C", + worst_offenders: [ + %{file: "lib/foo/bar.ex", cosine: -0.71} + ] + }, + %{ + behavior: "single_responsibility", + cosine: 0.45, + score: 78, + grade: "B+", + worst_offenders: [] + } + ] + } + + @enriched_cosine_category %{ + type: :cosine, + key: "function_design", + name: "Function Design", + score: 64, + grade: "C", + impact: 1, + behaviors: [ + %{ + behavior: "no_boolean_parameter", + cosine: -0.65, + score: 42, + grade: "D+", + worst_offenders: [ + %{ + file: "lib/codeqa/formatter.ex", + cosine: -0.65, + top_metrics: [ + %{metric: "branching.mean_depth", contribution: -4.10}, + %{metric: "halstead.effort", contribution: -3.22} + ], + top_nodes: [ + %{"start_line" => 89, "type" => "block"}, + %{"start_line" => 134, "type" => "block"} + ] + } + ] + } + ] + } + + @enriched_threshold_category %{ + type: :threshold, + name: "Complexity", + key: :complexity, + score: 32, + grade: "F", + impact: 5, + summary: "Critical", + metric_scores: [ + %{name: "difficulty", source: "halstead", weight: 0.35, good: :low, value: 39.0, score: 32} + ], + worst_offenders: [ + %{ + path: "lib/foo.ex", + score: 32, + grade: "F", + lines: 491, + bytes: 15_872, + metric_scores: [ + %{name: "difficulty", source: "halstead", good: :low, value: 99.0, score: 0} + ], + top_nodes: [ + %{"start_line" => 201, "type" => "block"}, + %{"start_line" => 312, "type" => "block"} + ] + } + ] + } + + @report_with_cosine %{ + @sample_report + | categories: @sample_report.categories ++ [@cosine_category] + } + describe "format_markdown/3 with :plain format" do test "produces header with # Code Health Report" do result = Formatter.format_markdown(@sample_report, :default, :plain) @@ -53,29 +170,179 @@ defmodule CodeQA.HealthReport.FormatterTest do assert result =~ "## Overall: B+" end - test "includes category table" do + test "includes cosine legend" do result = Formatter.format_markdown(@sample_report, :default, :plain) - assert result =~ "| Readability | A | 100 | Excellent |" - assert result =~ "| Complexity | D | 35 |" + assert result =~ "cosine similarity" + assert result =~ "anti-pattern detected" end - test "includes worst offenders section" do + test "includes category table with Impact column" do result = Formatter.format_markdown(@sample_report, :default, :plain) - assert result =~ "### Worst Offenders" - refute result =~ "lib/
`foo.ex`" - assert result =~ "`lib/foo.ex`" - assert result =~ "120 lines · 3.8 KB" - assert result =~ "↑ flesch_adapted=65.00 (avg: 102.50)" - refute result =~ "↑ flesch_adapted=65.00, " + assert result =~ "| Category | Grade | Score | Impact | Summary |" + assert result =~ "| Readability | A | 100 | 3 | Excellent |" + assert result =~ "| Complexity | D | 35 | 5 |" end test "summary detail omits category sections" do result = Formatter.format_markdown(@sample_report, :summary, :plain) - refute result =~ "### Worst Offenders" refute result =~ "Codebase averages" end end + describe "format_markdown/3 plain with cosine category" do + test "renders cosine category header" do + result = Formatter.format_markdown(@report_with_cosine, :default, :plain) + assert result =~ "## Function Design — C" + end + + test "renders cosine behavior table" do + result = Formatter.format_markdown(@report_with_cosine, :default, :plain) + assert result =~ "| Behavior | Cosine | Score | Grade |" + assert result =~ "| no_boolean_parameter | 0.12 | 56 | C |" + assert result =~ "| single_responsibility | 0.45 | 78 | B+ |" + end + + test "cosine category impact shown in overall table" do + result = Formatter.format_markdown(@report_with_cosine, :default, :plain) + assert result =~ "| Function Design | C | 64 | 1 |" + end + end + + describe "plain formatter: PR summary section" do + @sample_report_with_pr Map.put(@sample_report, :pr_summary, %{ + base_score: 85, + head_score: 77, + score_delta: -8, + base_grade: "B+", + head_grade: "C+", + blocks_flagged: 6, + files_changed: 3, + files_added: 1, + files_modified: 2 + }) + + test "renders PR summary line when pr_summary present" do + result = Formatter.format_markdown(@sample_report_with_pr, :default, :plain) + assert result =~ "B+" + assert result =~ "C+" + assert result =~ "-8" + assert result =~ "6" + assert result =~ "1 added" + assert result =~ "2 modified" + end + + test "omits PR summary when pr_summary is nil" do + result = Formatter.format_markdown(@sample_report, :default, :plain) + refute result =~ "Score:" + end + end + + describe "plain formatter: delta section" do + @delta %{ + base: %{ + aggregate: %{ + "readability" => %{"mean_flesch_adapted" => 65.0}, + "halstead" => %{"mean_difficulty" => 12.0} + } + }, + head: %{ + aggregate: %{ + "readability" => %{"mean_flesch_adapted" => 61.0}, + "halstead" => %{"mean_difficulty" => 15.0} + } + } + } + + @sample_report_with_delta Map.put(@sample_report, :codebase_delta, @delta) + + test "renders metric changes table when codebase_delta present" do + result = Formatter.format_markdown(@sample_report_with_delta, :default, :plain) + assert result =~ "Metric Changes" + assert result =~ "Readability" + assert result =~ "65.00" + assert result =~ "61.00" + end + + test "omits delta section when codebase_delta is nil" do + result = Formatter.format_markdown(@sample_report, :default, :plain) + refute result =~ "Metric Changes" + end + end + + describe "plain formatter: block section" do + @block_potential %{ + category: "function_design", + behavior: "cyclomatic_complexity_under_10", + cosine_delta: 0.41, + severity: :critical, + fix_hint: "Reduce branching" + } + + @top_blocks [ + %{ + path: "lib/foo.ex", + status: "modified", + start_line: 42, + end_line: 67, + type: "code", + token_count: 84, + source: "def foo do\n :bar\nend", + language: "elixir", + potentials: [@block_potential] + } + ] + + @sample_report_with_blocks Map.put(@sample_report, :top_blocks, @top_blocks) + + test "renders block verdict header" do + result = Formatter.format_markdown(@sample_report_with_blocks, :default, :plain) + assert result =~ "review required" + assert result =~ "🔴" + end + + test "renders file path with status" do + result = Formatter.format_markdown(@sample_report_with_blocks, :default, :plain) + assert result =~ "lib/foo.ex" + assert result =~ "modified" + end + + test "renders block location and type" do + result = Formatter.format_markdown(@sample_report_with_blocks, :default, :plain) + assert result =~ "42-67" + assert result =~ "84 tokens" + end + + test "renders severity icon and behavior" do + result = Formatter.format_markdown(@sample_report_with_blocks, :default, :plain) + assert result =~ "🔴" + assert result =~ "CRITICAL" + assert result =~ "cyclomatic_complexity_under_10" + assert result =~ "0.41" + end + + test "renders fix hint" do + result = Formatter.format_markdown(@sample_report_with_blocks, :default, :plain) + assert result =~ "Reduce branching" + end + + test "renders source code" do + result = Formatter.format_markdown(@sample_report_with_blocks, :default, :plain) + assert result =~ "def foo do" + assert result =~ ":bar" + end + + test "shows green verdict when top_blocks is empty" do + report = Map.put(@sample_report, :top_blocks, []) + result = Formatter.format_markdown(report, :default, :plain) + assert result =~ "No block-level issues detected" + end + + test "shows green verdict when top_blocks key absent" do + result = Formatter.format_markdown(@sample_report, :default, :plain) + refute result =~ "review required" + end + end + describe "format_markdown/3 defaults to :plain" do test "two-arity call matches plain output" do plain = Formatter.format_markdown(@sample_report, :default, :plain) @@ -91,6 +358,12 @@ defmodule CodeQA.HealthReport.FormatterTest do assert result =~ "(79/100)" end + test "includes cosine legend" do + result = Formatter.format_markdown(@sample_report, :default, :github) + assert result =~ "cosine similarity" + assert result =~ "anti-pattern detected" + end + test "includes mermaid chart" do result = Formatter.format_markdown(@sample_report, :default, :github) assert result =~ "```mermaid" @@ -130,6 +403,19 @@ defmodule CodeQA.HealthReport.FormatterTest do end end + describe "format_markdown/3 github with cosine category" do + test "wraps cosine category in details/summary block" do + result = Formatter.format_markdown(@report_with_cosine, :default, :github) + assert result =~ "🟠 Function Design — C (64/100)" + end + + test "renders cosine behaviors table inside details" do + result = Formatter.format_markdown(@report_with_cosine, :default, :github) + assert result =~ "| Behavior | Cosine | Score | Grade |" + assert result =~ "| no_boolean_parameter | 0.12 | 56 | C |" + end + end + describe "format_markdown/4 with :github format and chart: false" do test "omits mermaid chart when chart option is false" do result = Formatter.format_markdown(@sample_report, :default, :github, chart: false) @@ -137,4 +423,228 @@ defmodule CodeQA.HealthReport.FormatterTest do assert result =~ "████" end end + + describe "github formatter: block section" do + @block_potential %{ + category: "function_design", + behavior: "cyclomatic_complexity_under_10", + cosine_delta: 0.41, + severity: :critical, + fix_hint: "Reduce branching" + } + + @top_blocks_gh [ + %{ + path: "lib/foo.ex", + status: "modified", + start_line: 42, + end_line: 67, + type: "code", + token_count: 84, + source: "def foo do\n :bar\nend", + language: "elixir", + potentials: [@block_potential] + } + ] + + @report_with_blocks_gh Map.put(@sample_report, :top_blocks, @top_blocks_gh) + + test "renders block section with verdict and details per block" do + result = Formatter.format_markdown(@report_with_blocks_gh, :default, :github) + assert result =~ "review required" + assert result =~ "
" + assert result =~ "lib/foo.ex" + end + + test "renders severity and fix hint" do + result = Formatter.format_markdown(@report_with_blocks_gh, :default, :github) + assert result =~ "🔴" + assert result =~ "cyclomatic_complexity_under_10" + assert result =~ "Reduce branching" + end + + test "renders source code in collapsed block" do + result = Formatter.format_markdown(@report_with_blocks_gh, :default, :github) + assert result =~ "```elixir" + assert result =~ "def foo do" + end + end + + describe "github formatter: PR summary and delta" do + @pr_summary_gh %{ + base_score: 85, + head_score: 77, + score_delta: -8, + base_grade: "B+", + head_grade: "C+", + blocks_flagged: 6, + files_changed: 3, + files_added: 1, + files_modified: 2 + } + + @delta_gh %{ + base: %{aggregate: %{"readability" => %{"mean_flesch_adapted" => 65.0}}}, + head: %{aggregate: %{"readability" => %{"mean_flesch_adapted" => 61.0}}} + } + + test "renders PR summary" do + report = @sample_report |> Map.put(:pr_summary, @pr_summary_gh) + result = Formatter.format_markdown(report, :default, :github) + assert result =~ "B+" + assert result =~ "C+" + assert result =~ "-8" + end + + test "renders delta section" do + report = @sample_report |> Map.put(:codebase_delta, @delta_gh) + result = Formatter.format_markdown(report, :default, :github) + assert result =~ "Metric Changes" + assert result =~ "65.00" + assert result =~ "61.00" + end + end + + describe "render_parts/2" do + test "returns at least 3 parts" do + parts = Formatter.render_parts(@sample_report) + assert length(parts) >= 3 + end + + test "each part ends with sentinel comment" do + parts = Formatter.render_parts(@sample_report) + + Enum.with_index(parts, 1) + |> Enum.each(fn {part, n} -> + assert part =~ "" + end) + end + + test "part 1 contains header and grade" do + [part_1 | _] = Formatter.render_parts(@sample_report) + assert part_1 =~ "Code Health: B+" + assert part_1 =~ "(79/100)" + end + + test "part 1 contains mermaid chart by default" do + [part_1 | _] = Formatter.render_parts(@sample_report) + assert part_1 =~ "```mermaid" + end + + test "part 1 contains progress bars" do + [part_1 | _] = Formatter.render_parts(@sample_report) + assert part_1 =~ "████" + end + + test "part 2 contains category details" do + [_, part_2 | _] = Formatter.render_parts(@sample_report) + assert part_2 =~ "
" + assert part_2 =~ "Readability" + end + + test "part 3 shows green verdict when no blocks" do + [_, _, part_3 | _] = Formatter.render_parts(@sample_report) + assert part_3 =~ "No block-level issues detected" + end + + test "part 3 contains verdict and blocks when present" do + report = Map.put(@sample_report, :top_blocks, @top_blocks_gh) + [_, _, part_3 | _] = Formatter.render_parts(report) + assert part_3 =~ "lib/foo.ex" + assert part_3 =~ "review required" + end + end + + describe "Github.render_parts_3/2" do + alias CodeQA.HealthReport.Formatter.Github + + test "returns single part with blocks (top 10 limit means no slicing needed)" do + blocks = + Enum.map(1..10, fn i -> + %{ + path: "lib/file_#{i}.ex", + status: "modified", + start_line: 10, + end_line: 30, + type: "function", + token_count: 150, + source: "def foo, do: :bar", + language: "elixir", + potentials: [ + %{ + category: "function_design", + behavior: "single_responsibility", + cosine_delta: 0.35, + severity: :high, + fix_hint: "Consider extracting helper function" + } + ] + } + end) + + report = Map.put(@sample_report, :top_blocks, blocks) + parts = Github.render_parts_3(report) + + # With top 10 blocks, should be a single part + assert length(parts) == 1 + end + + test "part ends with sentinel" do + blocks = [ + %{ + path: "lib/foo.ex", + status: nil, + start_line: 1, + end_line: 10, + type: "code", + token_count: 50, + source: "def foo, do: :bar", + language: "elixir", + potentials: [ + %{ + category: "function_design", + behavior: "single_responsibility", + cosine_delta: 0.35, + severity: :high, + fix_hint: nil + } + ] + } + ] + + report = Map.put(@sample_report, :top_blocks, blocks) + [part] = Github.render_parts_3(report) + assert part =~ "" + end + + test "renders source code in fenced block" do + blocks = [ + %{ + path: "lib/foo.ex", + status: nil, + start_line: 1, + end_line: 10, + type: "code", + token_count: 50, + source: "def hello do\n :world\nend", + language: "elixir", + potentials: [ + %{ + category: "function_design", + behavior: "single_responsibility", + cosine_delta: 0.35, + severity: :high, + fix_hint: nil + } + ] + } + ] + + report = Map.put(@sample_report, :top_blocks, blocks) + [part] = Github.render_parts_3(report) + assert part =~ "```elixir" + assert part =~ "def hello do" + assert part =~ ":world" + end + end end diff --git a/test/codeqa/health_report/grader_test.exs b/test/codeqa/health_report/grader_test.exs new file mode 100644 index 00000000..6f9ea544 --- /dev/null +++ b/test/codeqa/health_report/grader_test.exs @@ -0,0 +1,495 @@ +defmodule CodeQA.HealthReport.GraderTest do + use ExUnit.Case, async: true + + alias CodeQA.CombinedMetrics.SampleRunner + alias CodeQA.Engine.Analyzer + alias CodeQA.Engine.Collector + alias CodeQA.HealthReport.Grader + + @default_scale CodeQA.HealthReport.Categories.default_grade_scale() + + # ----------------------------------------------------------------------- + # score_cosine/1 + # ----------------------------------------------------------------------- + + describe "score_cosine/1" do + test "cosine 1.0 maps to 100" do + assert Grader.score_cosine(1.0) == 100 + end + + test "cosine -1.0 maps to 0" do + assert Grader.score_cosine(-1.0) == 0 + end + + test "cosine 0.5 (lower bound of top band) maps to 90" do + assert Grader.score_cosine(0.5) == 90 + end + + test "cosine 0.2 (lower bound of second band) maps to 70" do + assert Grader.score_cosine(0.2) == 70 + end + + test "cosine 0.0 (lower bound of third band) maps to 50" do + assert Grader.score_cosine(0.0) == 50 + end + + test "cosine -0.3 (lower bound of fourth band) maps to 30" do + assert Grader.score_cosine(-0.3) == 30 + end + + test "interpolation in [0.0, 0.2) band: cosine 0.12 → 62" do + # ratio = 0.12 / 0.2 = 0.6, score = 50 + 0.6 * 20 = 62 + assert Grader.score_cosine(0.12) == 62 + end + + test "interpolation in [0.2, 0.5) band: cosine 0.35 → 80" do + # ratio = (0.35 - 0.2) / (0.5 - 0.2) = 0.15/0.3 = 0.5, score = 70 + 0.5 * 20 = 80 + assert Grader.score_cosine(0.35) == 80 + end + + test "interpolation in [0.5, 1.0] band: cosine 0.75 → 95" do + # ratio = (0.75 - 0.5) / (1.0 - 0.5) = 0.25/0.5 = 0.5, score = 90 + 0.5 * 10 = 95 + assert Grader.score_cosine(0.75) == 95 + end + + test "interpolation in [-0.3, 0.0) band: cosine -0.15 → 40" do + # ratio = (-0.15 - (-0.3)) / (0.0 - (-0.3)) = 0.15/0.3 = 0.5, score = 30 + 0.5 * 20 = 40 + assert Grader.score_cosine(-0.15) == 40 + end + + test "interpolation in [-1.0, -0.3) band: cosine -0.65 → 15" do + # ratio = (-0.65 - (-1.0)) / (-0.3 - (-1.0)) = 0.35/0.7 = 0.5, score = 0 + 0.5 * 30 = 15 + assert Grader.score_cosine(-0.65) == 15 + end + + test "result is always an integer" do + for cosine <- [-1.0, -0.5, 0.0, 0.1, 0.3, 0.6, 1.0] do + assert is_integer(Grader.score_cosine(cosine)), + "expected integer for cosine #{cosine}" + end + end + + test "result is always in [0, 100]" do + for cosine <- [-1.0, -0.9, -0.3, 0.0, 0.2, 0.5, 1.0] do + score = Grader.score_cosine(cosine) + + assert score >= 0 and score <= 100, + "score #{score} out of range for cosine #{cosine}" + end + end + end + + # ----------------------------------------------------------------------- + # overall_score/3 (including backward compat as /2) + # ----------------------------------------------------------------------- + + describe "overall_score/3" do + test "empty list returns {0, 'F'}" do + assert Grader.overall_score([], @default_scale) == {0, "F"} + end + + test "equal weights produces arithmetic mean (backward compat /2)" do + categories = [ + %{key: :readability, score: 80}, + %{key: :complexity, score: 60} + ] + + {score, _grade} = Grader.overall_score(categories, @default_scale) + assert score == 70 + end + + test "weighted average applies impact_map correctly" do + categories = [ + %{key: :readability, score: 80}, + %{key: :complexity, score: 60} + ] + + # readability has weight 3, complexity has weight 1 + # weighted = (80*3 + 60*1) / 4 = 300/4 = 75 + impact_map = %{"readability" => 3, "complexity" => 1} + {score, _grade} = Grader.overall_score(categories, @default_scale, impact_map) + assert score == 75 + end + + test "missing keys in impact_map default to 1" do + categories = [ + %{key: :readability, score: 80}, + %{key: :complexity, score: 60} + ] + + # Only readability in map with weight 2; complexity defaults to 1 + # weighted = (80*2 + 60*1) / 3 = 220/3 ≈ 73 + impact_map = %{"readability" => 2} + {score, _grade} = Grader.overall_score(categories, @default_scale, impact_map) + assert score == 73 + end + + test "backward compat: /2 call with empty impact_map equals arithmetic mean" do + categories = [ + %{key: :readability, score: 90}, + %{key: :complexity, score: 70}, + %{key: :naming, score: 50} + ] + + {score_two, grade_two} = Grader.overall_score(categories, @default_scale) + {score_three, grade_three} = Grader.overall_score(categories, @default_scale, %{}) + + assert score_two == score_three + assert grade_two == grade_three + end + + test "returns grade string along with integer score" do + categories = [%{key: :readability, score: 100}] + {score, grade} = Grader.overall_score(categories, @default_scale) + assert is_integer(score) + assert is_binary(grade) + end + + test "atom keys are converted to strings for impact_map lookup" do + categories = [ + %{key: :function_design, score: 60}, + %{key: :variable_naming, score: 40} + ] + + impact_map = %{"function_design" => 2, "variable_naming" => 1} + {score, _} = Grader.overall_score(categories, @default_scale, impact_map) + # (60*2 + 40*1) / 3 = 160/3 ≈ 53 + assert score == 53 + end + end + + # Shared cosines_by_category for grade_cosine_categories/3 tests — computed once for the module. + setup_all do + files = Collector.collect_files("lib", []) + result = Analyzer.analyze_codebase(files) + aggregate = get_in(result, ["codebase", "aggregate"]) + all_cosines = SampleRunner.diagnose_aggregate(aggregate, top: 99_999) + cosines_by_category = Enum.group_by(all_cosines, & &1.category) + {:ok, cosines_by_category: cosines_by_category} + end + + # ----------------------------------------------------------------------- + # grade_cosine_categories/3 + # ----------------------------------------------------------------------- + + describe "grade_cosine_categories/3" do + test "returns a list", %{cosines_by_category: cosines_by_category} do + result = Grader.grade_cosine_categories(cosines_by_category, %{}, @default_scale) + assert is_list(result) + end + + test "each entry has required top-level keys", %{cosines_by_category: cosines_by_category} do + result = Grader.grade_cosine_categories(cosines_by_category, %{}, @default_scale) + + for cat <- result do + assert Map.has_key?(cat, :type), "missing :type in #{inspect(cat)}" + assert Map.has_key?(cat, :key), "missing :key" + assert Map.has_key?(cat, :name), "missing :name" + assert Map.has_key?(cat, :score), "missing :score" + assert Map.has_key?(cat, :grade), "missing :grade" + assert Map.has_key?(cat, :behaviors), "missing :behaviors" + end + end + + test "type is :cosine for every entry", %{cosines_by_category: cosines_by_category} do + result = Grader.grade_cosine_categories(cosines_by_category, %{}, @default_scale) + for cat <- result, do: assert(cat.type == :cosine) + end + + test "scores are integers in [0, 100]", %{cosines_by_category: cosines_by_category} do + result = Grader.grade_cosine_categories(cosines_by_category, %{}, @default_scale) + + for cat <- result do + assert is_integer(cat.score), "score not integer in #{cat.key}" + assert cat.score >= 0 and cat.score <= 100 + end + end + + test "grade is a string", %{cosines_by_category: cosines_by_category} do + result = Grader.grade_cosine_categories(cosines_by_category, %{}, @default_scale) + for cat <- result, do: assert(is_binary(cat.grade)) + end + + test "impact key is absent (HealthReport.generate/2 is responsible for embedding impact)", %{ + cosines_by_category: cosines_by_category + } do + result = Grader.grade_cosine_categories(cosines_by_category, %{}, @default_scale) + for cat <- result, do: refute(Map.has_key?(cat, :impact)) + end + + test "name is humanized from key", %{cosines_by_category: cosines_by_category} do + result = Grader.grade_cosine_categories(cosines_by_category, %{}, @default_scale) + + for cat <- result do + # name must be a non-empty string, words capitalized + assert is_binary(cat.name) + assert String.length(cat.name) > 0 + # key should be a string (category slug) + assert is_binary(cat.key) + end + end + + test "each behavior entry has required keys", %{cosines_by_category: cosines_by_category} do + result = Grader.grade_cosine_categories(cosines_by_category, %{}, @default_scale) + + for cat <- result, b <- cat.behaviors do + assert Map.has_key?(b, :behavior) + assert Map.has_key?(b, :cosine) + assert Map.has_key?(b, :score) + assert Map.has_key?(b, :grade) + assert Map.has_key?(b, :worst_offenders) + end + end + + test "behavior scores are integers in [0, 100]", %{cosines_by_category: cosines_by_category} do + result = Grader.grade_cosine_categories(cosines_by_category, %{}, @default_scale) + + for cat <- result, b <- cat.behaviors do + assert is_integer(b.score) + assert b.score >= 0 and b.score <= 100 + end + end + + test "worst_offenders uses worst_files lookup", %{cosines_by_category: cosines_by_category} do + sentinel = [%{file: "lib/sentinel.ex", cosine: -0.99}] + # Get one real behavior key to inject into worst_files + [first_cat | _] = Grader.grade_cosine_categories(cosines_by_category, %{}, @default_scale) + first_behavior = hd(first_cat.behaviors) + lookup_key = "#{first_cat.key}.#{first_behavior.behavior}" + + worst_files = %{lookup_key => sentinel} + result = Grader.grade_cosine_categories(cosines_by_category, worst_files, @default_scale) + + found_cat = Enum.find(result, &(&1.key == first_cat.key)) + found_behavior = Enum.find(found_cat.behaviors, &(&1.behavior == first_behavior.behavior)) + assert found_behavior.worst_offenders == sentinel + end + + test "top_metrics and top_nodes pass through unmodified", %{ + cosines_by_category: cosines_by_category + } do + sentinel = [ + %{ + file: "lib/sentinel.ex", + cosine: -0.99, + top_metrics: [%{metric: "foo.bar", contribution: -1.5}], + top_nodes: [%{"start_line" => 42, "type" => "block"}] + } + ] + + [first_cat | _] = Grader.grade_cosine_categories(cosines_by_category, %{}, @default_scale) + first_behavior = hd(first_cat.behaviors) + lookup_key = "#{first_cat.key}.#{first_behavior.behavior}" + + worst_files = %{lookup_key => sentinel} + result = Grader.grade_cosine_categories(cosines_by_category, worst_files, @default_scale) + + found_cat = Enum.find(result, &(&1.key == first_cat.key)) + found_behavior = Enum.find(found_cat.behaviors, &(&1.behavior == first_behavior.behavior)) + assert found_behavior.worst_offenders == sentinel + end + + test "worst_offenders defaults to [] when key absent", %{ + cosines_by_category: cosines_by_category + } do + result = Grader.grade_cosine_categories(cosines_by_category, %{}, @default_scale) + for cat <- result, b <- cat.behaviors, do: assert(b.worst_offenders == []) + end + end + + # ----------------------------------------------------------------------- + # worst_offenders/4 — top_nodes + # ----------------------------------------------------------------------- + + describe "worst_offenders/4 top_nodes" do + @category %{ + key: :function_design, + name: "Function Design", + metrics: [ + %{ + source: "halstead", + name: "tokens", + weight: 1.0, + good: :low, + thresholds: %{a: 10, b: 20, c: 30, d: 40} + } + ] + } + + test "returns top_nodes: [] when file_data has no nodes key" do + files = %{ + "lib/foo.ex" => %{ + "metrics" => %{"halstead" => %{"tokens" => 50.0}}, + "lines" => 10, + "bytes" => 100 + } + } + + result = Grader.worst_offenders(@category, files, 5) + [entry | _] = result + assert entry.top_nodes == [] + end + + test "returns top_nodes: [] when file_data nodes is nil" do + files = %{ + "lib/foo.ex" => %{ + "metrics" => %{"halstead" => %{"tokens" => 50.0}}, + "nodes" => nil, + "lines" => 10, + "bytes" => 100 + } + } + + result = Grader.worst_offenders(@category, files, 5) + [entry | _] = result + assert entry.top_nodes == [] + end + + test "returns top_nodes: [] when file_data nodes is []" do + files = %{ + "lib/foo.ex" => %{ + "metrics" => %{"halstead" => %{"tokens" => 50.0}}, + "nodes" => [], + "lines" => 10, + "bytes" => 100 + } + } + + result = Grader.worst_offenders(@category, files, 5) + [entry | _] = result + assert entry.top_nodes == [] + end + + test "returns top 3 nodes ranked by refactoring_potentials descending" do + nodes = [ + %{ + "start_line" => 1, + "column_start" => 0, + "char_length" => 50, + "type" => "function", + "token_count" => 20, + "refactoring_potentials" => [ + %{"category" => "function_design", "behavior" => "x", "cosine_delta" => 0.5} + ], + "children" => [] + }, + %{ + "start_line" => 10, + "column_start" => 0, + "char_length" => 100, + "type" => "function", + "token_count" => 40, + "refactoring_potentials" => [ + %{"category" => "function_design", "behavior" => "x", "cosine_delta" => 0.9}, + %{"category" => "naming", "behavior" => "y", "cosine_delta" => 0.4} + ], + "children" => [] + }, + %{ + "start_line" => 20, + "column_start" => 0, + "char_length" => 30, + "type" => "function", + "token_count" => 10, + "refactoring_potentials" => [ + %{"category" => "function_design", "behavior" => "z", "cosine_delta" => 0.2} + ], + "children" => [] + }, + %{ + "start_line" => 30, + "column_start" => 0, + "char_length" => 10, + "type" => "function", + "token_count" => 5, + "refactoring_potentials" => [], + "children" => [] + } + ] + + files = %{ + "lib/foo.ex" => %{ + "metrics" => %{"halstead" => %{"tokens" => 50.0}}, + "nodes" => nodes, + "lines" => 40, + "bytes" => 400 + } + } + + result = Grader.worst_offenders(@category, files, 5) + [entry | _] = result + + assert length(entry.top_nodes) == 3 + # The node with highest sum of cosine_delta comes first (0.9+0.4=1.3) + [first | _] = entry.top_nodes + assert first["start_line"] == 10 + end + + test "parent+child overlap: only parent is included when both rank top 3" do + child_node = %{ + "start_line" => 11, + "column_start" => 2, + "char_length" => 30, + "type" => "function", + "token_count" => 10, + "refactoring_potentials" => [ + %{"category" => "function_design", "behavior" => "x", "cosine_delta" => 0.8} + ], + "children" => [] + } + + nodes = [ + %{ + "start_line" => 10, + "column_start" => 0, + "char_length" => 100, + "type" => "function", + "token_count" => 40, + "refactoring_potentials" => [ + %{"category" => "function_design", "behavior" => "x", "cosine_delta" => 0.9} + ], + "children" => [child_node] + }, + %{ + "start_line" => 20, + "column_start" => 0, + "char_length" => 50, + "type" => "function", + "token_count" => 20, + "refactoring_potentials" => [ + %{"category" => "naming", "behavior" => "y", "cosine_delta" => 0.5} + ], + "children" => [] + }, + %{ + "start_line" => 30, + "column_start" => 0, + "char_length" => 30, + "type" => "function", + "token_count" => 10, + "refactoring_potentials" => [ + %{"category" => "naming", "behavior" => "z", "cosine_delta" => 0.3} + ], + "children" => [] + } + ] + + files = %{ + "lib/foo.ex" => %{ + "metrics" => %{"halstead" => %{"tokens" => 50.0}}, + "nodes" => nodes, + "lines" => 40, + "bytes" => 400 + } + } + + result = Grader.worst_offenders(@category, files, 5) + [entry | _] = result + + # child_node is not top-level, so only top-level nodes are considered + assert length(entry.top_nodes) == 3 + start_lines = Enum.map(entry.top_nodes, & &1["start_line"]) + refute 11 in start_lines + end + end +end diff --git a/test/codeqa/health_report/top_blocks_test.exs b/test/codeqa/health_report/top_blocks_test.exs new file mode 100644 index 00000000..4bef28b2 --- /dev/null +++ b/test/codeqa/health_report/top_blocks_test.exs @@ -0,0 +1,471 @@ +defmodule CodeQA.HealthReport.TopBlocksTest do + use ExUnit.Case, async: true + + alias CodeQA.Git.ChangedFile + alias CodeQA.HealthReport.TopBlocks + + # A node with cosine_delta 0.60 — will be :critical when codebase_cosine = 0.0 (gap=1.0, ratio=0.60) + defp make_node(cosine_delta, token_count \\ 20) do + %{ + "start_line" => 1, + "end_line" => 10, + "type" => "code", + "token_count" => token_count, + "refactoring_potentials" => [ + %{ + "category" => "function_design", + "behavior" => "cyclomatic_complexity_under_10", + "cosine_delta" => cosine_delta + } + ], + "children" => [] + } + end + + defp make_results(nodes) do + %{"files" => %{"lib/foo.ex" => %{"nodes" => nodes}}, "metadata" => %{"path" => "/tmp"}} + end + + defp lookup(cosine \\ 0.0) do + %{{"function_design", "cyclomatic_complexity_under_10"} => cosine} + end + + describe "severity classification" do + test ":critical when severity_ratio > 0.50" do + # gap = max(0.01, 1.0 - 0.0) = 1.0, ratio = 0.60 / 1.0 = 0.60 > 0.50 + [block] = TopBlocks.build(make_results([make_node(0.60)]), [], lookup()) + assert hd(block.potentials).severity == :critical + end + + test ":high when severity_ratio > 0.25 and <= 0.50" do + # ratio = 0.30 / 1.0 = 0.30 + [block] = TopBlocks.build(make_results([make_node(0.30)]), [], lookup()) + assert hd(block.potentials).severity == :high + end + + test ":medium when severity_ratio > 0.10 and <= 0.25" do + # ratio = 0.15 / 1.0 = 0.15 + [block] = TopBlocks.build(make_results([make_node(0.15)]), [], lookup()) + assert hd(block.potentials).severity == :medium + end + + test "filtered when severity_ratio <= 0.10" do + # ratio = 0.05 / 1.0 = 0.05 — block should not appear + assert TopBlocks.build(make_results([make_node(0.05)]), [], lookup()) == [] + end + + test "gap floor prevents division by zero when codebase_cosine = 1.0" do + # gap = max(0.01, 1.0 - 1.0) = 0.01, ratio = 0.02 / 0.01 = 2.0 → :critical + [block] = TopBlocks.build(make_results([make_node(0.02)]), [], lookup(1.0)) + assert hd(block.potentials).severity == :critical + end + + test "gap handles negative codebase_cosine" do + # codebase_cosine = -0.5, gap = max(0.01, 1.0 - (-0.5)) = 1.5 + # ratio = 0.60 / 1.5 = 0.40 → :high + [block] = TopBlocks.build(make_results([make_node(0.60)]), [], lookup(-0.5)) + assert hd(block.potentials).severity == :high + end + + test "unknown behavior defaults codebase_cosine to 0.0" do + lookup_empty = %{} + # gap = 1.0, ratio = 0.60 → :critical + [block] = TopBlocks.build(make_results([make_node(0.60)]), [], lookup_empty) + assert hd(block.potentials).severity == :critical + end + end + + describe "changed_files filtering" do + test "when changed_files is empty, shows all files" do + [block] = TopBlocks.build(make_results([make_node(0.60)]), [], lookup()) + assert block.path == "lib/foo.ex" + assert block.status == nil + end + + test "when changed_files given, only shows matching files" do + changed = [%ChangedFile{path: "lib/other.ex", status: "added"}] + assert TopBlocks.build(make_results([make_node(0.60)]), changed, lookup()) == [] + end + + test "status comes from ChangedFile struct" do + changed = [%ChangedFile{path: "lib/foo.ex", status: "modified"}] + [block] = TopBlocks.build(make_results([make_node(0.60)]), changed, lookup()) + assert block.status == "modified" + end + end + + describe "block filtering" do + test "blocks with token_count < 10 are excluded" do + assert TopBlocks.build(make_results([make_node(0.60, 9)]), [], lookup()) == [] + end + + test "blocks are ordered by highest cosine_delta descending" do + node_low = make_node(0.20) + node_high = put_in(make_node(0.60), ["start_line"], 20) + + results = %{ + "files" => %{"lib/foo.ex" => %{"nodes" => [node_low, node_high]}}, + "metadata" => %{"path" => "/tmp"} + } + + blocks = TopBlocks.build(results, [], lookup()) + deltas = Enum.map(blocks, fn b -> hd(b.potentials).cosine_delta end) + assert deltas == Enum.sort(deltas, :desc) + end + + test "children nodes are included" do + parent = %{ + "start_line" => 1, + "end_line" => 20, + "type" => "code", + "token_count" => 5, + "refactoring_potentials" => [], + "children" => [make_node(0.60)] + } + + blocks = TopBlocks.build(make_results([parent]), [], lookup()) + assert length(blocks) == 1 + end + end + + describe "fix hints" do + test "includes fix_hint string for known behavior" do + # naming_conventions/file_name_matches_primary_export has _fix_hint in YAML + node = %{ + "start_line" => 1, + "end_line" => 10, + "type" => "code", + "token_count" => 20, + "refactoring_potentials" => [ + %{ + "category" => "naming_conventions", + "behavior" => "file_name_matches_primary_export", + "cosine_delta" => 0.60 + } + ], + "children" => [] + } + + hint_lookup = %{{"naming_conventions", "file_name_matches_primary_export"} => 0.0} + [block] = TopBlocks.build(make_results([node]), [], hint_lookup) + potential = hd(block.potentials) + assert is_binary(potential.fix_hint) + end + + test "fix_hint is nil for unknown behavior" do + node = %{ + "start_line" => 1, + "end_line" => 10, + "type" => "code", + "token_count" => 20, + "refactoring_potentials" => [ + %{"category" => "unknown_cat", "behavior" => "unknown_beh", "cosine_delta" => 0.60} + ], + "children" => [] + } + + [block] = TopBlocks.build(make_results([node]), [], %{}) + assert hd(block.potentials).fix_hint == nil + end + end + + describe "source code extraction" do + test "includes source code when file exists" do + # Create a temp file + tmp_dir = System.tmp_dir!() + test_dir = Path.join(tmp_dir, "top_blocks_test_#{:rand.uniform(100_000)}") + File.mkdir_p!(test_dir) + file_path = Path.join(test_dir, "test.ex") + File.write!(file_path, "line 1\nline 2\nline 3\nline 4\nline 5") + + results = %{ + "files" => %{"test.ex" => %{"nodes" => [make_node(0.60) |> Map.put("end_line", 3)]}}, + "metadata" => %{"path" => test_dir} + } + + [block] = TopBlocks.build(results, [], lookup()) + assert block.source == "line 1\nline 2\nline 3" + assert block.language == "elixir" + + File.rm_rf!(test_dir) + end + + test "source is nil when file does not exist" do + results = %{ + "files" => %{"nonexistent.ex" => %{"nodes" => [make_node(0.60)]}}, + "metadata" => %{"path" => "/nonexistent/path"} + } + + [block] = TopBlocks.build(results, [], lookup()) + assert block.source == nil + end + end + + describe "top N limiting" do + test "returns at most 10 blocks" do + # Create 15 nodes, each 10 lines (within default 3-20 range) + nodes = + for i <- 1..15 do + make_node(0.60 + i * 0.01) + |> put_in(["start_line"], i * 20) + |> put_in(["end_line"], i * 20 + 9) + end + + results = %{ + "files" => %{"lib/foo.ex" => %{"nodes" => nodes}}, + "metadata" => %{"path" => "/tmp"} + } + + blocks = TopBlocks.build(results, [], lookup()) + assert length(blocks) == 10 + end + end + + describe "line range filtering" do + test "blocks outside line range are excluded" do + # 2-line block (below min of 3) + small_node = + make_node(0.60) + |> put_in(["start_line"], 1) + |> put_in(["end_line"], 2) + + # 25-line block (above max of 20) + large_node = + make_node(0.60) + |> put_in(["start_line"], 10) + |> put_in(["end_line"], 34) + + results = %{ + "files" => %{"lib/foo.ex" => %{"nodes" => [small_node, large_node]}}, + "metadata" => %{"path" => "/tmp"} + } + + blocks = TopBlocks.build(results, [], lookup()) + assert blocks == [] + end + + test "blocks within line range are included" do + # 10-line block (within 3-20 range) + node = + make_node(0.60) + |> put_in(["start_line"], 1) + |> put_in(["end_line"], 10) + + results = %{ + "files" => %{"lib/foo.ex" => %{"nodes" => [node]}}, + "metadata" => %{"path" => "/tmp"} + } + + blocks = TopBlocks.build(results, [], lookup()) + assert length(blocks) == 1 + end + + test "line range is configurable" do + # 2-line block + small_node = + make_node(0.60) + |> put_in(["start_line"], 1) + |> put_in(["end_line"], 2) + + results = %{ + "files" => %{"lib/foo.ex" => %{"nodes" => [small_node]}}, + "metadata" => %{"path" => "/tmp"} + } + + # Default range (3-20) excludes it + assert TopBlocks.build(results, [], lookup()) == [] + + # Custom range (1-5) includes it + blocks = TopBlocks.build(results, [], lookup(), block_min_lines: 1, block_max_lines: 5) + assert length(blocks) == 1 + end + end + + describe "diff_line_ranges filtering" do + test "when diff_line_ranges is empty map, shows all blocks" do + node = make_node(0.60) + [block] = TopBlocks.build(make_results([node]), [], lookup(), diff_line_ranges: %{}) + assert block.path == "lib/foo.ex" + end + + test "when diff_line_ranges provided, only shows blocks overlapping diff" do + # Block at lines 1-10 + node = make_node(0.60) + + # Diff changes lines 5-7 (overlaps with block) + diff_ranges = %{"lib/foo.ex" => [{5, 7}]} + + [block] = + TopBlocks.build(make_results([node]), [], lookup(), diff_line_ranges: diff_ranges) + + assert block.path == "lib/foo.ex" + end + + test "excludes blocks that don't overlap with diff" do + # Block at lines 1-10 + node = make_node(0.60) + + # Diff changes lines 50-55 (no overlap) + diff_ranges = %{"lib/foo.ex" => [{50, 55}]} + + blocks = TopBlocks.build(make_results([node]), [], lookup(), diff_line_ranges: diff_ranges) + assert blocks == [] + end + + test "excludes blocks when file has no diff ranges" do + node = make_node(0.60) + + # Diff only has ranges for different file + diff_ranges = %{"lib/other.ex" => [{1, 10}]} + + blocks = TopBlocks.build(make_results([node]), [], lookup(), diff_line_ranges: diff_ranges) + assert blocks == [] + end + + test "includes block with exact overlap" do + # Block at lines 5-15 + node = + make_node(0.60) + |> put_in(["start_line"], 5) + |> put_in(["end_line"], 15) + + # Diff changes exactly lines 5-15 + diff_ranges = %{"lib/foo.ex" => [{5, 15}]} + + [block] = + TopBlocks.build(make_results([node]), [], lookup(), diff_line_ranges: diff_ranges) + + assert block.start_line == 5 + assert block.end_line == 15 + end + + test "includes block with partial overlap at start" do + # Block at lines 10-20 + node = + make_node(0.60) + |> put_in(["start_line"], 10) + |> put_in(["end_line"], 20) + + # Diff changes lines 5-12 (overlaps start of block) + diff_ranges = %{"lib/foo.ex" => [{5, 12}]} + + [block] = + TopBlocks.build(make_results([node]), [], lookup(), diff_line_ranges: diff_ranges) + + assert block.start_line == 10 + end + + test "includes block with partial overlap at end" do + # Block at lines 10-20 + node = + make_node(0.60) + |> put_in(["start_line"], 10) + |> put_in(["end_line"], 20) + + # Diff changes lines 18-25 (overlaps end of block) + diff_ranges = %{"lib/foo.ex" => [{18, 25}]} + + [block] = + TopBlocks.build(make_results([node]), [], lookup(), diff_line_ranges: diff_ranges) + + assert block.end_line == 20 + end + + test "includes block when diff is entirely inside block" do + # Block at lines 1-10 + node = make_node(0.60) + + # Diff changes lines 3-5 (inside block) + diff_ranges = %{"lib/foo.ex" => [{3, 5}]} + + [block] = + TopBlocks.build(make_results([node]), [], lookup(), diff_line_ranges: diff_ranges) + + assert block.path == "lib/foo.ex" + assert block.start_line == 1 + assert block.end_line == 10 + end + + test "works with multiple diff ranges for same file" do + # Block at lines 1-10 + node = make_node(0.60) + + # Diff changes lines 50-55 and 5-7 (second range overlaps) + diff_ranges = %{"lib/foo.ex" => [{50, 55}, {5, 7}]} + + [block] = + TopBlocks.build(make_results([node]), [], lookup(), diff_line_ranges: diff_ranges) + + assert block.path == "lib/foo.ex" + assert block.start_line == 1 + assert block.end_line == 10 + end + + test "excludes adjacent but non-overlapping ranges" do + # Block at lines 1-10 + node = make_node(0.60) + + # Diff changes line 11 (adjacent but not overlapping) + diff_ranges = %{"lib/foo.ex" => [{11, 11}]} + + blocks = TopBlocks.build(make_results([node]), [], lookup(), diff_line_ranges: diff_ranges) + assert blocks == [] + end + + test "excludes blocks when file has empty diff ranges list" do + node = make_node(0.60) + + # File is present but with empty ranges (e.g., only deletions) + diff_ranges = %{"lib/foo.ex" => []} + + blocks = TopBlocks.build(make_results([node]), [], lookup(), diff_line_ranges: diff_ranges) + assert blocks == [] + end + + test "single-line block overlapping single-line diff" do + # Single-line block at line 5 + node = + make_node(0.60) + |> put_in(["start_line"], 5) + |> put_in(["end_line"], 5) + + diff_ranges = %{"lib/foo.ex" => [{5, 5}]} + + # Need to adjust min_lines for this test since block is only 1 line + [block] = + TopBlocks.build(make_results([node]), [], lookup(), + diff_line_ranges: diff_ranges, + block_min_lines: 1 + ) + + assert block.start_line == 5 + assert block.end_line == 5 + end + + test "when both changed_files and diff_line_ranges provided, both filters apply" do + node = make_node(0.60) + changed = [%ChangedFile{path: "lib/foo.ex", status: "modified"}] + diff_ranges = %{"lib/foo.ex" => [{5, 7}]} + + [block] = + TopBlocks.build(make_results([node]), changed, lookup(), diff_line_ranges: diff_ranges) + + assert block.path == "lib/foo.ex" + assert block.status == "modified" + assert block.start_line == 1 + assert block.end_line == 10 + end + + test "changed_files filter applies before diff_line_ranges filter" do + node = make_node(0.60) + # File is in diff_ranges but not in changed_files + changed = [%ChangedFile{path: "lib/other.ex", status: "modified"}] + diff_ranges = %{"lib/foo.ex" => [{5, 7}]} + + blocks = + TopBlocks.build(make_results([node]), changed, lookup(), diff_line_ranges: diff_ranges) + + assert blocks == [] + end + end +end diff --git a/test/codeqa/health_report_test.exs b/test/codeqa/health_report_test.exs new file mode 100644 index 00000000..80f8575f --- /dev/null +++ b/test/codeqa/health_report_test.exs @@ -0,0 +1,131 @@ +defmodule CodeQA.HealthReportTest do + use ExUnit.Case, async: true + + alias CodeQA.BlockImpactAnalyzer + alias CodeQA.Engine.Analyzer + alias CodeQA.Git.ChangedFile + alias CodeQA.HealthReport + + describe "generate/2 output keys" do + @tag :slow + test "without base_results: pr_summary and codebase_delta are nil" do + files = %{"lib/foo.ex" => "defmodule Foo do\n def bar, do: :ok\nend\n"} + results = Analyzer.analyze_codebase(files) + results = BlockImpactAnalyzer.analyze(results, files) + + report = HealthReport.generate(results) + + assert report.pr_summary == nil + assert report.codebase_delta == nil + assert is_list(report.top_blocks) + assert Map.has_key?(report, :overall_score) + assert Map.has_key?(report, :overall_grade) + assert Map.has_key?(report, :categories) + assert Map.has_key?(report, :top_issues) + end + + @tag :slow + test "without base_results: top_blocks shows top 10 blocks by impact" do + files = %{"lib/foo.ex" => "defmodule Foo do\n def bar, do: :ok\nend\n"} + results = Analyzer.analyze_codebase(files) + results = BlockImpactAnalyzer.analyze(results, files) + + report = HealthReport.generate(results) + + # top_blocks is a flat list of blocks (may be empty if no blocks above threshold) + assert is_list(report.top_blocks) + + Enum.each(report.top_blocks, fn block -> + assert Map.has_key?(block, :path) + assert Map.has_key?(block, :status) + assert Map.has_key?(block, :potentials) + assert Map.has_key?(block, :source) + assert block.status == nil + end) + end + + @tag :slow + test "worst_offenders is always empty in categories" do + files = %{"lib/foo.ex" => "defmodule Foo do\n def bar, do: :ok\nend\n"} + results = Analyzer.analyze_codebase(files) + results = BlockImpactAnalyzer.analyze(results, files) + + report = HealthReport.generate(results) + + Enum.each(report.categories, fn cat -> + assert Map.get(cat, :worst_offenders, []) == [] + end) + end + end + + describe "generate/2 with base_results" do + @tag :slow + test "pr_summary is populated" do + files = %{"lib/foo.ex" => "defmodule Foo do\n def bar, do: :ok\nend\n"} + head_results = Analyzer.analyze_codebase(files) + head_results = BlockImpactAnalyzer.analyze(head_results, files) + base_results = Analyzer.analyze_codebase(files) + + changed = [%ChangedFile{path: "lib/foo.ex", status: "modified"}] + + report = + HealthReport.generate(head_results, + base_results: base_results, + changed_files: changed + ) + + assert %{ + base_score: base_score, + head_score: head_score, + score_delta: delta, + base_grade: _, + head_grade: _, + blocks_flagged: flagged, + files_changed: 1, + files_added: 0, + files_modified: 1 + } = report.pr_summary + + assert is_integer(base_score) + assert is_integer(head_score) + assert delta == head_score - base_score + assert is_integer(flagged) + end + + @tag :slow + test "codebase_delta is populated" do + files = %{"lib/foo.ex" => "defmodule Foo do\n def bar, do: :ok\nend\n"} + head_results = Analyzer.analyze_codebase(files) + head_results = BlockImpactAnalyzer.analyze(head_results, files) + base_results = Analyzer.analyze_codebase(files) + + report = HealthReport.generate(head_results, base_results: base_results) + + assert %{base: %{aggregate: _}, head: %{aggregate: _}, delta: %{aggregate: _}} = + report.codebase_delta + end + + @tag :slow + test "top_blocks scoped to changed_files" do + files = %{ + "lib/foo.ex" => "defmodule Foo do\n def bar, do: :ok\nend\n", + "lib/bar.ex" => "defmodule Bar do\n def baz, do: :ok\nend\n" + } + + head_results = Analyzer.analyze_codebase(files) + head_results = BlockImpactAnalyzer.analyze(head_results, files) + base_results = Analyzer.analyze_codebase(files) + + changed = [%ChangedFile{path: "lib/foo.ex", status: "modified"}] + + report = + HealthReport.generate(head_results, + base_results: base_results, + changed_files: changed + ) + + paths = Enum.map(report.top_blocks, & &1.path) + refute "lib/bar.ex" in paths + end + end +end diff --git a/test/codeqa/metrics/codebase/near_duplicate_blocks_codebase_test.exs b/test/codeqa/metrics/codebase/near_duplicate_blocks_codebase_test.exs new file mode 100644 index 00000000..1c797761 --- /dev/null +++ b/test/codeqa/metrics/codebase/near_duplicate_blocks_codebase_test.exs @@ -0,0 +1,98 @@ +defmodule CodeQA.Metrics.Codebase.NearDuplicateBlocksCodebaseTest do + use ExUnit.Case, async: true + alias CodeQA.Analysis.FileContextServer + alias CodeQA.Metrics.Codebase.NearDuplicateBlocksCodebase + + defp files(pairs), do: Map.new(pairs) + + defp with_pid(fun) do + {:ok, pid} = FileContextServer.start_link() + fun.(pid) + end + + describe "name/0" do + test "returns near_duplicate_blocks_codebase" do + assert NearDuplicateBlocksCodebase.name() == "near_duplicate_blocks_codebase" + end + end + + describe "analyze/2" do + test "returns all count keys d0..d8" do + with_pid(fn pid -> + result = + NearDuplicateBlocksCodebase.analyze(files([{"a.ex", "x = 1\n"}]), file_context_pid: pid) + + for d <- 0..8, do: assert(Map.has_key?(result, "near_dup_block_d#{d}")) + end) + end + + test "returns all pairs keys d0..d8" do + with_pid(fn pid -> + result = + NearDuplicateBlocksCodebase.analyze(files([{"a.ex", "x = 1\n"}]), file_context_pid: pid) + + for d <- 0..8, do: assert(Map.has_key?(result, "near_dup_block_d#{d}_pairs")) + end) + end + + test "zero counts for a single trivial file" do + with_pid(fn pid -> + result = + NearDuplicateBlocksCodebase.analyze(files([{"a.ex", "x = 1\n"}]), file_context_pid: pid) + + assert result["near_dup_block_d0"] == 0 + end) + end + + test "detects exact duplicate block across two files" do + block = "def foo\n x = 1\nend\n" + + with_pid(fn pid -> + result = + NearDuplicateBlocksCodebase.analyze( + files([{"a.ex", block}, {"b.ex", block}]), + file_context_pid: pid + ) + + assert result["near_dup_block_d0"] >= 1 + end) + end + + test "pair sources include file paths" do + block = "def foo\n x = 1\nend\n" + + with_pid(fn pid -> + result = + NearDuplicateBlocksCodebase.analyze( + files([{"a.ex", block}, {"b.ex", block}]), + file_context_pid: pid + ) + + all_pairs = result |> Map.values() |> Enum.filter(&is_list/1) |> List.flatten() + + if all_pairs != [] do + pair = hd(all_pairs) + assert Map.has_key?(pair, "source_a") + assert Map.has_key?(pair, "source_b") + end + end) + end + + test "pairs list is capped at max_pairs_per_bucket" do + block = "def foo\n x = 1\nend\n" + many_files = for i <- 1..5, do: {"file#{i}.ex", block} + + with_pid(fn pid -> + result = + NearDuplicateBlocksCodebase.analyze( + files(many_files), + file_context_pid: pid, + near_duplicate_blocks: [max_pairs_per_bucket: 2] + ) + + pairs_lists = result |> Map.values() |> Enum.filter(&is_list/1) + assert Enum.all?(pairs_lists, &(length(&1) <= 2)) + end) + end + end +end diff --git a/test/codeqa/metrics/codebase/similarity_test.exs b/test/codeqa/metrics/codebase/similarity_test.exs new file mode 100644 index 00000000..d20dbf13 --- /dev/null +++ b/test/codeqa/metrics/codebase/similarity_test.exs @@ -0,0 +1,79 @@ +defmodule CodeQA.Metrics.Codebase.SimilarityTest do + use ExUnit.Case, async: true + alias CodeQA.Metrics.Codebase.Similarity + + describe "name/0" do + test "returns similarity" do + assert Similarity.name() == "similarity" + end + end + + describe "analyze/2 with fewer than 2 files" do + test "empty codebase returns zero density" do + result = Similarity.analyze(%{}) + assert result["cross_file_density"] == 0.0 + end + + test "single file returns zero density" do + result = Similarity.analyze(%{"a.ex" => "x = 1"}) + assert result["cross_file_density"] == 0.0 + end + + test "fewer than 2 files returns empty ncd_pairs" do + result = Similarity.analyze(%{"a.ex" => "x = 1"}) + assert result["ncd_pairs"] == %{} + end + end + + describe "analyze/2 cross_file_density" do + test "returns a float between 0 and 2" do + files = %{"a.ex" => "def foo, do: 1", "b.ex" => "def bar, do: 2"} + result = Similarity.analyze(files) + assert is_float(result["cross_file_density"]) + assert result["cross_file_density"] >= 0.0 + end + + test "identical files produce higher density than dissimilar files" do + content = String.duplicate("def foo do\n x = 1\nend\n", 20) + identical = %{"a.ex" => content, "b.ex" => content} + dissimilar = %{"a.ex" => content, "b.ex" => String.duplicate("zzz qqq rrr\n", 20)} + + assert Similarity.analyze(identical)["cross_file_density"] > + Similarity.analyze(dissimilar)["cross_file_density"] + end + + test "does not return ncd_pairs key by default" do + files = %{"a.ex" => "x = 1", "b.ex" => "y = 2"} + result = Similarity.analyze(files) + refute Map.has_key?(result, "ncd_pairs") + end + end + + describe "analyze/2 with show_ncd: true" do + test "returns ncd_pairs key" do + files = %{"a.ex" => "x = 1", "b.ex" => "y = 2"} + result = Similarity.analyze(files, show_ncd: true) + assert Map.has_key?(result, "ncd_pairs") + end + + test "identical files have ncd near 0" do + content = String.duplicate("def foo do\n x = 1\nend\n", 10) + files = %{"a.ex" => content, "b.ex" => content} + + result = Similarity.analyze(files, show_ncd: true, ncd_paths: ["a.ex"]) + pairs = result["ncd_pairs"] + + scores = pairs |> Map.values() |> List.flatten() |> Enum.map(& &1["score"]) + assert Enum.all?(scores, &(&1 < 0.2)) + end + + test "ncd_paths restricts which files are compared" do + files = %{"a.ex" => "x = 1", "b.ex" => "y = 2", "c.ex" => "z = 3"} + result = Similarity.analyze(files, show_ncd: true, ncd_paths: ["a.ex"]) + pairs = result["ncd_pairs"] + assert Map.has_key?(pairs, "a.ex") + refute Map.has_key?(pairs, "b.ex") + refute Map.has_key?(pairs, "c.ex") + end + end +end diff --git a/test/codeqa/metrics/file/bradford_test.exs b/test/codeqa/metrics/file/bradford_test.exs new file mode 100644 index 00000000..db948d9e --- /dev/null +++ b/test/codeqa/metrics/file/bradford_test.exs @@ -0,0 +1,122 @@ +defmodule CodeQA.Metrics.File.BradfordTest do + use ExUnit.Case, async: true + + alias CodeQA.Engine.Pipeline + alias CodeQA.Metrics.File.Bradford + + # Bradford zones are built by ranking lines densest-first, then walking down + # until each third of total tokens is accumulated: + # zone 1 (core) — fewest lines needed to reach 1/3 of all tokens + # zone 2 (middle) — fewest additional lines to reach 2/3 + # zone 3 (tail) — all remaining lines + # + # k1 = zone2_lines / zone1_lines — how many more lines the middle needs vs the core + # k2 = zone3_lines / zone2_lines — how many more lines the tail needs vs the middle + # k_ratio = k2 / k1 — > 1 means tail is more stretched; < 1 means core is extreme + + defp ctx(code), do: Pipeline.build_file_context(code) + defp result(code), do: Bradford.analyze(ctx(code)) + + describe "analyze/1 - edge cases" do + test "returns zeros for empty content" do + # can't form three meaningful zones with nothing + assert result("") == %{"k1" => 0.0, "k2" => 0.0, "k_ratio" => 0.0} + end + + test "returns zeros for a single line" do + # a single line cannot be split into three zones + assert result("a b c") == %{"k1" => 0.0, "k2" => 0.0, "k_ratio" => 0.0} + end + + test "returns zeros for two lines" do + # two lines still can't fill three zones + assert result("a b c\nd e f") == %{"k1" => 0.0, "k2" => 0.0, "k_ratio" => 0.0} + end + end + + describe "analyze/1 - uniform distribution" do + # 9 lines × 3 tokens = 27 total, third = 9 + # sorted counts: [3, 3, 3, 3, 3, 3, 3, 3, 3] + # zone 1: 3 lines (3+3+3 = 9 ≥ 9) + # zone 2: 3 lines (3+3+3 = 9 ≥ 9) + # zone 3: 3 lines remaining + # k1 = 3/3 = 1.0 — middle needs the same number of lines as the core + # k2 = 3/3 = 1.0 — tail needs the same number of lines as the middle + # k_ratio = 1.0 — perfectly symmetric: no zone is more stretched than another + test "uniform file has k = 1" do + code = Enum.map_join(1..9, "\n", fn _ -> "a b c" end) + assert result(code) == %{"k1" => 1.0, "k2" => 1.0, "k_ratio" => 1.0} + end + end + + describe "analyze/1 - Bradford concentration" do + # 1 line with 10 tokens + 3 lines with 3 tokens + 9 lines with 1 token + # total = 28, third ≈ 9.333 + # sorted: [10, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1] (13 lines) + # zone 1: 1 line (10 ≥ 9.333) → n1 = 1 + # zone 2: 4 lines (3+3+3 = 9 < 9.333; +1 → 10 ≥ 9.333) → n2 = 4 + # zone 3: 8 lines remaining → n3 = 8 + # k1 = 4/1 = 4.0 — the middle needs 4× more lines than the dense core + # k2 = 8/4 = 2.0 — the tail needs 2× more lines than the middle + # k_ratio = 0.5 — the core-to-middle jump (4×) is bigger than middle-to-tail (2×), + # meaning extreme concentration is at the very top, not spread across zones + test "concentrated file produces k1=4.0, k2=2.0, k_ratio=0.5" do + dense = "a b c d e f g h i j" + medium = Enum.map_join(1..3, "\n", fn _ -> "a b c" end) + sparse = Enum.map_join(1..9, "\n", fn _ -> "a" end) + code = Enum.join([dense, medium, sparse], "\n") + + assert result(code) == %{ + # 1 dense line does the work of 4 middle lines — extreme core + "k1" => 4.0, + # 4 middle lines do the work of 8 tail lines — moderate long tail + "k2" => 2.0, + # k2 < k1: the core is more concentrated than the tail is sparse + "k_ratio" => 0.5 + } + end + + test "concentrated file has higher k1 than uniform" do + # k1 is the primary concentration signal: how many times more lines the + # middle zone needs compared to the core. A uniform file scores 1.0 here. + uniform = Enum.map_join(1..9, "\n", fn _ -> "a b c" end) + + dense = "a b c d e f g h i j" + medium = Enum.map_join(1..3, "\n", fn _ -> "a b c" end) + sparse = Enum.map_join(1..9, "\n", fn _ -> "a" end) + concentrated = Enum.join([dense, medium, sparse], "\n") + + assert result(concentrated)["k1"] > result(uniform)["k1"] + end + + test "k_ratio < 1 when the core is more extreme than the tail" do + # k_ratio = k2 / k1 + # k_ratio < 1 → k2 < k1 → the core-to-middle multiplier exceeds the + # middle-to-tail multiplier: the spike is at + # the very top, not spread evenly down the rank list + # k_ratio > 1 → k2 > k1 → the tail is more stretched than the core jump, + # typical of many medium lines plus a huge sparse tail + code = + Enum.join( + [ + "a b c d e f g h i j", + "a b c", + "a b c", + "a b c", + "a", + "a", + "a", + "a", + "a", + "a", + "a", + "a", + "a" + ], + "\n" + ) + + assert result(code)["k_ratio"] < 1.0 + end + end +end diff --git a/test/codeqa/metrics/branching_test.exs b/test/codeqa/metrics/file/branching_test.exs similarity index 79% rename from test/codeqa/metrics/branching_test.exs rename to test/codeqa/metrics/file/branching_test.exs index d7947800..c4749b3f 100644 --- a/test/codeqa/metrics/branching_test.exs +++ b/test/codeqa/metrics/file/branching_test.exs @@ -1,8 +1,8 @@ -defmodule CodeQA.Metrics.BranchingTest do +defmodule CodeQA.Metrics.File.BranchingTest do use ExUnit.Case, async: true - alias CodeQA.Metrics.Branching - alias CodeQA.Pipeline + alias CodeQA.Engine.Pipeline + alias CodeQA.Metrics.File.Branching defp ctx(code), do: Pipeline.build_file_context(code) defp density(code), do: Branching.analyze(ctx(code))["branching_density"] @@ -27,7 +27,9 @@ defmodule CodeQA.Metrics.BranchingTest do for keyword <- Branching.branching_keywords() |> MapSet.to_list() |> Enum.sort() do test "counts #{keyword} as a branching token" do code = "line_before\n#{unquote(keyword)} condition\nline_after" - assert density(code) > 0.0, "expected '#{unquote(keyword)}' to be counted as a branching token" + + assert density(code) > 0.0, + "expected '#{unquote(keyword)}' to be counted as a branching token" end end end diff --git a/test/codeqa/metrics/file/brevity_test.exs b/test/codeqa/metrics/file/brevity_test.exs new file mode 100644 index 00000000..4f65fa05 --- /dev/null +++ b/test/codeqa/metrics/file/brevity_test.exs @@ -0,0 +1,44 @@ +defmodule CodeQA.Metrics.File.BrevityTest do + use ExUnit.Case, async: true + + alias CodeQA.Engine.Pipeline + alias CodeQA.Metrics.File.Brevity + + defp ctx(code), do: Pipeline.build_file_context(code) + defp result(code), do: Brevity.analyze(ctx(code)) + + describe "analyze/1 - edge cases" do + test "returns zeros for empty content" do + assert result("") == %{"correlation" => 0.0, "slope" => 0.0, "sample_size" => 0} + end + + test "returns zeros for fewer than 3 unique tokens" do + assert result("a a b b")["correlation"] == 0.0 + assert result("a a b b")["slope"] == 0.0 + end + end + + describe "analyze/1 - brevity law" do + test "negative correlation when shorter tokens are more frequent" do + # x(len=1): 10×, to(len=2): 3×, longname(len=8): 1× + code = String.duplicate("x ", 10) <> String.duplicate("to ", 3) <> "longname" + assert result(code)["correlation"] < 0.0 + end + + test "positive correlation when longer tokens are more frequent" do + # longword(len=8): 4×, a(len=1): 1×, b(len=1): 1× + code = String.duplicate("longword ", 4) <> "a b" + assert result(code)["correlation"] > 0.0 + end + + test "sample_size reflects unique token count" do + code = "alpha beta gamma alpha beta" + assert result(code)["sample_size"] == 3 + end + + test "slope is negative when brevity law holds" do + code = String.duplicate("x ", 10) <> String.duplicate("to ", 3) <> "longname" + assert result(code)["slope"] < 0.0 + end + end +end diff --git a/test/codeqa/metrics/function_metrics_test.exs b/test/codeqa/metrics/file/function_metrics_test.exs similarity index 78% rename from test/codeqa/metrics/function_metrics_test.exs rename to test/codeqa/metrics/file/function_metrics_test.exs index caa1f6bf..7f05b906 100644 --- a/test/codeqa/metrics/function_metrics_test.exs +++ b/test/codeqa/metrics/file/function_metrics_test.exs @@ -1,8 +1,8 @@ -defmodule CodeQA.Metrics.FunctionMetricsTest do +defmodule CodeQA.Metrics.File.FunctionMetricsTest do use ExUnit.Case, async: true - alias CodeQA.Metrics.FunctionMetrics - alias CodeQA.Pipeline + alias CodeQA.Engine.Pipeline + alias CodeQA.Metrics.File.FunctionMetrics defp ctx(code), do: Pipeline.build_file_context(code) defp analyze(code), do: FunctionMetrics.analyze(ctx(code)) @@ -50,8 +50,10 @@ defmodule CodeQA.Metrics.FunctionMetricsTest do for keyword <- FunctionMetrics.func_keywords() do test "detects function starting with #{keyword}" do code = "#{unquote(keyword)} my_func(x) {\n return x\n}" - result = FunctionMetrics.analyze(CodeQA.Pipeline.build_file_context(code)) - assert result["avg_function_lines"] > 0, "expected '#{unquote(keyword)}' to be detected as function start" + result = FunctionMetrics.analyze(Pipeline.build_file_context(code)) + + assert result["avg_function_lines"] > 0, + "expected '#{unquote(keyword)}' to be detected as function start" end end end @@ -60,8 +62,10 @@ defmodule CodeQA.Metrics.FunctionMetricsTest do for modifier <- FunctionMetrics.access_modifiers() do test "detects method starting with #{modifier}" do code = "#{unquote(modifier)} void MyMethod() {\n return;\n}" - result = FunctionMetrics.analyze(CodeQA.Pipeline.build_file_context(code)) - assert result["avg_function_lines"] > 0, "expected '#{unquote(modifier)}' access modifier to trigger method detection" + result = FunctionMetrics.analyze(Pipeline.build_file_context(code)) + + assert result["avg_function_lines"] > 0, + "expected '#{unquote(modifier)}' access modifier to trigger method detection" end end end diff --git a/test/codeqa/metrics/file/near_duplicate_blocks_file_test.exs b/test/codeqa/metrics/file/near_duplicate_blocks_file_test.exs new file mode 100644 index 00000000..cb10540c --- /dev/null +++ b/test/codeqa/metrics/file/near_duplicate_blocks_file_test.exs @@ -0,0 +1,62 @@ +defmodule CodeQA.Metrics.File.NearDuplicateBlocksFileTest do + use ExUnit.Case, async: true + alias CodeQA.Engine.Pipeline + alias CodeQA.Metrics.File.NearDuplicateBlocksFile + + defp ctx(code, path \\ "test.ex") do + Pipeline.build_file_context(code, path: path) + end + + describe "name/0" do + test "returns near_duplicate_blocks_file" do + assert NearDuplicateBlocksFile.name() == "near_duplicate_blocks_file" + end + end + + describe "keys/0" do + test "returns 11 keys: block_count, sub_block_count, and d0..d8" do + keys = NearDuplicateBlocksFile.keys() + assert length(keys) == 11 + assert "block_count" in keys + assert "sub_block_count" in keys + assert "near_dup_block_d0" in keys + assert "near_dup_block_d8" in keys + end + end + + describe "analyze/1 with nil blocks" do + test "returns zeroed map with all keys when blocks is nil" do + ctx = Pipeline.build_file_context("x = 1\n", skip_structural: true) + result = NearDuplicateBlocksFile.analyze(ctx) + assert Map.has_key?(result, "block_count") + assert Map.has_key?(result, "sub_block_count") + for d <- 0..8, do: assert(Map.has_key?(result, "near_dup_block_d#{d}")) + for {_key, value} <- result, do: assert(value == 0) + end + end + + describe "analyze/1" do + test "returns a map with all expected keys" do + result = NearDuplicateBlocksFile.analyze(ctx("x = 1\n")) + assert Map.has_key?(result, "block_count") + assert Map.has_key?(result, "sub_block_count") + for d <- 0..8, do: assert(Map.has_key?(result, "near_dup_block_d#{d}")) + end + + test "no _pairs keys in output" do + result = NearDuplicateBlocksFile.analyze(ctx("x = 1\n")) + refute Enum.any?(Map.keys(result), &String.ends_with?(&1, "_pairs")) + end + + test "detects exact duplicate blocks at d0" do + block = "def foo\n x = 1\nend\n" + result = NearDuplicateBlocksFile.analyze(ctx(block <> "\n\n" <> block)) + assert result["near_dup_block_d0"] >= 1 + end + + test "block_count is positive for non-trivial file" do + result = NearDuplicateBlocksFile.analyze(ctx("def foo\n x\nend\n")) + assert result["block_count"] >= 1 + end + end +end diff --git a/test/codeqa/metrics/file/near_duplicate_blocks_test.exs b/test/codeqa/metrics/file/near_duplicate_blocks_test.exs new file mode 100644 index 00000000..a65e201e --- /dev/null +++ b/test/codeqa/metrics/file/near_duplicate_blocks_test.exs @@ -0,0 +1,227 @@ +defmodule CodeQA.Metrics.File.NearDuplicateBlocksTest do + use ExUnit.Case, async: true + alias CodeQA.Metrics.File.NearDuplicateBlocks, as: NDB + + describe "token_edit_distance/2" do + test "identical sequences have distance 0" do + assert NDB.token_edit_distance(~w[a b c], ~w[a b c]) == 0 + end + + test "empty vs non-empty equals length of other" do + assert NDB.token_edit_distance([], ~w[a b c]) == 3 + assert NDB.token_edit_distance(~w[a b c], []) == 3 + end + + test "single substitution" do + assert NDB.token_edit_distance(~w[a b c], ~w[a x c]) == 1 + end + end + + describe "find_pairs/2 idf_max_freq option" do + defp make_block(tokens, label) do + %CodeQA.AST.Enrichment.Node{ + label: label, + tokens: Enum.map(tokens, &%{kind: &1}), + line_count: length(tokens), + children: [] + } + end + + test "exact duplicates are still detected when all bigrams are high-frequency" do + # 30 blocks all sharing bigram [end, nil] → pruned by IDF + # Two additional identical blocks → should still match via exact hash index (d0) + common = Enum.map(1..30, fn i -> make_block(~w[end nil common_#{i}], "file:#{i}") end) + dup = make_block(~w[end nil special unique_token], "dup:1") + dup2 = make_block(~w[end nil special unique_token], "dup:2") + + result = NDB.find_pairs(common ++ [dup, dup2], idf_max_freq: 0.05) + + assert result[0].count >= 1 + end + + test "near-duplicates are detected via non-pruned unique bigrams" do + # 50 blocks all sharing [end, nil] → pruned + # Two near-duplicates sharing unique bigrams [nil, special], [special, alpha] → not pruned + common = Enum.map(1..50, fn i -> make_block(~w[end nil common_#{i}], "common:#{i}") end) + near_a = make_block(~w[end nil special alpha beta gamma], "near:1") + near_b = make_block(~w[end nil special alpha beta delta], "near:2") + + result = NDB.find_pairs(common ++ [near_a, near_b], idf_max_freq: 0.05) + + total = Map.values(result) |> Enum.map(& &1.count) |> Enum.sum() + assert total >= 1 + end + end + + describe "analyze_from_blocks/2 sub_block_count" do + test "sub_block_count equals sum of children counts across all blocks" do + child = make_block(["x"], "child:1") + + parent = %CodeQA.AST.Enrichment.Node{ + label: "a:1", + tokens: Enum.map(["def", "", "end"], &%{kind: &1}), + line_count: 3, + children: [child, child] + } + + solo = make_block(["y", "z", "w", "v", "u"], "b:1") + result = NDB.analyze_from_blocks([parent, solo], []) + assert result["sub_block_count"] == 2 + end + + test "sub_block_count is zero when no block has children" do + a = make_block(["x", "y", "z", "w", "v"], "a:1") + b = make_block(["x", "y", "z", "w", "Q"], "b:1") + result = NDB.analyze_from_blocks([a, b], []) + assert result["sub_block_count"] == 0 + end + end + + describe "canonical_values (via find_pairs)" do + test "blocks identical except for leading/trailing newline tokens are detected as d0 exact duplicates" do + core = ["def", "", "end"] + trimmed = make_block(core, "a:1") + with_nl = make_block([""] ++ core ++ [""], "b:1") + result = NDB.find_pairs([trimmed, with_nl], []) + assert Map.get(result, 0, %{count: 0}).count >= 1 + end + + test "blocks identical except for leading/trailing whitespace tokens are detected as d0 exact duplicates" do + core = ["def", "", "end"] + trimmed = make_block(core, "a:1") + with_ws = make_block([""] ++ core ++ [""], "b:1") + result = NDB.find_pairs([trimmed, with_ws], []) + assert Map.get(result, 0, %{count: 0}).count >= 1 + end + end + + describe "find_pairs/2 near-boundary behavior" do + test "pair at exactly d8 boundary (50% edit distance) is detected" do + # 10 tokens each, 5 substitutions = exactly 50% edit distance → d8 + # First 5 tokens identical → 4 shared bigrams, passes shingle filter + a = ~w[a b c d e f g h i j] + b = ~w[a b c d e X Y Z W V] + result = NDB.find_pairs([make_block(a, "x:1"), make_block(b, "x:2")], []) + total = Map.values(result) |> Enum.map(& &1.count) |> Enum.sum() + assert total >= 1 + end + + test "pair just over d8 boundary (>50% edit distance) is not reported" do + # a: 10 tokens, b: 11 tokens — first 5 identical (4 shared bigrams, passes shingle), + # abs(10-11)=1 passes token-length guard, but edit distance = 6 (60%) → nil + a = ~w[a b c d e f g h i j] + b = ~w[a b c d e X Y Z W V U] + result = NDB.find_pairs([make_block(a, "x:1"), make_block(b, "x:2")], []) + total = Map.values(result) |> Enum.map(& &1.count) |> Enum.sum() + assert total == 0 + end + end + + describe "percent_bucket/2" do + test "returns 0 for edit distance 0" do + assert NDB.percent_bucket(0, 100) == 0 + end + + test "returns 1 for 1% difference (within 0–5%)" do + assert NDB.percent_bucket(1, 100) == 1 + end + + test "returns 1 for 5% difference (boundary)" do + assert NDB.percent_bucket(5, 100) == 1 + end + + test "returns 2 for 6% difference" do + assert NDB.percent_bucket(6, 100) == 2 + end + + test "returns 8 for 50% difference" do + assert NDB.percent_bucket(50, 100) == 8 + end + + test "returns nil for >50% difference" do + assert NDB.percent_bucket(51, 100) == nil + end + + test "returns nil when min_token_count is 0" do + assert NDB.percent_bucket(0, 0) == nil + end + + test "returns 7 for exactly 40% (d7 upper boundary)" do + assert NDB.percent_bucket(40, 100) == 7 + end + + test "returns 8 for 41% (just above d7 boundary, in d8)" do + assert NDB.percent_bucket(41, 100) == 8 + end + + test "returns 7 for mid-range d7 (35%)" do + assert NDB.percent_bucket(35, 100) == 7 + end + end + + describe "analyze/2" do + test "returns all expected count keys" do + result = NDB.analyze([{"a.ex", "x = 1\n"}], []) + + for d <- 0..8 do + assert Map.has_key?(result, "near_dup_block_d#{d}") + end + end + + test "returns block_count and sub_block_count" do + result = NDB.analyze([{"a.ex", "def foo\n x\nend\n"}], []) + assert Map.has_key?(result, "block_count") + assert Map.has_key?(result, "sub_block_count") + end + + test "block_count reflects detected blocks" do + code = "def foo\n x\nend\n\n\ndef bar\n y\nend\n" + result = NDB.analyze([{"a.ex", code}], []) + assert result["block_count"] >= 2 + end + + test "detects exact duplicate blocks at d0" do + # Two identical function-like blocks separated by blank lines + block = "def foo\n x = 1\nend\n" + result = NDB.analyze([{"a.ex", block <> "\n\n" <> block}], []) + assert result["near_dup_block_d0"] >= 1 + end + + test "detects near-duplicate blocks (single token difference)" do + block_a = "def foo\n x = 1\nend\n" + # one identifier differs + block_b = "def bar\n x = 1\nend\n" + result = NDB.analyze([{"a.ex", block_a <> "\n\n" <> block_b}], []) + near_dup_total = Enum.sum(for d <- 0..8, do: result["near_dup_block_d#{d}"]) + assert near_dup_total >= 1 + end + + test "cross-file detection: same block in two files" do + block = "def foo\n x = 1\nend\n" + result = NDB.analyze([{"a.ex", block}, {"b.ex", block}], []) + assert result["near_dup_block_d0"] >= 1 + end + + test "returns only count keys (no pairs keys)" do + result = NDB.analyze([{"a.ex", "x = 1\n"}], []) + refute Enum.any?(Map.keys(result), &String.ends_with?(&1, "_pairs")) + end + + test "find_pairs/2 with include_pairs option returns pair data" do + block = "def foo\n x = 1\nend\n" + result = NDB.analyze([{"a.ex", block <> "\n\n" <> block}], include_pairs: true) + pairs_keys = Map.keys(result) |> Enum.filter(&String.ends_with?(&1, "_pairs")) + assert pairs_keys != [] + end + + test "pair sources include file:line format" do + block = "def foo\n x = 1\nend\n" + result = NDB.analyze([{"a.ex", block <> "\n\n" <> block}], include_pairs: true) + pairs = result["near_dup_block_d0_pairs"] + assert pairs != [] + [first | _] = pairs + assert first["source_a"] =~ ~r/a\.ex:\d+/ + assert first["source_b"] =~ ~r/a\.ex:\d+/ + end + end +end diff --git a/test/codeqa/metrics/file/rfc_test.exs b/test/codeqa/metrics/file/rfc_test.exs new file mode 100644 index 00000000..19716f3a --- /dev/null +++ b/test/codeqa/metrics/file/rfc_test.exs @@ -0,0 +1,97 @@ +defmodule CodeQA.Metrics.File.RFCTest do + use ExUnit.Case, async: true + + alias CodeQA.Engine.Pipeline + alias CodeQA.Metrics.File.RFC + + defp ctx(code), do: Pipeline.build_file_context(code) + defp result(code), do: RFC.analyze(ctx(code)) + + describe "analyze/1" do + test "returns zero counts for empty content" do + r = result("") + assert r["rfc_count"] == 0 + assert r["rfc_density"] == 0.0 + end + + test "counts function definitions with no calls" do + code = """ + def foo do + 1 + end + """ + + r = result(code) + assert r["function_def_count"] == 1 + assert r["distinct_call_count"] == 0 + assert r["rfc_count"] == 1 + end + + test "counts distinct call targets" do + code = """ + def foo do + bar() + baz() + bar() + end + """ + + r = result(code) + # bar and baz are distinct call targets (bar appears twice but counts once) + assert r["distinct_call_count"] == 2 + assert r["function_def_count"] == 1 + assert r["rfc_count"] == 3 + end + + test "rfc_density is rfc_count normalized by line count" do + code = """ + def foo do + bar() + baz() + end + """ + + c = ctx(code) + r = RFC.analyze(c) + assert r["rfc_density"] == Float.round(r["rfc_count"] / c.line_count, 4) + end + + test "file with no functions and no calls returns all zeros" do + r = result("x = 1\ny = 2") + assert r["rfc_count"] == 0 + assert r["function_def_count"] == 0 + assert r["distinct_call_count"] == 0 + end + + test "file with only calls and no function definitions" do + code = "foo()\nbar()\nbaz()" + r = result(code) + assert r["function_def_count"] == 0 + assert r["distinct_call_count"] == 3 + assert r["rfc_count"] == 3 + end + + test "duplicate calls are deduplicated" do + code = "foo()\nfoo()\nfoo()" + r = result(code) + assert r["distinct_call_count"] == 1 + end + + test "multiple function definitions are counted" do + code = """ + def foo do + bar() + end + + def baz do + qux() + end + """ + + r = result(code) + assert r["function_def_count"] == 2 + assert r["distinct_call_count"] == 2 + assert r["rfc_count"] == 4 + end + end +end diff --git a/test/codeqa/metrics/file/separator_counts_test.exs b/test/codeqa/metrics/file/separator_counts_test.exs new file mode 100644 index 00000000..2ac2a063 --- /dev/null +++ b/test/codeqa/metrics/file/separator_counts_test.exs @@ -0,0 +1,54 @@ +defmodule CodeQA.Metrics.File.SeparatorCountsTest do + use ExUnit.Case, async: true + + alias CodeQA.Metrics.File.SeparatorCounts + + describe "name/0" do + test "returns separator_counts" do + assert SeparatorCounts.name() == "separator_counts" + end + end + + describe "keys/0" do + test "returns four count keys" do + assert SeparatorCounts.keys() == [ + "underscore_count", + "hyphen_count", + "slash_count", + "dot_count" + ] + end + end + + describe "analyze/1" do + test "counts separators in source code" do + content = "def my_func(a_b) do\n File.read(\"path/to/file.txt\")\nend" + + result = SeparatorCounts.analyze(%{content: content}) + + assert result["underscore_count"] == 2 + assert result["slash_count"] == 2 + assert result["dot_count"] == 2 + assert result["hyphen_count"] == 0 + end + + test "counts hyphens" do + content = "some-component {\n background-color: red;\n}" + + result = SeparatorCounts.analyze(%{content: content}) + + assert result["hyphen_count"] == 2 + end + + test "returns zeros for empty content" do + result = SeparatorCounts.analyze(%{content: ""}) + + assert result == %{ + "underscore_count" => 0, + "hyphen_count" => 0, + "slash_count" => 0, + "dot_count" => 0 + } + end + end +end diff --git a/test/fixtures/sample.ex b/test/fixtures/sample.ex index 16d90fc4..625d9cb2 100644 --- a/test/fixtures/sample.ex +++ b/test/fixtures/sample.ex @@ -1,4 +1,5 @@ defmodule Sample do + @moduledoc false def hello do IO.puts("Hello, world!") end diff --git a/test/support/counter_signal.ex b/test/support/counter_signal.ex new file mode 100644 index 00000000..7ffb5d81 --- /dev/null +++ b/test/support/counter_signal.ex @@ -0,0 +1,19 @@ +defmodule CodeQA.Support.CounterSignal do + @moduledoc false + defstruct [] +end + +defimpl CodeQA.AST.Parsing.Signal, for: CodeQA.Support.CounterSignal do + def source(_), do: CodeQA.Support.CounterSignal + def group(_), do: :test + def init(_, _), do: %{idx: 0} + + def emit(_, {_prev, token, _next}, %{idx: i} = state) do + emissions = + if token.kind == "", + do: MapSet.new([{:id_seen, i}]), + else: MapSet.new() + + {emissions, %{state | idx: i + 1}} + end +end diff --git a/test/support/fixtures/cpp/observer_pattern.ex b/test/support/fixtures/cpp/observer_pattern.ex new file mode 100644 index 00000000..b536d358 --- /dev/null +++ b/test/support/fixtures/cpp/observer_pattern.ex @@ -0,0 +1,71 @@ +defmodule Test.Fixtures.Cpp.ObserverPattern do + @moduledoc false + use Test.LanguageFixture, language: "cpp observer_pattern" + + @code ~S''' + #include + #include + + template + class Observer { + public: + virtual void onEvent(const Event& event) = 0; + + virtual ~Observer() = default; + }; + + template + class Subject { + std::vector*> observers; + + public: + void attach(Observer* observer) { observers.push_back(observer); } + + void detach(Observer* observer) { + observers.erase( + std::remove(observers.begin(), observers.end(), observer), + observers.end() + ); + } + + void notify(const Event& event) { + for (auto* obs : observers) obs->onEvent(event); + } + }; + + struct StockEvent { + std::string symbol; + double price; + double previousPrice; + + double change() const { return price - previousPrice; } + + double changePercent() const { return previousPrice > 0 ? change() / previousPrice * 100.0 : 0.0; } + }; + + class StockTicker : public Subject { + std::map prices; + + public: + void updatePrice(const std::string& symbol, double newPrice) { + double prev = prices.count(symbol) ? prices[symbol] : newPrice; + prices[symbol] = newPrice; + notify(StockEvent{symbol, newPrice, prev}); + } + + double getPrice(const std::string& symbol) const { + auto it = prices.find(symbol); + return it != prices.end() ? it->second : 0.0; + } + }; + + class AlertObserver : public Observer { + double threshold; + + public: + explicit AlertObserver(double threshold) : threshold(threshold) {} + + void onEvent(const StockEvent& event) override {} + }; + ''' +end diff --git a/test/support/fixtures/cpp/smart_pointer.ex b/test/support/fixtures/cpp/smart_pointer.ex new file mode 100644 index 00000000..6e91c9b9 --- /dev/null +++ b/test/support/fixtures/cpp/smart_pointer.ex @@ -0,0 +1,87 @@ +defmodule Test.Fixtures.Cpp.SmartPointer do + @moduledoc false + use Test.LanguageFixture, language: "cpp smart_pointer" + + @code ~S''' + #include + #include + + template + class UniquePtr { + T* ptr; + std::function deleter; + + public: + explicit UniquePtr(T* p = nullptr, std::function d = std::default_delete()) + : ptr(p), deleter(d) {} + + ~UniquePtr() { if (ptr) deleter(ptr); } + + UniquePtr(const UniquePtr&) = delete; + + UniquePtr& operator=(const UniquePtr&) = delete; + + UniquePtr(UniquePtr&& other) noexcept : ptr(other.ptr), deleter(std::move(other.deleter)) { other.ptr = nullptr; } + + UniquePtr& operator=(UniquePtr&& other) noexcept { + if (this != &other) { if (ptr) deleter(ptr); ptr = other.ptr; other.ptr = nullptr; } + return *this; + } + + T* get() const { return ptr; } + + T& operator*() const { return *ptr; } + + T* operator->() const { return ptr; } + + explicit operator bool() const { return ptr != nullptr; } + + T* release() { T* p = ptr; ptr = nullptr; return p; } + + void reset(T* p = nullptr) { if (ptr) deleter(ptr); ptr = p; } + }; + + template + struct SharedControl { + T* ptr; + int refCount; + + SharedControl(T* p) : ptr(p), refCount(1) {} + + ~SharedControl() { delete ptr; } + }; + + template + class SharedPtr { + SharedControl* ctrl; + + public: + explicit SharedPtr(T* p = nullptr) : ctrl(p ? new SharedControl(p) : nullptr) {} + + SharedPtr(const SharedPtr& other) : ctrl(other.ctrl) { if (ctrl) ++ctrl->refCount; } + + SharedPtr& operator=(const SharedPtr& other) { + if (this != &other) { release(); ctrl = other.ctrl; if (ctrl) ++ctrl->refCount; } + return *this; + } + + ~SharedPtr() { release(); } + + T* get() const { return ctrl ? ctrl->ptr : nullptr; } + + T& operator*() const { return *ctrl->ptr; } + + T* operator->() const { return ctrl->ptr; } + + int useCount() const { return ctrl ? ctrl->refCount : 0; } + + private: + void release() { if (ctrl && --ctrl->refCount == 0) { delete ctrl; ctrl = nullptr; } } + }; + + template + UniquePtr makeUnique(Args&&... args) { + return UniquePtr(new T(std::forward(args)...)); + } + ''' +end diff --git a/test/support/fixtures/cpp/template_container.ex b/test/support/fixtures/cpp/template_container.ex new file mode 100644 index 00000000..6ff7bdb1 --- /dev/null +++ b/test/support/fixtures/cpp/template_container.ex @@ -0,0 +1,90 @@ +defmodule Test.Fixtures.Cpp.TemplateContainer do + @moduledoc false + use Test.LanguageFixture, language: "cpp template_container" + + @code ~S''' + #include + + template + class Stack { + T* data; + int capacity; + int topIdx; + + public: + explicit Stack(int cap = 16) : capacity(cap), topIdx(-1) { data = new T[cap]; } + + ~Stack() { delete[] data; } + + Stack(const Stack&) = delete; + + Stack& operator=(const Stack&) = delete; + + void push(const T& value) { + if (topIdx + 1 >= capacity) throw std::overflow_error("Stack overflow"); + data[++topIdx] = value; + } + + T pop() { + if (empty()) throw std::underflow_error("Stack underflow"); + return data[topIdx--]; + } + + T& top() { + if (empty()) throw std::underflow_error("Stack is empty"); + return data[topIdx]; + } + + bool empty() const { return topIdx < 0; } + + int size() const { return topIdx + 1; } + + int maxCapacity() const { return capacity; } + }; + + template + class Queue { + T* data; + int capacity; + int head; + int tail; + int count; + + public: + explicit Queue(int cap = 16) : capacity(cap), head(0), tail(0), count(0) { data = new T[cap]; } + + ~Queue() { delete[] data; } + + void enqueue(const T& value) { + if (count >= capacity) throw std::overflow_error("Queue overflow"); + data[tail] = value; + tail = (tail + 1) % capacity; + ++count; + } + + T dequeue() { + if (empty()) throw std::underflow_error("Queue underflow"); + T value = data[head]; + head = (head + 1) % capacity; + --count; + return value; + } + + T& front() { if (empty()) throw std::underflow_error("Queue is empty"); return data[head]; } + + bool empty() const { return count == 0; } + + int size() const { return count; } + }; + + template + struct Pair { + T first; + T second; + + Pair(T a, T b) : first(a), second(b) {} + + bool operator==(const Pair& other) const { return first == other.first && second == other.second; } + }; + ''' +end diff --git a/test/support/fixtures/csharp/async_task_manager.ex b/test/support/fixtures/csharp/async_task_manager.ex new file mode 100644 index 00000000..6dd9db7f --- /dev/null +++ b/test/support/fixtures/csharp/async_task_manager.ex @@ -0,0 +1,67 @@ +defmodule Test.Fixtures.CSharp.AsyncTaskManager do + @moduledoc false + use Test.LanguageFixture, language: "csharp async_task_manager" + + @code ~S''' + // TaskManagement namespace — async task scheduling with bounded concurrency + using System.Threading.Tasks; + using System.Collections.Generic; + + interface ITaskScheduler + { + Task ScheduleAsync(System.Func work, System.Threading.CancellationToken ct); + Task ScheduleAsync(System.Func> work, System.Threading.CancellationToken ct); + } + + interface IWorkQueue + { + void Enqueue(System.Func work); + Task DrainAsync(System.Threading.CancellationToken ct); + int Count { get; } + } + + class BoundedTaskScheduler : ITaskScheduler + { + private readonly System.Threading.SemaphoreSlim semaphore; + + public BoundedTaskScheduler(int maxConcurrency) + { + semaphore = new System.Threading.SemaphoreSlim(maxConcurrency, maxConcurrency); + } + + public async Task ScheduleAsync(System.Func work, System.Threading.CancellationToken ct) + { + await semaphore.WaitAsync(ct); + try { await work(); } + finally { semaphore.Release(); } + } + + public async Task ScheduleAsync(System.Func> work, System.Threading.CancellationToken ct) + { + await semaphore.WaitAsync(ct); + try { return await work(); } + finally { semaphore.Release(); } + } + } + + class InMemoryWorkQueue : IWorkQueue + { + private readonly Queue> queue = new Queue>(); + + public void Enqueue(System.Func work) { queue.Enqueue(work); } + + public int Count => queue.Count; + + public async Task DrainAsync(System.Threading.CancellationToken ct) + { + while (queue.Count > 0 && !ct.IsCancellationRequested) + { + var work = queue.Dequeue(); + await work(); + } + } + } + + enum TaskState { Pending, Running, Completed, Failed, Cancelled } + ''' +end diff --git a/test/support/fixtures/csharp/linq_pipeline.ex b/test/support/fixtures/csharp/linq_pipeline.ex new file mode 100644 index 00000000..677c6eb9 --- /dev/null +++ b/test/support/fixtures/csharp/linq_pipeline.ex @@ -0,0 +1,71 @@ +defmodule Test.Fixtures.CSharp.LinqPipeline do + @moduledoc false + use Test.LanguageFixture, language: "csharp linq_pipeline" + + @code ~S''' + // DataPipeline namespace — LINQ-style transformation pipeline + using System.Collections.Generic; + using System.Linq; + + interface ITransform + { + IEnumerable Apply(IEnumerable input); + } + + interface IPipeline + { + IPipeline Pipe(ITransform transform); + IEnumerable Execute(); + } + + class FilterTransform : ITransform + { + private readonly System.Func predicate; + + public FilterTransform(System.Func predicate) + { + this.predicate = predicate; + } + + public IEnumerable Apply(IEnumerable input) + { + return input.Where(predicate); + } + } + + class MapTransform : ITransform + { + private readonly System.Func selector; + + public MapTransform(System.Func selector) + { + this.selector = selector; + } + + public IEnumerable Apply(IEnumerable input) + { + return input.Select(selector); + } + } + + class DataPipeline : IPipeline + { + private readonly IEnumerable source; + + public DataPipeline(IEnumerable source) + { + this.source = source; + } + + public IPipeline Pipe(ITransform transform) + { + return new DataPipeline(transform.Apply(source)); + } + + public IEnumerable Execute() + { + return source.ToList(); + } + } + ''' +end diff --git a/test/support/fixtures/csharp/plugin_system.ex b/test/support/fixtures/csharp/plugin_system.ex new file mode 100644 index 00000000..ebf7e7fd --- /dev/null +++ b/test/support/fixtures/csharp/plugin_system.ex @@ -0,0 +1,72 @@ +defmodule Test.Fixtures.CSharp.PluginSystem do + @moduledoc false + use Test.LanguageFixture, language: "csharp plugin_system" + + @code ~S''' + // PluginSystem namespace — plugin registry with lifecycle management + using System.Collections.Generic; + + interface IPlugin + { + string Name { get; } + string Version { get; } + void Initialize(IPluginContext context); + void Shutdown(); + } + + interface IPluginContext + { + void RegisterService(T service) where T : class; + T ResolveService() where T : class; + void Log(string message); + } + + interface IPluginRegistry + { + void Register(IPlugin plugin); + void Unregister(string name); + IPlugin Find(string name); + IEnumerable All(); + } + + class PluginContext : IPluginContext + { + private readonly Dictionary services = new Dictionary(); + + public void RegisterService(T service) where T : class { services[typeof(T)] = service; } + + public T ResolveService() where T : class + { + if (services.TryGetValue(typeof(T), out var svc)) return (T)svc; + throw new System.InvalidOperationException("Service not found: " + typeof(T).Name); + } + + public void Log(string message) { System.Console.WriteLine("[Plugin] " + message); } + } + + class PluginRegistry : IPluginRegistry + { + private readonly Dictionary plugins = new Dictionary(); + private readonly IPluginContext context; + + public PluginRegistry(IPluginContext context) { this.context = context; } + + public void Register(IPlugin plugin) + { + plugin.Initialize(context); + plugins[plugin.Name] = plugin; + } + + public void Unregister(string name) + { + if (plugins.TryGetValue(name, out var plugin)) { plugin.Shutdown(); plugins.Remove(name); } + } + + public IPlugin Find(string name) { plugins.TryGetValue(name, out var p); return p; } + + public IEnumerable All() { return plugins.Values; } + } + + enum PluginState { Unloaded, Initializing, Active, ShuttingDown } + ''' +end diff --git a/test/support/fixtures/dart/futures_async.ex b/test/support/fixtures/dart/futures_async.ex new file mode 100644 index 00000000..ff5317a6 --- /dev/null +++ b/test/support/fixtures/dart/futures_async.ex @@ -0,0 +1,78 @@ +defmodule Test.Fixtures.Dart.FuturesAsync do + @moduledoc false + use Test.LanguageFixture, language: "dart futures_async" + + @code ~S''' + abstract class AsyncTask { + Future execute(); + + void cancel(); + + bool get isCancelled; + } + + abstract class TaskScheduler { + Future schedule(AsyncTask task); + + Future> scheduleAll(List> tasks); + + void shutdown(); + } + + class RetryPolicy { + final int maxAttempts; + final Duration delay; + final double backoffMultiplier; + + const RetryPolicy({ + this.maxAttempts = 3, + this.delay = const Duration(milliseconds: 500), + this.backoffMultiplier = 2.0, + }); + + Duration delayForAttempt(int attempt) { + final ms = delay.inMilliseconds * (backoffMultiplier * attempt).ceil(); + return Duration(milliseconds: ms); + } + } + + class SimpleTaskScheduler implements TaskScheduler { + bool _shutdown = false; + final List> _pending = []; + + Future schedule(AsyncTask task) async { + if (_shutdown) throw StateError("Scheduler is shut down"); + final future = task.execute(); + _pending.add(future); + return future; + } + + Future> scheduleAll(List> tasks) { + return Future.wait(tasks.map((t) => schedule(t)).toList()); + } + + void shutdown() { + _shutdown = true; + _pending.clear(); + } + } + + enum TaskStatus { + pending, + running, + completed, + failed, + cancelled + } + + class TaskResult { + final T? value; + final Object? error; + final TaskStatus status; + + const TaskResult.success(this.value) : error = null, status = TaskStatus.completed; + + const TaskResult.failure(this.error) : value = null, status = TaskStatus.failed; + } + ''' +end diff --git a/test/support/fixtures/dart/mixin_composition.ex b/test/support/fixtures/dart/mixin_composition.ex new file mode 100644 index 00000000..05013b41 --- /dev/null +++ b/test/support/fixtures/dart/mixin_composition.ex @@ -0,0 +1,85 @@ +defmodule Test.Fixtures.Dart.MixinComposition do + @moduledoc false + use Test.LanguageFixture, language: "dart mixin_composition" + + @code ~S''' + abstract class Serializable { + Map toJson(); + + String toJsonString() { + final map = toJson(); + return map.entries.map((e) => '"${e.key}": "${e.value}"').join(', '); + } + } + + abstract class Validatable { + List validate(); + + bool get isValid => validate().isEmpty; + + void assertValid() { + final errors = validate(); + if (errors.isNotEmpty) throw ArgumentError(errors.join(', ')); + } + } + + abstract class Equatable { + List get props; + + bool equalsTo(Object other) { + if (identical(this, other)) return true; + if (other.runtimeType != runtimeType) return false; + final otherEquatable = other as Equatable; + for (int i = 0; i < props.length; i++) { + if (props[i] != otherEquatable.props[i]) return false; + } + return true; + } + } + + class Address extends Serializable implements Validatable { + final String street; + final String city; + final String country; + + Address({required this.street, required this.city, required this.country}); + + Map toJson() => {'street': street, 'city': city, 'country': country}; + + List validate() { + final errors = []; + if (street.isEmpty) errors.add('street is required'); + if (city.isEmpty) errors.add('city is required'); + if (country.isEmpty) errors.add('country is required'); + return errors; + } + + List get props => [street, city, country]; + } + + enum AddressType { + home, + work, + billing, + shipping + } + + class Contact extends Serializable implements Validatable { + final String name; + final String email; + final Address address; + + Contact({required this.name, required this.email, required this.address}); + + Map toJson() => {'name': name, 'email': email, 'address': address.toJson()}; + + List validate() { + final errors = []; + if (name.isEmpty) errors.add('name is required'); + if (!email.contains('@')) errors.add('invalid email'); + errors.addAll(address.validate()); + return errors; + } + } + ''' +end diff --git a/test/support/fixtures/dart/widget_state.ex b/test/support/fixtures/dart/widget_state.ex new file mode 100644 index 00000000..d4a1b048 --- /dev/null +++ b/test/support/fixtures/dart/widget_state.ex @@ -0,0 +1,91 @@ +defmodule Test.Fixtures.Dart.WidgetState do + @moduledoc false + use Test.LanguageFixture, language: "dart widget_state" + + @code ~S''' + abstract class Widget { + String get key; + + Element createElement(); + } + + abstract class StatefulWidget extends Widget { + State createState(); + } + + abstract class State { + T widget; + + State(this.widget); + + void setState(void Function() fn) { + fn(); + markNeedsBuild(); + } + + void markNeedsBuild() {} + + Widget build(); + + void initState() {} + + void dispose() {} + } + + class Element { + Widget widget; + State? state; + + Element(this.widget); + + void mount() { + if (widget is StatefulWidget) { + state = (widget as StatefulWidget).createState(); + state!.initState(); + } + } + + void unmount() { + state?.dispose(); + } + } + + abstract class BuildContext { + Widget get widget; + + Element get element; + } + + enum WidgetLifecycle { + created, + mounted, + active, + inactive, + disposed + } + + class RenderObject { + double x = 0; + double y = 0; + double width = 0; + double height = 0; + bool needsLayout = true; + bool needsPaint = true; + RenderObject? parent; + List children = []; + + void layout() { + needsLayout = false; + } + + void paint() { + needsPaint = false; + } + + void addChild(RenderObject child) { + children.add(child); + child.parent = this; + } + } + ''' +end diff --git a/test/support/fixtures/elixir/calculator.ex b/test/support/fixtures/elixir/calculator.ex new file mode 100644 index 00000000..7657b88b --- /dev/null +++ b/test/support/fixtures/elixir/calculator.ex @@ -0,0 +1,125 @@ +defmodule Test.Fixtures.Elixir.Calculator do + @moduledoc false + use Test.LanguageFixture, language: "elixir calculator" + import Test.NodeMatcher + + @code ~S''' + defmodule Calculator.Behaviour do + @moduledoc "Contract for all calculator implementations." + @callback add(number, number) :: number + @callback subtract(number, number) :: number + @callback multiply(number, number) :: number + @callback divide(number, number) :: {:ok, float} | {:error, :division_by_zero} + end + + defprotocol Calculator.Displayable do + @doc "Converts a result to a human-readable string." + def display(value) + end + + defmodule Calculator.Basic do + @moduledoc "Basic arithmetic calculator." + @behaviour Calculator.Behaviour + + @doc "Adds two numbers." + @spec add(number, number) :: number + def add(a, b), do: a + b + + @doc "Subtracts b from a." + @spec subtract(number, number) :: number + def subtract(a, b), do: a - b + + @doc "Multiplies two numbers." + @spec multiply(number, number) :: number + def multiply(a, b), do: a * b + + @doc "Divides a by b, returns error for zero divisor." + @spec divide(number, number) :: {:ok, float} | {:error, :division_by_zero} + def divide(_a, 0), do: {:error, :division_by_zero} + def divide(a, b), do: {:ok, a / b} + + @doc "Absolute value of n." + @spec abs_val(number) :: number + def abs_val(n) when n < 0, do: -n + def abs_val(n), do: n + end + + defimpl Calculator.Displayable, for: Integer do + def display(value), do: Integer.to_string(value) + end + + defimpl Calculator.Displayable, for: Float do + def display(value), do: :erlang.float_to_binary(value, [decimals: 4]) + end + + defmodule Calculator.Scientific do + @moduledoc "Scientific calculator with extended math operations." + @behaviour Calculator.Behaviour + + @doc "Adds two numbers." + @spec add(number, number) :: number + def add(a, b), do: a + b + + @doc "Subtracts b from a." + @spec subtract(number, number) :: number + def subtract(a, b), do: a - b + + @doc "Multiplies two numbers." + @spec multiply(number, number) :: number + def multiply(a, b), do: a * b + + @doc "Divides, returning an error on zero divisor." + @spec divide(number, number) :: {:ok, float} | {:error, :division_by_zero} + def divide(_a, 0), do: {:error, :division_by_zero} + def divide(a, b), do: {:ok, a / b} + + @doc "Raises a to the power of b." + @spec power(number, number) :: number + def power(a, b), do: :math.pow(a, b) + + @doc "Returns the square root or an error for negative input." + @spec sqrt(number) :: {:ok, float} | {:error, :negative_input} + def sqrt(n) when n < 0, do: {:error, :negative_input} + def sqrt(n), do: {:ok, :math.sqrt(n)} + + @doc "Natural logarithm, error for non-positive input." + @spec log(number) :: {:ok, float} | {:error, :non_positive_input} + def log(n) when n <= 0, do: {:error, :non_positive_input} + def log(n), do: {:ok, :math.log(n)} + + defp validate_positive(n) when n > 0, do: {:ok, n} + defp validate_positive(_n), do: {:error, :non_positive_input} + end + + defmodule Calculator.History do + @moduledoc "Tracks a history of calculator operations." + @type entry :: {atom, list} + @type t :: list + + @doc "Creates an empty history." + @spec new() :: t + def new(), do: [] + + @doc "Records an operation entry." + @spec record(t, atom, list) :: t + def record(history, op, args) when is_list(args), do: [{op, args} | history] + + @doc "Returns the last n entries." + @spec last(t, non_neg_integer) :: t + def last(history, n \\ 5), do: Enum.take(history, n) + + @doc "Clears the history." + @spec clear(t) :: t + def clear(_history), do: [] + + defp format_entry({op, args}), do: "#{op}(#{Enum.join(args, ", ")})" + end + ''' + + @block_assertions [ + %{ + description: "a compound block containing add with doc and spec annotations", + all_of: [exact(:content, "add"), exact(:content, "doc"), exact(:content, "spec")] + } + ] +end diff --git a/test/support/fixtures/elixir/event_bus.ex b/test/support/fixtures/elixir/event_bus.ex new file mode 100644 index 00000000..e196e099 --- /dev/null +++ b/test/support/fixtures/elixir/event_bus.ex @@ -0,0 +1,71 @@ +defmodule Test.Fixtures.Elixir.EventBus do + @moduledoc false + use Test.LanguageFixture, language: "elixir event_bus" + + @code ~S''' + defmodule EventBus.Behaviour do + @moduledoc "Contract for event bus implementations." + @callback subscribe(topic :: String.t(), pid :: pid()) :: :ok | {:error, term()} + @callback unsubscribe(topic :: String.t(), pid :: pid()) :: :ok + @callback publish(topic :: String.t(), event :: term()) :: :ok + @callback topics() :: [String.t()] + end + + defprotocol EventBus.Serializable do + @doc "Encodes an event to a binary payload." + @spec encode(t()) :: binary() + def encode(event) + + @doc "Decodes a binary payload back to an event." + @spec decode(t(), binary()) :: term() + def decode(schema, payload) + end + + defmodule EventBus.Topic do + @moduledoc "Represents a named event topic with subscriber tracking." + @enforce_keys [:name] + defstruct [:name, subscribers: []] + + @doc "Creates a new topic." + @spec new(String.t()) :: t() + def new(name) when is_binary(name), do: %__MODULE__{name: name} + + @doc "Adds a subscriber pid to the topic." + @spec add_subscriber(t(), pid()) :: t() + def add_subscriber(%__MODULE__{subscribers: subs} = topic, pid) do + %{topic | subscribers: [pid | subs]} + end + + @doc "Removes a subscriber pid from the topic." + @spec remove_subscriber(t(), pid()) :: t() + def remove_subscriber(%__MODULE__{subscribers: subs} = topic, pid) do + %{topic | subscribers: List.delete(subs, pid)} + end + + @doc "Returns all current subscribers." + @spec subscribers(t()) :: [pid()] + def subscribers(%__MODULE__{subscribers: subs}), do: subs + end + + defmodule EventBus.Dispatcher do + @moduledoc "Dispatches events to all topic subscribers." + + @doc "Broadcasts an event to every subscriber of the given topic." + @spec broadcast(EventBus.Topic.t(), term()) :: :ok + def broadcast(%EventBus.Topic{} = topic, event) do + topic + |> EventBus.Topic.subscribers() + |> Enum.each(&send(&1, {:event, topic.name, event})) + end + + @doc "Dispatches to subscribers matching a predicate." + @spec dispatch_filtered(EventBus.Topic.t(), term(), (pid() -> boolean())) :: :ok + def dispatch_filtered(%EventBus.Topic{} = topic, event, filter_fn) do + topic + |> EventBus.Topic.subscribers() + |> Enum.filter(filter_fn) + |> Enum.each(&send(&1, {:event, topic.name, event})) + end + end + ''' +end diff --git a/test/support/fixtures/elixir/rate_limiter.ex b/test/support/fixtures/elixir/rate_limiter.ex new file mode 100644 index 00000000..580a2b4b --- /dev/null +++ b/test/support/fixtures/elixir/rate_limiter.ex @@ -0,0 +1,85 @@ +defmodule Test.Fixtures.Elixir.RateLimiter do + @moduledoc false + use Test.LanguageFixture, language: "elixir rate_limiter" + + @code ~S''' + defmodule RateLimiter.Behaviour do + @moduledoc "Contract for rate limiter backends." + @callback allow?(key :: term(), cost :: pos_integer()) :: boolean() + @callback reset(key :: term()) :: :ok + @callback stats(key :: term()) :: {:ok, map()} | {:error, :not_found} + end + + defmodule RateLimiter.Bucket do + @moduledoc "Token bucket state for a single rate-limited key." + @enforce_keys [:capacity, :tokens, :refill_rate] + defstruct [:capacity, :tokens, :refill_rate, last_refill: nil] + + @doc "Creates a new bucket with full capacity." + @spec new(pos_integer(), pos_integer()) :: t() + def new(capacity, refill_rate) when capacity > 0 and refill_rate > 0 do + %__MODULE__{capacity: capacity, tokens: capacity, refill_rate: refill_rate, last_refill: System.monotonic_time(:millisecond)} + end + + @doc "Consumes tokens from the bucket. Returns updated bucket or error." + @spec consume(t(), pos_integer()) :: {:ok, t()} | {:error, :rate_limited} + def consume(%__MODULE__{tokens: tokens} = bucket, cost) when tokens >= cost do + {:ok, %{bucket | tokens: tokens - cost}} + end + def consume(%__MODULE__{}, _cost), do: {:error, :rate_limited} + + @doc "Refills the bucket based on elapsed time." + @spec refill(t()) :: t() + def refill(%__MODULE__{tokens: t, capacity: cap, refill_rate: rate, last_refill: last} = bucket) do + now = System.monotonic_time(:millisecond) + elapsed_ms = now - last + new_tokens = min(cap, t + div(elapsed_ms * rate, 1000)) + %{bucket | tokens: new_tokens, last_refill: now} + end + end + + defmodule RateLimiter.Server do + @moduledoc "GenServer-backed rate limiter with configurable buckets." + @behaviour RateLimiter.Behaviour + use GenServer + + @doc "Starts the rate limiter server." + @spec start_link(keyword()) :: GenServer.on_start() + def start_link(opts), do: GenServer.start_link(__MODULE__, opts, name: __MODULE__) + + @impl RateLimiter.Behaviour + @spec allow?(term(), pos_integer()) :: boolean() + def allow?(key, cost \\ 1), do: GenServer.call(__MODULE__, {:allow, key, cost}) + + @impl RateLimiter.Behaviour + @spec reset(term()) :: :ok + def reset(key), do: GenServer.cast(__MODULE__, {:reset, key}) + + @impl RateLimiter.Behaviour + @spec stats(term()) :: {:ok, map()} | {:error, :not_found} + def stats(key), do: GenServer.call(__MODULE__, {:stats, key}) + + @impl GenServer + def init(opts) do + capacity = Keyword.get(opts, :capacity, 100) + refill_rate = Keyword.get(opts, :refill_rate, 10) + {:ok, %{buckets: %{}, capacity: capacity, refill_rate: refill_rate}} + end + + @impl GenServer + def handle_call({:allow, key, cost}, _from, state) do + bucket = Map.get_lazy(state.buckets, key, fn -> RateLimiter.Bucket.new(state.capacity, state.refill_rate) end) + bucket = RateLimiter.Bucket.refill(bucket) + case RateLimiter.Bucket.consume(bucket, cost) do + {:ok, updated} -> {:reply, true, %{state | buckets: Map.put(state.buckets, key, updated)}} + {:error, :rate_limited} -> {:reply, false, %{state | buckets: Map.put(state.buckets, key, bucket)}} + end + end + + @impl GenServer + def handle_cast({:reset, key}, state), do: {:noreply, %{state | buckets: Map.delete(state.buckets, key)}} + + defp default_bucket(state), do: RateLimiter.Bucket.new(state.capacity, state.refill_rate) + end + ''' +end diff --git a/test/support/fixtures/go/calculator.ex b/test/support/fixtures/go/calculator.ex new file mode 100644 index 00000000..e55100a1 --- /dev/null +++ b/test/support/fixtures/go/calculator.ex @@ -0,0 +1,53 @@ +defmodule Test.Fixtures.Go.Calculator do + @moduledoc false + use Test.LanguageFixture, language: "go calculator" + + @code ~S''' + func Add(a, b float64) float64 { + return a + b + } + + func Subtract(a, b float64) float64 { + return a - b + } + + func Multiply(a, b float64) float64 { + return a * b + } + + func Divide(a, b float64) (float64, error) { + if b == 0 { + return 0, fmt.Errorf("division by zero") + } + return a / b, nil + } + + func Power(base, exp float64) float64 { + return math.Pow(base, exp) + } + + func Sqrt(n float64) (float64, error) { + if n < 0 { + return 0, fmt.Errorf("cannot take sqrt of negative number") + } + return math.Sqrt(n), nil + } + + func Abs(n float64) float64 { + if n < 0 { + return -n + } + return n + } + + func Clamp(n, min, max float64) float64 { + if n < min { + return min + } + if n > max { + return max + } + return n + } + ''' +end diff --git a/test/support/fixtures/go/cli_parser.ex b/test/support/fixtures/go/cli_parser.ex new file mode 100644 index 00000000..c97c14a1 --- /dev/null +++ b/test/support/fixtures/go/cli_parser.ex @@ -0,0 +1,77 @@ +defmodule Test.Fixtures.Go.CliParser do + @moduledoc false + use Test.LanguageFixture, language: "go cli_parser" + + @code ~S''' + type Flag struct { + Name string + Short string + Description string + Required bool + Value interface{} + } + + type Command struct { + Name string + Description string + flags []*Flag + subcommands []*Command + action func(args []string, flags map[string]interface{}) error + } + + func NewCommand(name, description string) *Command { + return &Command{Name: name, Description: description, flags: []*Flag{}, subcommands: []*Command{}} + } + + func (c *Command) AddFlag(name, short, description string, required bool) *Flag { + f := &Flag{Name: name, Short: short, Description: description, Required: required} + c.flags = append(c.flags, f) + return f + } + + func (c *Command) AddSubcommand(sub *Command) *Command { + c.subcommands = append(c.subcommands, sub) + return c + } + + func (c *Command) Action(fn func(args []string, flags map[string]interface{}) error) { + c.action = fn + } + + func (c *Command) Execute(args []string) error { + if len(args) > 0 { + for _, sub := range c.subcommands { + if sub.Name == args[0] { + return sub.Execute(args[1:]) + } + } + } + flags, remaining, err := c.parseFlags(args) + if err != nil { + return err + } + if c.action != nil { + return c.action(remaining, flags) + } + return nil + } + + func (c *Command) parseFlags(args []string) (map[string]interface{}, []string, error) { + result := make(map[string]interface{}) + remaining := []string{} + for i := 0; i < len(args); i++ { + arg := args[i] + if len(arg) > 2 && arg[:2] == "--" { + key := arg[2:] + if i+1 < len(args) { + result[key] = args[i+1] + i++ + } + } else { + remaining = append(remaining, arg) + } + } + return result, remaining, nil + } + ''' +end diff --git a/test/support/fixtures/go/http_middleware.ex b/test/support/fixtures/go/http_middleware.ex new file mode 100644 index 00000000..e759c854 --- /dev/null +++ b/test/support/fixtures/go/http_middleware.ex @@ -0,0 +1,86 @@ +defmodule Test.Fixtures.Go.HttpMiddleware do + @moduledoc false + use Test.LanguageFixture, language: "go http_middleware" + + @code ~S''' + type Handler func(w ResponseWriter, r *Request) + + type Middleware func(Handler) Handler + + type ResponseWriter interface { + Write([]byte) (int, error) + WriteHeader(statusCode int) + Header() map[string][]string + } + + type Request struct { + Method string + Path string + Headers map[string]string + Body []byte + } + + type Router struct { + routes map[string]Handler + middlewares []Middleware + } + + func NewRouter() *Router { + return &Router{routes: make(map[string]Handler), middlewares: []Middleware{}} + } + + func (r *Router) Use(m Middleware) { + r.middlewares = append(r.middlewares, m) + } + + func (r *Router) Handle(path string, h Handler) { + r.routes[path] = r.wrap(h) + } + + func (r *Router) ServeHTTP(w ResponseWriter, req *Request) { + h, ok := r.routes[req.Path] + if !ok { + w.WriteHeader(404) + return + } + h(w, req) + } + + func (r *Router) wrap(h Handler) Handler { + for i := len(r.middlewares) - 1; i >= 0; i-- { + h = r.middlewares[i](h) + } + return h + } + + func LoggingMiddleware(next Handler) Handler { + return func(w ResponseWriter, r *Request) { + next(w, r) + } + } + + func RecoveryMiddleware(next Handler) Handler { + return func(w ResponseWriter, r *Request) { + defer func() { + if rec := recover(); rec != nil { + w.WriteHeader(500) + } + }() + next(w, r) + } + } + + func AuthMiddleware(secret string) Middleware { + return func(next Handler) Handler { + return func(w ResponseWriter, r *Request) { + token, ok := r.Headers["Authorization"] + if !ok || token != secret { + w.WriteHeader(401) + return + } + next(w, r) + } + } + } + ''' +end diff --git a/test/support/fixtures/java/builder_pattern.ex b/test/support/fixtures/java/builder_pattern.ex new file mode 100644 index 00000000..15cd00d7 --- /dev/null +++ b/test/support/fixtures/java/builder_pattern.ex @@ -0,0 +1,81 @@ +defmodule Test.Fixtures.Java.BuilderPattern do + @moduledoc false + use Test.LanguageFixture, language: "java builder_pattern" + + @code ~S''' + interface Validatable { + boolean isValid(); + String validationError(); + } + + interface Buildable { + T build(); + } + + class Address implements Validatable { + private final String street; + private final String city; + private final String country; + private final String postalCode; + + private Address(Builder b) { + this.street = b.street; + this.city = b.city; + this.country = b.country; + this.postalCode = b.postalCode; + } + + public boolean isValid() { + return street != null && !street.isEmpty() && city != null && country != null; + } + + public String validationError() { + if (street == null || street.isEmpty()) return "street is required"; + if (city == null) return "city is required"; + return null; + } + + public String getStreet() { return street; } + + public String getCity() { return city; } + + public String getCountry() { return country; } + + public String getPostalCode() { return postalCode; } + + public static class Builder implements Buildable
{ + private String street; + private String city; + private String country; + private String postalCode; + + public Builder street(String street) { this.street = street; return this; } + + public Builder city(String city) { this.city = city; return this; } + + public Builder country(String country) { this.country = country; return this; } + + public Builder postalCode(String postalCode) { this.postalCode = postalCode; return this; } + + public Address build() { + Address a = new Address(this); + if (!a.isValid()) throw new IllegalStateException(a.validationError()); + return a; + } + } + } + + enum Country { + US("United States"), + DE("Germany"), + JP("Japan"), + BR("Brazil"); + + private final String displayName; + + Country(String displayName) { this.displayName = displayName; } + + public String getDisplayName() { return displayName; } + } + ''' +end diff --git a/test/support/fixtures/java/repository_pattern.ex b/test/support/fixtures/java/repository_pattern.ex new file mode 100644 index 00000000..487b5260 --- /dev/null +++ b/test/support/fixtures/java/repository_pattern.ex @@ -0,0 +1,76 @@ +defmodule Test.Fixtures.Java.RepositoryPattern do + @moduledoc false + use Test.LanguageFixture, language: "java repository_pattern" + + @code ~S''' + interface Entity { + ID getId(); + } + + interface Repository, ID> { + T findById(ID id); + java.util.List findAll(); + T save(T entity); + void delete(ID id); + boolean exists(ID id); + } + + interface UserRepository extends Repository { + java.util.Optional findByEmail(String email); + java.util.List findByRole(String role); + } + + class User implements Entity { + private Long id; + private String name; + private String email; + private String role; + + public User(Long id, String name, String email, String role) { + this.id = id; + this.name = name; + this.email = email; + this.role = role; + } + + public Long getId() { return id; } + + public String getName() { return name; } + + public String getEmail() { return email; } + + public String getRole() { return role; } + } + + class InMemoryUserRepository implements UserRepository { + private final java.util.Map store = new java.util.HashMap<>(); + private long nextId = 1L; + + public User findById(Long id) { return store.get(id); } + + public java.util.List findAll() { return new java.util.ArrayList<>(store.values()); } + + public User save(User user) { + if (user.getId() == null) { + User saved = new User(nextId++, user.getName(), user.getEmail(), user.getRole()); + store.put(saved.getId(), saved); + return saved; + } + store.put(user.getId(), user); + return user; + } + + public void delete(Long id) { store.remove(id); } + + public boolean exists(Long id) { return store.containsKey(id); } + + public java.util.Optional findByEmail(String email) { + return store.values().stream().filter(u -> u.getEmail().equals(email)).findFirst(); + } + + public java.util.List findByRole(String role) { + return store.values().stream().filter(u -> u.getRole().equals(role)).collect(java.util.stream.Collectors.toList()); + } + } + ''' +end diff --git a/test/support/fixtures/java/strategy_pattern.ex b/test/support/fixtures/java/strategy_pattern.ex new file mode 100644 index 00000000..0d129f17 --- /dev/null +++ b/test/support/fixtures/java/strategy_pattern.ex @@ -0,0 +1,79 @@ +defmodule Test.Fixtures.Java.StrategyPattern do + @moduledoc false + use Test.LanguageFixture, language: "java strategy_pattern" + + @code ~S''' + interface PaymentStrategy { + boolean validate(double amount); + String process(double amount, String currency); + String getName(); + } + + interface TransactionLogger { + void log(String strategy, double amount, String result); + } + + class CreditCardStrategy implements PaymentStrategy { + private final String cardNumber; + private final String expiry; + private final String cvv; + + public CreditCardStrategy(String cardNumber, String expiry, String cvv) { + this.cardNumber = cardNumber; + this.expiry = expiry; + this.cvv = cvv; + } + + public boolean validate(double amount) { + return amount > 0 && cardNumber != null && cardNumber.length() == 16; + } + + public String process(double amount, String currency) { + return "Charged " + amount + " " + currency + " to card ending " + cardNumber.substring(12); + } + + public String getName() { return "credit_card"; } + } + + class BankTransferStrategy implements PaymentStrategy { + private final String accountNumber; + private final String routingNumber; + + public BankTransferStrategy(String accountNumber, String routingNumber) { + this.accountNumber = accountNumber; + this.routingNumber = routingNumber; + } + + public boolean validate(double amount) { return amount >= 1.0; } + + public String process(double amount, String currency) { + return "Transferred " + amount + " " + currency + " from account " + accountNumber; + } + + public String getName() { return "bank_transfer"; } + } + + class PaymentProcessor { + private PaymentStrategy strategy; + private final TransactionLogger logger; + + public PaymentProcessor(PaymentStrategy strategy, TransactionLogger logger) { + this.strategy = strategy; + this.logger = logger; + } + + public void setStrategy(PaymentStrategy strategy) { this.strategy = strategy; } + + public String pay(double amount, String currency) { + if (!strategy.validate(amount)) throw new IllegalArgumentException("Invalid payment"); + String result = strategy.process(amount, currency); + logger.log(strategy.getName(), amount, result); + return result; + } + } + + enum PaymentStatus { + PENDING, PROCESSING, COMPLETED, FAILED, REFUNDED + } + ''' +end diff --git a/test/support/fixtures/javascript/calculator.ex b/test/support/fixtures/javascript/calculator.ex new file mode 100644 index 00000000..b6d67a5b --- /dev/null +++ b/test/support/fixtures/javascript/calculator.ex @@ -0,0 +1,51 @@ +defmodule Test.Fixtures.JavaScript.Calculator do + @moduledoc false + use Test.LanguageFixture, language: "javascript calculator" + + @code ~S''' + function add(a, b) { + return a + b; + } + + function subtract(a, b) { + return a - b; + } + + function multiply(a, b) { + return a * b; + } + + function divide(a, b) { + if (b === 0) throw new Error("Cannot divide by zero"); + return a / b; + } + + function power(base, exp) { + return Math.pow(base, exp); + } + + function sqrt(n) { + if (n < 0) throw new Error("Cannot take sqrt of negative number"); + return Math.sqrt(n); + } + + function abs(n) { + return Math.abs(n); + } + + function clamp(n, min, max) { + return Math.min(Math.max(n, min), max); + } + + function roundTo(n, decimals) { + var factor = Math.pow(10, decimals); + return Math.round(n * factor) / factor; + } + + function average(values) { + if (values.length === 0) return 0; + var sum = values.reduce(function(acc, v) { return acc + v; }, 0); + return sum / values.length; + } + ''' +end diff --git a/test/support/fixtures/javascript/form_validator.ex b/test/support/fixtures/javascript/form_validator.ex new file mode 100644 index 00000000..017ed520 --- /dev/null +++ b/test/support/fixtures/javascript/form_validator.ex @@ -0,0 +1,134 @@ +defmodule Test.Fixtures.JavaScript.FormValidator do + @moduledoc false + use Test.LanguageFixture, language: "javascript form_validator" + + @code ~S''' + class ValidationError { + constructor(field, message) { + this.field = field; + this.message = message; + } + + toString() { + return `${this.field}: ${this.message}`; + } + } + + class ValidationResult { + constructor() { + this.errors = []; + } + + addError(field, message) { + this.errors.push(new ValidationError(field, message)); + return this; + } + + isValid() { + return this.errors.length === 0; + } + + getErrors(field) { + return this.errors.filter(function(e) { return e.field === field; }); + } + } + + class FieldValidator { + constructor(field, value) { + this.field = field; + this.value = value; + this._rules = []; + } + + required() { + this._rules.push(function(v) { + if (v === null || v === undefined || v === "") { + return "is required"; + } + return null; + }); + return this; + } + + minLength(n) { + this._rules.push(function(v) { + if (typeof v === "string" && v.length < n) { + return "is too short (minimum " + n + " characters)"; + } + return null; + }); + return this; + } + + maxLength(n) { + this._rules.push(function(v) { + if (typeof v === "string" && v.length > n) { + return "is too long (maximum " + n + " characters)"; + } + return null; + }); + return this; + } + + matches(pattern, message) { + this._rules.push(function(v) { + if (typeof v === "string" && !pattern.test(v)) { + return message || "is invalid"; + } + return null; + }); + return this; + } + + validate() { + var errors = []; + for (var i = 0; i < this._rules.length; i++) { + var error = this._rules[i](this.value); + if (error !== null) { + errors.push(error); + } + } + return errors; + } + } + + class FormValidator { + constructor(data) { + this._data = data; + this._fields = []; + } + + field(name) { + var validator = new FieldValidator(name, this._data[name]); + this._fields.push(validator); + return validator; + } + + validate() { + var result = new ValidationResult(); + for (var i = 0; i < this._fields.length; i++) { + var f = this._fields[i]; + var errors = f.validate(); + for (var j = 0; j < errors.length; j++) { + result.addError(f.field, errors[j]); + } + } + return result; + } + } + + function validateEmail(value) { + var pattern = /^[^\s@]+@[^\s@]+\.[^\s@]+$/; + return pattern.test(value); + } + + function validateUrl(value) { + try { + new URL(value); + return true; + } catch (_) { + return false; + } + } + ''' +end diff --git a/test/support/fixtures/javascript/shopping_cart.ex b/test/support/fixtures/javascript/shopping_cart.ex new file mode 100644 index 00000000..e7d8600b --- /dev/null +++ b/test/support/fixtures/javascript/shopping_cart.ex @@ -0,0 +1,100 @@ +defmodule Test.Fixtures.JavaScript.ShoppingCart do + @moduledoc false + use Test.LanguageFixture, language: "javascript shopping_cart" + + @code ~S''' + class CartItem { + constructor(id, name, price, quantity) { + this.id = id; + this.name = name; + this.price = price; + this.quantity = quantity; + } + + get subtotal() { + return this.price * this.quantity; + } + + withQuantity(quantity) { + return new CartItem(this.id, this.name, this.price, quantity); + } + } + + class Discount { + constructor(code, type, value) { + this.code = code; + this.type = type; + this.value = value; + } + + apply(subtotal) { + if (this.type === "percent") { + return subtotal * (1 - this.value / 100); + } + if (this.type === "fixed") { + return Math.max(0, subtotal - this.value); + } + return subtotal; + } + } + + class ShoppingCart { + constructor() { + this._items = new Map(); + this._discount = null; + this._listeners = []; + } + + addItem(item) { + var existing = this._items.get(item.id); + if (existing) { + this._items.set(item.id, existing.withQuantity(existing.quantity + item.quantity)); + } else { + this._items.set(item.id, item); + } + this._emit("item:added", item); + return this; + } + + removeItem(id) { + this._items.delete(id); + this._emit("item:removed", { id: id }); + return this; + } + + applyDiscount(discount) { + this._discount = discount; + this._emit("discount:applied", discount); + return this; + } + + get subtotal() { + var total = 0; + this._items.forEach(function(item) { total += item.subtotal; }); + return total; + } + + get total() { + var sub = this.subtotal; + return this._discount ? this._discount.apply(sub) : sub; + } + + get itemCount() { + var count = 0; + this._items.forEach(function(item) { count += item.quantity; }); + return count; + } + + on(event, handler) { + this._listeners.push({ event: event, handler: handler }); + return this; + } + + _emit(event, data) { + this._listeners + .filter(function(l) { return l.event === event; }) + .forEach(function(l) { l.handler(data); }); + } + } + ''' +end diff --git a/test/support/fixtures/kotlin/coroutine_flow.ex b/test/support/fixtures/kotlin/coroutine_flow.ex new file mode 100644 index 00000000..efd8f80f --- /dev/null +++ b/test/support/fixtures/kotlin/coroutine_flow.ex @@ -0,0 +1,64 @@ +defmodule Test.Fixtures.Kotlin.CoroutineFlow do + @moduledoc false + use Test.LanguageFixture, language: "kotlin coroutine_flow" + + @code ~S''' + interface FlowCollector { + suspend fun emit(value: T) + } + + interface Flow { + suspend fun collect(collector: FlowCollector) + } + + interface Channel { + suspend fun send(value: T) + suspend fun receive(): T + fun close() + val isClosedForSend: Boolean + } + + class SimpleFlow(private val block: suspend FlowCollector.() -> Unit) : Flow { + override suspend fun collect(collector: FlowCollector) { + collector.block() + } + } + + class TransformFlow( + private val upstream: Flow, + private val transform: suspend (T) -> R + ) : Flow { + override suspend fun collect(collector: FlowCollector) { + upstream.collect(object : FlowCollector { + override suspend fun emit(value: T) { + collector.emit(transform(value)) + } + }) + } + } + + class FilterFlow( + private val upstream: Flow, + private val predicate: suspend (T) -> Boolean + ) : Flow { + override suspend fun collect(collector: FlowCollector) { + upstream.collect(object : FlowCollector { + override suspend fun emit(value: T) { + if (predicate(value)) collector.emit(value) + } + }) + } + } + + class BufferedChannel(private val capacity: Int) : Channel { + private val buffer: ArrayDeque = ArrayDeque() + override val isClosedForSend: Boolean get() = false + + override suspend fun send(value: T) { buffer.addLast(value) } + + override suspend fun receive(): T = buffer.removeFirst() + + override fun close() { buffer.clear() } + } + ''' +end diff --git a/test/support/fixtures/kotlin/extension_library.ex b/test/support/fixtures/kotlin/extension_library.ex new file mode 100644 index 00000000..8d0ad7ee --- /dev/null +++ b/test/support/fixtures/kotlin/extension_library.ex @@ -0,0 +1,55 @@ +defmodule Test.Fixtures.Kotlin.ExtensionLibrary do + @moduledoc false + use Test.LanguageFixture, language: "kotlin extension_library" + + @code ~S''' + interface StringValidator { + fun validate(value: String): Boolean + fun errorMessage(): String + } + + interface Transformer { + fun transform(value: T): R + } + + interface Pipeline { + fun pipe(step: Transformer): Pipeline + fun execute(input: T): T + } + + class EmailValidator : StringValidator { + override fun validate(value: String): Boolean = value.contains("@") && value.contains(".") + + override fun errorMessage(): String = "Invalid email format" + } + + class LengthValidator(private val min: Int, private val max: Int) : StringValidator { + override fun validate(value: String): Boolean = value.length in min..max + + override fun errorMessage(): String = "Length must be between $min and $max" + } + + class TrimTransformer : Transformer { + override fun transform(value: String): String = value.trim() + } + + class LowercaseTransformer : Transformer { + override fun transform(value: String): String = value.lowercase() + } + + class StringPipeline : Pipeline { + private val steps: MutableList> = mutableListOf() + + override fun pipe(step: Transformer): Pipeline { + steps.add(step) + return this + } + + override fun execute(input: String): String = steps.fold(input) { acc, step -> step.transform(acc) } + } + + enum class ValidationMode { + STRICT, LENIENT, DISABLED + } + ''' +end diff --git a/test/support/fixtures/kotlin/sealed_state.ex b/test/support/fixtures/kotlin/sealed_state.ex new file mode 100644 index 00000000..fd0b1fa7 --- /dev/null +++ b/test/support/fixtures/kotlin/sealed_state.ex @@ -0,0 +1,63 @@ +defmodule Test.Fixtures.Kotlin.SealedState do + @moduledoc false + use Test.LanguageFixture, language: "kotlin sealed_state" + + @code ~S''' + interface Action + + interface State + + interface Reducer { + fun reduce(state: S, action: A): S + } + + class ScreenState { + class Loading : ScreenState() + class Success(val data: List) : ScreenState() + class Error(val message: String, val cause: Throwable?) : ScreenState() + class Empty : ScreenState() + } + + class ScreenAction { + class Load : ScreenAction() + class LoadSuccess(val data: List) : ScreenAction() + class LoadError(val message: String, val cause: Throwable?) : ScreenAction() + class Refresh : ScreenAction() + class Clear : ScreenAction() + } + + class ScreenReducer : Reducer { + override fun reduce(state: ScreenState, action: ScreenAction): ScreenState { + return when (action) { + is ScreenAction.Load -> ScreenState.Loading() + is ScreenAction.LoadSuccess -> if (action.data.isEmpty()) ScreenState.Empty() else ScreenState.Success(action.data) + is ScreenAction.LoadError -> ScreenState.Error(action.message, action.cause) + is ScreenAction.Refresh -> ScreenState.Loading() + is ScreenAction.Clear -> ScreenState.Empty() + else -> state + } + } + } + + enum class LoadStrategy { + EAGER, LAZY, PREFETCH, BACKGROUND + } + + class StateStore(private val reducer: Reducer, initialState: S) { + private var state: S = initialState + private val listeners: MutableList<(S) -> Unit> = mutableListOf() + + fun getState(): S = state + + fun dispatch(action: A) { + state = reducer.reduce(state, action) + listeners.forEach { it(state) } + } + + fun subscribe(listener: (S) -> Unit): () -> Unit { + listeners.add(listener) + return { listeners.remove(listener) } + } + } + ''' +end diff --git a/test/support/fixtures/lua/class_system.ex b/test/support/fixtures/lua/class_system.ex new file mode 100644 index 00000000..d96769db --- /dev/null +++ b/test/support/fixtures/lua/class_system.ex @@ -0,0 +1,63 @@ +defmodule Test.Fixtures.Lua.ClassSystem do + @moduledoc false + use Test.LanguageFixture, language: "lua class_system" + + @code ~S''' + function class(parent) + local cls = {} + cls.__index = cls + if parent then + setmetatable(cls, { __index = parent }) + end + cls.new = function(...) + local instance = setmetatable({}, cls) + if instance.init then + instance:init(...) + end + return instance + end + cls.isInstanceOf = function(self, klass) + local mt = getmetatable(self) + while mt do + if mt == klass then return true end + mt = getmetatable(mt) + end + return false + end + return cls + end + + function mixin(target, source) + for key, value in pairs(source) do + if type(value) == "function" and not target[key] then + target[key] = value + end + end + return target + end + + function interface(...) + local methods = { ... } + return function(obj) + for _, method in ipairs(methods) do + if type(obj[method]) ~= "function" then + error("Missing method: " .. method) + end + end + return true + end + end + + function extend(parent, definition) + local cls = class(parent) + for k, v in pairs(definition) do + cls[k] = v + end + return cls + end + + function implements(obj, iface) + return pcall(iface, obj) + end + ''' +end diff --git a/test/support/fixtures/lua/event_system.ex b/test/support/fixtures/lua/event_system.ex new file mode 100644 index 00000000..4c50cf85 --- /dev/null +++ b/test/support/fixtures/lua/event_system.ex @@ -0,0 +1,76 @@ +defmodule Test.Fixtures.Lua.EventSystem do + @moduledoc false + use Test.LanguageFixture, language: "lua event_system" + + @code ~S''' + function EventEmitter() + local self = { listeners = {}, onceListeners = {} } + + function self:on(event, callback) + if not self.listeners[event] then + self.listeners[event] = {} + end + table.insert(self.listeners[event], callback) + return self + end + + function self:once(event, callback) + if not self.onceListeners[event] then + self.onceListeners[event] = {} + end + table.insert(self.onceListeners[event], callback) + return self + end + + function self:off(event, callback) + if self.listeners[event] then + for i, cb in ipairs(self.listeners[event]) do + if cb == callback then + table.remove(self.listeners[event], i) + return self + end + end + end + return self + end + + function self:emit(event, ...) + local listeners = self.listeners[event] or {} + for _, cb in ipairs(listeners) do + cb(...) + end + local onceListeners = self.onceListeners[event] or {} + self.onceListeners[event] = {} + for _, cb in ipairs(onceListeners) do + cb(...) + end + return self + end + + function self:removeAllListeners(event) + if event then + self.listeners[event] = nil + self.onceListeners[event] = nil + else + self.listeners = {} + self.onceListeners = {} + end + return self + end + + return self + end + + function pipe(emitter1, event, emitter2, targetEvent) + emitter1:on(event, function(...) + emitter2:emit(targetEvent or event, ...) + end) + end + + function broadcast(emitters, event, ...) + for _, emitter in ipairs(emitters) do + emitter:emit(event, ...) + end + end + ''' +end diff --git a/test/support/fixtures/lua/state_machine.ex b/test/support/fixtures/lua/state_machine.ex new file mode 100644 index 00000000..cba47b8f --- /dev/null +++ b/test/support/fixtures/lua/state_machine.ex @@ -0,0 +1,75 @@ +defmodule Test.Fixtures.Lua.StateMachine do + @moduledoc false + use Test.LanguageFixture, language: "lua state_machine" + + @code ~S''' + function StateMachine(config) + local self = { + current = config.initial, + states = config.states or {}, + transitions = config.transitions or {}, + history = {}, + listeners = {}, + } + + function self:can(event) + local key = self.current .. ":" .. event + return self.transitions[key] ~= nil + end + + function self:transition(event, data) + local key = self.current .. ":" .. event + local target = self.transitions[key] + if not target then + error("No transition from '" .. self.current .. "' on event '" .. event .. "'") + end + local from = self.current + local stateConfig = self.states[from] or {} + if stateConfig.onExit then stateConfig.onExit(from, event, data) end + table.insert(self.history, { state = from, event = event }) + self.current = target + local targetConfig = self.states[target] or {} + if targetConfig.onEnter then targetConfig.onEnter(target, event, data) end + for _, cb in ipairs(self.listeners) do + cb(from, event, target, data) + end + return self + end + + function self:onTransition(callback) + table.insert(self.listeners, callback) + return self + end + + function self:getHistory() + return self.history + end + + function self:reset() + self.current = config.initial + self.history = {} + return self + end + + return self + end + + function buildTransitionTable(transitions) + local tbl = {} + for _, t in ipairs(transitions) do + local key = t.from .. ":" .. t.event + tbl[key] = t.to + end + return tbl + end + + function validateMachine(machine, requiredStates) + for _, state in ipairs(requiredStates) do + if not machine.states[state] then + return false, "Missing state: " .. state + end + end + return true, nil + end + ''' +end diff --git a/test/support/fixtures/python/calculator.ex b/test/support/fixtures/python/calculator.ex new file mode 100644 index 00000000..47c9029e --- /dev/null +++ b/test/support/fixtures/python/calculator.ex @@ -0,0 +1,83 @@ +defmodule Test.Fixtures.Python.Calculator do + @moduledoc false + use Test.LanguageFixture, language: "python calculator" + + @code ~S''' + class Calculator: + """A calculator supporting basic arithmetic operations.""" + + def add(self, a, b): + """Returns the sum of a and b.""" + return a + b + + def subtract(self, a, b): + """Returns a minus b.""" + return a - b + + def multiply(self, a, b): + """Returns the product of a and b.""" + return a * b + + def divide(self, a, b): + """Divides a by b. Raises for zero divisor.""" + if b == 0: + raise ValueError("Cannot divide by zero") + return a / b + + def power(self, base, exp): + """Returns base to the power of exp.""" + return base ** exp + + def sqrt(self, n): + """Returns the square root. Raises for negative input.""" + if n < 0: + raise ValueError("Cannot take sqrt of negative number") + return n ** 0.5 + + def abs_val(self, n): + """Returns the absolute value of n.""" + if n < 0: + return -n + return n + + + class ScientificCalculator(Calculator): + """Extended scientific calculator.""" + + def log(self, n, base=10): + """Returns log base of n. Raises for non-positive n.""" + if n <= 0: + raise ValueError("Logarithm undefined for non-positive values") + import math + return math.log(n, base) + + def factorial(self, n): + """Returns n factorial. Raises for negative n.""" + if n < 0: + raise ValueError("Factorial undefined for negative numbers") + if n == 0: + return 1 + result = 1 + for i in range(1, n + 1): + result *= i + return result + + + def add(a, b): + return a + b + + + def subtract(a, b): + return a - b + + + def multiply(a, b): + return a * b + + + def divide(a, b): + if b == 0: + raise ValueError("Cannot divide by zero") + return a / b + ''' +end diff --git a/test/support/fixtures/python/config_parser.ex b/test/support/fixtures/python/config_parser.ex new file mode 100644 index 00000000..a58516ef --- /dev/null +++ b/test/support/fixtures/python/config_parser.ex @@ -0,0 +1,89 @@ +defmodule Test.Fixtures.Python.ConfigParser do + @moduledoc false + use Test.LanguageFixture, language: "python config_parser" + + @code ~S''' + from dataclasses import dataclass, field + from typing import ClassVar, Optional + + + @dataclass + class DatabaseConfig: + """Database connection configuration.""" + + host: str = "localhost" + port: int = 5432 + name: str = "app" + pool_size: int = 10 + VALID_PORTS: ClassVar[range] = range(1, 65536) + + def __post_init__(self): + """Validates configuration after initialisation.""" + if self.port not in self.VALID_PORTS: + raise ValueError(f"Invalid port: {self.port}") + if not self.host: + raise ValueError("host must not be empty") + if self.pool_size < 1: + raise ValueError("pool_size must be at least 1") + + def url(self) -> str: + """Returns the database connection URL.""" + return f"postgres://{self.host}:{self.port}/{self.name}" + + + @dataclass + class LoggingConfig: + """Logging configuration.""" + + level: str = "info" + format: str = "text" + output: str = "stdout" + VALID_LEVELS: ClassVar[list] = ["debug", "info", "warning", "error"] + VALID_FORMATS: ClassVar[list] = ["text", "json"] + + def __post_init__(self): + """Validates level and format.""" + if self.level not in self.VALID_LEVELS: + raise ValueError(f"Invalid log level: {self.level}") + if self.format not in self.VALID_FORMATS: + raise ValueError(f"Invalid log format: {self.format}") + + + @dataclass + class AppConfig: + """Top-level application configuration.""" + + database: DatabaseConfig = field(default_factory=DatabaseConfig) + logging: LoggingConfig = field(default_factory=LoggingConfig) + debug: bool = False + version: str = "1.0.0" + + def is_production(self) -> bool: + """Returns True when debug mode is disabled.""" + return not self.debug + + @classmethod + def from_dict(cls, data: dict) -> "AppConfig": + """Builds an AppConfig from a plain dictionary.""" + db_data = data.get("database", {}) + log_data = data.get("logging", {}) + return cls( + database=DatabaseConfig(**db_data), + logging=LoggingConfig(**log_data), + debug=data.get("debug", False), + version=data.get("version", "1.0.0"), + ) + + @classmethod + def from_env(cls, prefix: str = "APP") -> "AppConfig": + """Builds an AppConfig from environment variables.""" + import os + return cls( + database=DatabaseConfig( + host=os.getenv(f"{prefix}_DB_HOST", "localhost"), + port=int(os.getenv(f"{prefix}_DB_PORT", "5432")), + ), + debug=os.getenv(f"{prefix}_DEBUG", "false").lower() == "true", + ) + ''' +end diff --git a/test/support/fixtures/python/csv_pipeline.ex b/test/support/fixtures/python/csv_pipeline.ex new file mode 100644 index 00000000..459acf88 --- /dev/null +++ b/test/support/fixtures/python/csv_pipeline.ex @@ -0,0 +1,95 @@ +defmodule Test.Fixtures.Python.CsvPipeline do + @moduledoc false + use Test.LanguageFixture, language: "python csv_pipeline" + + @code ~S''' + from dataclasses import dataclass, field + from typing import Iterator, Protocol + + + @dataclass + class CsvRow: + """Represents one row of parsed CSV data.""" + + fields: dict + line_number: int + + def get(self, key: str, default=None): + """Returns the value for key or default.""" + return self.fields.get(key, default) + + def keys(self) -> list: + """Returns all field names.""" + return list(self.fields.keys()) + + + class RowTransformer(Protocol): + """Protocol for CSV row transformation steps.""" + + def transform(self, row: CsvRow) -> CsvRow: + """Transforms a single row.""" + ... + + + @dataclass + class ColumnRenamer: + """Renames columns according to a mapping.""" + + mapping: dict = field(default_factory=dict) + + def transform(self, row: CsvRow) -> CsvRow: + """Applies column rename mapping to a row.""" + new_fields = {self.mapping.get(k, k): v for k, v in row.fields.items()} + return CsvRow(fields=new_fields, line_number=row.line_number) + + + @dataclass + class TypeCoercer: + """Coerces column values to specified types.""" + + types: dict = field(default_factory=dict) + + def transform(self, row: CsvRow) -> CsvRow: + """Coerces field values using the types mapping.""" + coerced = {} + for key, value in row.fields.items(): + target_type = self.types.get(key) + if target_type is not None: + try: + coerced[key] = target_type(value) + except (ValueError, TypeError): + coerced[key] = value + else: + coerced[key] = value + return CsvRow(fields=coerced, line_number=row.line_number) + + + class CsvPipeline: + """Streaming CSV pipeline with pluggable transformation steps.""" + + def __init__(self, path: str): + """Initialises the pipeline for the given CSV file path.""" + self._path = path + self._steps: list = [] + + def add_step(self, step: RowTransformer) -> "CsvPipeline": + """Adds a transformation step and returns self for chaining.""" + self._steps.append(step) + return self + + def run(self) -> Iterator[CsvRow]: + """Yields processed rows from the CSV file.""" + with open(self._path, "r", newline="") as fh: + import csv + reader = csv.DictReader(fh) + for line_number, raw in enumerate(reader, start=1): + row = CsvRow(fields=dict(raw), line_number=line_number) + for step in self._steps: + row = step.transform(row) + yield row + + def collect(self) -> list: + """Collects all processed rows into a list.""" + return list(self.run()) + ''' +end diff --git a/test/support/fixtures/ruby/calculator.ex b/test/support/fixtures/ruby/calculator.ex new file mode 100644 index 00000000..df469555 --- /dev/null +++ b/test/support/fixtures/ruby/calculator.ex @@ -0,0 +1,59 @@ +defmodule Test.Fixtures.Ruby.Calculator do + @moduledoc false + use Test.LanguageFixture, language: "ruby calculator" + + @code ~S''' + module Calculable + def abs_val(n) + n < 0 ? -n : n + end + + def clamp(n, min, max) + [[n, min].max, max].min + end + end + + class BasicCalculator + include Calculable + + def add(a, b) + a + b + end + + def subtract(a, b) + a - b + end + + def multiply(a, b) + a * b + end + + def divide(a, b) + raise ArgumentError, "Cannot divide by zero" if b.zero? + a.to_f / b + end + + def power(a, b) + a ** b + end + end + + class ScientificCalculator < BasicCalculator + def sqrt(n) + raise ArgumentError, "Cannot take sqrt of negative number" if n < 0 + Math.sqrt(n) + end + + def log(n, base = 10) + raise ArgumentError, "Logarithm undefined for non-positive values" if n <= 0 + Math.log(n) / Math.log(base) + end + + def factorial(n) + raise ArgumentError, "Factorial undefined for negative numbers" if n < 0 + return 1 if n == 0 + (1..n).reduce(1, :*) + end + end + ''' +end diff --git a/test/support/fixtures/ruby/markdown_renderer.ex b/test/support/fixtures/ruby/markdown_renderer.ex new file mode 100644 index 00000000..2e70d263 --- /dev/null +++ b/test/support/fixtures/ruby/markdown_renderer.ex @@ -0,0 +1,79 @@ +defmodule Test.Fixtures.Ruby.MarkdownRenderer do + @moduledoc false + use Test.LanguageFixture, language: "ruby markdown_renderer" + + @code ~S''' + module Markdown + Token = Struct.new(:type, :content, :level) + end + + module Markdown::Tokenizer + HEADING_RE = /^(#{1,6})\s+(.+)$/ + CODE_BLOCK_RE = /^```(\w*)$/ + BOLD_RE = /\*\*(.+?)\*\*/ + ITALIC_RE = /\*(.+?)\*/ + LINK_RE = /\[(.+?)\]\((.+?)\)/ + + def tokenize_line(line) + case line + when HEADING_RE + Markdown::Token.new(:heading, Regexp.last_match(2), Regexp.last_match(1).length) + when /^\s*[-*]\s+(.+)/ + Markdown::Token.new(:list_item, Regexp.last_match(1), 0) + when /^\s*$/ + Markdown::Token.new(:blank, "", 0) + else + Markdown::Token.new(:paragraph, line, 0) + end + end + + def inline_format(text) + text + .gsub(LINK_RE) { "#{Regexp.last_match(1)}" } + .gsub(BOLD_RE) { "#{Regexp.last_match(1)}" } + .gsub(ITALIC_RE) { "#{Regexp.last_match(1)}" } + end + end + + module Markdown::Renderer + include Markdown::Tokenizer + + def render_token(token) + case token.type + when :heading + "#{inline_format(token.content)}" + when :list_item + "
  • #{inline_format(token.content)}
  • " + when :paragraph + "

    #{inline_format(token.content)}

    " + when :blank + "" + end + end + + def render(markdown) + markdown.lines.map { |line| tokenize_line(line.chomp) }.map { |token| render_token(token) }.reject(&:empty?).join("\n") + end + end + + class Markdown::Document + include Markdown::Renderer + + def initialize(source) + @source = source + end + + def to_html + render(@source) + end + + def word_count + @source.split(/\s+/).length + end + + def heading_count + @source.lines.count { |l| l.match?(HEADING_RE) } + end + end + ''' +end diff --git a/test/support/fixtures/ruby/orm_lite.ex b/test/support/fixtures/ruby/orm_lite.ex new file mode 100644 index 00000000..672b668a --- /dev/null +++ b/test/support/fixtures/ruby/orm_lite.ex @@ -0,0 +1,106 @@ +defmodule Test.Fixtures.Ruby.OrmLite do + @moduledoc false + use Test.LanguageFixture, language: "ruby orm_lite" + + @code ~S''' + module OrmLite + module Persistence + def self.included(base) + base.extend(ClassMethods) + base.instance_variable_set(:@columns, []) + base.instance_variable_set(:@validations, []) + end + + module ClassMethods + def column(name, type = :string) + @columns << { name: name, type: type } + attr_accessor name + end + + def validates(name, **rules) + @validations << { name: name, rules: rules } + end + + def columns + @columns + end + + def validations + @validations + end + + def find(id) + new(id: id) + end + end + + def initialize(attrs = {}) + attrs.each do |key, value| + send(:"#{key}=", value) if respond_to?(:"#{key}=") + end + end + + def valid? + @errors = [] + self.class.validations.each do |v| + value = send(v[:name]) + @errors << "#{v[:name]} can't be blank" if v[:rules][:presence] && (value.nil? || value.to_s.empty?) + @errors << "#{v[:name]} is too short" if v[:rules][:min_length] && value.to_s.length < v[:rules][:min_length] + end + @errors.empty? + end + + def errors + @errors ||= [] + end + + def save + return false unless valid? + true + end + end + + module Associations + def self.included(base) + base.extend(ClassMethods) + end + + module ClassMethods + def has_many(name) + define_method(name) do + [] + end + end + + def belongs_to(name) + attr_accessor :"#{name}_id" + define_method(name) do + nil + end + end + end + end + end + + class User + include OrmLite::Persistence + include OrmLite::Associations + column :name, :string + column :email, :string + column :age, :integer + has_many :posts + validates :name, presence: true, min_length: 2 + validates :email, presence: true + end + + class Post + include OrmLite::Persistence + include OrmLite::Associations + column :title, :string + column :body, :text + belongs_to :user + validates :title, presence: true + validates :body, presence: true + end + ''' +end diff --git a/test/support/fixtures/rust/calculator.ex b/test/support/fixtures/rust/calculator.ex new file mode 100644 index 00000000..a47df574 --- /dev/null +++ b/test/support/fixtures/rust/calculator.ex @@ -0,0 +1,70 @@ +defmodule Test.Fixtures.Rust.Calculator do + @moduledoc false + use Test.LanguageFixture, language: "rust calculator" + + @code ~S''' + trait Calculator { + fn add(&self, a: f64, b: f64) -> f64; + fn subtract(&self, a: f64, b: f64) -> f64; + fn multiply(&self, a: f64, b: f64) -> f64; + fn divide(&self, a: f64, b: f64) -> Option; + } + + struct BasicCalculator; + + impl Calculator for BasicCalculator { + fn add(&self, a: f64, b: f64) -> f64 { + a + b + } + + fn subtract(&self, a: f64, b: f64) -> f64 { + a - b + } + + fn multiply(&self, a: f64, b: f64) -> f64 { + a * b + } + + fn divide(&self, a: f64, b: f64) -> Option { + if b == 0.0 { return None; } + Some(a / b) + } + } + + impl BasicCalculator { + fn new() -> Self { + BasicCalculator + } + + fn power(&self, base: f64, exp: f64) -> f64 { + base.powf(exp) + } + + fn sqrt(&self, n: f64) -> Option { + if n < 0.0 { return None; } + Some(n.sqrt()) + } + + fn abs(&self, n: f64) -> f64 { + n.abs() + } + } + + fn add(a: f64, b: f64) -> f64 { + a + b + } + + fn subtract(a: f64, b: f64) -> f64 { + a - b + } + + fn multiply(a: f64, b: f64) -> f64 { + a * b + } + + fn divide(a: f64, b: f64) -> Option { + if b == 0.0 { return None; } + Some(a / b) + } + ''' +end diff --git a/test/support/fixtures/rust/ring_buffer.ex b/test/support/fixtures/rust/ring_buffer.ex new file mode 100644 index 00000000..eba5a762 --- /dev/null +++ b/test/support/fixtures/rust/ring_buffer.ex @@ -0,0 +1,86 @@ +defmodule Test.Fixtures.Rust.RingBuffer do + @moduledoc false + use Test.LanguageFixture, language: "rust ring_buffer" + + @code ~S''' + struct RingBuffer { + data: Vec>, + head: usize, + tail: usize, + len: usize, + capacity: usize, + } + + impl RingBuffer { + fn new(capacity: usize) -> Self { + let data = (0..capacity).map(|_| None).collect(); + RingBuffer { data, head: 0, tail: 0, len: 0, capacity } + } + + fn push(&mut self, value: T) -> bool { + if self.len == self.capacity { + return false; + } + self.data[self.tail] = Some(value); + self.tail = (self.tail + 1) % self.capacity; + self.len += 1; + true + } + + fn pop(&mut self) -> Option { + if self.len == 0 { + return None; + } + let value = self.data[self.head].take(); + self.head = (self.head + 1) % self.capacity; + self.len -= 1; + value + } + + fn peek(&self) -> Option<&T> { + if self.len == 0 { None } else { self.data[self.head].as_ref() } + } + + fn is_empty(&self) -> bool { + self.len == 0 + } + + fn is_full(&self) -> bool { + self.len == self.capacity + } + + fn len(&self) -> usize { + self.len + } + + fn capacity(&self) -> usize { + self.capacity + } + + fn clear(&mut self) { + for slot in self.data.iter_mut() { + *slot = None; + } + self.head = 0; + self.tail = 0; + self.len = 0; + } + } + + impl RingBuffer { + fn to_vec(&self) -> Vec { + (0..self.len) + .filter_map(|i| self.data[(self.head + i) % self.capacity].clone()) + .collect() + } + } + + fn fill_buffer(items: &[T], capacity: usize) -> RingBuffer { + let mut buf = RingBuffer::new(capacity); + for item in items { + buf.push(item.clone()); + } + buf + } + ''' +end diff --git a/test/support/fixtures/rust/tokenizer.ex b/test/support/fixtures/rust/tokenizer.ex new file mode 100644 index 00000000..09257745 --- /dev/null +++ b/test/support/fixtures/rust/tokenizer.ex @@ -0,0 +1,112 @@ +defmodule Test.Fixtures.Rust.Tokenizer do + @moduledoc false + use Test.LanguageFixture, language: "rust tokenizer" + + @code ~S''' + #[derive(Debug, PartialEq, Clone)] + enum TokenKind { + Number(f64), + Plus, + Minus, + Star, + Slash, + LParen, + RParen, + Eof, + } + + #[derive(Debug, Clone)] + struct Token { + kind: TokenKind, + lexeme: String, + line: usize, + } + + impl Token { + fn new(kind: TokenKind, lexeme: &str, line: usize) -> Self { + Token { kind, lexeme: lexeme.to_string(), line } + } + + fn is_operator(&self) -> bool { + matches!(self.kind, TokenKind::Plus | TokenKind::Minus | TokenKind::Star | TokenKind::Slash) + } + } + + impl std::fmt::Display for Token { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{:?}({})", self.kind, self.lexeme) + } + } + + struct Lexer { + source: Vec, + pos: usize, + line: usize, + } + + impl Lexer { + fn new(source: &str) -> Self { + Lexer { source: source.chars().collect(), pos: 0, line: 1 } + } + + fn peek(&self) -> Option { + self.source.get(self.pos).copied() + } + + fn advance(&mut self) -> Option { + let ch = self.source.get(self.pos).copied(); + self.pos += 1; + ch + } + + fn skip_whitespace(&mut self) { + while let Some(c) = self.peek() { + if c == '\n' { self.line += 1; self.pos += 1; } + else if c.is_whitespace() { self.pos += 1; } + else { break; } + } + } + + fn read_number(&mut self) -> Token { + let start = self.pos; + while let Some(c) = self.peek() { + if c.is_ascii_digit() || c == '.' { self.pos += 1; } + else { break; } + } + let lexeme: String = self.source[start..self.pos].iter().collect(); + let value: f64 = lexeme.parse().unwrap_or(0.0); + Token::new(TokenKind::Number(value), &lexeme, self.line) + } + + fn next_token(&mut self) -> Token { + self.skip_whitespace(); + match self.advance() { + Some('+') => Token::new(TokenKind::Plus, "+", self.line), + Some('-') => Token::new(TokenKind::Minus, "-", self.line), + Some('*') => Token::new(TokenKind::Star, "*", self.line), + Some('/') => Token::new(TokenKind::Slash, "/", self.line), + Some('(') => Token::new(TokenKind::LParen, "(", self.line), + Some(')') => Token::new(TokenKind::RParen, ")", self.line), + Some(c) if c.is_ascii_digit() => { self.pos -= 1; self.read_number() } + None => Token::new(TokenKind::Eof, "", self.line), + _ => Token::new(TokenKind::Eof, "", self.line), + } + } + + fn tokenize(&mut self) -> Vec { + let mut tokens = Vec::new(); + loop { + let t = self.next_token(); + let done = t.kind == TokenKind::Eof; + tokens.push(t); + if done { break; } + } + tokens + } + } + + fn tokenize(source: &str) -> Vec { + Lexer::new(source).tokenize() + } + ''' +end diff --git a/test/support/fixtures/scala/actor_messages.ex b/test/support/fixtures/scala/actor_messages.ex new file mode 100644 index 00000000..73f150c3 --- /dev/null +++ b/test/support/fixtures/scala/actor_messages.ex @@ -0,0 +1,70 @@ +defmodule Test.Fixtures.Scala.ActorMessages do + @moduledoc false + use Test.LanguageFixture, language: "scala actor_messages" + + @code ~S''' + trait Message + + class Request(val id: String, val payload: Map[String, String]) extends Message + + class Response(val id: String, val status: Int, val body: String) extends Message + + class Broadcast(val topic: String, val data: String) extends Message + + class Shutdown(val reason: String) extends Message + + trait ActorState + + class Active(val processedCount: Int) extends ActorState + + class Paused(val since: Long, val reason: String) extends ActorState + + class Stopped(val at: Long) extends ActorState + + trait Behaviour { + def receive(message: Message, state: ActorState): (List[Message], ActorState) + + def onStart(): ActorState + + def onStop(state: ActorState): Unit + } + + class EchoBehaviour extends Behaviour { + def receive(message: Message, state: ActorState): (List[Message], ActorState) = + message match { + case req: Request => + val reply = new Response(req.id, 200, req.payload.mkString(",")) + val newState = state match { + case a: Active => new Active(a.processedCount + 1) + case other => other + } + (List(reply), newState) + case _: Shutdown => (List.empty, new Stopped(System.currentTimeMillis())) + case _ => (List.empty, state) + } + + def onStart(): ActorState = new Active(0) + + def onStop(state: ActorState): Unit = {} + } + + class Supervisor { + private var actors: Map[String, Behaviour] = Map.empty + private var states: Map[String, ActorState] = Map.empty + + def spawn(id: String, behaviour: Behaviour): Unit = { + actors = actors + (id -> behaviour) + states = states + (id -> behaviour.onStart()) + } + + def send(id: String, message: Message): List[Message] = + actors.get(id).map { b => + val (replies, newState) = b.receive(message, states(id)) + states = states + (id -> newState) + replies + }.getOrElse(List.empty) + + def stop(id: String): Unit = actors.get(id).foreach { b => b.onStop(states(id)); actors = actors - id } + } + ''' +end diff --git a/test/support/fixtures/scala/case_class_algebra.ex b/test/support/fixtures/scala/case_class_algebra.ex new file mode 100644 index 00000000..7a2f002f --- /dev/null +++ b/test/support/fixtures/scala/case_class_algebra.ex @@ -0,0 +1,73 @@ +defmodule Test.Fixtures.Scala.CaseClassAlgebra do + @moduledoc false + use Test.LanguageFixture, language: "scala case_class_algebra" + + @code ~S''' + trait Expr + + class Num(val value: Double) extends Expr + + class Add(val left: Expr, val right: Expr) extends Expr + + class Sub(val left: Expr, val right: Expr) extends Expr + + class Mul(val left: Expr, val right: Expr) extends Expr + + class Div(val left: Expr, val right: Expr) extends Expr + + class Neg(val expr: Expr) extends Expr + + trait EvalResult + + class EvalOk(val value: Double) extends EvalResult + + class EvalError(val message: String) extends EvalResult + + trait Evaluator { + def eval(expr: Expr): EvalResult + } + + class SafeEvaluator extends Evaluator { + def eval(expr: Expr): EvalResult = expr match { + case n: Num => new EvalOk(n.value) + case neg: Neg => eval(neg.expr) match { + case ok: EvalOk => new EvalOk(-ok.value) + case err => err + } + case add: Add => combine(add.left, add.right)(_ + _) + case sub: Sub => combine(sub.left, sub.right)(_ - _) + case mul: Mul => combine(mul.left, mul.right)(_ * _) + case div: Div => eval(div.right) match { + case ok: EvalOk if ok.value == 0.0 => new EvalError("Division by zero") + case ok: EvalOk => eval(div.left) match { + case lOk: EvalOk => new EvalOk(lOk.value / ok.value) + case err => err + } + case err => err + } + } + + private def combine(l: Expr, r: Expr)(op: (Double, Double) => Double): EvalResult = + (eval(l), eval(r)) match { + case (lv: EvalOk, rv: EvalOk) => new EvalOk(op(lv.value, rv.value)) + case (err: EvalError, _) => err + case (_, err: EvalError) => err + } + } + + trait Printer { + def print(expr: Expr): String + } + + class InfixPrinter extends Printer { + def print(expr: Expr): String = expr match { + case n: Num => n.value.toString + case neg: Neg => s"-${print(neg.expr)}" + case add: Add => s"(${print(add.left)} + ${print(add.right)})" + case sub: Sub => s"(${print(sub.left)} - ${print(sub.right)})" + case mul: Mul => s"(${print(mul.left)} * ${print(mul.right)})" + case div: Div => s"(${print(div.left)} / ${print(div.right)})" + } + } + ''' +end diff --git a/test/support/fixtures/scala/typeclass_pattern.ex b/test/support/fixtures/scala/typeclass_pattern.ex new file mode 100644 index 00000000..0a14c37e --- /dev/null +++ b/test/support/fixtures/scala/typeclass_pattern.ex @@ -0,0 +1,64 @@ +defmodule Test.Fixtures.Scala.TypeclassPattern do + @moduledoc false + use Test.LanguageFixture, language: "scala typeclass_pattern" + + @code ~S''' + trait Show[A] { + def show(value: A): String + } + + trait Eq[A] { + def eqv(a: A, b: A): Boolean + + def neqv(a: A, b: A): Boolean = !eqv(a, b) + } + + trait Ord[A] extends Eq[A] { + def compare(a: A, b: A): Int + + def lt(a: A, b: A): Boolean = compare(a, b) < 0 + + def lte(a: A, b: A): Boolean = compare(a, b) <= 0 + + def gt(a: A, b: A): Boolean = compare(a, b) > 0 + + def gte(a: A, b: A): Boolean = compare(a, b) >= 0 + + def eqv(a: A, b: A): Boolean = compare(a, b) == 0 + } + + trait Functor[F[_]] { + def map[A, B](fa: F[A])(f: A => B): F[B] + } + + class Identity[A](val value: A) + + class IdentityInstances { + val identityFunctor: Functor[Identity] = new Functor[Identity] { + def map[A, B](fa: Identity[A])(f: A => B): Identity[B] = new Identity(f(fa.value)) + } + + val identityShow: Show[Identity[String]] = new Show[Identity[String]] { + def show(value: Identity[String]): String = s"Identity(${value.value})" + } + } + + class ShowSyntax[A](value: A, ev: Show[A]) { + def show: String = ev.show(value) + } + + class OrdSyntax[A](value: A, ev: Ord[A]) { + def <(other: A): Boolean = ev.lt(value, other) + + def >(other: A): Boolean = ev.gt(value, other) + + def ===(other: A): Boolean = ev.eqv(value, other) + } + + trait Monoid[A] { + def empty: A + + def combine(a: A, b: A): A + } + ''' +end diff --git a/test/support/fixtures/swift/actor_model.ex b/test/support/fixtures/swift/actor_model.ex new file mode 100644 index 00000000..3e651744 --- /dev/null +++ b/test/support/fixtures/swift/actor_model.ex @@ -0,0 +1,81 @@ +defmodule Test.Fixtures.Swift.ActorModel do + @moduledoc false + use Test.LanguageFixture, language: "swift actor_model" + + @code ~S''' + enum ActorMessage { + case ping(replyTo: String) + case pong(from: String) + case shutdown + case updateState(key: String, value: String) + } + + protocol ActorBehaviour { + var id: String { get } + func receive(_ message: ActorMessage) -> [ActorMessage] + func preStart() + func postStop() + } + + struct ActorRef { + let id: String + private let mailbox: [ActorMessage] + + init(id: String) { + self.id = id + self.mailbox = [] + } + } + + class ActorSystem { + private var actors: [String: ActorBehaviour] = [:] + private var mailboxes: [String: [ActorMessage]] = [:] + + func spawn(id: String, behaviour: ActorBehaviour) { + actors[id] = behaviour + mailboxes[id] = [] + behaviour.preStart() + } + + func send(to id: String, message: ActorMessage) { + mailboxes[id, default: []].append(message) + } + + func process(actorId: String) { + guard let actor = actors[actorId] else { return } + let messages = mailboxes[actorId] ?? [] + mailboxes[actorId] = [] + for message in messages { + let replies = actor.receive(message) + for reply in replies { self.processReply(reply) } + } + } + + func stop(actorId: String) { + actors[actorId]?.postStop() + actors.removeValue(forKey: actorId) + mailboxes.removeValue(forKey: actorId) + } + + private func processReply(_ message: ActorMessage) {} + } + + struct StateActor: ActorBehaviour { + let id: String + private var state: [String: String] = [:] + + func receive(_ message: ActorMessage) -> [ActorMessage] { + switch message { + case .ping(let replyTo): return [.pong(from: id)] + case .updateState(let key, let value): return [] + case .shutdown: return [] + default: return [] + } + } + + func preStart() {} + + func postStop() {} + } + ''' +end diff --git a/test/support/fixtures/swift/combine_stream.ex b/test/support/fixtures/swift/combine_stream.ex new file mode 100644 index 00000000..1faf9e4e --- /dev/null +++ b/test/support/fixtures/swift/combine_stream.ex @@ -0,0 +1,66 @@ +defmodule Test.Fixtures.Swift.CombineStream do + @moduledoc false + use Test.LanguageFixture, language: "swift combine_stream" + + @code ~S''' + protocol Publisher { + associatedtype Output + associatedtype Failure: Error + func subscribe(_ subscriber: AnySubscriber) + } + + protocol Subscriber { + associatedtype Input + associatedtype Failure: Error + func receive(_ input: Input) + func receiveCompletion(_ completion: Completion) + } + + enum Completion { + case finished + case failure(Failure) + } + + struct AnySubscriber { + private let receiveValue: (Input) -> Void + private let receiveCompletion: (Completion) -> Void + + init(receiveValue: @escaping (Input) -> Void, receiveCompletion: @escaping (Completion) -> Void) { + self.receiveValue = receiveValue + self.receiveCompletion = receiveCompletion + } + + func receive(_ input: Input) { receiveValue(input) } + + func receiveCompletion(_ completion: Completion) { self.receiveCompletion(completion) } + } + + struct Just: Publisher { + typealias Failure = Never + let value: Output + + func subscribe(_ subscriber: AnySubscriber) { + subscriber.receive(value) + subscriber.receiveCompletion(.finished) + } + } + + struct MapPublisher: Publisher { + typealias Failure = Upstream.Failure + let upstream: Upstream + let transform: (Upstream.Output) -> Output + + func subscribe(_ subscriber: AnySubscriber) { + let mapped = AnySubscriber( + receiveValue: { self.upstream.subscribe(AnySubscriber(receiveValue: { _ in }, receiveCompletion: { _ in })); subscriber.receive(self.transform($0)) }, + receiveCompletion: subscriber.receiveCompletion + ) + upstream.subscribe(mapped) + } + } + + func sink(receiveValue: @escaping (T) -> Void) -> AnySubscriber { + return AnySubscriber(receiveValue: receiveValue, receiveCompletion: { _ in }) + } + ''' +end diff --git a/test/support/fixtures/swift/result_type.ex b/test/support/fixtures/swift/result_type.ex new file mode 100644 index 00000000..5ce1179a --- /dev/null +++ b/test/support/fixtures/swift/result_type.ex @@ -0,0 +1,63 @@ +defmodule Test.Fixtures.Swift.ResultType do + @moduledoc false + use Test.LanguageFixture, language: "swift result_type" + + @code ~S''' + enum ValidationError: Error { + case empty(field: String) + case tooShort(field: String, minimum: Int) + case tooLong(field: String, maximum: Int) + case invalidFormat(field: String, pattern: String) + } + + enum ParseError: Error { + case invalidJSON + case missingField(String) + case typeMismatch(field: String, expected: String) + } + + struct Email { + let value: String + + static func parse(_ raw: String) -> Result { + guard !raw.isEmpty else { return .failure(.empty(field: "email")) } + guard raw.contains("@") else { return .failure(.invalidFormat(field: "email", pattern: "must contain @")) } + return .success(Email(value: raw.lowercased())) + } + } + + struct Username { + let value: String + + static func parse(_ raw: String) -> Result { + guard !raw.isEmpty else { return .failure(.empty(field: "username")) } + guard raw.count >= 3 else { return .failure(.tooShort(field: "username", minimum: 3)) } + guard raw.count <= 32 else { return .failure(.tooLong(field: "username", maximum: 32)) } + return .success(Username(value: raw)) + } + } + + struct UserRegistration { + let email: Email + let username: Username + + static func validate(email rawEmail: String, username rawUsername: String) -> Result { + switch Email.parse(rawEmail) { + case .failure(let e): return .failure(e) + case .success(let email): + switch Username.parse(rawUsername) { + case .failure(let e): return .failure(e) + case .success(let username): return .success(UserRegistration(email: email, username: username)) + } + } + } + } + + func mapResult(_ result: Result, _ transform: (T) -> U) -> Result { + switch result { + case .success(let value): return .success(transform(value)) + case .failure(let error): return .failure(error) + } + } + ''' +end diff --git a/test/support/fixtures/typescript/dependency_injection.ex b/test/support/fixtures/typescript/dependency_injection.ex new file mode 100644 index 00000000..38bb9b8a --- /dev/null +++ b/test/support/fixtures/typescript/dependency_injection.ex @@ -0,0 +1,66 @@ +defmodule Test.Fixtures.TypeScript.DependencyInjection do + @moduledoc false + use Test.LanguageFixture, language: "typescript dependency_injection" + + @code ~S''' + interface Token { + readonly name: string; + } + + interface Provider { + token: Token; + factory: (container: Container) => T; + singleton: boolean; + } + + interface Container { + register(provider: Provider): void; + resolve(token: Token): T; + has(token: Token): boolean; + } + + class DIContainer implements Container { + private providers: Map>; + private singletons: Map; + + constructor() { + this.providers = new Map(); + this.singletons = new Map(); + } + + register(provider: Provider): void { + this.providers.set(provider.token.name, provider as Provider); + } + + resolve(token: Token): T { + const provider = this.providers.get(token.name); + if (!provider) { + throw new Error("No provider registered for token: " + token.name); + } + if (provider.singleton) { + if (!this.singletons.has(token.name)) { + this.singletons.set(token.name, provider.factory(this)); + } + return this.singletons.get(token.name) as T; + } + return provider.factory(this) as T; + } + + has(token: Token): boolean { + return this.providers.has(token.name); + } + } + + function createToken(name: string): Token { + return { name }; + } + + function singleton(token: Token, factory: (c: Container) => T): Provider { + return { token, factory, singleton: true }; + } + + function transient(token: Token, factory: (c: Container) => T): Provider { + return { token, factory, singleton: false }; + } + ''' +end diff --git a/test/support/fixtures/typescript/event_emitter.ex b/test/support/fixtures/typescript/event_emitter.ex new file mode 100644 index 00000000..8f1fed31 --- /dev/null +++ b/test/support/fixtures/typescript/event_emitter.ex @@ -0,0 +1,68 @@ +defmodule Test.Fixtures.TypeScript.EventEmitter do + @moduledoc false + use Test.LanguageFixture, language: "typescript event_emitter" + + @code ~S''' + interface EventMap { + [event: string]: unknown; + } + + interface Listener { + callback: (data: T) => void; + once: boolean; + } + + class EventEmitter { + private listeners: Map>>; + + constructor() { + this.listeners = new Map(); + } + + on(event: K, callback: (data: T[K]) => void): this { + if (!this.listeners.has(event)) { + this.listeners.set(event, []); + } + this.listeners.get(event)!.push({ callback: callback as (data: unknown) => void, once: false }); + return this; + } + + once(event: K, callback: (data: T[K]) => void): this { + if (!this.listeners.has(event)) { + this.listeners.set(event, []); + } + this.listeners.get(event)!.push({ callback: callback as (data: unknown) => void, once: true }); + return this; + } + + off(event: K, callback: (data: T[K]) => void): this { + const list = this.listeners.get(event); + if (list) { + this.listeners.set(event, list.filter(function(l) { return l.callback !== callback; })); + } + return this; + } + + emit(event: K, data: T[K]): boolean { + const list = this.listeners.get(event); + if (!list || list.length === 0) return false; + list.forEach(function(listener) { listener.callback(data); }); + this.listeners.set(event, list.filter(function(l) { return !l.once; })); + return true; + } + + removeAllListeners(event?: keyof T): this { + if (event) { + this.listeners.delete(event); + } else { + this.listeners.clear(); + } + return this; + } + } + + function createEmitter(): EventEmitter { + return new EventEmitter(); + } + ''' +end diff --git a/test/support/fixtures/typescript/user_profile_store.ex b/test/support/fixtures/typescript/user_profile_store.ex new file mode 100644 index 00000000..2242e438 --- /dev/null +++ b/test/support/fixtures/typescript/user_profile_store.ex @@ -0,0 +1,72 @@ +defmodule Test.Fixtures.TypeScript.UserProfileStore do + @moduledoc false + use Test.LanguageFixture, language: "typescript user_profile_store" + + @code ~S''' + interface UserProfile { + id: string; + name: string; + email: string; + role: "admin" | "member" | "guest"; + } + + interface StoreState { + users: Record; + loading: boolean; + error: string | null; + } + + interface Action { + type: string; + payload?: unknown; + } + + class UserProfileStore { + private state: StoreState; + private subscribers: Array<(state: StoreState) => void>; + + constructor() { + this.state = { users: {}, loading: false, error: null }; + this.subscribers = []; + } + + getState(): StoreState { + return this.state; + } + + dispatch(action: Action): void { + this.state = this.reduce(this.state, action); + this.notify(); + } + + subscribe(listener: (state: StoreState) => void): () => void { + this.subscribers.push(listener); + return () => { + this.subscribers = this.subscribers.filter(function(s) { return s !== listener; }); + }; + } + + private reduce(state: StoreState, action: Action): StoreState { + switch (action.type) { + case "SET_LOADING": + return { ...state, loading: action.payload as boolean }; + case "SET_ERROR": + return { ...state, error: action.payload as string }; + case "UPSERT_USER": + const user = action.payload as UserProfile; + return { ...state, users: { ...state.users, [user.id]: user } }; + default: + return state; + } + } + + private notify(): void { + this.subscribers.forEach(function(listener) { listener(this.state); }.bind(this)); + } + } + + function createUserProfileStore(): UserProfileStore { + return new UserProfileStore(); + } + ''' +end diff --git a/test/support/fixtures/zig/allocator_interface.ex b/test/support/fixtures/zig/allocator_interface.ex new file mode 100644 index 00000000..f11d6800 --- /dev/null +++ b/test/support/fixtures/zig/allocator_interface.ex @@ -0,0 +1,72 @@ +defmodule Test.Fixtures.Zig.AllocatorInterface do + @moduledoc false + use Test.LanguageFixture, language: "zig allocator_interface" + + @code ~S''' + const Allocator = struct { + ptr: *anyopaque, + vtable: *const VTable, + + pub const VTable = struct { + alloc: *const fn (ctx: *anyopaque, len: usize, alignment: u8) ?[*]u8, + free: *const fn (ctx: *anyopaque, buf: [*]u8, len: usize) void, + resize: *const fn (ctx: *anyopaque, buf: [*]u8, old_len: usize, new_len: usize) bool, + }; + + pub fn alloc(self: Allocator, comptime T: type, n: usize) ![]T { + const ptr = self.vtable.alloc(self.ptr, @sizeOf(T) * n, @alignOf(T)) orelse return error.OutOfMemory; + return @as([*]T, @ptrCast(@alignCast(ptr)))[0..n]; + } + + pub fn free(self: Allocator, slice: anytype) void { + const T = @TypeOf(slice[0]); + self.vtable.free(self.ptr, @as([*]u8, @ptrCast(slice.ptr)), slice.len * @sizeOf(T)); + } + }; + + const ArenaAllocator = struct { + backing: Allocator, + buffer: []u8, + pos: usize, + + pub fn init(backing: Allocator, size: usize) !ArenaAllocator { + const buf = try backing.alloc(u8, size); + return ArenaAllocator{ .backing = backing, .buffer = buf, .pos = 0 }; + } + + pub fn deinit(self: *ArenaAllocator) void { + self.backing.free(self.buffer); + } + + pub fn alloc(self: *ArenaAllocator, comptime T: type, n: usize) ![]T { + const size = @sizeOf(T) * n; + if (self.pos + size > self.buffer.len) return error.OutOfMemory; + const slice = self.buffer[self.pos .. self.pos + size]; + self.pos += size; + return @as([*]T, @ptrCast(@alignCast(slice.ptr)))[0..n]; + } + + pub fn reset(self: *ArenaAllocator) void { + self.pos = 0; + } + }; + + const AllocError = error{ + OutOfMemory, + AlignmentError, + InvalidSize, + }; + + fn alignForward(addr: usize, alignment: usize) usize { + return (addr + alignment - 1) & ~(alignment - 1); + } + + fn isPowerOfTwo(n: usize) bool { + return n > 0 and (n & (n - 1)) == 0; + } + + fn sizeOf(comptime T: type) comptime_int { + return @sizeOf(T); + } + ''' +end diff --git a/test/support/fixtures/zig/iterator_protocol.ex b/test/support/fixtures/zig/iterator_protocol.ex new file mode 100644 index 00000000..52848ef3 --- /dev/null +++ b/test/support/fixtures/zig/iterator_protocol.ex @@ -0,0 +1,87 @@ +defmodule Test.Fixtures.Zig.IteratorProtocol do + @moduledoc false + use Test.LanguageFixture, language: "zig iterator_protocol" + + @code ~S''' + fn Iterator(comptime T: type) type { + return struct { + const Self = @This(); + pub const Item = T; + ptr: *anyopaque, + nextFn: *const fn (ptr: *anyopaque) ?T, + + pub fn next(self: *Self) ?T { + return self.nextFn(self.ptr); + } + + pub fn count(self: *Self) usize { + var n: usize = 0; + while (self.next() != null) n += 1; + return n; + } + + pub fn forEach(self: *Self, callback: fn (T) void) void { + while (self.next()) |item| callback(item); + } + }; + } + + fn RangeIterator(comptime T: type) type { + return struct { + current: T, + end: T, + step: T, + + pub fn init(start: T, end: T, step: T) @This() { + return .{ .current = start, .end = end, .step = step }; + } + + pub fn next(self: *@This()) ?T { + if (self.current >= self.end) return null; + const value = self.current; + self.current += self.step; + return value; + } + }; + } + + fn SliceIterator(comptime T: type) type { + return struct { + slice: []const T, + index: usize, + + pub fn init(slice: []const T) @This() { + return .{ .slice = slice, .index = 0 }; + } + + pub fn next(self: *@This()) ?T { + if (self.index >= self.slice.len) return null; + const item = self.slice[self.index]; + self.index += 1; + return item; + } + + pub fn reset(self: *@This()) void { + self.index = 0; + } + }; + } + + fn MapIterator(comptime In: type, comptime Out: type) type { + return struct { + inner: SliceIterator(In), + transform: *const fn (In) Out, + + pub fn next(self: *@This()) ?Out { + const item = self.inner.next() orelse return null; + return self.transform(item); + } + }; + } + + fn take(comptime T: type, iter: *SliceIterator(T), n: usize) []const T { + _ = n; + return iter.slice; + } + ''' +end diff --git a/test/support/fixtures/zig/tagged_union.ex b/test/support/fixtures/zig/tagged_union.ex new file mode 100644 index 00000000..fed8b31b --- /dev/null +++ b/test/support/fixtures/zig/tagged_union.ex @@ -0,0 +1,90 @@ +defmodule Test.Fixtures.Zig.TaggedUnion do + @moduledoc false + use Test.LanguageFixture, language: "zig tagged_union" + + @code ~S''' + const TokenKind = enum { + identifier, + integer, + float, + string_literal, + operator, + keyword, + comment, + eof, + }; + + const Token = struct { + kind: TokenKind, + start: usize, + end: usize, + line: u32, + column: u32, + + pub fn length(self: Token) usize { + return self.end - self.start; + } + + pub fn isLiteral(self: Token) bool { + return self.kind == .integer or self.kind == .float or self.kind == .string_literal; + } + }; + + const Value = union(enum) { + int: i64, + float: f64, + boolean: bool, + string: []const u8, + null_value: void, + + pub fn typeName(self: Value) []const u8 { + return switch (self) { + .int => "int", + .float => "float", + .boolean => "boolean", + .string => "string", + .null_value => "null", + }; + } + + pub fn isTruthy(self: Value) bool { + return switch (self) { + .int => |v| v != 0, + .float => |v| v != 0.0, + .boolean => |v| v, + .string => |v| v.len > 0, + .null_value => false, + }; + } + }; + + const ParseError = error{ + UnexpectedToken, + UnexpectedEof, + InvalidLiteral, + StackOverflow, + }; + + fn parseInteger(source: []const u8) !i64 { + var result: i64 = 0; + for (source) |ch| { + if (ch < '0' or ch > '9') return ParseError.InvalidLiteral; + result = result * 10 + @as(i64, ch - '0'); + } + return result; + } + + fn parseFloat(source: []const u8) !f64 { + var result: f64 = 0; + var decimal = false; + var scale: f64 = 1; + for (source) |ch| { + if (ch == '.') { decimal = true; continue; } + if (ch < '0' or ch > '9') return ParseError.InvalidLiteral; + if (decimal) { scale /= 10; result += @as(f64, ch - '0') * scale; } + else { result = result * 10 + @as(f64, ch - '0'); } + } + return result; + } + ''' +end diff --git a/test/support/language_fixture.ex b/test/support/language_fixture.ex new file mode 100644 index 00000000..b83b44a2 --- /dev/null +++ b/test/support/language_fixture.ex @@ -0,0 +1,61 @@ +defmodule Test.LanguageFixture do + @moduledoc """ + Macro for defining per-language, per-domain code fixtures. + + ## In a fixture module + + defmodule Test.Fixtures.Elixir.EventBus do + use Test.LanguageFixture, language: "elixir event bus" + + @code ~S''' + defmodule EventBus do + ... + end + ''' + end + + ## In a test module + + defmodule MyTest do + Module.register_attribute(__MODULE__, :fixture, accumulate: true, persist: false) + use Test.Fixtures.Elixir.EventBus + use Test.Fixtures.Python.CsvPipeline + end + """ + + defmacro __using__(opts) do + language = Keyword.fetch!(opts, :language) + + quote do + @language unquote(language) + @before_compile Test.LanguageFixture + end + end + + defmacro __before_compile__(env) do + mod = env.module + code = Module.get_attribute(mod, :code) + language = Module.get_attribute(mod, :language) + block_assertions = Module.get_attribute(mod, :block_assertions) || [] + + unless code do + raise CompileError, + file: env.file, + line: env.line, + description: "#{mod} uses Test.LanguageFixture but @code is not set" + end + + quote do + defmacro __using__(_opts) do + fixture_language = unquote(language) + fixture_code = unquote(code) + fixture_block_assertions = unquote(Macro.escape(block_assertions)) + + quote do + @fixture {unquote(fixture_language), unquote(fixture_code), + unquote(Macro.escape(fixture_block_assertions))} + end + end + end + end +end diff --git a/test/support/node_matcher.ex b/test/support/node_matcher.ex new file mode 100644 index 00000000..679dc921 --- /dev/null +++ b/test/support/node_matcher.ex @@ -0,0 +1,17 @@ +defmodule Test.NodeMatcher do + @moduledoc """ + Helpers for asserting on tokens within `CompoundNode` structures. + + Returns tagged tuples that can be matched against token fields: + + - `exact(:content, "add")` — token whose `content` equals `"add"` exactly + - `partial(:content, "@doc")` — token whose `content` contains `"@doc"` as a substring + - `:value` targets the normalized token value instead of raw source content + """ + + @spec exact(:content | :value, String.t()) :: {:exact, :content | :value, String.t()} + def exact(field, value) when field in [:content, :value], do: {:exact, field, value} + + @spec partial(:content | :value, String.t()) :: {:partial, :content | :value, String.t()} + def partial(field, value) when field in [:content, :value], do: {:partial, field, value} +end