diff --git a/.codecov.yml b/.codecov.yml new file mode 100644 index 0000000..6245dae --- /dev/null +++ b/.codecov.yml @@ -0,0 +1,15 @@ +coverage: + status: + project: + default: + target: 85% + threshold: 3% + patch: + default: + target: 70% + threshold: 5% + +comment: + layout: "header, diff" + behavior: default + require_changes: true \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index cecffd8..0afb065 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "diffchunk" -version = "0.1.6" +version = "0.1.7" description = "MCP server for navigating large diff files with intelligent chunking" readme = "README.md" requires-python = ">=3.10" @@ -26,6 +26,7 @@ classifiers = [ dependencies = [ "click>=8.2.1", "mcp>=1.10.0", + "chardet>=4.0.0", ] [project.urls] diff --git a/src/chunker.py b/src/chunker.py index a397a69..74dd3a2 100644 --- a/src/chunker.py +++ b/src/chunker.py @@ -33,7 +33,7 @@ def chunk_diff( raise ValueError(f"Failed to parse diff: {e}") if not file_changes: - raise ValueError("No valid diff content found") + raise ValueError("Diff file parsed successfully but contains no changes") for files, content in file_changes: # Apply filters diff --git a/src/parser.py b/src/parser.py index 134ab44..cb0fa4d 100644 --- a/src/parser.py +++ b/src/parser.py @@ -123,29 +123,32 @@ def should_include_file( return True # Include by default if no patterns specified def _read_diff_file(self, file_path: str) -> List[str]: - """Read diff file with proper encoding handling.""" - # Try UTF-8 first (most common) - encodings = ["utf-8", "utf-8-sig", "cp1252", "latin-1"] + """Read diff file with encoding detection.""" + import chardet - for encoding in encodings: - try: - with open(file_path, "r", encoding=encoding) as f: - content = f.read() + # Detect encoding from sample + with open(file_path, "rb") as f: + sample = f.read(8192) + result = chardet.detect(sample) - # Strip BOM if present - if content.startswith("\ufeff"): - content = content[1:] + # Use detected encoding if confident, otherwise UTF-8 + encoding = ( + result.get("encoding") if result.get("confidence", 0) > 0.7 else "utf-8" + ) - lines = content.splitlines(keepends=True) - return lines + try: + with open(file_path, "r", encoding=encoding) as f: + content = f.read() + except UnicodeDecodeError: + # Fallback with error replacement + with open(file_path, "r", encoding="utf-8", errors="replace") as f: + content = f.read() - except (UnicodeDecodeError, IOError): - continue + # Strip BOM if present + if content.startswith("\ufeff"): + content = content[1:] - # If all encodings failed, raise clear error - raise ValueError( - f"Cannot read diff file {file_path}: unable to decode with any common encoding" - ) + return content.splitlines(keepends=True) def count_lines(self, content: str) -> int: """Count meaningful lines in diff content.""" diff --git a/tests/test_encodings.py b/tests/test_encodings.py new file mode 100644 index 0000000..20efd0c --- /dev/null +++ b/tests/test_encodings.py @@ -0,0 +1,60 @@ +"""Test encoding support for diff files.""" + +import pytest +from pathlib import Path + +from src.tools import DiffChunkTools + + +class TestEncodings: + """Test encoding detection and parsing.""" + + @pytest.fixture + def test_data_dir(self): + return Path(__file__).parent / "test_data" + + @pytest.fixture + def tools(self): + return DiffChunkTools() + + def test_encoding_support(self, tools, test_data_dir): + """Test that various encodings are supported.""" + test_files = [ + "minimal_working.diff", # UTF-8 + "minimal_windows.diff", # Windows line endings + "minimal_bom.diff", # UTF-8 BOM + "minimal_latin1.diff", # Latin-1 + ] + + for filename in test_files: + result = tools.load_diff(str(test_data_dir / filename)) + assert result["chunks"] > 0, f"{filename} should parse successfully" + assert result["files"] > 0, f"{filename} should contain files" + + def test_encoding_detection(self, tools, tmp_path): + """Test encoding detection with UTF-16.""" + # Create a minimal UTF-16 diff file + content = """diff --git a/test.txt b/test.txt +index 1234567..abcdefg 100644 +--- a/test.txt ++++ b/test.txt +@@ -1 +1 @@ +-old line ++new line +""" + utf16_file = tmp_path / "test_utf16.diff" + utf16_file.write_text(content, encoding="utf-16") + + result = tools.load_diff(str(utf16_file)) + assert result["chunks"] > 0 + assert result["files"] > 0 + + def test_empty_diff_error(self, tools, tmp_path): + """Test error message for empty diff files.""" + empty_file = tmp_path / "empty.diff" + empty_file.write_text("") + + with pytest.raises( + ValueError, match="Diff file parsed successfully but contains no changes" + ): + tools.load_diff(str(empty_file)) diff --git a/tests/test_windows_repro.py b/tests/test_windows_repro.py deleted file mode 100644 index 3be1f41..0000000 --- a/tests/test_windows_repro.py +++ /dev/null @@ -1,34 +0,0 @@ -"""Test to reproduce Windows "No valid diff content found" issue.""" - -import pytest -from pathlib import Path - -from src.tools import DiffChunkTools - - -class TestWindowsEncoding: - """Test Windows encoding issues that cause 'No valid diff content found'.""" - - @pytest.fixture - def test_data_dir(self): - return Path(__file__).parent / "test_data" - - @pytest.fixture - def tools(self): - return DiffChunkTools() - - def test_encoding_scenarios_work_with_fix(self, tools, test_data_dir): - """Test that encoding scenarios work with the fix.""" - working_files = [ - "minimal_working.diff", # UTF-8 baseline - "minimal_windows.diff", # Windows \r\n line endings - "minimal_bom.diff", # UTF-8 BOM (now handled) - "minimal_latin1.diff", # Latin-1 encoding (now handled) - ] - - for filename in working_files: - result = tools.load_diff( - str(test_data_dir / filename), max_chunk_lines=1000 - ) - assert result["chunks"] > 0, f"{filename} should work with fix" - assert result["files"] > 0, f"{filename} should have files" diff --git a/uv.lock b/uv.lock index afef845..d931774 100644 --- a/uv.lock +++ b/uv.lock @@ -44,6 +44,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/84/ae/320161bd181fc06471eed047ecce67b693fd7515b16d495d8932db763426/certifi-2025.6.15-py3-none-any.whl", hash = "sha256:2e0c7ce7cb5d8f8634ca55d2ba7e6ec2689a2fd6537d8dec1296a477a4910057", size = 157650, upload-time = "2025-06-15T02:45:49.977Z" }, ] +[[package]] +name = "chardet" +version = "5.2.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f3/0d/f7b6ab21ec75897ed80c17d79b15951a719226b9fababf1e40ea74d69079/chardet-5.2.0.tar.gz", hash = "sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7", size = 2069618, upload-time = "2023-08-01T19:23:02.662Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/38/6f/f5fbc992a329ee4e0f288c1fe0e2ad9485ed064cac731ed2fe47dcc38cbf/chardet-5.2.0-py3-none-any.whl", hash = "sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970", size = 199385, upload-time = "2023-08-01T19:23:00.661Z" }, +] + [[package]] name = "click" version = "8.2.1" @@ -136,9 +145,10 @@ toml = [ [[package]] name = "diffchunk" -version = "0.1.6" +version = "0.1.7" source = { editable = "." } dependencies = [ + { name = "chardet" }, { name = "click" }, { name = "mcp" }, ] @@ -154,6 +164,7 @@ dev = [ [package.metadata] requires-dist = [ + { name = "chardet", specifier = ">=4.0.0" }, { name = "click", specifier = ">=8.2.1" }, { name = "mcp", specifier = ">=1.10.0" }, ]