diff --git a/pkg/huggingface-api/client.go b/pkg/huggingface-api/client.go index 74494ed0b866..77c26a646fa3 100644 --- a/pkg/huggingface-api/client.go +++ b/pkg/huggingface-api/client.go @@ -148,10 +148,15 @@ func (c *Client) SetBaseURL(url string) { c.baseURL = url } -// ListFiles lists all files in a HuggingFace repository -func (c *Client) ListFiles(repoID string) ([]FileInfo, error) { +// listFilesInPath lists all files in a specific path of a HuggingFace repository (recursive helper) +func (c *Client) listFilesInPath(repoID, path string) ([]FileInfo, error) { baseURL := strings.TrimSuffix(c.baseURL, "/api/models") - url := fmt.Sprintf("%s/api/models/%s/tree/main", baseURL, repoID) + var url string + if path == "" { + url = fmt.Sprintf("%s/api/models/%s/tree/main", baseURL, repoID) + } else { + url = fmt.Sprintf("%s/api/models/%s/tree/main/%s", baseURL, repoID, path) + } req, err := http.NewRequest("GET", url, nil) if err != nil { @@ -173,12 +178,45 @@ func (c *Client) ListFiles(repoID string) ([]FileInfo, error) { return nil, fmt.Errorf("failed to read response body: %w", err) } - var files []FileInfo - if err := json.Unmarshal(body, &files); err != nil { + var items []FileInfo + if err := json.Unmarshal(body, &items); err != nil { return nil, fmt.Errorf("failed to parse JSON response: %w", err) } - return files, nil + var allFiles []FileInfo + for _, item := range items { + switch item.Type { + // If it's a directory/folder, recursively list its contents + case "directory", "folder": + // Build the subfolder path + subPath := item.Path + if path != "" { + subPath = fmt.Sprintf("%s/%s", path, item.Path) + } + + // Recursively get files from subfolder + // The recursive call will already prepend the subPath to each file's path + subFiles, err := c.listFilesInPath(repoID, subPath) + if err != nil { + return nil, fmt.Errorf("failed to list files in subfolder %s: %w", subPath, err) + } + + allFiles = append(allFiles, subFiles...) + case "file": + // It's a file, prepend the current path to make it relative to root + // if path != "" { + // item.Path = fmt.Sprintf("%s/%s", path, item.Path) + // } + allFiles = append(allFiles, item) + } + } + + return allFiles, nil +} + +// ListFiles lists all files in a HuggingFace repository, including files in subfolders +func (c *Client) ListFiles(repoID string) ([]FileInfo, error) { + return c.listFilesInPath(repoID, "") } // GetFileSHA gets the SHA256 checksum for a specific file by searching through the file list diff --git a/pkg/huggingface-api/client_test.go b/pkg/huggingface-api/client_test.go index 5ef5f7900930..8468af6e6a76 100644 --- a/pkg/huggingface-api/client_test.go +++ b/pkg/huggingface-api/client_test.go @@ -337,6 +337,137 @@ var _ = Describe("HuggingFace API Client", func() { }) }) + Context("when listing files with subfolders", func() { + BeforeEach(func() { + // Mock response for root directory with files and a subfolder + mockRootResponse := `[ + { + "type": "file", + "path": "README.md", + "size": 5000, + "oid": "readme123" + }, + { + "type": "directory", + "path": "subfolder", + "size": 0, + "oid": "dir123" + }, + { + "type": "file", + "path": "config.json", + "size": 1000, + "oid": "config123" + } + ]` + + // Mock response for subfolder directory + mockSubfolderResponse := `[ + { + "type": "file", + "path": "subfolder/file.bin", + "size": 2000000, + "oid": "filebin123", + "lfs": { + "oid": "filebin456", + "size": 2000000, + "pointerSize": 135 + } + }, + { + "type": "directory", + "path": "nested", + "size": 0, + "oid": "nesteddir123" + } + ]` + + // Mock response for nested subfolder + mockNestedResponse := `[ + { + "type": "file", + "path": "subfolder/nested/nested_file.gguf", + "size": 5000000, + "oid": "nested123", + "lfs": { + "oid": "nested456", + "size": 5000000, + "pointerSize": 135 + } + } + ]` + + server = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + urlPath := r.URL.Path + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + + if strings.Contains(urlPath, "/tree/main/subfolder/nested") { + w.Write([]byte(mockNestedResponse)) + } else if strings.Contains(urlPath, "/tree/main/subfolder") { + w.Write([]byte(mockSubfolderResponse)) + } else if strings.Contains(urlPath, "/tree/main") { + w.Write([]byte(mockRootResponse)) + } else { + w.WriteHeader(http.StatusNotFound) + } + })) + + client.SetBaseURL(server.URL) + }) + + It("should recursively list all files including those in subfolders", func() { + files, err := client.ListFiles("test/model") + + Expect(err).ToNot(HaveOccurred()) + Expect(files).To(HaveLen(4)) + + // Verify root level files + readmeFile := findFileByPath(files, "README.md") + Expect(readmeFile).ToNot(BeNil()) + Expect(readmeFile.Size).To(Equal(int64(5000))) + Expect(readmeFile.Oid).To(Equal("readme123")) + + configFile := findFileByPath(files, "config.json") + Expect(configFile).ToNot(BeNil()) + Expect(configFile.Size).To(Equal(int64(1000))) + Expect(configFile.Oid).To(Equal("config123")) + + // Verify subfolder file with relative path + subfolderFile := findFileByPath(files, "subfolder/file.bin") + Expect(subfolderFile).ToNot(BeNil()) + Expect(subfolderFile.Size).To(Equal(int64(2000000))) + Expect(subfolderFile.LFS).ToNot(BeNil()) + Expect(subfolderFile.LFS.Oid).To(Equal("filebin456")) + + // Verify nested subfolder file + nestedFile := findFileByPath(files, "subfolder/nested/nested_file.gguf") + Expect(nestedFile).ToNot(BeNil()) + Expect(nestedFile.Size).To(Equal(int64(5000000))) + Expect(nestedFile.LFS).ToNot(BeNil()) + Expect(nestedFile.LFS.Oid).To(Equal("nested456")) + }) + + It("should handle files with correct relative paths", func() { + files, err := client.ListFiles("test/model") + + Expect(err).ToNot(HaveOccurred()) + + // Check that all paths are relative and correct + paths := make([]string, len(files)) + for i, file := range files { + paths[i] = file.Path + } + + Expect(paths).To(ContainElements( + "README.md", + "config.json", + "subfolder/file.bin", + "subfolder/nested/nested_file.gguf", + )) + }) + }) + Context("when getting file SHA", func() { BeforeEach(func() { mockFilesResponse := `[ @@ -405,6 +536,7 @@ var _ = Describe("HuggingFace API Client", func() { BeforeEach(func() { mockFilesResponse := `[ { + "type": "file", "path": "model-Q4_K_M.gguf", "size": 1000000, "oid": "abc123", @@ -416,6 +548,7 @@ var _ = Describe("HuggingFace API Client", func() { } }, { + "type": "file", "path": "README.md", "size": 5000, "oid": "readme123" @@ -538,4 +671,84 @@ var _ = Describe("HuggingFace API Client", func() { Expect(preferred).To(BeNil()) }) }) + + Context("integration test with real HuggingFace API", func() { + It("should recursively list all files including subfolders from real repository", func() { + // This test makes actual API calls to HuggingFace + // Skip if running in CI or if network is not available + realClient := hfapi.NewClient() + repoID := "bartowski/Qwen_Qwen3-Next-80B-A3B-Instruct-GGUF" + + files, err := realClient.ListFiles(repoID) + + Expect(err).ToNot(HaveOccurred()) + Expect(files).ToNot(BeEmpty(), "should return at least some files") + + // Verify that we get files from subfolders + // Based on the repository structure, there should be files in subfolders like: + // - Qwen_Qwen3-Next-80B-A3B-Instruct-Q4_1/... + // - Qwen_Qwen3-Next-80B-A3B-Instruct-Q5_K_L/... + // etc. + hasSubfolderFiles := false + rootLevelFiles := 0 + subfolderFiles := 0 + + for _, file := range files { + if strings.Contains(file.Path, "/") { + hasSubfolderFiles = true + subfolderFiles++ + // Verify the path format is correct (subfolder/file.gguf) + Expect(file.Path).ToNot(HavePrefix("/"), "paths should be relative, not absolute") + Expect(file.Path).ToNot(HaveSuffix("/"), "file paths should not end with /") + } else { + rootLevelFiles++ + } + } + + Expect(hasSubfolderFiles).To(BeTrue(), "should find files in subfolders") + Expect(rootLevelFiles).To(BeNumerically(">", 0), "should find files at root level") + Expect(subfolderFiles).To(BeNumerically(">", 0), "should find files in subfolders") + // Verify specific expected files exist + // Root level files + readmeFile := findFileByPath(files, "README.md") + Expect(readmeFile).ToNot(BeNil(), "README.md should exist at root level") + + // Verify we can find files in subfolders + // Look for any file in a subfolder (the exact structure may vary, can be nested) + foundSubfolderFile := false + for _, file := range files { + if strings.Contains(file.Path, "/") && strings.HasSuffix(file.Path, ".gguf") { + foundSubfolderFile = true + // Verify the path structure: can be nested like subfolder/subfolder/file.gguf + parts := strings.Split(file.Path, "/") + Expect(len(parts)).To(BeNumerically(">=", 2), "subfolder files should have at least subfolder/file.gguf format") + // The last part should be the filename + Expect(parts[len(parts)-1]).To(HaveSuffix(".gguf"), "file in subfolder should be a .gguf file") + Expect(parts[len(parts)-1]).ToNot(BeEmpty(), "filename should not be empty") + break + } + } + Expect(foundSubfolderFile).To(BeTrue(), "should find at least one .gguf file in a subfolder") + + // Verify file properties are populated + for _, file := range files { + Expect(file.Path).ToNot(BeEmpty(), "file path should not be empty") + Expect(file.Type).To(Equal("file"), "all returned items should be files, not directories") + // Size might be 0 for some files, but OID should be present + if file.LFS == nil { + Expect(file.Oid).ToNot(BeEmpty(), "file should have an OID if no LFS") + } + } + }) + }) }) + +// findFileByPath is a helper function to find a file by its path in a slice of FileInfo +func findFileByPath(files []hfapi.FileInfo, path string) *hfapi.FileInfo { + for i := range files { + if files[i].Path == path { + return &files[i] + } + } + return nil +}