In [100]:
import json
import pandas as pd


def unpack_content(row, max_char=4112):
    packed_content = json.loads(row.content)
    match row.provider:
        case "github":
            unpacked_content = packed_content.get("readme", "")
        case "hacker_news" | "arxiv":
            unpacked_content = packed_content.get("text", "")
        case _:
            unpacked_content = ""
    unpacked_content = unpacked_content or ""
    return unpacked_content[:max_char]


content = pd.read_parquet("data/content-sample.parquet")
content["_text"] = content.apply(unpack_content, axis=1, max_char=1028)
content = content[content["_text"].apply(lambda _: len(_) > 250)]
content.head()

Unnamed: 0,id,url,provider,metadata,content,statistics,load_id,load_label,load_ts,load_date,_text
0,45445160-a2dd-437e-b7ce-8a0d32da4e23,https://api.github.com/repos/didi/logbook,github,"{""created_at"":""2022-01-12T08:29:01Z"",""descript...","{""readme"":""# logbook 简介\nlogbook 是一款面向 ToB 业务的...","{""forks"":9,""issues"":0,""stars"":103}",203ac5aa-fb5c-4f4b-ad54-8ce27ddd0622,,2023-06-16 00:38:04,2023-06-16,# logbook 简介\nlogbook 是一款面向 ToB 业务的服务端埋点方案\n\n...
1,e625534c-4792-4a64-964e-3559349eb66c,https://api.github.com/repos/scofield7419/THOR...,github,"{""created_at"":""2023-05-18T13:06:06Z"",""descript...","{""readme"":""## ����‍♀️⚡⚒️ THOR: Three-hop Reaso...","{""forks"":1,""issues"":0,""stars"":104}",203ac5aa-fb5c-4f4b-ad54-8ce27ddd0622,,2023-06-16 00:38:04,2023-06-16,## ����‍♀️⚡⚒️ THOR: Three-hop Reasoning for Im...
2,46878b47-b242-4c8d-9300-855fbb225947,https://api.github.com/repos/scofield7419/LAGC...,github,"{""created_at"":""2023-06-11T12:57:26Z"",""descript...","{""readme"":""# LAGCN SRL Pointer\nThis repositor...","{""forks"":0,""issues"":0,""stars"":50}",203ac5aa-fb5c-4f4b-ad54-8ce27ddd0622,,2023-06-16 00:38:04,2023-06-16,# LAGCN SRL Pointer\nThis repository includes ...
3,d7fc273a-ce2c-42b7-9bb3-c6ccb1ca3413,https://api.github.com/repos/scofield7419/DiaR...,github,"{""created_at"":""2022-04-24T08:25:20Z"",""descript...","{""readme"":""**Python implementation of IJCAI 20...","{""forks"":0,""issues"":0,""stars"":53}",203ac5aa-fb5c-4f4b-ad54-8ce27ddd0622,,2023-06-16 00:38:04,2023-06-16,**Python implementation of IJCAI 2022 Paper [G...
4,896542c4-ad6b-46e0-9b9e-aba6cfa13838,https://api.github.com/repos/kdoctor-io/kdoctor,github,"{""created_at"":""2023-06-05T03:59:18Z"",""descript...","{""readme"":""# kdoctor\n[![Auto Release Version]...","{""forks"":10,""issues"":6,""stars"":62}",203ac5aa-fb5c-4f4b-ad54-8ce27ddd0622,,2023-06-16 00:38:04,2023-06-16,# kdoctor\n[![Auto Release Version](https://gi...


In [102]:
content.shape, content.provider.unique()

((6257, 11),
 array(['github', 'hacker_news', 'arxiv'], dtype=object),
 (1109, 11))

In [103]:
from qdrant_client import QdrantClient

client = QdrantClient(":memory:")
ids = client.add("content", documents=content["_text"], ids=content["id"], metadata=content[["provider", "url"]].to_dict(orient="records"))
ids[:10]

['45445160-a2dd-437e-b7ce-8a0d32da4e23',
 'e625534c-4792-4a64-964e-3559349eb66c',
 '46878b47-b242-4c8d-9300-855fbb225947',
 'd7fc273a-ce2c-42b7-9bb3-c6ccb1ca3413',
 '896542c4-ad6b-46e0-9b9e-aba6cfa13838',
 '861baf10-312f-4ff3-b435-19009a8c7b07',
 '33d557cb-ac6f-4d36-9a95-4b867541a0b8',
 '03efd920-2d87-4872-a7dd-011afd92ea8a',
 '8e3e39fe-bf5c-494b-aa88-3276a067cc9a',
 '042399be-e69d-417e-b17d-6eca76fab74b']

In [104]:
result = client.query(collection_name="content", query_text=["a document for machine learning engineers"], limit=3)
result

[QueryResponse(id='5a6887d6-8220-4a5e-81fb-13a476178287', embedding=None, metadata={'document': '# Top-down learning path: Machine Learning for Software Engineers\n\n<p align="center">\n  <a href="https://github.com/ZuzooVn/machine-learning-for-software-engineers">\n    <img alt="Top-down learning path: Machine Learning for Software Engineers" src="https://img.shields.io/badge/Machine%20Learning-Software%20Engineers-blue.svg">\n  </a>\n  <a href="https://github.com/ZuzooVn/machine-learning-for-software-engineers/stargazers">\n    <img alt="GitHub stars" src="https://img.shields.io/github/stars/ZuzooVn/machine-learning-for-software-engineers.svg">\n  </a>\n  <a href="https://github.com/ZuzooVn/machine-learning-for-software-engineers/network">\n    <img alt="GitHub forks" src="https://img.shields.io/github/forks/ZuzooVn/machine-learning-for-software-engineers.svg">\n  </a>\n</p>\n\nInspired by [Coding Interview University](https://github.com/jwasham/coding-interview-university).\n\nTrans