<a href="https://colab.research.google.com/github/panchambanerjee/langchain_experiments/blob/main/langchain_issues_dataset_2024_01_18.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Source idea from HF:: https://huggingface.co/learn/nlp-course/chapter5/5?fw=pt

In [1]:
!pip install datasets evaluate transformers[sentencepiece]
!apt install git-lfs

Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting sentencepiece!=0.1.92,>=0.1.91 (from transformers

In [2]:
!git config --global user.email "panchajanya.banerjee@gmail.com"
!git config --global user.name "panchambanerjee"

In [3]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
!pip install requests



In [5]:
import requests

url = "https://api.github.com/repos/langchain-ai/langchain/issues?page=1&per_page=1"
response = requests.get(url)

In [6]:
response.status_code

200

In [7]:
response.json()

[{'url': 'https://api.github.com/repos/langchain-ai/langchain/issues/16179',
  'repository_url': 'https://api.github.com/repos/langchain-ai/langchain',
  'labels_url': 'https://api.github.com/repos/langchain-ai/langchain/issues/16179/labels{/name}',
  'comments_url': 'https://api.github.com/repos/langchain-ai/langchain/issues/16179/comments',
  'events_url': 'https://api.github.com/repos/langchain-ai/langchain/issues/16179/events',
  'html_url': 'https://github.com/langchain-ai/langchain/issues/16179',
  'id': 2087527966,
  'node_id': 'I_kwDOIPDwls58bSYe',
  'number': 16179,
  'title': "How can I embed chat logs into the specified collection 'QDRANT_COLLECTION_NAME' in Qrdant, and what approach should I use to write the chain for short-term memory?",
  'user': {'login': 'yen111445',
   'id': 103471919,
   'node_id': 'U_kgDOBirbLw',
   'avatar_url': 'https://avatars.githubusercontent.com/u/103471919?v=4',
   'gravatar_id': '',
   'url': 'https://api.github.com/users/yen111445',
   'html

In [8]:
GITHUB_TOKEN = ''  # Copy your GitHub token here
headers = {"Authorization": f"token {GITHUB_TOKEN}"}

In [9]:
import time
import math
from pathlib import Path
import pandas as pd
from tqdm.notebook import tqdm


def fetch_issues(
    owner="langchain-ai",
    repo="langchain",
    num_issues=10_000,
    rate_limit=5_000,
    issues_path=Path("."),
):
    if not issues_path.is_dir():
        issues_path.mkdir(exist_ok=True)

    batch = []
    all_issues = []
    per_page = 100  # Number of issues to return per page
    num_pages = math.ceil(num_issues / per_page)
    base_url = "https://api.github.com/repos"

    for page in tqdm(range(num_pages)):
        # Query with state=all to get both open and closed issues
        query = f"issues?page={page}&per_page={per_page}&state=all"
        issues = requests.get(f"{base_url}/{owner}/{repo}/{query}", headers=headers)
        batch.extend(issues.json())

        if len(batch) > rate_limit and len(all_issues) < num_issues:
            all_issues.extend(batch)
            batch = []  # Flush batch for next time period
            print(f"Reached GitHub rate limit. Sleeping for one hour ...")
            time.sleep(60 * 60 + 1)

    all_issues.extend(batch)
    df = pd.DataFrame.from_records(all_issues)
    df.to_json(f"{issues_path}/{repo}-issues.jsonl", orient="records", lines=True)
    print(
        f"Downloaded all the issues for {repo}! Dataset stored at {issues_path}/{repo}-issues.jsonl"
    )

In [10]:
fetch_issues()

  0%|          | 0/100 [00:00<?, ?it/s]

Reached GitHub rate limit. Sleeping for one hour ...
Downloaded all the issues for langchain! Dataset stored at ./langchain-issues.jsonl


In [13]:
from datasets import load_dataset

issues_dataset = load_dataset("json", data_files="langchain-issues.jsonl", split="train")
issues_dataset

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'body', 'reactions', 'timeline_url', 'performed_via_github_app', 'state_reason', 'draft', 'pull_request'],
    num_rows: 10000
})

In [16]:
sample = issues_dataset.shuffle(seed=21).select(range(3))

# Print out the URL and pull request entries
for url, pr in zip(sample["html_url"], sample["pull_request"]):
    print(f">> URL: {url}")
    print(f">> Pull request: {pr}\n")

>> URL: https://github.com/langchain-ai/langchain/pull/15150
>> Pull request: {'url': 'https://api.github.com/repos/langchain-ai/langchain/pulls/15150', 'html_url': 'https://github.com/langchain-ai/langchain/pull/15150', 'diff_url': 'https://github.com/langchain-ai/langchain/pull/15150.diff', 'patch_url': 'https://github.com/langchain-ai/langchain/pull/15150.patch', 'merged_at': datetime.datetime(2024, 1, 15, 20, 29, 14)}

>> URL: https://github.com/langchain-ai/langchain/issues/12877
>> Pull request: None

>> URL: https://github.com/langchain-ai/langchain/issues/8319
>> Pull request: None



In [17]:
issues_dataset = issues_dataset.map(
    lambda x: {"is_pull_request": False if x["pull_request"] is None else True}
)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [18]:
issue_number = 16181
url = f"https://api.github.com/repos/langchain-ai/langchain/issues/{issue_number}/comments"
response = requests.get(url, headers=headers)
response.json()

[{'url': 'https://api.github.com/repos/langchain-ai/langchain/issues/comments/1897852586',
  'html_url': 'https://github.com/langchain-ai/langchain/issues/16181#issuecomment-1897852586',
  'issue_url': 'https://api.github.com/repos/langchain-ai/langchain/issues/16181',
  'id': 1897852586,
  'node_id': 'IC_kwDOIPDwls5xHu6q',
  'user': {'login': 'dosubot[bot]',
   'id': 131922026,
   'node_id': 'BOT_kgDOB9z4ag',
   'avatar_url': 'https://avatars.githubusercontent.com/in/324583?v=4',
   'gravatar_id': '',
   'url': 'https://api.github.com/users/dosubot%5Bbot%5D',
   'html_url': 'https://github.com/apps/dosubot',
   'followers_url': 'https://api.github.com/users/dosubot%5Bbot%5D/followers',
   'following_url': 'https://api.github.com/users/dosubot%5Bbot%5D/following{/other_user}',
   'gists_url': 'https://api.github.com/users/dosubot%5Bbot%5D/gists{/gist_id}',
   'starred_url': 'https://api.github.com/users/dosubot%5Bbot%5D/starred{/owner}{/repo}',
   'subscriptions_url': 'https://api.gith

In [19]:
issue_number = 16182 # Latest issue, 2024-01-18
url = f"https://api.github.com/repos/langchain-ai/langchain/issues/{issue_number}/comments"
response = requests.get(url, headers=headers)
response.json()

[{'url': 'https://api.github.com/repos/langchain-ai/langchain/issues/comments/1897869836',
  'html_url': 'https://github.com/langchain-ai/langchain/issues/16182#issuecomment-1897869836',
  'issue_url': 'https://api.github.com/repos/langchain-ai/langchain/issues/16182',
  'id': 1897869836,
  'node_id': 'IC_kwDOIPDwls5xHzIM',
  'user': {'login': 'dosubot[bot]',
   'id': 131922026,
   'node_id': 'BOT_kgDOB9z4ag',
   'avatar_url': 'https://avatars.githubusercontent.com/in/324583?v=4',
   'gravatar_id': '',
   'url': 'https://api.github.com/users/dosubot%5Bbot%5D',
   'html_url': 'https://github.com/apps/dosubot',
   'followers_url': 'https://api.github.com/users/dosubot%5Bbot%5D/followers',
   'following_url': 'https://api.github.com/users/dosubot%5Bbot%5D/following{/other_user}',
   'gists_url': 'https://api.github.com/users/dosubot%5Bbot%5D/gists{/gist_id}',
   'starred_url': 'https://api.github.com/users/dosubot%5Bbot%5D/starred{/owner}{/repo}',
   'subscriptions_url': 'https://api.gith

In [20]:
def get_comments(issue_number):
    url = f"https://api.github.com/repos/langchain-ai/langchain/issues/{issue_number}/comments"
    response = requests.get(url, headers=headers)
    return [r["body"] for r in response.json()]


# Test our function works as expected
get_comments(16182)

['_🤖_\n<!-- Greeting -->\nHello @shrimad-mishra-cognoai! :wave: I\'m [Dosu](https://dosu.dev), a friendly bot here to assist you in solving bugs, answering questions, and making you a contributing champ while we wait for a human maintainer. Happy to have you here! Let\'s get started, shall we? :rocket:\n\n<!-- Answer -->\nBased on the information provided, it seems like you\'re encountering an issue where the `get_openai_callback()` function is not returning the expected token usage when streaming output from the language model. This is a known issue that has been discussed in the LangChain repository before.\n\nOne of the solutions that has been suggested in a similar issue ([#3114](https://github.com/langchain-ai/langchain/issues/3114)) involves creating a custom async and cost calculator \'handler\' that uses the `tiktoken` dependency to calculate the cost of tokens used. Here\'s the suggested code:\n\n```python\nfrom langchain.callbacks.base import AsyncCallbackHandler\nfrom langch

In [21]:
get_comments(634) # Oldest issue currently on langchain repo, 2023-01-17

["Perhaps not quite the same scenario, but I'm getting exactly the same error when running the [VectorDB Question Answering with Sources](https://github.com/hwchase17/langchain/blob/30abfc41c24a8b7b003ef94c4cef760da4947a53/docs/modules/chains/combine_docs_examples/vector_db_qa_with_sources.ipynb) example.\r\n\r\nPerhaps adding some [exponential backoff as OpenAI recommend](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_handle_rate_limits.ipynb)?",
 'I ran into rate limits when using `FAISS.from_texts` on one markdown file with ~800 lines with the [Question Answering with Sources sample](https://langchain.readthedocs.io/en/latest/modules/chains/combine_docs_examples/qa_with_sources.html). I worked around it like this. Posting in case it is useful for other users:\r\n\r\n```python\r\ndef chunks(lst, n):\r\n  # https://stackoverflow.com/a/312464/18903720\r\n  """Yield successive n-sized chunks from lst."""\r\n  for i in range(0, len(lst), n):\r\n    yield lst[i:i + n]

In [23]:
# # Depending on your internet connection, this can take a few minutes...
# issues_with_comments_dataset = issues_dataset.map(
#     lambda x: {"comments": get_comments(x["number"])}
# )

In [24]:
issues_dataset

Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'body', 'reactions', 'timeline_url', 'performed_via_github_app', 'state_reason', 'draft', 'pull_request', 'is_pull_request'],
    num_rows: 10000
})

In [26]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [30]:
issues_dataset.push_to_hub("langchain-issues")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

RepositoryNotFoundError: 404 Client Error. (Request ID: Root=1-65a8d522-5fca4c7d3f1cf7d60cc36a34;9b1ca7fa-bade-424f-b8c1-eb65e7fc7e6d)

Repository Not Found for url: https://huggingface.co/api/datasets/langchain-issues/preupload/main.
Please make sure you specified the correct `repo_id` and `repo_type`.
If you are trying to access a private or gated repo, make sure you are authenticated.
Note: Creating a commit assumes that the repo already exists on the Huggingface Hub. Please use `create_repo` if it's not the case.