# Fetch labelled issues using data from GH Archive
The purpose of this notebook is to use the data provided by GH Archive and format it into dataframes that we can use

In [1]:
import pandas as pd
import numpy as np
import requests
from dotenv import load_dotenv
import os
import requests
from concurrent.futures import ThreadPoolExecutor

## Download data to local

In [3]:
def download_data(partition):
    """
    Download data from url and save it to filename
    """

    url = f"http://data.gharchive.org/{partition}.json.gz"

    response = requests.get(url)

    if response.status_code == 200:
        with open(f"../data/{partition}.json.gz", "wb") as file:
            file.write(response.content)
    else:
        print(f"Failed to download data for partition {partition}")

Run this function to get the issue_df for a particular partition

In [39]:
def get_issues_df(partition):
    df = pd.read_json(f"../data/{partition}.json.gz", lines=True)

    issues_events_df = df.query("type == 'IssuesEvent'")

    issues_df = pd.json_normalize(issues_events_df["payload"])

    issues_df = issues_df.filter(items=[ 'issue.id', 'issue.url', 'action', 'issue.user.login', 'issue.title', 'issue.labels', 'issue.body'])
    issues_df["issue.labels"] = issues_df["issue.labels"].apply(lambda x: [label["name"] for label in x])

    return issues_df

Run this function to filter the issue df by labels.

In [38]:
def filter_df_by_label(df, label_name: str):
    return df[df["issue.labels"].apply(lambda labels: label_name in [l.lower() for l in labels])]

Run this function to mass download files for a date

In [37]:
def download_date_data(year, month, day):
    date = f"{year}-{'{:02d}'.format(month)}-{'{:02d}'.format(day)}"
    for hour in range(24):
        download_data(f"{date}-{hour}")

    issues_df = pd.concat([get_issues_df(f"{date}-{hour}") for hour in range(24)])

    issues_df.to_pickle(f"../data/{date}-issues.pkl")
    # issues_df.to_csv(f"../data/{date}-issues.csv", index=False)

Code to download data across an entire month

In [None]:
# for i in range(1, 32):
#     try:
#         print(f"Downloading data for 2024-05-{i}")
#         download_date_data(2024, 5, i)
#     except:
#         print(f"Failed to download data for 2024-05-{i}")

# issues_df = pd.concat([pd.read_pickle(f"../data/2024-05-{'{:02d}'.format(day)}-issues.pkl") for day in range(1, 32)])

## Generate issues dataframe

In [41]:
# issues_df = get_issues_df("2024-05-01-01")
issues_df = pd.read_pickle("../data/2024-05-issues.pkl")

### Count issues by label

In [35]:
pd.Series(np.concatenate(issues_df["issue.labels"].values)).value_counts().head(20)

bug                                        189328
status                                     178838
enhancement                                143259
documentation                               23865
Mend: dependency security vulnerability     20460
feature                                     17586
question                                    17201
good first issue                            16762
stale                                       13340
help wanted                                 12728
migration:no-jira                           11543
Bug                                         11049
triage                                       7669
kind/bug                                     7641
type: bug                                    6387
b2b-marketing-services                       6287
under review                                 5841
frontend                                     5394
Mend: IaC violation                          5356
state:verified_fixed                         5165
