# Import the requirements and security keys


In [1]:
import json
import os
import re

import nltk
import pandas as pd
import requests

In [2]:
# importing `config.py` to access its variables, not working on windows
from config import github_token_value

	pd.options.display.max_columns = 20 #Changes the number of columns diplayed (default is 20) 250
	pd.options.display.max_rows = 60 #Changes the number of rows diplayed (default is 60) 250 
	pd.options.display.max_colwidth = 50 #Changes the number of characters in a cell so that the contents don't get truncated (default is 50) 250

# Starts the import using the Google API

In [3]:
import requests

# Set the API endpoint URL
url = "https://api.github.com/repos/{owner}/{repo}/issues"

# Set the owner and repo variables   

owner = "Significant-Gravitas"
repo = "Auto-GPT"

# Set the authentication token (if required)
token = github_token_value

# Set the query parameters for the API request
params = {
    "state": "open",  # retrieve all issues, including closed ones
    "per_page": 100,  # retrieve up to 100 issues per page
    "page": 1,  # start with the first page of results
}

issues = []

while True:
    response = requests.get(
        url.format(owner=owner, repo=repo), params=params, auth=(token, "")
    )

    if response.status_code != 200:
        print("Error retrieving issues from GitHub API")
        break

    page_issues = response.json()
    issues += page_issues

    if "Link" not in response.headers:
        break

    link_header = response.headers["Link"]
    next_url = None
    for link in link_header.split(","):
        if 'rel="next"' in link:
            next_url = link.split(";")[0][1:-1]
            break

    if next_url is None:
        break

    print(params)
    params["page"] += 1

print(f"Retrieved {len(issues)} issues from GitHub")

# Print the response headers
print("Response Headers:")
for key, value in response.headers.items():
    print(f"{key}: {value}")

{'state': 'open', 'per_page': 100, 'page': 1}
{'state': 'open', 'per_page': 100, 'page': 2}
{'state': 'open', 'per_page': 100, 'page': 3}
{'state': 'open', 'per_page': 100, 'page': 4}
{'state': 'open', 'per_page': 100, 'page': 5}
{'state': 'open', 'per_page': 100, 'page': 6}
Retrieved 643 issues from GitHub
Response Headers:
Server: GitHub.com
Date: Wed, 03 May 2023 00:05:46 GMT
Content-Type: application/json; charset=utf-8
Transfer-Encoding: chunked
Cache-Control: private, max-age=60, s-maxage=60
Vary: Accept, Authorization, Cookie, X-GitHub-OTP, Accept-Encoding, Accept, X-Requested-With
ETag: W/"119af5862a48a482b940c6129c4773cdce323d2f297ad143da7551663c406c5e"
github-authentication-token-expiration: 2023-07-26 11:35:33 -0400
X-GitHub-Media-Type: github.v3; format=json
Link: <https://api.github.com/repositories/614765452/issues?state=open&per_page=100&page=6>; rel="prev", <https://api.github.com/repositories/614765452/issues?state=open&per_page=100&page=1>; rel="first"
x-github-api-ve

# saves the api response as a dataframe and outputs it various ways for review.

In [4]:
# create subdirectory if it does not exist
if not os.path.exists("subdata"):
    os.makedirs("subdata")

# loop over issues and save each one as a separate JSON file in the subdirectory
for i, issue in enumerate(issues):
    filename = (
        f"subdata/issue-{i+1}.json"  # use the issue number as filename in subdirectory
    )
    with open(filename, "w") as f:
        json.dump(issue, f)
#   print(f"Exported JSON to {filename}")

In [5]:
# Export the resulting JSON to a file
with open("github-issues.json", "w") as outfile:
    json.dump(issues, outfile)

# print("Exported JSON to github-issues.json")

issues_df = pd.json_normalize(issues)

# print(issues_df.head())

## prints the number of rows in the data frame
issues_df.size

print(len(issues_df.index))
issues_df.to_csv("issues_df.csv")

# Makes copies of the issues dataframes for later use

In [6]:
# copy dataframe
issues_title = issues_df.copy(deep=True)
issues_body = issues_df.copy(deep=True)

In [7]:
# removes all the other columns except for the issue number and the body
issues_body = issues_body[["number", "body"]]

## prints the issue_body data frame if needed
print(issues_body)

### Adds the index of the issues_body dataframe and prints it to verify that the index is working
issues_body.reset_index(inplace=True)

print(issues_body)

In [8]:
# imports the text from the issues template
file_name = "issue_autogpt.txt"

# Starts the text analysis of the body columns

In [9]:
# Downloads the lastest punkt nltk library
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\smsma\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [10]:
# using the template text imported earlier, this cell removes all the template text from the issue body

with open(file_name, "r", encoding="utf-8") as f:
    # Read the file and tokenize it into sentences
    text = f.read()
    sentences = nltk.sent_tokenize(text)

    # Convert the sentences into an array
    sentences_array = []
    for sentence in sentences:
        sentence = (
            sentence.strip()
        )  # Remove whitespace at the beginning and end of the sentence
        if sentence:  # Check if the sentence is not empty
            #print(sentence)
            sentences_array.append(sentence)


### revert issues_body_clean back to original

In [None]:
# makes a copy of the issues_body, so that we can compare the 2 or 
#revert back to the issues_body dataframe without having to rerun multiple cells.

issues_body_clean = issues_body.copy(deep=True)

## Start removing the template text.

In [None]:
for sentence1 in sentences_array:
    issues_body_clean["body"] = issues_body_clean["body"].str.replace(sentence1, "")

In [None]:
print(sentences_array[10])

In [None]:
issues_body_clean.info(verbose=False)

In [None]:
issues_body_clean

In [None]:
issues_body_clean_good = issues_body_clean.copy(deep=True)

In [None]:
issues_body_clean = issues_body_clean_good.copy(deep=True)

In [None]:
# convert all non-string columns to string
issues_body_clean = issues_body_clean.astype(str)

# define a regular expression to match non-alphanumeric and non-space characters
regex_unknown = re.compile(r"[^a-zA-Z0-9\s]")

# apply the regular expression to each element of the DataFrame
issues_body_clean = issues_body_clean.applymap(lambda s: regex_unknown.sub(" ", s))

In [None]:
issues_body_clean.to_csv("issue_body_after_remove.md")

In [None]:
for col in issues_body_clean.columns:
    print(col)

In [None]:
issues_body_clean.head()

In [None]:
issues_body_clean.to_csv("my_data.md", index=False, columns=["body"])

In [None]:
# convert all non-string columns to string
issues_body_clean = issues_body_clean.astype(str)

# define a regular expression to match known characters and markdown syntax
regex_known = re.compile(r"[^\w\s.,!?;:\-'" r"\*\(\)\[\]#`_\{\}\+\-\\/]|_")

# apply the regular expression to each element of the DataFrame
issues_body_clean = issues_body_clean.applymap(lambda s: regex_known.sub(" ", s))


# function to extract phrases from a string
def extract_phrases(text, phrase_len):
    # split the text into words
    words = text.split()
    # initialize an empty list to store the phrases
    phrases = []
    # loop over the words, stopping at the last phrase_len words
    for i in range(len(words) - phrase_len + 1):
        # extract the phrase of length phrase_len
        phrase = " ".join(words[i : i + phrase_len])
        # add the phrase to the list
        phrases.append(phrase)
    # return the list of phrases
    return phrases


# set the desired phrase length
phrase_len = 3

# initialize an empty dictionary to store the phrase counts
phrase_counts = {}

# loop over each row of the dataframe
for index, row in issues_body_clean.iterrows():
    # extract the text from the "body" column
    text = row["body"]
    # extract the phrases from the text
    phrases = extract_phrases(text, phrase_len)
    # loop over the phrases
    for phrase in phrases:
        # if the phrase is already in the dictionary, increment the count
        if phrase in phrase_counts:
            phrase_counts[phrase] += 1
        # otherwise, add the phrase to the dictionary with a count of 1
        else:
            phrase_counts[phrase] = 1

# create a dataframe from the phrase_counts dictionary
phrase_df = pd.DataFrame(list(phrase_counts.items()), columns=["phrase", "count"])

# sort the dataframe by count in descending order
phrase_df = phrase_df.sort_values("count", ascending=False)

# print the top 10 phrases
print(phrase_df.head(100))

In [None]:
phrase_df.to_csv("phrase.csv", index=False)

In [None]:
phrase_DF.head