In [None]:
from get_repo_data import main
import plotly.express as px
import pandas as pd
import nbformat
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer
from openai import OpenAI
import os

In [3]:
# import data
commits = pd.read_csv('data/commits.csv')
users = pd.read_csv('data/users.csv')
issues = pd.read_csv('data/issues.csv')

pass 

# Community Analytics
## Issues: User Counts

In [None]:
# Create a bar chart of the user issues counts
issues_counts = issues.value_counts(subset=['user'])

issues_counts = issues_counts.reset_index()

issues_counts.sort_values(by='count', ascending=False, inplace=True)

fig = px.bar(data_frame=issues_counts, x='user', y='count', title='Issues per User')

fig.show()

In [None]:
## Users: User Type, Counts

In [None]:
# Create a bar chart of the user type counts
users_counts = users.value_counts(subset=['type'])

users_counts = users_counts.reset_index()

users_counts.sort_values(by='count', ascending=False, inplace=True)

fig = px.bar(data_frame=users_counts, x='type', y='count', title='User Type Counts')

fig.show()

## Users: User Company, Counts

In [None]:
# Create a bar chart of the user company counts
users_counts = users.value_counts(subset=['company'])

users_counts = users_counts.reset_index()

users_counts.sort_values(by='count', ascending=False, inplace=True)

fig = px.bar(data_frame=users_counts, x='company', y='count', title='User Company Counts')

fig.show()

## Users: User Location, Counts

In [None]:
# Create a bar chart of the user location counts
users_counts = users.value_counts(subset=['location'])

users_counts = users_counts.reset_index()

users_counts.sort_values(by='count', ascending=False, inplace=True)

fig = px.bar(data_frame=users_counts, x='location', y='count', title='User Location Counts')

fig.show()

## Users: User Bios, Counts

In [None]:
# Create a bar chart of the user bio counts
users_counts = users.value_counts(subset=['bio'])

users_counts = users_counts.reset_index()

users_counts.sort_values(by='count', ascending=False, inplace=True)

fig = px.bar(data_frame=users_counts, x='bio', y='count', title='User Bio Counts')

fig.show()

# Product Analytics
## Issues: Issue Titles, Wordcloud

In [None]:
# generate word cloud from issue titles (https://github.com/amueller/word_cloud/blob/main/examples/simple.py)

text = ' '.join(issues['title'])

lemma = WordNetLemmatizer()
text_lemmatized = [lemma.lemmatize(w.lower()) for w in text.split()]
text = ' '.join(text_lemmatized)

stopwords = set(STOPWORDS)
stopwords.update([
    'parson','python', 'connector', 'fix', 'upsert', 'add','addition', 'update',
    'remove', 'change', 'doc', 'docs','documentation', 'table',
    'function', 'use', 'error', 'data', 'bump', 'version','type',
    'test', 'release', 'feat', 'feature', 'support','method',    
    'column','bug','code','added','file','phone','string','added',
    'name','class','list','issue','py','number','empty','github',
    'create','query','option','row','import'
])


wordcloud = WordCloud(max_font_size=40,stopwords=stopwords).generate(text)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()



## Issues: Open Issues Titles, Wordcloud

In [None]:
# word cloud of open issues

open_issues = issues[issues['state'] == 'open']

import inspect, parsons

modules = [module[0] for module in inspect.getmembers(parsons, inspect.ismodule)]

text = open_issues['title'].tolist()

text = [item.lower().strip() for item in text]

norm_text = [item.split() for item in text]

flattened_list = [item for sublist in norm_text for item in sublist]


filtered_text = [item.lower().strip() for item in flattened_list if item in modules]

text = " ".join(filtered_text)

lemma = WordNetLemmatizer()
text_lemmatized = [lemma.lemmatize(w.lower()) for w in text.split()]
text = ' '.join(text_lemmatized)

stopwords = set(STOPWORDS)
stopwords.update([
    'parson','python', 'connector', 'fix', 'upsert', 'add','addition', 'update',
    'remove', 'change', 'doc', 'docs','documentation', 'table',
    'function', 'use', 'error', 'data', 'bump', 'version','type',
    'test', 'release', 'feat', 'feature', 'support','method',    
    'column','bug','code','added','file','phone','string','added',
    'name','class','list','issue','py','number','empty','github',
    'create','query','option','row','import'
])

wordcloud = WordCloud(max_font_size=40,stopwords=stopwords).generate(text)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

## Issues: Open Issues, Wordcloud--Only Names of Parsons Modules

In [None]:
# wordclould with only names of parsons modules for open issues
import inspect, parsons
from collections import Counter

modules = [module[0] for module in inspect.getmembers(parsons, inspect.ismodule)]

titles_unnorm = open_issues['title'].tolist()

titles = [list(set(title.lower().split())) for title in titles_unnorm] #use set because don't want repeated module name per title

print(titles)

titles_modules = [title_item for title in titles for title_item in title if title_item in modules ] 

print(Counter(titles_modules))

titles_modules = " ".join(titles_modules)

wordcloud = WordCloud(max_font_size=40,stopwords=stopwords).generate(titles_modules)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

## Issues: Open Issues, Wordcloud (Other Than Parsons Modules Names)

In [None]:
# wordclould with only names of *non* parsons-modules for open issues
import inspect, parsons
from collections import Counter

modules = [module[0] for module in inspect.getmembers(parsons, inspect.ismodule)]

titles_unnorm = open_issues['title'].tolist()

titles = [list(set(title.lower().split())) for title in titles_unnorm] #use set because don't want repeated module name per title

print(titles)

titles_not_modules = [title_item for title in titles for title_item in title if title_item not in modules ] #do this for each title and then take set

print(Counter(titles_not_modules))

stopwords = set(STOPWORDS)

titles_not_modules = " ".join(titles_not_modules)

wordcloud = WordCloud(max_font_size=40,stopwords=stopwords).generate(titles_not_modules)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

## Issues: Open Issues, States, Barchart

In [None]:
# Create a bar chart of the issue states
issues_counts = issues.value_counts(subset=['state'])

issues_counts = issues_counts.reset_index()

issues_counts.sort_values(by='count', ascending=False, inplace=True)

fig = px.bar(data_frame=issues_counts, x='state', y='count', title='Issue States')

fig.show()

## Issues: Issues, Body, Wordcloud

In [None]:
# analyze issues body

body = issues['body'].dropna().tolist()

body = " ".join(body)

lemma = WordNetLemmatizer()

body_lemmatized = [lemma.lemmatize(w.lower()) for w in body.split()]

body = ' '.join(body_lemmatized)

stopwords = set(STOPWORDS)

wordcloud = WordCloud(max_font_size=40,stopwords=stopwords).generate(body)

plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()



pass

## Issues: Issues, Body, Summary

In [9]:
# summarize text in issues.body

# get openai api key
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

# create openai client
client = OpenAI(api_key=OPENAI_API_KEY)


# code used for interacting with the OpenAI API

def format_message(role, content):
        return {"role": role, "content": content}


def get_response(messages):
    completion = client.chat.completions.create(
        model='gpt-4-1106-preview', # newest, cheapest model
        messages=messages,
    )
    content = completion.choices[0].message.content
    return content

# get body fields
body_fields = issues['body'].dropna().tolist()

# make list to hold summaries that will later itself be summarized
summaries = []

# get quotes

for j in range(len(body_fields)):
    
    quotes = body_fields[j]

    instructions = f"""

    You will be provided with the body of an issue from a GitHub repository. Summarize this text in one sentence, 
    no more than half the length of the input, or 20 words, whichever is shorter. The summary should be concise and informative, 
    focusing on the key issues raised by the author. The audience for these summaries are software developers who are familiar with the project. 
    The summaries will be used to identify common issues and help developers address them more efficiently.

    Quotes: {quotes}
    """

    message = format_message("system", instructions) # system means high priority 
    messages = [message] # ChatGPT API expects any message to be in a list
    response = get_response(messages)
    print(response)
    summaries.append(response)

# summarize summaries

quotes = " ".join(summaries)

instructions = f"""

You will be provided with your previous summaries of the body of issues from a GitHub repository. Summarize this text in 2-3 paragraphs,
focusing on common themes across issues. The audience for this summary of summaries are software developers who are familiar with the project. 
The 2-3 paragraphs you produce will be used to identify common issues and help developers understand common issues facing users and developers.

Quotes: {quotes}
"""

message = format_message("system", instructions) # system means high priority 
messages = [message] # ChatGPT API expects any message to be in a list
response = get_response(messages)
print(response)



The issue details two problems: Mac+Python 3.9 causing psycopg2 install errors and a global issue with limited dependency installs due to a pydantic release.
Added a method to return the full export unparsed for ELT, branching from @jburchard's Empower connector PR.
This PR updates the 'civis' package from version 1.16.1 to 2.4.0, adding features like customizable retries, YAML validation, and Python 3.13 support, while also deprecating and removing some methods and features.
Bumps grpcio from 1.62.2 to 1.68.0, including core improvements, C++ build updates, Python support for 3.13, and various bug fixes.
This GitHub issue updates the CodeQL Action from version 3.26.12 to 3.27.4 and discusses changes such as fixing a setup issue, performance improvements with Zstandard for bundles, and updating the default CodeQL bundle version.
Dependabot updates the [install-pinned/uv](https://github.com/install-pinned/uv) dependency from de03c60 to 5743f94 with multiple README and pins updates.
The 

RateLimitError: Error code: 429 - {'error': {'message': 'Request too large for gpt-4-turbo-preview in organization org-dnsLBIN9IeStdXjb3ul4pJ1n on tokens per min (TPM): Limit 30000, Requested 35743. The input or output tokens must be reduced in order to run successfully. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}