## Mine the same GitHub Java repositories as the paper and get last 2 months data

<p> The code put in comments is used to run the script with colab. </p>

In [1]:
%pip install PyGithub
# !pip install PyGithub

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.0.1 -> 23.1.2
[notice] To update, run: C:\Users\eleon\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [2]:
import requests
from github import Github
from datetime import datetime, timedelta
import os
import pandas as pd
from github import GithubException
import time

In [None]:
# If you want to run the code un colab, mount the drive and then unzip the data folder
"""
from google.colab import drive
drive.mount('/content/drive')

!unzip '/content/drive/My Drive/github_old.zip'
"""

### Directories

In [3]:
# Directory with dataset of the original paper
dir_path = './github_old/'

# Directory to sabe the new dataset
new_gh_data_dir = "./data/"

### Functions

In [4]:
"""Returns true if the csv file is empty"""
def is_csv_empty(path):
    try:
        if os.stat(dir_path + path).st_size == 0:
            return True
            
    except FileNotFoundError:
        return False

In [5]:
def get_commit(repo,id):
    try :
        return repo.get_commit(id)
    except:
        return None



In [6]:
def extract_data(raw_data, key):
    try :
        if key in raw_data:
            return raw_data[key]
        else:
            return None
    except:
        return None

In [7]:
def get_file_content(file_url):
    try:
        response = requests.get(file_url)
        if response.status_code == 200:  
            content = response.content.decode('utf-8')
            return content
        else:
            return None
    except:
        return None

In [8]:
def find_file(files, file_path):
    try:
        target_file_name = file_path.split("/")[-1]
        for file in files:
            current_file_name = file["filename"].split("/")[-1]
            if current_file_name == target_file_name:
                return file["raw_url"]
    except Exception as e:
        print(e)
        return None

In [9]:
def handle_rate_limit_exception(e):
    # Extract the retry-after value from the exception
    retry_after = int(e.headers.get("x-ratelimit-reset")) + 5
    current_timestamp = time.time()
    time_difference = retry_after - current_timestamp
    print(time_difference)
    time.sleep(time_difference)
    # Retry the failed operation

### Delete empty folders in the dataset of the original paper

In [10]:
entries = os.listdir(dir_path)

for entry in entries:
    if is_csv_empty(entry):
        os.remove(dir_path + entry)

print(len(entries))


1764


### Get all the names of the projects

In [11]:
gh_projects = []
for filename in os.listdir(dir_path):
    df = pd.read_csv(dir_path + filename)
    gh_project = df['project'].iloc[0]
    gh_projects.append(gh_project)

  df = pd.read_csv(dir_path + filename)


In [12]:
gh_projects

['52North/sensorweb-server-helgoland',
 '52North/SensorWebClient',
 '52North/sos-importer',
 '52North/WPS',
 'A24Group/ssGWT-lib',
 'aaberg/sql2o',
 'aboutsip/pkts',
 'abstractj/kalium',
 'ACRA/acra',
 'Activiti/Activiti',
 'AddstarMC/Minigames',
 'AddstarMC/Prism-Bukkit',
 'addthis/stream-lib',
 'adennie/vroom',
 'adjust/android_sdk',
 'Adobe-Consulting-Services/acs-aem-commons',
 'adobe-research/mesh',
 'aerogear-attic/aerogear-simplepush-server',
 'aerogear/aerogear-android-cookbook',
 'aerogear/aerogear-unifiedpush-java-client',
 'aerogear/aerogear-unifiedpush-server',
 'aeshell/aesh-extensions',
 'aeshell/aesh',
 'Agilefreaks/DroidOmni',
 'aikuma/aikuma',
 'airlift/aircompressor',
 'airlift/airlift',
 'airlift/slice',
 'ajanata/PretendYoureXyzzy',
 'akvo/akvo-flow-mobile',
 'akvo/akvo-flow',
 'alecgorge/jsonapi',
 'alengel/ASE5',
 'alengel/ASE5_Java_Server',
 'alexo/wro4j',
 'Alfresco/alfresco-android-app',
 'Alfresco/alfresco-android-sdk',
 'Alfresco/gytheio',
 'algolia/algoliase

### Get two_months ago's timestamp

In [13]:
two_months_ago = datetime.now() - timedelta(days=60)

### Set up PyGithub variables

In [14]:
# Create personal token under GitHub settings
token = "MY_TOKEN"

In [15]:
g = Github(token)

print(g.rate_limiting)
print(g.rate_limiting_resettime)

(5000, 5000)
1684496676


### Run script

In [16]:
entries = gh_projects

In [17]:
for entry in entries:
    repo_name = entry

    try:
        repo = g.get_repo(repo_name)
        pulls = repo.get_pulls(state='closed', sort='created', direction='desc')
        recent_pull_requests = [pr for pr in pulls if pr.created_at >= two_months_ago]

        if recent_pull_requests:
            print(f"Extracting {repo_name} data ({len(recent_pull_requests)} pull requests) ...")
            review_ids = []
            review_datas = []
            for pull in recent_pull_requests:
                reviews = pull.get_reviews()
                for review in reviews:
                    #print(review.raw_data)
                    review_data = {
                        "project": repo_name,
                        "pull_number": pull.number,
                        "pull_id": pull.id,
                        "review_id": review.raw_data["id"],
                        "user": review.raw_data["user"]["login"],
                        "user_id": review.raw_data["user"]["id"],
                        "message": review.raw_data["body"],
                        "state": review.raw_data["state"],
                        "submitted_at": review.raw_data["submitted_at"],
                        "pull_request_id": review.raw_data["pull_request_url"].split("/")[-1],
                        "pull_request_url": review.raw_data["pull_request_url"],
                        "commit_id" : review.raw_data["commit_id"] if "commit_id" in review.raw_data else None,  
                        "comments" : []
                    }
                    review_datas.append(review_data)

            for pull in recent_pull_requests:
                review_comments = pull.get_review_comments()

                for review_comment in review_comments:
                    pr_review_id = extract_data(review_comment.raw_data, "pull_request_review_id")

                    commit_before_id = extract_data(pull.raw_data["base"], "sha")
                    commit_while_id = extract_data(review_comment.raw_data, "original_commit_id")
                    commit_after_id = extract_data(review_comment.raw_data, "commit_id")
                    
                    commit_before = get_commit(repo,commit_before_id)
                    commit_while = get_commit(repo,commit_while_id)
                    commit_after = get_commit(repo,commit_after_id)
                #print(commit_before.raw_data["files"][0])
                    try:
                        comment_data = {
                            "comment_id" : extract_data(review_comment.raw_data, "id"),
                            "commit_id" : extract_data(review_comment.raw_data, "commit_id"),
                            "original_commit_id" : extract_data(review_comment.raw_data, "original_commit_id"),
                            "html_url": extract_data(review_comment.raw_data, "html_url"),
                            "user_id" : extract_data(review_comment.raw_data["user"], "id"),
                            "comment" :  extract_data(review_comment.raw_data, "body"),
                            "created_at" : extract_data(review_comment.raw_data, "created_at"),
                            "updated_at" : extract_data(review_comment.raw_data, "updated_at"),
                            "start_line" : extract_data(review_comment.raw_data, "start_line"),
                            "original_start_line" : extract_data(review_comment.raw_data, "original_start_line"),
                            "original_line" : extract_data(review_comment.raw_data, "original_line"),
                            "start_side" : extract_data(review_comment.raw_data, "start_side"),
                            "side" : extract_data(review_comment.raw_data, "side"),
                            "line" : extract_data(review_comment.raw_data, "line"),
                            "filename" : extract_data(review_comment.raw_data, "path"),
                            "commit_before" : commit_before_id,
                            "commit_while" : commit_while_id,
                            "commit_after" : commit_after_id,
                            "url_source_before" : extract_data(commit_before.raw_data["files"][0], "raw_url") if commit_before is not None else None,
                            "url_source_while" : extract_data(commit_while.raw_data["files"][0], "raw_url") if commit_while is not None else None,
                            "url_source_after" : extract_data(commit_after.raw_data["files"][0], "raw_url") if commit_after is not None else None,
                            "file_content_before" : get_file_content(find_file(commit_before.raw_data["files"], review_comment.raw_data["path"])) if commit_before is not None else None,
                            "file_content_while" : get_file_content(find_file(commit_while.raw_data["files"], review_comment.raw_data["path"])) if commit_while is not None else None,
                            "file_content_after" : get_file_content(find_file(commit_after.raw_data["files"], review_comment.raw_data["path"])) if commit_after is not None else None,

                        }

                    except:
                        pass

                    if pr_review_id is not None:
                        # in the review_datas find the review_id and add the comment to the review
                        for current_review_data in review_datas:
                            if current_review_data["review_id"] == pr_review_id:
                                current_review_data["comments"].append(comment_data)
                                break

            
            filtered_review_datas = [review for review in review_datas if not review['comments'] == []]



            new_data = pd.DataFrame(columns=['project',
                                            'url',
                                            'discussion',
                                            'pull_number',
                                            'pull_id',
                                            'filename',
                                            'start_line',
                                            'original_start_line',
                                            'start_side',
                                            'side',
                                            'line',
                                            'original_line',
                                            'message',
                                            'owner_id',
                                            'user_id',
                                            'created_at',
                                            'updated_at',
                                            'commit_before',
                                            'commit_while',
                                            'commit_after',
                                            'url_source_before',
                                            'url_source_while',
                                            'url_source_after',
                                            'file_content_before',
                                            'file_content_while',
                                            'file_content_after'])
            
            for filtered_review_data in filtered_review_datas:
                for comment in filtered_review_data['comments']:
                    data = {'project': filtered_review_data['project'],
                            'url': 'https://github.com/'+ filtered_review_data['project'],
                            'discussion': comment['html_url'],
                            'pull_number': filtered_review_data['pull_number'],
                            'pull_id': filtered_review_data['pull_id'],
                            'filename': comment['filename'],
                            'start_line': comment['start_line'],
                            'original_start_line': comment['original_start_line'],
                            'start_side': comment['start_side'],
                            'side': comment['side'],
                            'line': comment['line'],
                            'original_line': comment['original_line'],
                            'message': comment['comment'],
                            'owner_id': filtered_review_data['user_id'],
                            'user_id': comment['user_id'],
                            'created_at': comment['created_at'],
                            'updated_at': comment['updated_at'],
                            'commit_before': comment['commit_before'],
                            'commit_while': comment['commit_while'],
                            'commit_after': comment['commit_after'],
                            'url_source_before': comment['url_source_before'],
                            'url_source_while': comment['url_source_while'],
                            'url_source_after': comment['url_source_after'],
                            'file_content_before': comment['file_content_before'],
                            'file_content_while': comment['file_content_while'],
                            'file_content_after': comment['file_content_after']
                            }

                        


                    df_dictionary = pd.DataFrame([data])
                    new_data = pd.concat([new_data, df_dictionary], ignore_index=True)

            new_data = new_data[new_data['file_content_before'] != new_data['file_content_after']].reset_index(drop=True)
            new_data = new_data[new_data['file_content_while'].notna()].reset_index(drop=True)
            new_data = new_data[new_data['file_content_after'].notna()].reset_index(drop=True)
            
            if not new_data.empty:
                print(f"Saving {repo_name} data...")
                repo_name = repo_name.replace('/', '_') + ".csv"
                new_data.to_csv(new_gh_data_dir + repo_name, index=False)

    except GithubException as e :
        if e.status == 403:
            handle_rate_limit_exception(e)


Extracting 52North/sensorweb-server-helgoland data (2 pull requests) ...
Extracting ACRA/acra data (18 pull requests) ...
Extracting Activiti/Activiti data (7 pull requests) ...
Saving Activiti/Activiti data...
Extracting AddstarMC/Minigames data (1 pull requests) ...
Extracting AddstarMC/Prism-Bukkit data (2 pull requests) ...
Extracting adjust/android_sdk data (1 pull requests) ...
Extracting Adobe-Consulting-Services/acs-aem-commons data (13 pull requests) ...
Saving Adobe-Consulting-Services/acs-aem-commons data...
Extracting airlift/airlift data (12 pull requests) ...
Saving airlift/airlift data...
Extracting airlift/slice data (2 pull requests) ...
Extracting Alfresco/gytheio data (44 pull requests) ...
Extracting alibaba/canal data (1 pull requests) ...
Extracting alibaba/druid data (15 pull requests) ...
Extracting alibaba/fastjson data (1 pull requests) ...
Extracting alkacon/opencms-core data (1 pull requests) ...
Extracting Alluxio/alluxio data (213 pull requests) ...


KeyboardInterrupt: 