In [1]:
# importing required libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import math
from tqdm import tqdm
import datetime
import re
from urllib.parse import urlparse
import urllib.parse
import time

In [2]:
def scrape_github_pull_requests(pr_url):
    base_url = 'https://github.com'

    # Initialize empty lists to store pull request titles and links
    pr_titles = []
    pr_links = []

    # Initialize a variable to keep track of the page number
    page_number = 1

    while True:
        # Construct the URL for the current page
        current_url = pr_url + f'?page={page_number}'

        r = requests.get(current_url)
        if r.status_code != 200:
            break  # Break the loop if the page is not found
        else:
            soup = BeautifulSoup(r.text, 'html.parser')

        # Find all anchor elements with the specified class and attributes for pull request links
        pr_links_on_page = soup.find_all('a', class_='Link--primary', attrs={'data-hovercard-type': 'pull_request'})

        # Extract the title and link for each pull request and append them to the lists
        for link in pr_links_on_page:
            pr_title = link.text.strip()  # Get the text (title) inside the anchor tag
            pull_url = base_url + link['href']  # Get the href attribute (link)

            pr_titles.append(pr_title)
            pr_links.append(pull_url)

        # Check if there is a next page
        pagination = soup.find('div', {'class': 'pagination'})
        next_page = pagination.find('a', {'class': 'next_page'}) if pagination else None
        if next_page:
            page_number += 1
            time.sleep(2)
        else:
            break  # Exit the loop if there is no next page

    return pr_titles, pr_links

# # Example usage:
# pr_url = 'https://github.com/TheAlgorithms/Python/pulls'  # Repository's Pull Requests URL
# titles, links = scrape_github_pull_requests(pr_url)
# for title, link in zip(titles, links):
#     print(f'Title: {title}')
#     print(f'Link: {link}')

In [3]:
def scrape_github_pull_requests_closed(pr_url):
    base_url = 'https://github.com'

    # Initialize empty lists to store pull request titles and links
    pr_titles = []
    pr_links = []

    # Initialize a variable to keep track of the page number
    page_number = 1

    while True:
        # Construct the URL for the current page
        current_url = pr_url + f'?q=is%3Apr+is%3Aclosed&page={page_number}'

        r = requests.get(current_url)
        if r.status_code != 200:
            break  # Break the loop if the page is not found
        else:
            soup = BeautifulSoup(r.text, 'html.parser')

        # Find all anchor elements with the specified class and attributes for pull request links
        pr_links_on_page = soup.find_all('a', class_='Link--primary', attrs={'data-hovercard-type': 'pull_request'})

        # Extract the title and link for each pull request and append them to the lists
        for link in pr_links_on_page:
            pr_title = link.text.strip()  # Get the text (title) inside the anchor tag
            pull_url = base_url + link['href']  # Get the href attribute (link)

            pr_titles.append(pr_title)
            pr_links.append(pull_url)

        # Check if there is a next page
        pagination = soup.find('div', {'class': 'pagination'})
        next_page = pagination.find('a', {'class': 'next_page'}) if pagination else None
        # print(current_url)
        if next_page:
            page_number += 1
            time.sleep(2)
        else:
            break  # Exit the loop if there is no next page

    return pr_titles, pr_links , len(pr_links)

# # Example usage:
# pr_url = 'https://github.com/TheAlgorithms/Python/pulls'  # Repository's Pull Requests URL
# titles, links , num = scrape_github_pull_requests_closed(pr_url)
# print(num)
# # for title, link in zip(titles, links):
# #     print(f'Title: {title}')
# #     print(f'Link: {link}')

In [4]:
def scrape_github_issues(url):
    base_url = 'https://github.com'

    # Initialize empty lists to store issue titles and links
    issue_titles = []
    issue_links = []

    # Initialize a variable to keep track of the page number
    page_number = 1

    while True:
        # Construct the URL for the current page
        current_url = url + f'?page={page_number}'

        r = requests.get(current_url)
        if r.status_code != 200:
            break  # Break the loop if the page is not found
        else:
            soup = BeautifulSoup(r.text, 'html.parser')

        # Find all anchor elements with the specified class
        issue_links_on_page = soup.find_all('a', class_='Link--primary')

        # Extract the title and link for each issue and append them to the lists
        for link in issue_links_on_page:
            issue_title = link.text.strip()  # Get the text (title) inside the anchor tag
            issue_url = base_url + link['href']  # Get the href attribute (link)

            issue_titles.append(issue_title)
            issue_links.append(issue_url)

        # Check if there is a next page
        pagination = soup.find('div', {'class': 'pagination'})
        next_page = pagination.find('a', {'class': 'next_page'}) if pagination else None
        if next_page:
            page_number += 1
            time.sleep(2)
        else:
            break  # Exit the loop if there is no next page

    return issue_titles, issue_links

# # Example usage:
# pr_url = 'https://github.com/TheAlgorithms/Python/issues'
# issue_titles, issue_links = scrape_github_issues(pr_url)
# print("Issue Titles:")
# print(issue_titles)
# print("\nIssue Links:")
# print(issue_links)

In [5]:
def scrape_github_issues_closed(url):
    base_url = 'https://github.com'

    # Initialize empty lists to store issue titles and links
    issue_titles = []
    issue_links = []

    # Initialize a variable to keep track of the page number
    page_number = 1

    while True:
        # Construct the URL for the current page
        current_url = url + f'?q=is%3Aissue+is%3Aclosed&page={page_number}'

        r = requests.get(current_url)
        if r.status_code != 200:
            break  # Break the loop if the page is not found
        else:
            soup = BeautifulSoup(r.text, 'html.parser')

        # Find all anchor elements with the specified class
        issue_links_on_page = soup.find_all('a', class_='Link--primary')

        # Extract the title and link for each issue and append them to the lists
        for link in issue_links_on_page:
            issue_title = link.text.strip()  # Get the text (title) inside the anchor tag
            issue_url = base_url + link['href']  # Get the href attribute (link)

            issue_titles.append(issue_title)
            issue_links.append(issue_url)

        # Check if there is a next page
        pagination = soup.find('div', {'class': 'pagination'})
        next_page = pagination.find('a', {'class': 'next_page'}) if pagination else None
        # print(current_url)
        if next_page:
            page_number += 1
            time.sleep(2)
        else:
            break  # Exit the loop if there is no next page

    return issue_titles, issue_links , len(issue_links)

# # Example usage:
# pr_url = 'https://github.com/TheAlgorithms/Python/issues'
# issue_titles, issue_links , num = scrape_github_issues_closed(pr_url)
# print(num)
# print("Issue Titles:")
# print(issue_titles)
# print("\nIssue Links:")
# print(issue_links)

In [6]:
def get_popular_repo_details(url):

      # creating another dictionary to strore the data of popular repositories & their details
    pop_repo_details ={
       'PR_URL':[],'repo_name':[],'Repository_username':[],'link_profile_username':[],
       'Stars':[],'Forks':[],'Commits':[],'Last_committed':[] ,'issues_link':[],
       'num_issues_open':[],'num_issues_closed':[],'titles_issues_open':[],'links_issues_open':[],'titles_issues_closed':[],'links_issues_closed':[],
       'pull_requests_link':[],'num_pull_requests_open':[],'num_pull_requests_closed':[],'titles_pull_open':[],'links_pull_open':[],'titles_pull_closed':[],'links_pull_closed':[],}




    pr_urls = url    # repo URL

    r =requests.get(pr_urls)
    if r.status_code != 200: i-=1
    else:
      pr_soup2 =BeautifulSoup(r.text, 'html.parser')

    base_url = 'https://github.com/'


    # Parse the URL
    parsed_url = urlparse(pr_urls)
    # Split the path of the URL
    path_parts = parsed_url.path.split('/')
    # Extract user name and repository name
    user_name = path_parts[1]
    repo_name = path_parts[2]
    # Construct the user profile link
    user_profile_link = base_url + user_name


    # locating & extracting tags for star counts
    star_span_tag = pr_soup2.find_all('span',{'id':'repo-stars-counter-star'})
    stars = int(star_span_tag[0]['aria-label'].split()[0])

    # locating & extracting tags for forks counts
    forks_span_tag =pr_soup2.find_all('span',{'id':'repo-network-counter'})
    forks = int(forks_span_tag[0]['title'].replace(',', ''))

    # locating & extracting tags for commits
    commit_span_tags = pr_soup2.find_all('span',{'class':'d-none d-sm-inline'})
    commits = int(commit_span_tags[1].strong.text.replace(',', '')) if len(commit_span_tags)==2 else int(commit_span_tags[0].strong.text.replace(',', ''))

    # locating & extracting tags for commits
    commit_span_tags = pr_soup2.find_all('span',{'class':'d-none d-sm-inline'})
    commits = int(commit_span_tags[1].strong.text.replace(',', '')) if len(commit_span_tags)==2 else int(commit_span_tags[0].strong.text.replace(',', ''))

    time.sleep(2)
    # locating & extracting tags for last committed time
    last_commit_atag =pr_soup2.find_all('a',{'class':'Link--secondary ml-2'})
    last_updated = last_commit_atag[0].find_all('relative-time')[0]['datetime'] if len(last_commit_atag)>=1 else None


   # locating & extracting tags for issues counts
    issue_span_tag =pr_soup2.find_all('span',{'id':"issues-repo-tab-count"})
    issues = int(issue_span_tag[0]['title'])

    # Locate and extract the <a> tag with the id 'issues-tab'
    issues_tab_a_tag = pr_soup2.find('a', {'id': 'issues-tab'})
    # Extract the 'href' attribute to get the link for issues
    issues_link = issues_tab_a_tag['href']

    full_issues_link = base_url + issues_link

    issue_titles_open, issue_links_open = scrape_github_issues(full_issues_link)

    issue_titles_closed, issue_links_closed , num_issues_closed = scrape_github_issues_closed(full_issues_link)

    time.sleep(2)

    # locating & extracting tags for pull_requests counts
    pull_span_tag =pr_soup2.find_all('span',{'id':"pull-requests-repo-tab-count"})
    pull_requests= int(pull_span_tag[0]['title'])

    # Locate and extract the <a> tag with the id 'pull_tab'
    pull_tab_a_tag = pr_soup2.find('a', {'id': 'pull-requests-tab'})
    # Extract the 'href' attribute to get the link for  pull_requests
    pull_requests_link = pull_tab_a_tag['href']

    full_pull_requests_link = base_url + pull_requests_link


    titles_pull_open, links_pull_open = scrape_github_pull_requests(full_pull_requests_link)

    titles_pull_closed, links_pull_closed , num_pull_requests_closed = scrape_github_pull_requests_closed(full_pull_requests_link)


    # appending scraped data for popular repository to the dictionary
    pop_repo_details['PR_URL'].append(pr_urls)
    pop_repo_details['repo_name'].append(repo_name)
    pop_repo_details['Repository_username'].append(user_name)
    pop_repo_details['link_profile_username'].append(user_profile_link)
    pop_repo_details['Stars'].append(stars)
    pop_repo_details['Forks'].append(forks)
    pop_repo_details['Commits'].append(commits)
    pop_repo_details['Last_committed'].append(last_updated)

    # issues
    pop_repo_details['issues_link'].append(full_issues_link)
    pop_repo_details['num_issues_open'].append(issues)
    pop_repo_details['num_issues_closed'].append(num_issues_closed)
    pop_repo_details['titles_issues_open'].append(issue_titles_open)
    pop_repo_details['links_issues_open'].append(issue_links_open)
    pop_repo_details['titles_issues_closed'].append(issue_titles_closed)
    pop_repo_details['links_issues_closed'].append(issue_links_closed)

    #pull_requests
    pop_repo_details['pull_requests_link'].append(full_pull_requests_link)
    pop_repo_details['num_pull_requests_open'].append(pull_requests)
    pop_repo_details['num_pull_requests_closed'].append(num_pull_requests_closed)
    pop_repo_details['titles_pull_open'].append(titles_pull_open)
    pop_repo_details['links_pull_open'].append(links_pull_open)
    pop_repo_details['titles_pull_closed'].append(titles_pull_closed)
    pop_repo_details['links_pull_closed'].append(links_pull_closed)


    return pop_repo_details



In [7]:
# df = get_popular_repo_details('https://github.com/ljpzzz/machinelearning')

In [8]:
# df

In [9]:
# Example list of repository URLs
repo_urls = [
    'https://github.com/PrithivirajDamodaran/Gramformer',
    'https://github.com/Show-Me-the-Code/python',
    'https://github.com/sebastianruder/NLP-progress',
    'https://github.com/explosion/sense2vec',
    'https://github.com/stanfordnlp/GloVe'

]

In [10]:
# Define the function to get popular repo details for a list of URLs
def get_details_saving(url_list):
    # Initialize an empty list to store the results
    repo_details_list = []

    # Iterate through the list of URLs and call get_popular_repo_details for each URL
    for url in url_list:
        repo_details = get_popular_repo_details(url)
        repo_details_list.append(repo_details)

    # Convert the list of dictionaries to a DataFrame
    df = pd.DataFrame(repo_details_list)

    # Extract values from lists in each column using the provided keys
    df['PR_URL'] = df['PR_URL'].apply(lambda x: x[0])
    df['repo_name'] = df['repo_name'].apply(lambda x: x[0])
    df['Repository_username'] = df['Repository_username'].apply(lambda x: x[0])
    df['link_profile_username'] = df['link_profile_username'].apply(lambda x: x[0])
    df['Stars'] = df['Stars'].apply(lambda x: x[0])
    df['Forks'] = df['Forks'].apply(lambda x: x[0])
    df['Commits'] = df['Commits'].apply(lambda x: x[0])
    df['Last_committed'] = df['Last_committed'].apply(lambda x: x[0])
    df['issues_link'] = df['issues_link'].apply(lambda x: x[0])
    df['num_issues_open'] = df['num_issues_open'].apply(lambda x: x[0])
    df['num_issues_closed'] = df['num_issues_closed'].apply(lambda x: x[0])
    df['titles_issues_open'] = df['titles_issues_open'].apply(lambda x: x[0])
    df['links_issues_open'] = df['links_issues_open'].apply(lambda x: x[0])
    df['titles_issues_closed'] = df['titles_issues_closed'].apply(lambda x: x[0])
    df['links_issues_closed'] = df['links_issues_closed'].apply(lambda x: x[0])
    df['pull_requests_link'] = df['pull_requests_link'].apply(lambda x: x[0])
    df['num_pull_requests_open'] = df['num_pull_requests_open'].apply(lambda x: x[0])
    df['num_pull_requests_closed'] = df['num_pull_requests_closed'].apply(lambda x: x[0])
    df['titles_pull_open'] = df['titles_pull_open'].apply(lambda x: x[0])
    df['links_pull_open'] = df['links_pull_open'].apply(lambda x: x[0])
    df['titles_pull_closed'] = df['titles_pull_closed'].apply(lambda x: x[0])
    df['links_pull_closed'] = df['links_pull_closed'].apply(lambda x: x[0])


    return df

In [11]:
# Record the start time
start_time = time.time()

df = get_details_saving(repo_urls)

# Record the end time
end_time = time.time()
# Calculate the runtime in seconds
runtime_seconds = end_time - start_time

# Calculate the runtime in minutes
runtime_minutes = runtime_seconds / 60

# Print the runtime in minutes
print(f"Runtime: {runtime_minutes:.2f} minutes")

Runtime: 2.67 minutes


In [12]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width',100)

In [13]:
df.head()

Unnamed: 0,PR_URL,repo_name,Repository_username,link_profile_username,Stars,Forks,Commits,Last_committed,issues_link,num_issues_open,num_issues_closed,titles_issues_open,links_issues_open,titles_issues_closed,links_issues_closed,pull_requests_link,num_pull_requests_open,num_pull_requests_closed,titles_pull_open,links_pull_open,titles_pull_closed,links_pull_closed
0,https://github.com/PrithivirajDamodaran/Gramfo...,Gramformer,PrithivirajDamodaran,https://github.com/PrithivirajDamodaran,1390,163,73,2022-12-27T16:45:05Z,https://github.com//PrithivirajDamodaran/Gramf...,8,15,[No module named 'annotated_text' in streamlit...,[https://github.com/PrithivirajDamodaran/Gramf...,[OSError: [E050] Can't find model 'en'. It doe...,[https://github.com/PrithivirajDamodaran/Gramf...,https://github.com//PrithivirajDamodaran/Gramf...,1,12,[Spacy 3],[https://github.com/PrithivirajDamodaran/Gramf...,"[Change Topp to Beam search, updated banner, R...",[https://github.com/PrithivirajDamodaran/Gramf...
1,https://github.com/Show-Me-the-Code/python,python,Show-Me-the-Code,https://github.com/Show-Me-the-Code,3633,2867,1499,2017-09-18T08:11:20Z,https://github.com//Show-Me-the-Code/python/is...,42,18,"[Python, Hey give me. Your. Name, rename files...",[https://github.com/Show-Me-the-Code/python/is...,"[Python3, 123, Code, Snap, P, Codr, Py, How to...",[https://github.com/Show-Me-the-Code/python/is...,https://github.com//Show-Me-the-Code/python/pulls,108,260,"[Fake_Data_Generator, Add a Clue game that I l...",[https://github.com/Show-Me-the-Code/python/pu...,"[Create hashes from a string, Create try, Crea...",[https://github.com/Show-Me-the-Code/python/pu...
2,https://github.com/sebastianruder/NLP-progress,NLP-progress,sebastianruder,https://github.com/sebastianruder,21891,3580,791,2023-03-09T10:36:42Z,https://github.com//sebastianruder/NLP-progres...,35,65,"[Tasks are not the right measure anymore, Depe...",[https://github.com/sebastianruder/NLP-progres...,[Add CFF (citation file format) to the reposit...,[https://github.com/sebastianruder/NLP-progres...,https://github.com//sebastianruder/NLP-progres...,16,521,[Keyphrase Extraction and Generation for Engli...,[https://github.com/sebastianruder/NLP-progres...,"[Added introduction of Multimodal NLP, Add zer...",[https://github.com/sebastianruder/NLP-progres...
3,https://github.com/explosion/sense2vec,sense2vec,explosion,https://github.com/explosion,1532,241,460,2023-04-20T14:53:46Z,https://github.com//explosion/sense2vec/issues,20,91,"[provide citation, Train sense2vec in Chinese,...",[https://github.com/explosion/sense2vec/issues...,[s2v standalone breaks if require_gpu() is cal...,[https://github.com/explosion/sense2vec/issues...,https://github.com//explosion/sense2vec/pulls,0,48,[],[],"[CI: Switch from Azure to GHA, Set version to ...",[https://github.com/explosion/sense2vec/pull/1...
4,https://github.com/stanfordnlp/GloVe,GloVe,stanfordnlp,https://github.com/stanfordnlp,6481,1475,174,2023-09-19T19:41:56Z,https://github.com//stanfordnlp/GloVe/issues,78,81,[there may be a bug in cooccur.c when the bin...,[https://github.com/stanfordnlp/GloVe/issues/2...,"[utf-8 bug, Overflow in ""overflow_threshold"" (...",[https://github.com/stanfordnlp/GloVe/issues/2...,https://github.com//stanfordnlp/GloVe/pulls,2,59,[Step size and gradient clipping for bias term...,[https://github.com/stanfordnlp/GloVe/pull/209...,[Make check-nan static to resolve undefined re...,[https://github.com/stanfordnlp/GloVe/pull/220...


In [14]:
# Save the DataFrame to a CSV file
df.to_csv('/content/drive/MyDrive/hamza/repos_details.csv', index=False)