Using the filtered projects results returned from MongoDB. We should be able to collect all the READMEs and Release Information.

First get the users github token:

In [5]:
from github import Github
import getpass
import time
import pandas as pd
import csv
import langdetect
import base64
import os
import json

g = Github(getpass.getpass("Enter access token:"), per_page=100)

Make directories to store the readmes and releases for each language if they don't exist.

In [6]:
languages = ["python","go", "java", "ruby",]

readme_folder = "readmes"
releases_folder = "releases"

if not os.path.exists(readme_folder):
    os.makedirs(readme_folder)

if not os.path.exists(releases_folder):
    os.makedirs(releases_folder)

for language in languages:

    language_readme_folder = f'{readme_folder}/{language}'
    if not os.path.exists(language_readme_folder):
        os.makedirs(language_readme_folder)

    language_releases_folder = f'{releases_folder}/{language}'
    if not os.path.exists(language_releases_folder):
        os.makedirs(language_releases_folder)

With the github token, collect the READMEs, and the Releases for each repository. 
Collect the project README first, then try to collect the Release information.
Write the results back to the project_details/results folder. Storing the results there will let us do some analysis later if needed.

In [8]:
project_details_folder = "project_details"

for language in languages:

    language_readme_folder = f'{readme_folder}/{language}'
    language_releases_folder = f'{releases_folder}/{language}'
 
    f = open(f'{project_details_folder}/filtered/{language}_projects_filtered.json')
    projects_data = json.load(f)

    
    print(f"Retrieving the readme files for {len(projects_data)} repositories in {language}...")
    
    collectedReadmes = 0
    collectedReleases = 0
    for project in projects_data:

        projectId = project["ProjectID"]
        projectOwner = projectId.split('/')[0]
        projectName = projectId.split('/')[1]

        # Folders to store the projects readme & releases
        readme_file = f"{language_readme_folder}/{projectOwner}_{projectName}_readme.md"
        release_file = f"{language_releases_folder}/{projectOwner}_{projectName}_releases.jsonl"

        if os.path.exists(readme_file) or os.path.exists(release_file):
            continue # continue to next project, we've already collected this projects info

        try:
            repo = g.get_repo(projectId)
        except Exception as e:
            print(f"{projectId}: {e}")
        
        # Try to collect the readme information
        time.sleep(0.25)
        try:
            readme = base64.b64decode(repo.get_readme()._rawData["content"].encode()).decode("UTF-8")
            with open(readme_file, "w", encoding="utf-8") as file:
                file.write(readme)
            collectedReadmes += 1
        except Exception as e:
            print(f"{projectId}: {e}")
            time.sleep(1)
        
        # Try to collect the release information
        time.sleep(0.25)
        try:
            releases = repo.get_releases()
            
            if releases.totalCount >= 1:
                with open(release_file, "w", encoding="utf-8") as file:
                    for release in releases:
                        file.write(json.dumps(release.raw_data) + "\n")
                collectedReleases += 1
        except Exception as e:
            print(f"{projectId}: {e}")
            if os.path.exists(release_file):
                os.remove(release_file) 
            time.sleep(1)
   
    print(f"{collectedReadmes} new readme files were collected for {language} projects.")
    print(f"{collectedReleases} new release files were collected for {language} projects.")

Retrieving the readme files for 4663 repositories in python...
google/eng-edu: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/reference/repos#get-a-repository-readme"}
mehulsingh10/Age-Transformation-Synthesis: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/reference/repos#get-a-repository-readme"}
tensorflow/workshops: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/reference/repos#get-a-repository-readme"}
LibreOffice/mso-dumper: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/reference/repos#get-a-repository-readme"}
cyyself/yaffs2: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/reference/repos#get-a-repository-readme"}
cdemu/cdemu: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/reference/repos#get-a-repository-readme"}
ros/common_msgs: 404 {"message": "Not Found", "documentation_url": "htt