In [1]:
import requests
from datetime import datetime, timedelta
import math
from ghapi.all import GhApi

In [2]:
import base64
import json

In [3]:
github_key = None
with open(".env", "r") as f:
    lines = f.readlines()

    for line in lines:
        if line.startswith("GITHUB_TOKEN"):
            parts = line.split("=")[1:]
            github_key = "=".join(parts).strip().strip('"')
            continue

assert github_key is not None, "Github key not found in .env file"
github = GhApi(owner='azure',token=github_key)

In [4]:
# #### print to see option
# # api.repos
# azure_sdk_readmes = []
# sdk_list = api.repos.get_content(owner='azure', repo='azure-sdk-for-net', path='sdk', ref='main')

# for subdirs in sdk_list:
#     print(subdirs.path)
#     sdks = api.repos.get_content(owner='azure', repo='azure-sdk-for-net', path=subdirs.path, ref='main')

#     for sdk in sdks:
#         sdk_contents = api.repos.get_content(owner='azure', repo='azure-sdk-for-net', path=sdk.path, ref='main')
#         # print (sdk_contents)
#         # print ("deeznuts!")
#         # print(sdk.path)
#         # readme = api.repos.get_readme(owner='azure', repo='azure-sdk-for-net', path=sdk.path, ref='main')
#         # print (readme)
#         for content in sdk_contents:
#             if isinstance(content, str):
#                 continue
#             if content.type == "file" and content.name == 'README.md':
#                 # print("found one!")
#                 # print(content)
#                 azure_sdk_readmes.append(content)
#                 break

# len(azure_sdk_readmes)

In [5]:
def get_azure_sdk_readmes(owner, repo):
    azure_sdk_readmes = []
    sdk_list = github.repos.get_content(owner=owner, repo=repo, path='sdk', ref='main')

    for subdirs in sdk_list:
        if isinstance(subdirs, str):
            continue
        sdks = github.repos.get_content(owner=owner, repo=repo, path=subdirs.path, ref='main')

        for sdk in sdks:
            if isinstance(sdk, str):
                continue
            if sdk.type == "file" and sdk.name == 'README.md':
                    azure_sdk_readmes.append(sdk)
                    break
            sdk_contents = github.repos.get_content(owner=owner, repo=repo, path=sdk.path, ref='main')

            for content in sdk_contents:
                if isinstance(content, str):
                    continue
                if content.type == "file" and content.name == 'README.md':
                    azure_sdk_readmes.append(content)
                    break

    return azure_sdk_readmes

In [6]:
def get_azure_sdk_samples (owner, repo):
    azure_sdk_samples = []
    sdk_list = github.repos.get_content(owner=owner, repo=repo, path='sdk', ref='main')

    for subdirs in sdk_list:
        if isinstance(subdirs, str):
            continue
        sdks = github.repos.get_content(owner=owner, repo=repo, path=subdirs.path, ref='main')

        for sdk in sdks:
            if isinstance(sdk, str):
                continue
            sdk_contents = github.repos.get_content(owner=owner, repo=repo, path=sdk.path, ref='main')
            for content in sdk_contents:
                if isinstance(content, str):
                    continue
                if content.type == "dir" and content.name == 'samples':
                    sample_contents = github.repos.get_content(owner=owner, repo=repo, path=content.path, ref='main')
                    for files in sample_contents:
                        if files.type == "file":
                            azure_sdk_samples.append(files)
                            print(files)
                            break
                

    return azure_sdk_samples

In [7]:
def getREADMEContent(repos, language, api=github):
    #### Functin that takes in list of README URLS and returns an object of the form:
    #### [{"url_to_readme":url,"content":content,"repo_name":repo_name,"org_name":org_name,"link_to_repo":repo_link, "repo_ghapi_response":repo_ghapi_response},...]
    #### The line used to get README content from ghapi JSON response is `content = base64.b64decode(api.repos.get_readme(repos[10].name)['content']).decode('utf-8')`
    #### The incoming list is a list of objects from a call like api.repos.get_readme(repos[10].name)
    #### where repos is the ghapi response for a call like api.repos.list_for_org(org='microsoft',per_page=100,page=1)
    results = []
    for repoJSON in repos:
        # print(repoJSON.path)
        try:
            newObject = {
                # 'README_text': base64.b64decode(api.repos.get_readme(repoJSON.name)['content']).decode('utf-8'),
                'README_text': requests.get(repoJSON.download_url).text,
                'repo_name': (repoJSON.path.replace('sdk/', '')).replace('/README.md', ''),
                # 'org_name': repoJSON.owner.login,
                # 'org_repo': repoJSON.owner.login + '/' + repoJSON.name,
                # 'platform_org_repo': platform + '+' + repoJSON.owner.login + '/' + repoJSON.name,
                'link_to_repo': repoJSON.html_url.replace('/README.md', ''),
                # "repo_api_response": dict(repoJSON), ### The incoming dict is a class fastcore so this fail!
                # "platform": platform,
                # "archived": repoJSON.archived,
                # "description": repoJSON.description,
                # "fork": repoJSON.fork,
                # "forks_count": repoJSON.forks_count,
                # "is_template": repoJSON.is_template,
                "language": language,
                # "languages_url": repoJSON.languages_url,
                # "license": repoJSON.license.spdx_id,
                # "open_issues_count": repoJSON.open_issues_count,
                # "open_issues": repoJSON.open_issues,
                # "pushed_at": repoJSON.pushed_at,
                # "stargazers_count": repoJSON.stargazers_count,
                # "watchers_count": repoJSON.watchers_count,
                # "size": repoJSON.size,
                # "default_branch": repoJSON.default_branch,
                # "visibility": repoJSON.visibility,
                # "topics": repoJSON.topics
                # "updated_at": repoJSON.updated_at,

            }
            ##results[platform + "+" + repoJSON.owner.login +  "+" + repoJSON.name] = newObject
            # print(newObject)
            results.append(newObject)
        except Exception as err:
            print(f'Error occurred while processing {repoJSON}: {err}')

    return results

In [8]:
# api.repos

In [9]:
# all_data = getXNumberReposSinceDateInGivenOrg(orgs=['Azure','Azure-Samples','MicrosoftDocs'],numberOfReposToSearchThrough=20000,per_page=100,since='2022-06-07T22:49:39Z',platform='github',output_file_path="../data/READMEs/azure_past_365days.json")

In [10]:
# output_file_path="../data/READMEs/azure_sdk_readme_net_list.json"

# net_data = get_azure_sdk_readmes(owner='Azure', repo='azure-sdk-for-net')
# JSON_readme_list = getREADMEContent(net_data, ["C#", ".NET"])
# with open(output_file_path, "w") as outfile:
#     json.dump(JSON_readme_list, outfile)

# output_file_path="../data/READMEs/azure_sdk_readme_java_list.json"
# java_data = get_azure_sdk_readmes(owner='Azure', repo='azure-sdk-for-java')
# JSON_readme_list = getREADMEContent(java_data, ["Java"])
# with open(output_file_path, "a") as outfile:
#     json.dump(JSON_readme_list, outfile)

# output_file_path="../data/READMEs/azure_sdk_readme_javascript_list.json"
# javascript_data = get_azure_sdk_readmes(owner='Azure', repo='azure-sdk-for-js')
# JSON_readme_list = getREADMEContent(javascript_data, ["JavaScript"])
# with open(output_file_path, "a") as outfile:
#     json.dump(JSON_readme_list, outfile)

# output_file_path="../data/READMEs/azure_sdk_readme_python_list.json"
# python_data = get_azure_sdk_readmes(owner='Azure', repo='azure-sdk-for-python')
# JSON_readme_list = getREADMEContent(python_data, ["Python"])
# with open(output_file_path, "a") as outfile:
#     json.dump(JSON_readme_list, outfile)

# output_file_path="../data/READMEs/azure_sdk_samples_net_list.json"
# net_data = get_azure_sdk_samples(owner='Azure', repo='azure-sdk-for-net')
# JSON_readme_list = getREADMEContent(net_data, ["C#", ".NET"])
# with open(output_file_path, "w") as outfile:
#     json.dump(JSON_readme_list, outfile)

# print("finished writing to file: ",output_file_path)

# output_file_path="../data/READMEs/azure_sdk_samples_python_list.json"    
# net_data = get_azure_sdk_samples(owner='Azure', repo='azure-sdk-for-python')
# JSON_readme_list = getREADMEContent(net_data, ["Python"])
# with open(output_file_path, "w") as outfile:
#     json.dump(JSON_readme_list, outfile)

output_file_path="../data/READMEs/azure_sdk_samples_java_list.json"
java_data = get_azure_sdk_samples(owner='Azure', repo='azure-sdk-for-java')
JSON_readme_list = getREADMEContent(java_data, ["Java"])
with open(output_file_path, "w") as outfile:
    json.dump(JSON_readme_list, outfile)
print("finished writing to file: ",output_file_path)

output_file_path="../data/READMEs/azure_sdk_samples_javascript_list.json"
javascript_data = get_azure_sdk_samples(owner='Azure', repo='azure-sdk-for-js')
JSON_readme_list = getREADMEContent(javascript_data, ["JavaScript"])
with open(output_file_path, "w") as outfile:
    json.dump(JSON_readme_list, outfile)
print("finished writing to file: ",output_file_path)

output_file_path="../data/READMEs/azure_sdk_samples_rust_list.json"
rust_data = get_azure_sdk_samples(owner='Azure', repo='azure-sdk-for-rust')
JSON_readme_list = getREADMEContent(rust_data, ["Rust"])
with open(output_file_path, "w") as outfile:
    json.dump(JSON_readme_list, outfile)    
print("finished writing to file: ",output_file_path)

{'name': 'pom.xml', 'path': 'sdk/applicationinsights/microsoft-azure-applicationinsights-query/samples/pom.xml', 'sha': '9336ce554fa19c7da15c70dde6d69eae3e37f839', 'size': 3211, 'url': 'https://api.github.com/repos/Azure/azure-sdk-for-java/contents/sdk/applicationinsights/microsoft-azure-applicationinsights-query/samples/pom.xml?ref=main', 'html_url': 'https://github.com/Azure/azure-sdk-for-java/blob/main/sdk/applicationinsights/microsoft-azure-applicationinsights-query/samples/pom.xml', 'git_url': 'https://api.github.com/repos/Azure/azure-sdk-for-java/git/blobs/9336ce554fa19c7da15c70dde6d69eae3e37f839', 'download_url': 'https://raw.githubusercontent.com/Azure/azure-sdk-for-java/main/sdk/applicationinsights/microsoft-azure-applicationinsights-query/samples/pom.xml', 'type': 'file', '_links': {'self': 'https://api.github.com/repos/Azure/azure-sdk-for-java/contents/sdk/applicationinsights/microsoft-azure-applicationinsights-query/samples/pom.xml?ref=main', 'git': 'https://api.github.com/

In [11]:
# len(net_data)
# print(net_data[4])
# JSON_readme_list = getREADMEContent(net_data, ["C#", ".NET"])
# len(JSON_readme_list)
# print(JSON_readme_list[0])  

In [12]:
# def load_json_from_file(file_path):
#     with open(file_path, "r") as infile:
#         data = json.load(infile)
#     return data


In [13]:
# data = load_json_from_file("../data/READMEs/azure_sdk_readme_list.json")

In [14]:
# data 

In [15]:
# len(data)

In [16]:
# data[-6:]