Get random repos

In [97]:
import json
import requests
import datetime
import random
from tqdm import tqdm 
import pandas as pd

In [2]:
def get(url, base_url='https://api.github.com/'):
    url = base_url + url

    payload={}
    headers = {}

    response = requests.request("GET", url, headers=headers, data=payload)

    return json.loads(response.text)

In [52]:
def get_random_date():
    start = datetime.date(year=2008, month=4, day=1)
    today = datetime.date.today()
    rand_ordinal = random.randint(start.toordinal(), today.toordinal())
    ret = datetime.date.fromordinal(rand_ordinal)
    return ret

## Scrap data!

In [89]:
seed = 42
random.seed(seed)

raw = []

search/repositories?q=created:2022-08-04&sort=star&ord=desc&per_page=100
110550
search/repositories?q=created:2010-09-30&sort=star&ord=desc&per_page=100
671
search/repositories?q=created:2008-10-22&sort=star&ord=desc&per_page=100
107
search/repositories?q=created:2014-06-02&sort=star&ord=desc&per_page=100
8671
search/repositories?q=created:2013-09-28&sort=star&ord=desc&per_page=100
4546


In [93]:
num_samples = 5

for _ in range(num_samples):
    rand_date = get_random_date()
    query = f'search/repositories?q=created:{rand_date.isoformat()}&sort=star&ord=desc&per_page=100'
    print(query)
    js = get(query)
    print(js['total_count'])
    raw.extend(js['items'])

search/repositories?q=created:2013-06-19&sort=star&ord=desc&per_page=100
5372
search/repositories?q=created:2019-08-01&sort=star&ord=desc&per_page=100
43534
search/repositories?q=created:2021-10-01&sort=star&ord=desc&per_page=100
90345
search/repositories?q=created:2008-11-04&sort=star&ord=desc&per_page=100
93
search/repositories?q=created:2020-11-01&sort=star&ord=desc&per_page=100
55402


In [94]:
len(raw)

1993

## Turn to JSON

In [96]:
cols = [
    'full_name', 'num_stars', 'num_forks', 'num_watchers', 'num_open_issues', 
    'topics', 'main_language', 'created_at', 'updated_at', 'license_key', 'description'
]

data = {
    key: []
    for key in cols
}

for repo in tqdm(raw):
#     s = get(repo['languages_url'], base_url='')
#     print(s)
#     languages_dict = json.loads(s)
#     languages = ', '.join(languages_dict)
    vals = [
        repo['full_name'], 
        repo['stargazers_count'], 
        repo['forks_count'], 
        repo['watchers'],
        repo['open_issues'],
        ', '.join(repo['topics']), 
        repo['language'],
        repo['created_at'],
        repo['updated_at'],
        repo['license']['key'] if repo['license'] else None,
        repo['description']
    ]
    for col, val in zip(cols, vals):
        data[col].append(val)

100%|████████████████████████████████████| 1993/1993 [00:00<00:00, 98501.70it/s]


In [98]:
df = pd.DataFrame(data)

In [99]:
df

Unnamed: 0,full_name,num_stars,num_forks,num_watchers,num_open_issues,topics,main_language,created_at,updated_at,license_key,description
0,recloudstream/cloudstream,1623,126,1623,52,"android, anime, media-center, streaming-app, v...",Kotlin,2022-08-04T08:42:38Z,2022-12-03T23:06:29Z,gpl-3.0,Android app for streaming and downloading Movi...
1,ProjectPhysX/FluidX3D,1041,98,1041,5,"cfd, computational-fluid-dynamics, fluid-simul...",C++,2022-08-04T08:49:44Z,2022-12-03T23:27:17Z,other,The fastest and most memory efficient lattice ...
2,serhack/pdf-diff,695,33,695,6,"editor-tools, pdf, pdf-diff, pdf-difference, t...",Go,2022-08-04T19:54:17Z,2022-11-18T03:43:17Z,mit,A tool for visualizing differences between two...
3,7eu7d7/pixiv_AI_crawler,375,25,375,4,,Python,2022-08-04T02:53:14Z,2022-12-03T19:01:11Z,mit,基于深度学习的p站高质量涩图AI爬虫，可以学会你的XP
4,pratikratadiya/awesome-ml-internships,357,36,357,1,"awesome-list, awesome-resources, data-science,...",,2022-08-04T04:35:49Z,2022-12-03T08:34:09Z,,List of companies offering Machine learning an...
...,...,...,...,...,...,...,...,...,...,...,...
1988,hrishikeshathalye/Hospital-Management-System-DBMS,49,39,49,0,,JavaScript,2020-11-01T14:58:29Z,2022-11-28T05:41:00Z,,Hospital Management System made for Sem 5 DBMS...
1989,ryjjin/Obsidian-shortcuts-extender,48,5,48,6,"obsidian, obsidian-plugin",TypeScript,2020-11-01T22:04:24Z,2022-11-30T11:43:43Z,,Plugin for Obsidian: Use shortcuts for input s...
1990,ckoval7/df-aggregator,47,10,47,1,"df-aggregator, fox-hunting, geolocation, ham-r...",Python,2020-11-01T21:36:20Z,2022-10-27T23:00:41Z,gpl-3.0,Networked DFing software that can handle multi...
1991,apachecn/apachecn-dsai-wiki,47,13,47,0,,JavaScript,2020-11-01T11:23:53Z,2022-11-04T09:47:04Z,other,:books: ApacheCN 数据科学和人工智能知识库


In [100]:
df.to_csv('2k_random_repos.csv')

In [101]:
import pickle
pickle.dump(raw, open('2k_random_repos.pkl', 'wb+'))