In [1]:
from top_github_scraper import (get_top_repo_urls, get_top_repos, get_top_contributors, 
get_top_user_urls, get_top_users)
#import datapane as dp 
import pandas as pd 
import numpy as np
from tqdm import tqdm 
from folium import plugins
import geopandas
from geopy.geocoders import Nominatim
import folium
from folium.plugins import Search
import requests
import os
from bs4 import BeautifulSoup
import time
USERNAME = os.getenv("GITHUB_USERNAME")
TOKEN = os.getenv("GITHUB_TOKEN")

In [7]:
keywords = ["data science","api"]
github_topics = ['3D','Algorithm','Android','API','Arduino','Atom','aws','azure','bash','bootstrap','chrome','compiler','crytocurrency','data structures','database','data visualization','deep learning','data science','deployment','flask','front end','git','google','iOS','json','library','machine learning','macOS','mobile','modeling','natural language processing','neural network','operating system','parsing','software','server','virtual reality','windows']
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A'}

In [8]:
def get_repo_info(keyword, stop=10):
    repos = get_repo_urls(keyword, stop=stop)
    
    all_repo_info = dict()
    info_to_scrape = ['name',"stargazers_count", "forks_count", 'subscribers_count', 'topics', 'language', 'created_at','updated_at']
    for repo in tqdm(repos,desc="Scraping top repo info..."):
        repo_url = repo
        repo_info_url = f"https://api.github.com/repos{repo_url}"
        repo_info = requests.get(repo_info_url, auth=(USERNAME, TOKEN))
        if repo_info.status_code == 429:
            time.sleep(30)
        repo_info = repo_info.json()
        repo_name = repo_info['id']
        repo_important_info = {}
        for info in info_to_scrape:
            repo_important_info[info] = repo_info[info]
        repo_important_info['url'] = repo_url
        repo_important_info['search_word'] = keyword
        all_repo_info[repo_name] = repo_important_info
    repo_df = pd.DataFrame.from_dict(all_repo_info, orient='index', columns=info_to_scrape+['url','search_word'])
    return repo_df

def all_repo_info(keywords, stop=10):
    repo_df = pd.DataFrame(columns=['name',"stargazers_count", "forks_count", 'subscribers_count', 'topics', 'language', 'created_at','updated_at','url','search_word'])
    for k in keywords:
        new_repo = get_repo_info(k, stop=stop)
        print(k,len(new_repo.index))
        repo_df = pd.concat([repo_df,new_repo])
        repo_df.to_csv('most_updated_repo_info_stop25to75.csv')
    return repo_df
            

def topic_relationship_table(repo_df):
    id_list = []
    topic_list = []
    for i in repo_df.index:
        topics = repo_df.loc[i,'topics']
        for t in topics:
            id_list.append(i)
            topic_list.append(t)
    df = pd.DataFrame({'id':id_list,'topic':topic_list})
    return df


SCRAPE_CLASS = {'Users': 'mr-1', 'Repositories': "v-align-middle"}
TYPE = 'Repositories'
def get_repo_urls(keyword, stop=10):
    urls = []
    page = None
    for page_num in tqdm(range(25, stop),desc="Scraping top GitHub URLs..."):
        keyword_no_space = ("+").join(keyword.split(" "))
        url = f"https://github.com/search?o=desc&p={str(page_num)}&q={keyword_no_space}&s=&type={TYPE}"
        if page and page.status_code == 429:
            time.sleep(60)
        page = requests.get(url, headers={'User-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.4 Safari/605.1.15'})
        if page.status_code == 429:
            time.sleep(60)
        soup = BeautifulSoup(page.text, "html.parser")
        a_tags = soup.find_all("a", class_=SCRAPE_CLASS[TYPE])
        new_urls = [a_tag.get("href") for a_tag in a_tags]
        urls.extend(new_urls)
        time.sleep(5)
    return urls

In [21]:
def get_repo_contributors(repo_url, repo_contributor_rel, repo_id, contributors_set,n_contributors=10):
    contributor_url = (f"https://api.github.com/repos{repo_url}/contributors")
    contributor_page = requests.get(contributor_url, auth=(USERNAME, TOKEN),headers=headers).json()
    max_n_top_contributors = min(len(contributor_page),n_contributors)
    
    all_contributors = dict()
    profile=None
    profile_features = ["login","url","type","name","company","location","hireable","bio","public_repos","public_gists","followers","following","created_at"]
    if max_n_top_contributors>0 :
        for n in range(max_n_top_contributors):
            contributor = contributor_page[n]
            repo_contributor_rel.add((repo_id, contributor["login"], contributor["contributions"]))
            if contributor["login"] not in contributors_set and contributor["contributions"]>10:
                contributors_set.add(contributor["login"])
                if profile and profile.status_code == 429:
                    time.sleep(30)
                profile = requests.get(contributor["url"], auth=(USERNAME, TOKEN),headers=headers)
                all_contributors[contributor["login"]] = {key: val for key, val in profile.json().items() if key in profile_features}
                if profile and profile.status_code == 429:
                    time.sleep(30)
    return pd.DataFrame.from_dict(all_contributors,orient='index')

def get_all_contributors(repos,repo_contributor_rel,contributors_set,n_contributors=10):
    contributor_df = pd.DataFrame(columns=["login","url","type","name","company","location","hireable","bio","public_repos","public_gists","followers","following","created_at"])
    for url,r_id in tqdm(repos,desc="Scraping top contributors info..."):
        new_contributors = get_repo_contributors(url, repo_contributor_rel, r_id, contributors_set,n_contributors=n_contributors)
        #print(url,len(new_contributors.index))
        contributor_df = pd.concat([contributor_df,new_contributors]).drop_duplicates()
        contributor_df.to_csv(f'most_updated_{n_contributors}_contributor_info_stop75.csv')
        pd.DataFrame(repo_contributor_rel, columns=['Repo','Contributor','Contributions']).sort_values('Repo').to_csv('repo_contributor_relationship_table_stop75.csv')

In [24]:
repos = pd.read_csv('repo_info_stop75.csv', index_col=0)
repo_contributor_rel = set()
contributors = set()
repos_zip = list(zip(list(repos['url']),list(repos.index)))[1323:]
#len(repos.index)


In [25]:
get_all_contributors(repos_zip,repo_contributor_rel,contributors)

Scraping top contributors info...:   1%|          | 140/22884 [01:26<3:53:56,  1.62it/s]


KeyError: 0

In [68]:
contributor_url = (f"https://api.github.com/repos/dragonir/3d/contributors")
#requests.get(contributor_url, auth=(USERNAME, TOKEN)).json()
requests.get('https://api.github.com/users/dragonir', auth=(USERNAME, TOKEN)).json()

{'login': 'dragonir',
 'id': 21058931,
 'node_id': 'MDQ6VXNlcjIxMDU4OTMx',
 'avatar_url': 'https://avatars.githubusercontent.com/u/21058931?v=4',
 'gravatar_id': '',
 'url': 'https://api.github.com/users/dragonir',
 'html_url': 'https://github.com/dragonir',
 'followers_url': 'https://api.github.com/users/dragonir/followers',
 'following_url': 'https://api.github.com/users/dragonir/following{/other_user}',
 'gists_url': 'https://api.github.com/users/dragonir/gists{/gist_id}',
 'starred_url': 'https://api.github.com/users/dragonir/starred{/owner}{/repo}',
 'subscriptions_url': 'https://api.github.com/users/dragonir/subscriptions',
 'organizations_url': 'https://api.github.com/users/dragonir/orgs',
 'repos_url': 'https://api.github.com/users/dragonir/repos',
 'events_url': 'https://api.github.com/users/dragonir/events{/privacy}',
 'received_events_url': 'https://api.github.com/users/dragonir/received_events',
 'type': 'User',
 'site_admin': False,
 'name': 'dragonir',
 'company': None,
 

In [57]:
pd.read_csv('most_updated_contributor_info_stop25.csv',index_col=0)

Unnamed: 0,login,url,type,name,company,location,hireable,bio,public_repos,public_gists,followers,following
dragonir,dragonir,https://api.github.com/users/dragonir,User,dragonir,,,True,我自食其力,285,5,342,34
nelsonkuang,nelsonkuang,https://api.github.com/users/nelsonkuang,User,茶布多,Aspire,广州,True,"图形学 is fascinating, I love CG",55,0,156,22
mrdoob,mrdoob,https://api.github.com/users/mrdoob,User,,,,,,42,68,18723,169
Mugen87,Mugen87,https://api.github.com/users/Mugen87,User,Michael Herzog,Human Interactive,Germany,,I :heart: three.js,11,1,664,34
alteredq,alteredq,https://api.github.com/users/alteredq,User,AlteredQualia,,,,,6,3,1131,4
WestLangley,WestLangley,https://api.github.com/users/WestLangley,User,,,,,,1,1,198,0
bhouston,bhouston,https://api.github.com/users/bhouston,User,Ben Houston,ThreeKit,"Ottawa, Canada",,Coding computer graphics since 1990. Founder &...,14,19,182,33
takahirox,takahirox,https://api.github.com/users/takahirox,User,Takahiro,Mozilla,Mountain View,True,Average level computer engineer,122,0,544,14
looeee,looeee,https://api.github.com/users/looeee,User,Lewy Blue,Discover three.js,Ireland,True,Author of the book Discover three.js! \r\n\r\n...,22,2,283,2
greggman,greggman,https://api.github.com/users/greggman,User,Greggman,,Earth,,30 years of games\r\n5 years of Chrome,305,333,1369,3


In [32]:
contributor_info = get_repo_contributors(list(repos['url'])[0],10, repo_contributor_rel, list(repos.index)[0])
pd.DataFrame.from_dict(contributor_info,orient='index')

Unnamed: 0,login,url,type,name,company,location,hireable,bio,public_repos,public_gists,followers,following
dragonir,dragonir,https://api.github.com/users/dragonir,User,dragonir,,,True,我自食其力,285,5,342,34
nelsonkuang,nelsonkuang,https://api.github.com/users/nelsonkuang,User,茶布多,Aspire,广州,True,"图形学 is fascinating, I love CG",55,0,156,22


In [54]:
pd.DataFrame(repo_contributor_rel, columns=['Repo','Contributor','Contributions']).sort_values('Repo')

Unnamed: 0,Repo,Contributor,Contributions
10,576201,Mugen87,3976
1,576201,mrdoob,17869
2,576201,looeee,697
19,576201,sunag,588
18,576201,greggman,636
5,576201,gero3,503
17,576201,takahirox,800
15,576201,bhouston,840
14,576201,alteredq,1753
11,576201,WestLangley,1028


In [141]:
info = all_repo_info(github_topics, stop=75)
topic_rel = topic_relationship_table(info)
topic_rel.to_csv('topic_relationship_table_stop25to75.csv')

Scraping top GitHub URLs...: 100%|██████████| 50/50 [09:36<00:00, 11.52s/it]
Scraping top repo info...: 100%|██████████| 450/450 [01:09<00:00,  6.43it/s]


3D 449


Scraping top GitHub URLs...: 100%|██████████| 50/50 [09:42<00:00, 11.65s/it]
Scraping top repo info...: 100%|██████████| 450/450 [01:09<00:00,  6.47it/s]


Algorithm 448


Scraping top GitHub URLs...: 100%|██████████| 50/50 [11:42<00:00, 14.06s/it]
Scraping top repo info...: 100%|██████████| 430/430 [01:06<00:00,  6.48it/s]


Android 420


Scraping top GitHub URLs...: 100%|██████████| 50/50 [09:45<00:00, 11.70s/it]
Scraping top repo info...: 100%|██████████| 450/450 [01:10<00:00,  6.35it/s]


API 391


Scraping top GitHub URLs...: 100%|██████████| 50/50 [09:36<00:00, 11.53s/it]
Scraping top repo info...: 100%|██████████| 450/450 [01:09<00:00,  6.47it/s]


Arduino 439


Scraping top GitHub URLs...: 100%|██████████| 50/50 [09:37<00:00, 11.54s/it]
Scraping top repo info...: 100%|██████████| 450/450 [01:08<00:00,  6.61it/s]


Atom 450


Scraping top GitHub URLs...: 100%|██████████| 50/50 [09:37<00:00, 11.55s/it]
Scraping top repo info...: 100%|██████████| 450/450 [01:12<00:00,  6.17it/s]


aws 448


Scraping top GitHub URLs...: 100%|██████████| 50/50 [09:36<00:00, 11.53s/it]
Scraping top repo info...: 100%|██████████| 450/450 [01:17<00:00,  5.83it/s]


azure 450


Scraping top GitHub URLs...: 100%|██████████| 50/50 [09:36<00:00, 11.52s/it]
Scraping top repo info...: 100%|██████████| 450/450 [01:07<00:00,  6.62it/s]


bash 448


Scraping top GitHub URLs...: 100%|██████████| 50/50 [10:37<00:00, 12.74s/it]
Scraping top repo info...: 100%|██████████| 440/440 [01:02<00:00,  7.02it/s]


bootstrap 435


Scraping top GitHub URLs...: 100%|██████████| 50/50 [09:34<00:00, 11.50s/it]
Scraping top repo info...: 100%|██████████| 450/450 [01:35<00:00,  4.71it/s]


chrome 449


Scraping top GitHub URLs...: 100%|██████████| 50/50 [09:39<00:00, 11.59s/it]
Scraping top repo info...: 100%|██████████| 450/450 [01:35<00:00,  4.72it/s]


compiler 449


Scraping top GitHub URLs...: 100%|██████████| 50/50 [09:27<00:00, 11.36s/it]
Scraping top repo info...: 100%|██████████| 46/46 [00:10<00:00,  4.58it/s]


crytocurrency 46


Scraping top GitHub URLs...: 100%|██████████| 50/50 [09:43<00:00, 11.67s/it]
Scraping top repo info...: 100%|██████████| 450/450 [01:34<00:00,  4.74it/s]


data structures 449


Scraping top GitHub URLs...: 100%|██████████| 50/50 [09:41<00:00, 11.62s/it]
Scraping top repo info...: 100%|██████████| 450/450 [01:10<00:00,  6.43it/s]


database 440


Scraping top GitHub URLs...: 100%|██████████| 50/50 [09:38<00:00, 11.57s/it]
Scraping top repo info...: 100%|██████████| 450/450 [01:09<00:00,  6.44it/s]


data visualization 449


Scraping top GitHub URLs...: 100%|██████████| 50/50 [09:39<00:00, 11.60s/it]
Scraping top repo info...: 100%|██████████| 450/450 [01:34<00:00,  4.79it/s]


deep learning 450


Scraping top GitHub URLs...: 100%|██████████| 50/50 [09:41<00:00, 11.63s/it]
Scraping top repo info...: 100%|██████████| 450/450 [01:35<00:00,  4.73it/s]


data science 446


Scraping top GitHub URLs...: 100%|██████████| 50/50 [09:41<00:00, 11.62s/it]
Scraping top repo info...: 100%|██████████| 450/450 [01:10<00:00,  6.36it/s]


deployment 445


Scraping top GitHub URLs...: 100%|██████████| 50/50 [09:38<00:00, 11.57s/it]
Scraping top repo info...: 100%|██████████| 450/450 [01:07<00:00,  6.64it/s]


flask 450


Scraping top GitHub URLs...: 100%|██████████| 50/50 [09:39<00:00, 11.58s/it]
Scraping top repo info...: 100%|██████████| 450/450 [01:12<00:00,  6.21it/s]


front end 439


Scraping top GitHub URLs...: 100%|██████████| 50/50 [09:46<00:00, 11.74s/it]
Scraping top repo info...: 100%|██████████| 450/450 [01:07<00:00,  6.64it/s]


git 324


Scraping top GitHub URLs...: 100%|██████████| 50/50 [09:38<00:00, 11.58s/it]
Scraping top repo info...: 100%|██████████| 450/450 [01:22<00:00,  5.43it/s]


google 438


Scraping top GitHub URLs...: 100%|██████████| 50/50 [09:39<00:00, 11.59s/it]
Scraping top repo info...: 100%|██████████| 450/450 [01:36<00:00,  4.66it/s]


iOS 436


Scraping top GitHub URLs...: 100%|██████████| 50/50 [09:38<00:00, 11.58s/it]
Scraping top repo info...: 100%|██████████| 450/450 [01:37<00:00,  4.64it/s]


json 449


Scraping top GitHub URLs...: 100%|██████████| 50/50 [09:44<00:00, 11.69s/it]
Scraping top repo info...: 100%|██████████| 450/450 [01:38<00:00,  4.57it/s]


library 440


Scraping top GitHub URLs...: 100%|██████████| 50/50 [09:44<00:00, 11.69s/it]
Scraping top repo info...: 100%|██████████| 450/450 [01:35<00:00,  4.73it/s]


machine learning 449


Scraping top GitHub URLs...: 100%|██████████| 50/50 [09:39<00:00, 11.58s/it]
Scraping top repo info...: 100%|██████████| 450/450 [01:46<00:00,  4.21it/s]


macOS 450


Scraping top GitHub URLs...: 100%|██████████| 50/50 [09:41<00:00, 11.62s/it]
Scraping top repo info...: 100%|██████████| 450/450 [01:36<00:00,  4.66it/s]


mobile 445


Scraping top GitHub URLs...: 100%|██████████| 50/50 [11:40<00:00, 14.00s/it]
Scraping top repo info...: 100%|██████████| 430/430 [01:30<00:00,  4.78it/s]


modeling 430


Scraping top GitHub URLs...: 100%|██████████| 50/50 [09:38<00:00, 11.57s/it]
Scraping top repo info...: 100%|██████████| 450/450 [01:38<00:00,  4.56it/s]


natural language processing 450


Scraping top GitHub URLs...: 100%|██████████| 50/50 [09:40<00:00, 11.62s/it]
Scraping top repo info...: 100%|██████████| 450/450 [01:36<00:00,  4.67it/s]


neural network 450


Scraping top GitHub URLs...: 100%|██████████| 50/50 [09:38<00:00, 11.57s/it]
Scraping top repo info...: 100%|██████████| 450/450 [01:36<00:00,  4.64it/s]


operating system 449


Scraping top GitHub URLs...: 100%|██████████| 50/50 [09:39<00:00, 11.60s/it]
Scraping top repo info...: 100%|██████████| 450/450 [01:32<00:00,  4.87it/s]


parsing 449


Scraping top GitHub URLs...: 100%|██████████| 50/50 [09:39<00:00, 11.59s/it]
Scraping top repo info...: 100%|██████████| 450/450 [01:36<00:00,  4.68it/s]


software 448


Scraping top GitHub URLs...: 100%|██████████| 50/50 [09:45<00:00, 11.71s/it]
Scraping top repo info...: 100%|██████████| 450/450 [01:36<00:00,  4.66it/s]


server 428


Scraping top GitHub URLs...: 100%|██████████| 50/50 [13:36<00:00, 16.32s/it]
Scraping top repo info...: 100%|██████████| 410/410 [01:31<00:00,  4.50it/s]


virtual reality 410


Scraping top GitHub URLs...: 100%|██████████| 50/50 [09:40<00:00, 11.61s/it]
Scraping top repo info...: 100%|██████████| 450/450 [01:34<00:00,  4.78it/s]


windows 449


In [90]:
len(github_topics)

38

In [139]:
info[info.language != np.nan]

Unnamed: 0,name,stargazers_count,forks_count,subscribers_count,topics,language,created_at,updated_at,url,search_word
442942525,3d,1276,745,25,"[3d, canvas, css, html, javascript, reactjs, s...",JavaScript,2021-12-30T02:19:09Z,2022-04-14T02:03:05Z,/dragonir/3d,3D
576201,three.js,80956,31338,2545,"[3d, augmented-reality, canvas, html5, javascr...",JavaScript,2010-03-23T18:58:01Z,2022-04-14T09:56:48Z,/mrdoob/three.js,3D
100120455,3D-Machine-Learning,7811,1625,588,"[3d, 3d-reconstruction, constructive-solid-geo...",,2017-08-12T15:20:54Z,2022-04-13T14:14:44Z,/timzhang642/3D-Machine-Learning,3D
16971353,3D,131,65,52,[],,2014-02-19T02:27:32Z,2021-12-29T06:33:38Z,/iliterobotics/3D,3D
254127753,3d-photo-inpainting,5869,908,145,"[3d-photo, novel-view-synthesis]",Python,2020-04-08T15:31:45Z,2022-04-14T09:53:49Z,/vt-vl-lab/3d-photo-inpainting,3D
...,...,...,...,...,...,...,...,...,...,...
66768460,Windows-Rootkits,348,166,27,[],C,2016-08-28T13:03:25Z,2022-04-09T00:31:55Z,/ciyze0101/Windows-Rootkits,windows
351432397,Swin-Transformer,7781,1269,122,"[ade20k, image-classification, imagenet, mask-...",Python,2021-03-25T12:42:36Z,2022-04-14T13:24:53Z,/microsoft/Swin-Transformer,windows
38767563,Rocket.Chat.Electron,1348,565,113,"[chat, collaboration, desktop, electron, foss,...",TypeScript,2015-07-08T16:56:41Z,2022-04-12T05:17:19Z,/RocketChat/Rocket.Chat.Electron,windows
51450581,awesome-windows-exploitation,55,590,3,[],,2016-02-10T15:41:56Z,2022-04-07T20:09:21Z,/GuardianRG/awesome-windows-exploitation,windows


In [127]:
topic_rel

Unnamed: 0,id,topic
0,442942525,3d
1,442942525,canvas
2,442942525,css
3,442942525,html
4,442942525,javascript
...,...,...
26060,38767563,rocketchat
26061,38767563,windows
26062,62802730,afl
26063,62802730,fuzzing


In [47]:
repo_info

{'id': 155662306,
 'node_id': 'MDEwOlJlcG9zaXRvcnkxNTU2NjIzMDY=',
 'name': 'homemade-machine-learning',
 'full_name': 'trekhleb/homemade-machine-learning',
 'private': False,
 'owner': {'login': 'trekhleb',
  'id': 3000285,
  'node_id': 'MDQ6VXNlcjMwMDAyODU=',
  'avatar_url': 'https://avatars.githubusercontent.com/u/3000285?v=4',
  'gravatar_id': '',
  'url': 'https://api.github.com/users/trekhleb',
  'html_url': 'https://github.com/trekhleb',
  'followers_url': 'https://api.github.com/users/trekhleb/followers',
  'following_url': 'https://api.github.com/users/trekhleb/following{/other_user}',
  'gists_url': 'https://api.github.com/users/trekhleb/gists{/gist_id}',
  'starred_url': 'https://api.github.com/users/trekhleb/starred{/owner}{/repo}',
  'subscriptions_url': 'https://api.github.com/users/trekhleb/subscriptions',
  'organizations_url': 'https://api.github.com/users/trekhleb/orgs',
  'repos_url': 'https://api.github.com/users/trekhleb/repos',
  'events_url': 'https://api.github.c