In [57]:
from top_github_scraper import (get_top_repo_urls, get_top_repos, get_top_contributors, 
get_top_user_urls, get_top_users)
#import datapane as dp 
import pandas as pd 
from tqdm import tqdm 
from folium import plugins
import geopandas
from geopy.geocoders import Nominatim
import folium
from folium.plugins import Search
import requests
import os
from bs4 import BeautifulSoup
import time
USERNAME = os.getenv("GITHUB_USERNAME")
TOKEN = os.getenv("GITHUB_TOKEN")

In [89]:
keywords = ["data science","api"]
github_topics = ['3D','Algorithm','Android','API','Arduino','Atom','aws','azure','bash','bootstrap','chrome','compiler','crytocurrency','data structures','database','data visualization','deep learning','data science','deployment','flask','front end','git','google','iOS','json','library','machine learning','macOS','mobile','modeling','natural language processing','neural network','operating system','parsing','software','server','virtual reality','windows']

In [103]:
def get_repo_info(keyword, stop=10):
    repos = get_repo_urls(keyword, stop=stop)
    
    all_repo_info = dict()
    info_to_scrape = ['name',"stargazers_count", "forks_count", 'subscribers_count', 'topics', 'language', 'created_at','updated_at']
    for repo in tqdm(repos,desc="Scraping top repo info..."):
        repo_url = repo
        repo_info_url = f"https://api.github.com/repos{repo_url}"
        repo_info = requests.get(repo_info_url, auth=(USERNAME, TOKEN))
        if repo_info.status_code == 429:
            time.sleep(30)
        repo_info = repo_info.json()
        repo_name = repo_info['id']
        repo_important_info = {}
        for info in info_to_scrape:
            repo_important_info[info] = repo_info[info]
        all_repo_info[repo_name] = repo_important_info
    repo_df = pd.DataFrame.from_dict(all_repo_info, orient='index', columns=info_to_scrape)
    return repo_df

def all_repo_info(keyword, stop=10):
    repo_df = pd.DataFrame(columns=['name',"stargazers_count", "forks_count", 'subscribers_count', 'topics', 'language', 'created_at','updated_at'])
    for k in keywords:
        new_repo = get_repo_info(k, stop=stop)
        print(k,len(new_repo.index))
        repo_df = pd.concat([repo_df,new_repo])
        repo_df.to_csv('most_updated_repo_info.csv')
    return repo_df
            

def topic_relationship_table(repo_df):
    id_list = []
    topic_list = []
    for i in repo_df.index:
        topics = repo_df.loc[i,'topics']
        for t in topics:
            id_list.append(i)
            topic_list.append(t)
    df = pd.DataFrame({'id':id_list,'topic':topic_list})
    return df


SCRAPE_CLASS = {'Users': 'mr-1', 'Repositories': "v-align-middle"}
TYPE = 'Repositories'
def get_repo_urls(keyword, stop=10):
    urls = []
    page = None
    for page_num in tqdm(range(0, stop),desc="Scraping top GitHub URLs..."):
        keyword_no_space = ("+").join(keyword.split(" "))
        url = f"https://github.com/search?o=desc&p={str(page_num)}&q={keyword_no_space}&s=&type={TYPE}"
        if page and page.status_code == 429:
            print('sleeping')
            time.sleep(60)
        page = requests.get(url, headers={'User-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.4 Safari/605.1.15'})
        if page.status_code == 429:
            print('sleeping')
            time.sleep(60)
        soup = BeautifulSoup(page.text, "html.parser")
        a_tags = soup.find_all("a", class_=SCRAPE_CLASS[TYPE])
        new_urls = [a_tag.get("href") for a_tag in a_tags]
        urls.extend(new_urls)
        time.sleep(5)
    return urls

In [None]:
info = all_repo_info(github_topics, stop=10)
#repos_1 = get_repo_urls("machine learning", stop=10)
#print(len(repos_1))

Scraping top GitHub URLs...:   0%|          | 0/10 [00:00<?, ?it/s]

In [90]:
len(github_topics)

38

In [105]:
info

Unnamed: 0,name,stargazers_count,forks_count,subscribers_count,topics,language,created_at,updated_at
65388917,PythonDataScienceHandbook,34099,15223,1749,"[jupyter-notebook, matplotlib, numpy, pandas, ...",Jupyter Notebook,2016-08-10T14:24:36Z,2022-04-13T22:31:48Z
29749635,data-science-ipython-notebooks,22939,7111,1637,"[aws, big-data, caffe, data-science, deep-lear...",Python,2015-01-23T19:38:29Z,2022-04-13T20:13:34Z
61486207,data-science,14626,2569,917,[],,2016-06-19T15:15:36Z,2022-04-14T01:07:02Z
26382146,data-science-from-scratch,6852,3860,631,[],Python,2014-11-09T02:31:24Z,2022-04-13T21:51:01Z
85111422,Data-Analysis,4223,3389,349,[],Jupyter Notebook,2017-03-15T19:13:06Z,2022-04-13T11:56:56Z
...,...,...,...,...,...,...,...,...
58159876,api,509,53,27,[],TypeScript,2016-05-05T20:51:29Z,2022-04-13T10:47:13Z
102723966,api,335,58,25,"[database, exploits, python, python-wrapper, s...",Python,2017-09-07T10:27:06Z,2022-04-02T07:57:04Z
4372486,api,186,69,37,[],,2012-05-18T19:02:23Z,2022-03-10T20:40:48Z
109716868,api,103,88,14,"[api, api-rest, api-restfull, laravel, laravel...",PHP,2017-11-06T15:57:48Z,2022-04-08T06:31:57Z


In [100]:
topic_df = topic_relationship_table(info)
topic_df

Unnamed: 0,id,topic
0,86140645,adaboost
1,86140645,adaboost-algorithm
2,86140645,decision-tree
3,86140645,knn
4,86140645,logistic
...,...,...
529,298262517,youtube
530,298262517,youtube-playlist
531,48062442,andrew-ng
532,48062442,coursera-machine-learning


In [47]:
repo_info

{'id': 155662306,
 'node_id': 'MDEwOlJlcG9zaXRvcnkxNTU2NjIzMDY=',
 'name': 'homemade-machine-learning',
 'full_name': 'trekhleb/homemade-machine-learning',
 'private': False,
 'owner': {'login': 'trekhleb',
  'id': 3000285,
  'node_id': 'MDQ6VXNlcjMwMDAyODU=',
  'avatar_url': 'https://avatars.githubusercontent.com/u/3000285?v=4',
  'gravatar_id': '',
  'url': 'https://api.github.com/users/trekhleb',
  'html_url': 'https://github.com/trekhleb',
  'followers_url': 'https://api.github.com/users/trekhleb/followers',
  'following_url': 'https://api.github.com/users/trekhleb/following{/other_user}',
  'gists_url': 'https://api.github.com/users/trekhleb/gists{/gist_id}',
  'starred_url': 'https://api.github.com/users/trekhleb/starred{/owner}{/repo}',
  'subscriptions_url': 'https://api.github.com/users/trekhleb/subscriptions',
  'organizations_url': 'https://api.github.com/users/trekhleb/orgs',
  'repos_url': 'https://api.github.com/users/trekhleb/repos',
  'events_url': 'https://api.github.c