# Web-scraping: github-topics

In [1]:
#importing the modules
import requests
from bs4 import BeautifulSoup

In [2]:
url = 'https://github.com/topics'

response = requests.get(url)
response.status_code

200

In [3]:
soup = BeautifulSoup(response.text, 'html.parser')

# Titles

In [6]:
selected_class = 'f3 lh-condensed mb-0 mt-1 Link--primary'

topic_title_tags = soup.find_all('p', {'class' : selected_class})

In [7]:
topic_title_tags[0].text.strip()

'3D'

In [24]:
topic_titles = []

for tag in topic_title_tags:
    topic_titles.append(tag.text.strip())
    
print(topic_titles)

['3D', 'Ajax', 'Algorithm', 'Amp', 'Android', 'Angular', 'Ansible', 'API', 'Arduino', 'ASP.NET', 'Atom', 'Awesome Lists', 'Amazon Web Services', 'Azure', 'Babel', 'Bash', 'Bitcoin', 'Bootstrap', 'Bot', 'C', 'Chrome', 'Chrome extension', 'Command line interface', 'Clojure', 'Code quality', 'Code review', 'Compiler', 'Continuous integration', 'COVID-19', 'C++']


# Description

In [9]:
selected_class = 'f5 color-fg-muted mb-0 mt-1'

topic_desc_tags = soup.find_all('p', {'class' : selected_class})

In [11]:
topic_desc_tags[0].text.strip()

'3D modeling is the process of virtually developing the surface and structure of a 3D object.'

In [14]:
topic_desc = []

for tag in topic_desc_tags:
    topic_desc.append(tag.text.strip())
    
topic_desc[:5]

['3D modeling is the process of virtually developing the surface and structure of a 3D object.',
 'Ajax is a technique for creating interactive web applications.',
 'Algorithms are self-contained sequences that carry out a variety of tasks.',
 'Amp is a non-blocking concurrency library for PHP.',
 'Android is an operating system built by Google designed for mobile devices.']

# Url

In [16]:
selected_class = 'no-underline flex-grow-0'

topic_url_tags = soup.find_all('a', {'class' : selected_class})

In [18]:
topic_url_tags[0]['href']

'/topics/3d'

In [20]:
base_url = 'https://github.com'
topic_url = base_url + topic_url_tags[0]['href']
topic_url

'https://github.com/topics/3d'

In [21]:
topic_urls = []


for tag in topic_url_tags:
    topic_urls.append(base_url + tag['href'])
    
topic_urls[:4]

['https://github.com/topics/3d',
 'https://github.com/topics/ajax',
 'https://github.com/topics/algorithm',
 'https://github.com/topics/amphp']

**Create a dataframe**

In [22]:
import pandas as pd

In [25]:
topic_dict = {
    'title' : topic_titles,
    'description' : topic_desc,
    'url' : topic_urls
}

In [26]:
topics_df = pd.DataFrame(topic_dict)
topics_df

Unnamed: 0,title,description,url
0,3D,3D modeling is the process of virtually develo...,https://github.com/topics/3d
1,Ajax,Ajax is a technique for creating interactive w...,https://github.com/topics/ajax
2,Algorithm,Algorithms are self-contained sequences that c...,https://github.com/topics/algorithm
3,Amp,Amp is a non-blocking concurrency library for ...,https://github.com/topics/amphp
4,Android,Android is an operating system built by Google...,https://github.com/topics/android
5,Angular,Angular is an open source web application plat...,https://github.com/topics/angular
6,Ansible,Ansible is a simple and powerful automation en...,https://github.com/topics/ansible
7,API,An API (Application Programming Interface) is ...,https://github.com/topics/api
8,Arduino,Arduino is an open source hardware and softwar...,https://github.com/topics/arduino
9,ASP.NET,ASP.NET is a web framework for building modern...,https://github.com/topics/aspnet


# Topic page

- we will scrape the repositories for one topic, then we will do for all the other topics

In [28]:
topic_page_url = topic_urls[0]

In [29]:
#topic page url - '3d'
topic_page_url

'https://github.com/topics/3d'

In [31]:
response = requests.get(topic_page_url)
response.status_code

200

In [32]:
topic_doc = BeautifulSoup(response.text, 'html.parser')

In [33]:
h3_selected_tags = 'f3 color-fg-muted text-normal lh-condensed'

repo_tags = topic_doc.find_all('h3', {'class' : h3_selected_tags})

In [42]:
a_tags = repo_tags[0].find_all('a')

In [44]:
#username
a_tags[0].text.strip()

'mrdoob'

In [45]:
#repo_name
a_tags[1].text.strip()

'three.js'

In [52]:
#repo-url
base_url = 'https://github.com'
repo_url = base_url + a_tags[1]['href']
print(repo_url)

https://github.com/mrdoob/three.js


In [63]:
#star-tags
star_tags = topic_doc.find_all('span', {'class' : 'Counter js-social-count'})

star_tags[0].text

'81.6k'

In [64]:
#function to parse the star count
def parse_star_count(star_str):
    star_str = star_str.strip()
    if star_str[-1] == 'k':
        return int(float(star_str[:-1]) * 1000)
    return int(star_str)

In [65]:
#star count
parse_star_count(star_tags[0].text)

81600

In [68]:
#define a function to extract repo name, username, url, stars
def get_repo_tags(repo_tag, star_tag):
    a_tags = repo_tag.find_all('a')
    username = a_tags[0].text.strip()
    repo_name = a_tags[1].text.strip()
    repo_url = base_url + a_tags[1]['href']
    stars = parse_star_count(star_tag.text)
    return username, repo_name, stars, repo_url

In [69]:
#repo info
get_repo_tags(repo_tags[0], star_tags[0])

('mrdoob', 'three.js', 81600, 'https://github.com/mrdoob/three.js')

In [71]:
topic_repos_dict = {
    'username' : [],
    'repo_name' : [],
    'stars' : [],
    'repo_url' : []
}


for i in range(len(repo_tags)):
    repo_info = get_repo_tags(repo_tags[i], star_tags[i])
    topic_repos_dict['username'].append(repo_info[0])
    topic_repos_dict['repo_name'].append(repo_info[1])
    topic_repos_dict['stars'].append(repo_info[2])
    topic_repos_dict['repo_url'].append(repo_info[3])


In [72]:
topic_repos_df = pd.DataFrame(topic_repos_dict)
topic_repos_df

Unnamed: 0,username,repo_name,stars,repo_url
0,mrdoob,three.js,81600,https://github.com/mrdoob/three.js
1,libgdx,libgdx,20000,https://github.com/libgdx/libgdx
2,pmndrs,react-three-fiber,17800,https://github.com/pmndrs/react-three-fiber
3,BabylonJS,Babylon.js,16600,https://github.com/BabylonJS/Babylon.js
4,aframevr,aframe,14100,https://github.com/aframevr/aframe
5,ssloy,tinyrenderer,13600,https://github.com/ssloy/tinyrenderer
6,lettier,3d-game-shaders-for-beginners,12800,https://github.com/lettier/3d-game-shaders-for...
7,FreeCAD,FreeCAD,11200,https://github.com/FreeCAD/FreeCAD
8,metafizzy,zdog,9100,https://github.com/metafizzy/zdog
9,CesiumGS,cesium,8600,https://github.com/CesiumGS/cesium


In [107]:
def get_topic_page(topic_url):
    #download the page
    response = requests.get(topic_url)
    
    if response.status_code != 200:
        raise Exception(f"Failed to load page {topic_url}")
    
    topic_doc = BeautifulSoup(response.text, 'html.parser')
    return topic_doc


#define a function to extract repo name, username, url, stars
def get_repo_tags(repo_tag, star_tag):
    a_tags = repo_tag.find_all('a')
    username = a_tags[0].text.strip()
    repo_name = a_tags[1].text.strip()
    repo_url = base_url + a_tags[1]['href']
    stars = parse_star_count(star_tag.text)
    return username, repo_name, stars, repo_url


def get_topic_repos(topic_doc):
    h3_selected_tags = 'f3 color-fg-muted text-normal lh-condensed'
    repo_tags = topic_doc.find_all('h3', {'class' : h3_selected_tags})

    star_tags = topic_doc.find_all('span', {'class' : 'Counter js-social-count'})
    
    topic_repos_dict = {
    'username' : [],
    'repo_name' : [],
    'stars' : [],
    'repo_url' : []
    }


    for i in range(len(repo_tags)):
        repo_info = get_repo_tags(repo_tags[i], star_tags[i])
        topic_repos_dict['username'].append(repo_info[0])
        topic_repos_dict['repo_name'].append(repo_info[1])
        topic_repos_dict['stars'].append(repo_info[2])
        topic_repos_dict['repo_url'].append(repo_info[3])

    return pd.DataFrame(topic_repos_dict)


def scrape_topic(topic_url, path):
    if os.path.exists(path):
        print("The file {} already exists".format(path))
        return
    topic_df = get_topic_repos(get_topic_page(topic_url))
    topic_df.to_csv(path, index=None)

In [79]:
topic_urls[8]

'https://github.com/topics/arduino'

In [86]:
get_topic_repos(get_topic_page(topic_urls[8])).to_csv('arduino.csv', index=None)

In [108]:
#get a list of topics from a topic page

def get_topic_title(doc):
    selected_class = 'f3 lh-condensed mb-0 mt-1 Link--primary'
    topic_title_tags = soup.find_all('p', {'class' : selected_class})
    topic_titles = []
    for tag in topic_title_tags:
        topic_titles.append(tag.text.strip())
    return topic_titles


def get_topic_desc(doc):
    selected_class = 'f5 color-fg-muted mb-0 mt-1'
    topic_desc_tags = soup.find_all('p', {'class' : selected_class})
    topic_desc = []
    for tag in topic_desc_tags:
        topic_desc.append(tag.text.strip())
    return topic_desc


def get_topic_urls(doc):
    selected_class = 'no-underline flex-grow-0'
    topic_url_tags = soup.find_all('a', {'class' : selected_class})
    topic_urls = []
    for tag in topic_url_tags:
        topic_urls.append(base_url + tag['href'])
    return topic_urls
                                   

                                   
def scrape_topics():
    topics_url = 'https://github.com/topics'
    response = requests.get(topics_url)
    if response.status_code != 200:
        raise Exception('Failed to load page {}'.format(topic_url))
    doc = BeautifulSoup(response.text, 'html.parser')
    topic_dict = {
    'title' : topic_titles,
    'description' : topic_desc,
    'url' : topic_urls
    }
    return pd.DataFrame(topic_dict)

In [94]:
scrape_topics()

Unnamed: 0,title,description,url
0,3D,3D modeling is the process of virtually develo...,https://github.com/topics/3d
1,Ajax,Ajax is a technique for creating interactive w...,https://github.com/topics/ajax
2,Algorithm,Algorithms are self-contained sequences that c...,https://github.com/topics/algorithm
3,Amp,Amp is a non-blocking concurrency library for ...,https://github.com/topics/amphp
4,Android,Android is an operating system built by Google...,https://github.com/topics/android
5,Angular,Angular is an open source web application plat...,https://github.com/topics/angular
6,Ansible,Ansible is a simple and powerful automation en...,https://github.com/topics/ansible
7,API,An API (Application Programming Interface) is ...,https://github.com/topics/api
8,Arduino,Arduino is an open source hardware and softwar...,https://github.com/topics/arduino
9,ASP.NET,ASP.NET is a web framework for building modern...,https://github.com/topics/aspnet


In [103]:
import os

def scrape_topic_repos():
    print('Scraping list of topics')
    topics_df = scrape_topics()
    
    #os.makedirs('data', exist_ok=False)
    
    for index, row in topics_df.iterrows():
        print('Scraping top repositories for {}'.format(row['title']))
        scrape_topic(row['url'], 'data/{}.csv'.format(row['title']))

In [109]:
scrape_topic_repos()

Scraping list of topics
Scraping top repositories for 3D
Scraping top repositories for Ajax
Scraping top repositories for Algorithm
Scraping top repositories for Amp
Scraping top repositories for Android
Scraping top repositories for Angular
Scraping top repositories for Ansible
Scraping top repositories for API
Scraping top repositories for Arduino
Scraping top repositories for ASP.NET
Scraping top repositories for Atom
Scraping top repositories for Awesome Lists
Scraping top repositories for Amazon Web Services
Scraping top repositories for Azure
Scraping top repositories for Babel
Scraping top repositories for Bash
Scraping top repositories for Bitcoin
Scraping top repositories for Bootstrap
Scraping top repositories for Bot
Scraping top repositories for C
Scraping top repositories for Chrome
Scraping top repositories for Chrome extension
Scraping top repositories for Command line interface
Scraping top repositories for Clojure
Scraping top repositories for Code quality
Scraping top

Exception: Failed to load page https://github.com/topics/cpp