# Web Scraping GitHub

This notebook extracts data from https://github.com/topics. Necessary information like topic name, topic's description, top repositories for that topic and the creators of those top repositories is scraped from the website.

## Importing necessary modules

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

## Getting data

In [3]:
url = "https://github.com/topics"

In [4]:
data = requests.get(url)
data.status_code

200

In [5]:
web_page = data.text

## Scraping information

In [7]:
doc = BeautifulSoup(web_page, "html.parser")

In [8]:
list_of_topics = []
topic_section = doc.find('div', class_="application-main").find("main").find("div", class_="d-lg-flex container-lg p-responsive").find("div", class_="col-lg-9 position-relative pr-lg-5 mb-6 mr-lg-5")
topics = topic_section.find_all("p", class_="f3 lh-condensed mb-0 mt-1 Link--primary")
for i in range(len(topics)):
    list_of_topics.append(topics[i].text)

In [9]:
list_of_topics_desc = []
topic_desc_class = "f5 color-fg-muted mb-0 mt-1"
topic_desc_section = doc.find_all('p', class_=topic_desc_class)
for i in range(len(topic_desc_section)):
    list_of_topics_desc.append(topic_desc_section[i].text.strip())

In [10]:
topic_urls = []
base_url = "https://github.com"

topic_url_class = "no-underline flex-grow-0"
topic_url_section = doc.find_all('a', class_=topic_url_class)
for i in range(len(topic_url_section)):
    topic_urls.append(base_url+topic_url_section[i]['href'])

## Converting into dataframe

In [11]:
topic_info_dict = {'topic_name': list_of_topics, 'topic_description': list_of_topics_desc,
                  'topic_url': topic_urls}

In [13]:
topic_df = pd.DataFrame(topic_info_dict)

In [14]:
topic_df

Unnamed: 0,topic_name,topic_description,topic_url
0,3D,3D modeling is the process of virtually develo...,https://github.com/topics/3d
1,Ajax,Ajax is a technique for creating interactive w...,https://github.com/topics/ajax
2,Algorithm,Algorithms are self-contained sequences that c...,https://github.com/topics/algorithm
3,Amp,Amp is a non-blocking concurrency library for ...,https://github.com/topics/amphp
4,Android,Android is an operating system built by Google...,https://github.com/topics/android
5,Angular,Angular is an open source web application plat...,https://github.com/topics/angular
6,Ansible,Ansible is a simple and powerful automation en...,https://github.com/topics/ansible
7,API,An API (Application Programming Interface) is ...,https://github.com/topics/api
8,Arduino,Arduino is an open source hardware and softwar...,https://github.com/topics/arduino
9,ASP.NET,ASP.NET is a web framework for building modern...,https://github.com/topics/aspnet


In [15]:
# topic_df.to_csv("topics.csv", index=None)

## Working on individual topic page

In [20]:
topic0 = topic_urls[0]

In [23]:
res = requests.get(topic0)
res.status_code

200

In [25]:
topic_page = res.text

In [27]:
topic_doc = BeautifulSoup(topic_page, "html.parser")

### Getting topic name, topic description, repo name and repo owner name

In [33]:
topic_name = topic_doc.find('h1', class_='h1').text.strip()

In [34]:
topic_desc = topic_doc.find('div', class_='markdown-body f5 mb-2').find('p').text.strip()

In [50]:
repo_creator = topic_doc.find('h3', class_='f3 color-fg-muted text-normal lh-condensed').find('a').text.strip()

In [48]:
repo_name = topic_doc.find('h3', class_='f3 color-fg-muted text-normal lh-condensed').find_all('a')[-1].text.strip()

In [55]:
repo_owner = []
repo_creator = topic_doc.find_all('h3', class_='f3 color-fg-muted text-normal lh-condensed')

for repo in repo_creator:
    repo_owner.append(repo.find('a').text.strip())

In [62]:
repo_creator[0].find_all('a')[-1]

<a class="text-bold wb-break-word" data-hydro-click='{"event_type":"explore.click","payload":{"click_context":"REPOSITORY_CARD","click_target":"REPOSITORY","click_visual_representation":"REPOSITORY_NAME_HEADING","actor_id":null,"record_id":576201,"originating_url":"https://github.com/topics/3d","user_id":null}}' data-hydro-click-hmac="517d3d5cb9d89752156923904a4238816bc9b51ab7772f3e3644ce897d8dd4e5" data-turbo="false" data-view-component="true" href="/mrdoob/three.js">
            three.js
</a>

In [63]:
repo_names = []

for repo in repo_creator:
    repo_names.append(repo.find_all('a')[-1].text.strip())

In [91]:
len(repo_owner), len(repo_names), len(topic_names), len(topic_descs)

(520, 520, 26, 26)

## Defining a function 

In [77]:
topic_names = []
topic_descs = []
repo_names = []
repo_owner = []

def topics_page(topic_url):
    res = requests.get(topic_url)
    try:
        page = res.text
        topic_doc = BeautifulSoup(page, "html.parser")
        
        #extracting name
        topic_names.append(topic_doc.find('h1', class_='h1').text.strip())
        
        #extracting descriptions
        topic_descs.append(topic_doc.find('div', class_='markdown-body f5 mb-2').find('p').text.strip())
        
        repo_creator = topic_doc.find_all('h3', class_='f3 color-fg-muted text-normal lh-condensed')
        
        #extracting repo owner
        for repo in repo_creator:
            repo_owner.append(repo.find('a').text.strip())
        
        #extracting repo name
        for repo in repo_creator:
            repo_names.append(repo.find_all('a')[-1].text.strip())
    except:
        return 404
    
    return topic_names, topic_descs, repo_owner, repo_names

In [78]:
for url in topic_df['topic_url']:
    topics_page(url)

In [84]:
repo_info_dict = {'repo_owner': repo_owner, 'repo_names': repo_names}

In [85]:
repo_info_df = pd.DataFrame(repo_info_dict)

In [95]:
repo_info_df

Unnamed: 0,repo_owner,repo_names
0,mrdoob,three.js
1,libgdx,libgdx
2,pmndrs,react-three-fiber
3,BabylonJS,Babylon.js
4,ssloy,tinyrenderer
...,...,...
515,code-review-checklists,java-concurrency
516,jiangsir404,Audit-Learning
517,TeamCodeStream,codestream
518,enlightn,enlightn


In [96]:
repo_info_df.to_csv("repo_info_df.csv", index=None)