## Use the request library to download web pages

In [1]:
import requests

In [2]:
topics_url = 'https://github.com/topics'

In [3]:
response = requests.get(topics_url)

In [4]:
## check the status code

In [5]:
response.status_code

200

In [50]:
page_contents = response.text

In [7]:
with open('webpage.html', 'w' , encoding='utf-8') as f:
    f.write(page_contents)

## using beautiful soup to extract information

In [8]:
!pip install beautifulsoup4 --upgrade




[notice] A new release of pip is available: 24.0 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [9]:
from bs4 import BeautifulSoup

## Topic Title Selector

In [10]:
doc = BeautifulSoup(page_contents , 'html.parser')
selection_class = 'f3 lh-condensed mb-0 mt-1 Link--primary'
topic_title_tags = doc.find_all('p' , {'class' : selection_class})
topic_title_tags[:5]

[<p class="f3 lh-condensed mb-0 mt-1 Link--primary">3D</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Ajax</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Algorithm</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Amp</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Android</p>]

## Topic Description Selector

In [11]:
desc_selector = 'f5 color-fg-muted mb-0 mt-1'
topic_desc_tags = doc.find_all('p' , {'class' : desc_selector})
topic_desc_tags[0].text
# len(topic_desc_tags)

'\n          3D refers to the use of three-dimensional graphics, modeling, and animation in various industries.\n        '

## Topic Link Selector

In [12]:
topic_link_tags = doc.find_all('a' , {'class' : 'no-underline flex-1 d-flex flex-column'})
len(topic_link_tags)

30

In [13]:
topic0_url = "https://github.com" + topic_link_tags[0]['href']
print(topic0_url)

https://github.com/topics/3d


## Extracting the topics using array

In [14]:
topic_titles = []
for topic in topic_title_tags:
    topic_titles.append(topic.text)
print(topic_titles)

['3D', 'Ajax', 'Algorithm', 'Amp', 'Android', 'Angular', 'Ansible', 'API', 'Arduino', 'ASP.NET', 'Awesome Lists', 'Amazon Web Services', 'Azure', 'Babel', 'Bash', 'Bitcoin', 'Bootstrap', 'Bot', 'C', 'Chrome', 'Chrome extension', 'Command-line interface', 'Clojure', 'Code quality', 'Code review', 'Compiler', 'Continuous integration', 'C++', 'Cryptocurrency', 'Crystal']


## Extracting the descriptions using array

In [15]:
topic_desc = []
for desc in topic_desc_tags:
    topic_desc.append(desc.text.strip())
print(topic_desc)

['3D refers to the use of three-dimensional graphics, modeling, and animation in various industries.', 'Ajax is a technique for creating interactive web applications.', 'Algorithms are self-contained sequences that carry out a variety of tasks.', 'Amp is a non-blocking concurrency library for PHP.', 'Android is an operating system built by Google designed for mobile devices.', 'Angular is an open source web application platform.', 'Ansible is a simple and powerful automation engine.', 'An API (Application Programming Interface) is a collection of protocols and subroutines for building software.', 'Arduino is an open source platform for building electronic devices.', 'ASP.NET is a web framework for building modern web apps and services.', 'An awesome list is a list of awesome things curated by the community.', 'Amazon Web Services provides on-demand cloud computing platforms on a subscription basis.', 'Azure is a cloud computing service created by Microsoft.', 'Babel is a compiler for w

## Extracting the url using array

In [16]:
topic_urls = []
base_url = 'https://github.com'
for url in topic_link_tags:
    topic_urls.append(base_url + url['href'])
print(topic_urls)

['https://github.com/topics/3d', 'https://github.com/topics/ajax', 'https://github.com/topics/algorithm', 'https://github.com/topics/amphp', 'https://github.com/topics/android', 'https://github.com/topics/angular', 'https://github.com/topics/ansible', 'https://github.com/topics/api', 'https://github.com/topics/arduino', 'https://github.com/topics/aspnet', 'https://github.com/topics/awesome', 'https://github.com/topics/aws', 'https://github.com/topics/azure', 'https://github.com/topics/babel', 'https://github.com/topics/bash', 'https://github.com/topics/bitcoin', 'https://github.com/topics/bootstrap', 'https://github.com/topics/bot', 'https://github.com/topics/c', 'https://github.com/topics/chrome', 'https://github.com/topics/chrome-extension', 'https://github.com/topics/cli', 'https://github.com/topics/clojure', 'https://github.com/topics/code-quality', 'https://github.com/topics/code-review', 'https://github.com/topics/compiler', 'https://github.com/topics/continuous-integration', 'ht

## Installing Pandas & creating the df

In [17]:
!pip install pandas




[notice] A new release of pip is available: 24.0 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [18]:
import pandas as pd

In [19]:
## creating a topic dictonary
topic_dict = {
    'title' : topic_titles,
    'description' : topic_desc,
    'url' : topic_urls
}

In [20]:
## creating the data frame
topics_df = pd.DataFrame(topic_dict)
topics_df

Unnamed: 0,title,description,url
0,3D,3D refers to the use of three-dimensional grap...,https://github.com/topics/3d
1,Ajax,Ajax is a technique for creating interactive w...,https://github.com/topics/ajax
2,Algorithm,Algorithms are self-contained sequences that c...,https://github.com/topics/algorithm
3,Amp,Amp is a non-blocking concurrency library for ...,https://github.com/topics/amphp
4,Android,Android is an operating system built by Google...,https://github.com/topics/android
5,Angular,Angular is an open source web application plat...,https://github.com/topics/angular
6,Ansible,Ansible is a simple and powerful automation en...,https://github.com/topics/ansible
7,API,An API (Application Programming Interface) is ...,https://github.com/topics/api
8,Arduino,Arduino is an open source platform for buildin...,https://github.com/topics/arduino
9,ASP.NET,ASP.NET is a web framework for building modern...,https://github.com/topics/aspnet


## Creating CSV files of the data frame

In [23]:
topics_df.to_csv('topics.csv' , index = None)

## Getting info out of a topic page

In [35]:
_3Drepo_url = topic_urls[0]
_3Drepo_url

'https://github.com/topics/3d'

In [51]:
_3Drepo = requests.get(_3Drepo_url)

In [52]:
_3Drepo.status_code

200

In [53]:
len(_3Drepo.text)

519556

In [59]:
with open('3drepo.html', 'w' , encoding='utf-8') as f:
    f.write(_3Drepo.text)

In [96]:
_3d_repoparser = BeautifulSoup(_3Drepo.text , 'html.parser')

## Selecting Repository Header Tags

In [95]:
h3_selection_class = 'f3 color-fg-muted text-normal lh-condensed'
repo_tags = _3d_repoparser.find_all('h3' , {'class' : h3_selection_class})

In [69]:
no_of_3drepo = len(repo_tags)
no_of_3drepo

20

In [71]:
## selecting the first repo
first_repo = repo_tags[0]

In [73]:
a_tags = first_repo.find_all('a')
a_tags

[<a class="Link" data-hydro-click='{"event_type":"explore.click","payload":{"click_context":"REPOSITORY_CARD","click_target":"OWNER","click_visual_representation":"REPOSITORY_OWNER_HEADING","actor_id":null,"record_id":97088,"originating_url":"https://github.com/topics/3d","user_id":null}}' data-hydro-click-hmac="c72fbd5c69a8ee7c9c53a4e65de2b93c8fc7552dd793945819639bc165c0f0ba" data-turbo="false" data-view-component="true" href="/mrdoob">
             mrdoob
 </a>,
 <a class="Link text-bold wb-break-word" data-hydro-click='{"event_type":"explore.click","payload":{"click_context":"REPOSITORY_CARD","click_target":"REPOSITORY","click_visual_representation":"REPOSITORY_NAME_HEADING","actor_id":null,"record_id":576201,"originating_url":"https://github.com/topics/3d","user_id":null}}' data-hydro-click-hmac="4a2667db3d63a1739c412e059e5da95afe419df83f70949b5d59dc3478f5c79a" data-turbo="false" data-view-component="true" href="/mrdoob/three.js">
             three.js
 </a>]

In [79]:
first_repo_username = a_tags[0].text.strip()
first_repo_username

'mrdoob'

In [80]:
first_repo_name = a_tags[1].text.strip()
first_repo_name

'three.js'

## selecting stars HTML selector

In [90]:
star_class = 'js-toggler-target rounded-left-2 btn-with-aria-count btn-sm btn BtnGroup-item color-bg-default'
repo_star = _3d_repoparser.find_all('span' , {'class' : 'Counter js-social-count'})

In [91]:
len(repo_star)

20

In [89]:
repo_star[0].text

'101k'

## Defining Star Parser function

In [92]:
def parse_star_count(stars_str):
    if stars_str[-1] == 'k':
        return int(float(stars_str[:-1])*1000)
    return int(stars_str)

In [94]:
first_repo_starcount = parse_star_count(repo_star[1].text)
first_repo_starcount

26800

## Defining RepoInfo function

In [107]:
def get_repo_info(h1_tag , star_tag):
    # returns all the required info of repository
    a_tags = h1_tag.find_all('a')
    username = a_tags[0].text.strip()
    repo_name = a_tags[1].text.strip()
    repo_url = base_url + a_tags[1]['href']
    stars = parse_star_count(star_tag.text)
    return username , repo_name , stars ,  repo_url

In [109]:
# repo_tags -> all header 
# repo_star -> all stars selector
get_repo_info(repo_tags[0] , repo_star[0])

('mrdoob', 'three.js', 101000, 'https://github.com/mrdoob/three.js')

## using loops to find all repository info for the Topic 3D

In [115]:
## converting into dictionary to store all repo info
_3dtopic_repos_dict = {
    'username' : [],
    'repo_name' : [],
    'stars' : [],
    'repo_url' : []
}

for i in range(len(repo_tags)):
    # repo_info -> ('mrdoob', 'three.js', 101000, 'https://github.com/mrdoob/three.js')
    repo_info = get_repo_info(repo_tags[i] , repo_star[i])
    _3dtopic_repos_dict['username'].append(repo_info[0])
    _3dtopic_repos_dict['repo_name'].append(repo_info[1])
    _3dtopic_repos_dict['stars'].append(repo_info[2])
    _3dtopic_repos_dict['repo_url'].append(repo_info[3])

In [117]:
_3dtopic_data = pd.DataFrame(_3dtopic_repos_dict)
_3dtopic_data

Unnamed: 0,username,repo_name,stars,repo_url
0,mrdoob,three.js,101000,https://github.com/mrdoob/three.js
1,pmndrs,react-three-fiber,26800,https://github.com/pmndrs/react-three-fiber
2,libgdx,libgdx,23000,https://github.com/libgdx/libgdx
3,BabylonJS,Babylon.js,22800,https://github.com/BabylonJS/Babylon.js
4,ssloy,tinyrenderer,19900,https://github.com/ssloy/tinyrenderer
5,FreeCAD,FreeCAD,18400,https://github.com/FreeCAD/FreeCAD
6,lettier,3d-game-shaders-for-beginners,17500,https://github.com/lettier/3d-game-shaders-for...
7,aframevr,aframe,16400,https://github.com/aframevr/aframe
8,CesiumGS,cesium,12500,https://github.com/CesiumGS/cesium
9,blender,blender,12300,https://github.com/blender/blender


## Final Code

In [129]:
import os

def get_topic_page(topic_url):
    # Download the page
    response = requests.get(topic_url)
    # Check successful response
    if response.status_code != 200:
        raise Exception('Failed to load page {}'.format(topic_url))
    # Parse using Beautiful soup
    topic_doc = BeautifulSoup(response.text, 'html.parser')
    return topic_doc


def get_repo_info(h1_tag , star_tag):
    # returns all the required info of repository
    a_tags = h1_tag.find_all('a')
    username = a_tags[0].text.strip()
    repo_name = a_tags[1].text.strip()
    repo_url = base_url + a_tags[1]['href']
    stars = parse_star_count(star_tag.text)
    return username , repo_name , stars ,  repo_url

def get_topic_repos(topic_repoPage):
    # header tags -> repo title , repo URL & username
    h3_selection_class = 'f3 color-fg-muted text-normal lh-condensed'
    repo_tags = topic_repoPage.find_all('h3' , {'class' : h3_selection_class})
    # getting star tags
    repo_star = topic_repoPage.find_all('span' , {'class' : 'Counter js-social-count'})
    
    ## converting into dictionary to store all repo info
    topic_repoPage_dict = {
        'username' : [],
        'repo_name' : [],
        'stars' : [],
        'repo_url' : []
    }
    for i in range(len(repo_tags)):
        # repo_info -> ('mrdoob', 'three.js', 101000, 'https://github.com/mrdoob/three.js')
        repo_info = get_repo_info(repo_tags[i] , repo_star[i])
        topic_repoPage_dict['username'].append(repo_info[0])
        topic_repoPage_dict['repo_name'].append(repo_info[1])
        topic_repoPage_dict['stars'].append(repo_info[2])
        topic_repoPage_dict['repo_url'].append(repo_info[3])
    return pd.DataFrame(topic_repoPage_dict)

## Write a single function to :
1. Get the list of topics from the topics page
2. Get the list of top repos from the individual topic pages
3. For each topic, create a CSV of the top repos for the topic

In [125]:
topic_urls

['https://github.com/topics/3d',
 'https://github.com/topics/ajax',
 'https://github.com/topics/algorithm',
 'https://github.com/topics/amphp',
 'https://github.com/topics/android',
 'https://github.com/topics/angular',
 'https://github.com/topics/ansible',
 'https://github.com/topics/api',
 'https://github.com/topics/arduino',
 'https://github.com/topics/aspnet',
 'https://github.com/topics/awesome',
 'https://github.com/topics/aws',
 'https://github.com/topics/azure',
 'https://github.com/topics/babel',
 'https://github.com/topics/bash',
 'https://github.com/topics/bitcoin',
 'https://github.com/topics/bootstrap',
 'https://github.com/topics/bot',
 'https://github.com/topics/c',
 'https://github.com/topics/chrome',
 'https://github.com/topics/chrome-extension',
 'https://github.com/topics/cli',
 'https://github.com/topics/clojure',
 'https://github.com/topics/code-quality',
 'https://github.com/topics/code-review',
 'https://github.com/topics/compiler',
 'https://github.com/topics/co

In [160]:
# fn to get the list of top repos from the individual topic pages
def getallTopicRepo():
    topics_repodf = []
    for topic_repo_url in topic_urls:
        topic_repoParser = get_topic_page(topic_repo_url)
        topics_repodf.append(get_topic_repos(topic_repoParser))
    return topics_repodf

# For each topic, create a CSV of the top repos for the topic
def scrape_topics_repos(dataFrame):
    print('Scrapping lsit of topics')
    i = 0
    os.makedirs('data' , exist_ok = True)
    for currTopicRepo in dataFrame:
        print('Scrapping top repositories for ' + topic_titles[i] + '....')
        currTopicRepo.to_csv('data/{}.csv'.format(topic_titles[i]) , index = None)
        i = i+1

In [161]:
allTopicRepoDf = getallTopicRepo()

In [162]:
print(len(allTopicRepoDf))

30


In [163]:
scrape_topics_repos(allTopicRepoDf)

Scrapping lsit of topics
Scrapping top repositories for 3D....
Scrapping top repositories for Ajax....
Scrapping top repositories for Algorithm....
Scrapping top repositories for Amp....
Scrapping top repositories for Android....
Scrapping top repositories for Angular....
Scrapping top repositories for Ansible....
Scrapping top repositories for API....
Scrapping top repositories for Arduino....
Scrapping top repositories for ASP.NET....
Scrapping top repositories for Awesome Lists....
Scrapping top repositories for Amazon Web Services....
Scrapping top repositories for Azure....
Scrapping top repositories for Babel....
Scrapping top repositories for Bash....
Scrapping top repositories for Bitcoin....
Scrapping top repositories for Bootstrap....
Scrapping top repositories for Bot....
Scrapping top repositories for C....
Scrapping top repositories for Chrome....
Scrapping top repositories for Chrome extension....
Scrapping top repositories for Command-line interface....
Scrapping top rep