# A Web Scrapping Project on GitHub Top Repositories

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
url = "https://github.com/topics"

In [3]:
res = requests.get(url)

In [4]:
res.status_code

200

In [5]:
content = res.text

In [6]:
content[:1000]

'\n\n<!DOCTYPE html>\n<html\n  lang="en"\n  \n  data-color-mode="auto" data-light-theme="light" data-dark-theme="dark"\n  data-a11y-animated-images="system" data-a11y-link-underlines="true"\n  \n  >\n\n\n\n  <head>\n    <meta charset="utf-8">\n  <link rel="dns-prefetch" href="https://github.githubassets.com">\n  <link rel="dns-prefetch" href="https://avatars.githubusercontent.com">\n  <link rel="dns-prefetch" href="https://github-cloud.s3.amazonaws.com">\n  <link rel="dns-prefetch" href="https://user-images.githubusercontent.com/">\n  <link rel="preconnect" href="https://github.githubassets.com" crossorigin>\n  <link rel="preconnect" href="https://avatars.githubusercontent.com">\n\n  \n\n  <link crossorigin="anonymous" media="all" rel="stylesheet" href="https://github.githubassets.com/assets/light-605318cbe3a1.css" /><link crossorigin="anonymous" media="all" rel="stylesheet" href="https://github.githubassets.com/assets/dark-bd1cb5575fff.css" /><link data-color-theme="dark_dimmed" cross

In [7]:
data = BeautifulSoup(content,"html.parser")

In [9]:
len(data)

5

In [11]:
data.find('a',class_='no-underline flex-1 d-flex flex-column')

<a class="no-underline flex-1 d-flex flex-column" href="/topics/3d">
<p class="f3 lh-condensed mb-0 mt-1 Link--primary">3D</p>
<p class="f5 color-fg-muted mb-0 mt-1">
          3D refers to the use of three-dimensional graphics, modeling, and animation in various industries.
        </p>
</a>

In [12]:
data.find('a',class_='no-underline flex-1 d-flex flex-column')['href']

'/topics/3d'

In [15]:
data.find('a',class_='no-underline flex-1 d-flex flex-column').find('p').text

'3D'

In [19]:
data.find('a',class_='no-underline flex-1 d-flex flex-column').find_all('p')[1].text

'\n          3D refers to the use of three-dimensional graphics, modeling, and animation in various industries.\n        '

In [26]:
topic = []
desc = []
link = []

In [27]:
for a in data.find_all('a',class_='no-underline flex-1 d-flex flex-column'):
    link.append("https://github.com"+a['href'])
    topic.append(a.find('p').text)
    desc.append(a.find_all('p')[1].text)

In [28]:
len(topic) , len(desc) , len(link)

(30, 30, 30)

In [29]:
ds = {
    'GitHub_Topic':topic,
    'Topic_Desc':desc,
    'Topic_Link':link
}

In [30]:
df1 = pd.DataFrame(ds)

In [31]:
df1.shape

(30, 3)

In [32]:
df1

Unnamed: 0,GitHub_Topic,Topic_Desc,Topic_Link
0,3D,\n 3D refers to the use of three-dime...,https://github.com/topics/3d
1,Ajax,\n Ajax is a technique for creating i...,https://github.com/topics/ajax
2,Algorithm,\n Algorithms are self-contained sequ...,https://github.com/topics/algorithm
3,Amp,\n Amp is a non-blocking concurrency ...,https://github.com/topics/amphp
4,Android,\n Android is an operating system bui...,https://github.com/topics/android
5,Angular,\n Angular is an open source web appl...,https://github.com/topics/angular
6,Ansible,\n Ansible is a simple and powerful a...,https://github.com/topics/ansible
7,API,\n An API (Application Programming In...,https://github.com/topics/api
8,Arduino,\n Arduino is an open source platform...,https://github.com/topics/arduino
9,ASP.NET,\n ASP.NET is a web framework for bui...,https://github.com/topics/aspnet


In [33]:
df1.to_csv('GitHub_Topics.csv')

In [35]:
df1['Topic_Link']

0                         https://github.com/topics/3d
1                       https://github.com/topics/ajax
2                  https://github.com/topics/algorithm
3                      https://github.com/topics/amphp
4                    https://github.com/topics/android
5                    https://github.com/topics/angular
6                    https://github.com/topics/ansible
7                        https://github.com/topics/api
8                    https://github.com/topics/arduino
9                     https://github.com/topics/aspnet
10                   https://github.com/topics/awesome
11                       https://github.com/topics/aws
12                     https://github.com/topics/azure
13                     https://github.com/topics/babel
14                      https://github.com/topics/bash
15                   https://github.com/topics/bitcoin
16                 https://github.com/topics/bootstrap
17                       https://github.com/topics/bot
18        

In [60]:
user = []
repo = []
repolink = []
star = []

In [61]:
for url in df1['Topic_Link']:
    data = BeautifulSoup(requests.get(url).content,'html.parser')
    for a in data.find_all('div',class_='d-flex flex-justify-between flex-items-start flex-wrap gap-2 my-3'):
        u = a.find('a').text
        user.append(u)
        r = a.find_all('a')[1].text
        repo.append(r)
        repolink.append("https://github.com/"+u+"/"+r)
        star.append(a.find('span',class_='Counter js-social-count')['title'])

In [62]:
len(user) , len(repo) , len(repolink) , len(star)

(600, 600, 600, 600)

In [63]:
ds = {
    'Username' : user,
    'Repo_Name' : repo,
    'Repo_Link' : repolink,
    'Stars' : star
}

In [64]:
df2 = pd.DataFrame(ds)

In [65]:
df2.shape

(600, 4)

In [66]:
df2.head(10)

Unnamed: 0,Username,Repo_Name,Repo_Link,Stars
0,mrdoob,three.js,https://github.com/mrdoob/three.js,104790
1,pmndrs,react-three-fiber,https://github.com/pmndrs/react-three-fiber,28326
2,libgdx,libgdx,https://github.com/libgdx/libgdx,23804
3,BabylonJS,Babylon.js,https://github.com/BabylonJS/Babylon.js,23712
4,FreeCAD,FreeCAD,https://github.com/FreeCAD/FreeCAD,23391
5,ssloy,tinyrenderer,https://github.com/ssloy/tinyrenderer,21413
6,lettier,3d-game-shaders-for-beginners,https://github.com/lettier/3d-game-shaders-for...,18430
7,aframevr,aframe,https://github.com/aframevr/aframe,16923
8,blender,blender,https://github.com/blender/blender,14424
9,CesiumGS,cesium,https://github.com/CesiumGS/cesium,13386


In [67]:
df2.to_csv('RepoReport.csv')

# Project End