## Installing Packages

In [8]:
!pip install requests --upgrade --quiet
!pip install BeautifulSoup4 --upgrade --quiet
!pip install Pandas --upgrade --quiet

## Importing Packages

In [106]:
import requests 
from bs4 import BeautifulSoup
import pandas as pd
import os

## Getting HTML Response for all topics using requests

In [10]:
topic_url='https://github.com/topics'

In [11]:
response= requests.get(topic_url)

In [4]:
html_response = response.text

In [12]:
len(html_response)

155437

## Save HTML Response

In [14]:
with open('topic_webpage.html', 'w', encoding='UTF-8') as f:
    f.write(html_response)

## Parse HTML_Response using BeautifulSoup

In [17]:
document =  BeautifulSoup(html_response,'html.parser')

In [18]:
document


<!DOCTYPE html>

<html data-a11y-animated-images="system" data-color-mode="auto" data-dark-theme="dark" data-light-theme="light" lang="en">
<head>
<meta charset="utf-8"/>
<link href="https://github.githubassets.com" rel="dns-prefetch"/>
<link href="https://avatars.githubusercontent.com" rel="dns-prefetch"/>
<link href="https://github-cloud.s3.amazonaws.com" rel="dns-prefetch"/>
<link href="https://user-images.githubusercontent.com/" rel="dns-prefetch"/>
<link crossorigin="" href="https://github.githubassets.com" rel="preconnect"/>
<link href="https://avatars.githubusercontent.com" rel="preconnect"/>
<link crossorigin="anonymous" href="https://github.githubassets.com/assets/light-0946cdc16f15.css" media="all" rel="stylesheet"><link crossorigin="anonymous" href="https://github.githubassets.com/assets/dark-3946c959759a.css" media="all" rel="stylesheet"><link crossorigin="anonymous" data-color-theme="dark_dimmed" data-href="https://github.githubassets.com/assets/dark_dimmed-9b9a8c91acc5.c

## Finding Tag which contains all Topic Name

In [20]:
selection_class_topics = 'f3 lh-condensed mb-0 mt-1 Link--primary'
topic_title_tags = document.find_all('p',{'class': selection_class_topics})

In [21]:
topic_title_tags

[<p class="f3 lh-condensed mb-0 mt-1 Link--primary">3D</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Ajax</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Algorithm</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Amp</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Android</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Angular</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Ansible</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">API</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Arduino</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">ASP.NET</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Atom</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Awesome Lists</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Amazon Web Services</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Azure</p>,
 <p class="f3 lh-condensed mb-0 mt-1 Link--primary">Babel</p>,
 <p class="f3 lh-condensed m

In [23]:
topic_title_tags[0].text

'3D'

## Fetching Other Details 

In [27]:
topics_name=[]
topic_desc=[]
topic_link=[]

selections_class_link='no-underline flex-1 d-flex flex-column'
selections_class_description='f5 color-fg-muted mb-0 mt-1'

topic_title_tags = document.find_all('p',{'class': selection_class_topics})
p_tag_description=document.find_all('p', {'class': selections_class_description})
a_tag_link=document.find_all('a', {'class': selections_class_link})

for i in range(0,len(topic_title_tags)):
    topics_name.append(topic_title_tags[i].text)
    topic_desc.append(p_tag_description[i].text.strip())
    topic_link.append('https://github.com'+a_tag_link[i]['href'])

In [29]:
topics_name[:5]

['3D', 'Ajax', 'Algorithm', 'Amp', 'Android']

In [30]:
topic_desc[:5]

['3D refers to the use of three-dimensional graphics, modeling, and animation in various industries.',
 'Ajax is a technique for creating interactive web applications.',
 'Algorithms are self-contained sequences that carry out a variety of tasks.',
 'Amp is a non-blocking concurrency library for PHP.',
 'Android is an operating system built by Google designed for mobile devices.']

In [31]:
topic_link[:5]

['https://github.com/topics/3d',
 'https://github.com/topics/ajax',
 'https://github.com/topics/algorithm',
 'https://github.com/topics/amphp',
 'https://github.com/topics/android']

## Creating DataFrame and Saving into File

In [33]:
col=zip(topics_name,topic_desc,topic_link)

In [34]:
Topic_DataFrame= pd.DataFrame(list(col),columns=['Topics_Name', 'Topics_Description', 'Topics_Link'])

In [36]:
Topic_DataFrame.head()

Unnamed: 0,Topics_Name,Topics_Description,Topics_Link
0,3D,3D refers to the use of three-dimensional grap...,https://github.com/topics/3d
1,Ajax,Ajax is a technique for creating interactive w...,https://github.com/topics/ajax
2,Algorithm,Algorithms are self-contained sequences that c...,https://github.com/topics/algorithm
3,Amp,Amp is a non-blocking concurrency library for ...,https://github.com/topics/amphp
4,Android,Android is an operating system built by Google...,https://github.com/topics/android


In [38]:
Topic_DataFrame.to_csv('Topics_Info.csv', index=None)

## Next step is to fetch all repo info topic-wise

In [39]:
# Defining Classes having required info
stars_class='Counter js-social-count'
class_username_repo='f3 color-fg-muted text-normal lh-condensed'

## Function to get required Tag

In [40]:
def get_parent_tag(link):
    response = requests.get(link)
    if(response.status_code>101 and response.status_code<=200):
        topics = bs(response.text, 'html.parser')
        h3_userrepo = topics.find_all('h3', class_username_repo)
        stars_tag=topics.find_all('span',{'class':stars_class})
        return h3_userrepo, stars_tag

## Function to convert String to Int 

In [58]:
def str_to_int(star):
    if a[-1:]=='k':
        return int(float(star[:-1])*1000)
    return star


## Function to get the the required info

In [62]:
def get_info(parent_tag, star_tag, topic):
    username_local=[]
    username_link_local=[]
    repo_name_local=[]
    repo_link_local=[]
    stars_local=[]
    topics_name_local=[]
    
    for i in range(0,len(parent_tag)):
        a=parent_tag[i].find_all('a')
        username_local.append(a[0].text.strip())
        username_link_local.append('https://www.github.com'+a[0]['href'])
        repo_name_local.append(a[1].text.strip())
        repo_link_local.append('https://www.github.com'+a[1]['href'])
        stars_local.append(str_to_int(star_tag[i].text))
        topics_name_local.append(topic)
        
    return topics_name_local, username_local, username_link_local, repo_name_local, repo_link_local, stars_local

## Function to create Dataframe of all the required info

In [63]:
def create_dataframe(df):
    
    UserName=[]
    UserName_Link=[]
    Repo_Name=[]
    Repo_Name_Link=[]
    Topic_Name=[]
    Number_of_Stars=[]
    for i in range(0,len(df)):
        # Getting the tags that contain the required details
        parent_tag, stars_tag = get_parent_tag(df.iloc[i]['Topics_Link'])
        topic_name, username, username_link, repo_name, repo_link, number_of_stars = get_info(parent_tag, stars_tag,df.iloc[i]['Topics_Name'])
        
        #Appending each list
        UserName += username
        UserName_Link += username_link
        Repo_Name += repo_name
        Repo_Name_Link += repo_link
        Topic_Name += topic_name
        Number_of_Stars += number_of_stars
    
    #Creating DataFrame
    col=zip(Topic_Name, UserName, UserName_Link, Repo_Name, Repo_Name_Link, Number_of_Stars)
    dataframe= pd.DataFrame(list(col),columns=['Topic_Name','Username', 'Username_Link', 'Repo_Name', 'Repo_Link', 'Number_of_Stars'])

    return dataframe

In [93]:
topicwise_info_df =  create_dataframe(Topic_DataFrame)

In [98]:
topicwise_info_df.head()

Unnamed: 0,Topic_Name,Username,Username_Link,Repo_Name,Repo_Link,Number_of_Stars
0,3D,mrdoob,https://www.github.com/mrdoob,three.js,https://www.github.com/mrdoob/three.js,92200
1,3D,pmndrs,https://www.github.com/pmndrs,react-three-fiber,https://www.github.com/pmndrs/react-three-fiber,22700
2,3D,libgdx,https://www.github.com/libgdx,libgdx,https://www.github.com/libgdx/libgdx,21500
3,3D,BabylonJS,https://www.github.com/BabylonJS,Babylon.js,https://www.github.com/BabylonJS/Babylon.js,20700
4,3D,ssloy,https://www.github.com/ssloy,tinyrenderer,https://www.github.com/ssloy/tinyrenderer,17000


12000


## Saving info for each topic in a file

In [109]:
path='data/'
for name in Topic_DataFrame['Topics_Name']:
    df = topicwise_info_df[topicwise_info_df['Topic_Name'] == name]
    df = df.drop('Topic_Name', axis=1) 
    if not os.path.exists(path):
        os.makedirs(path)
    df.to_csv(path+name+'.csv', index=None)



False