# Top Repositories for GitHub Topics

Here are the steps we'll follow:

- We're going to scrape https://github.com/topics
- We'll get a list of topics. For each topic, we'll get topic title, topic page URL and topic description
- For each topic, we'll get the top 25 repositories in the topic from the topic page
- For each repository, we'll grab the repo name, username, stars and repo URL
- For each topic we'll create a CSV file in the following format:

```
Repo Name,Username,Stars,Repo URL
three.js,mrdoob,69700,https://github.com/mrdoob/three.js
libgdx,libgdx,18300,https://github.com/libgdx/libgdx
```

## Scrape the list of topics from Github

- use requests to downlaod the page
- user BS4 to parse and extract information
- convert to a Pandas dataframe

Let's write a function to download the page.

In [None]:
import requests

In [None]:
topics_url= 'https://github.com/topics'

In [None]:
response = requests.get(topics_url)

In [None]:
response.status_code

In [None]:
len(response.text)

In [None]:
page_contents=response.text

In [None]:
response.text[:5]

In [None]:
page_contents[:1000]

In [None]:
import bs4

In [None]:
from bs4 import BeautifulSoup

In [None]:
doc =BeautifulSoup(page_contents, 'html.parser')

#### breakdown of each component:
 1. The find_all() method takes an HTML tag as a string argument and returns the list of elements that match with the provided    tag. 
 2. We can search for tags of a specific class as well by providing the class_ argument.. Beautiful Soup uses class_ because class is a reserved keyword in Python.
 3. What if we wanted to fetch the links embedded inside the a tags? Let's retrieve a link's href attribute using the find() option. It works just like find_all() but it returns the first matching element instead of a list. 
 4. get_text() function retrieves all the text from the HTML document.

In [None]:
selection_class ='f3 lh-condensed mb-0 mt-1 Link--primary'
topic_title_tags = doc.find_all('p',{'class':selection_class})

In [None]:
len(topic_title_tags)

In [None]:
topic_title_tags[:5]

In [None]:
desc_selector = 'f5 color-fg-muted mb-0 mt-1'
topic_desc_tags = doc.find_all('p',{'class':desc_selector})

In [None]:
topic_desc_tags[:5]

In [None]:
topic_titles=[]
for tag in topic_title_tags:
    topic_titles.append(tag.text)
print(topic_titles)

In [None]:
topic_descs=[]
for tag in topic_desc_tags:
    topic_descs.append(tag.text.strip())
topic_descs[:5]

In [None]:
topic_link_tags=doc.find_all('a',{'class':'no-underline flex-grow-0'})

In [None]:
len(topic_link_tags)

In [None]:
topic_urls =[]
base_url='https://github.com'
for tag in topic_link_tags:
    topic_urls.append(base_url+tag['href'])
topic_urls

In [None]:
import pandas as pd

In [None]:
topics_dict={
    'title':topic_titles,
    'description':topic_descs,
    'url':topic_urls
}

In [None]:
topics_df=pd.DataFrame(topics_dict)

In [None]:
topics_df

# saving as csv file

In [None]:
topics_df.to_csv('topics.csv',index=None)

## Getting information out of the topic page

In [None]:
topic_page_url = topic_urls[0]

In [None]:
topic_page_url

In [None]:
response=requests.get(topic_page_url)

In [None]:
len(response.text)

In [None]:
topic_doc=BeautifulSoup(response.text,'html.parser')

In [None]:
repo_tags= topic_doc.find_all('h3',{'class':'f3 color-fg-muted text-normal lh-condensed'})

In [None]:
len(repo_tags)

In [None]:
a_tags=repo_tags[0].find_all('a')

In [None]:
a_tags[0].text.strip()

In [None]:
a_tags[1].text.strip()

In [None]:
base_url='https://github.com'
repo_url=base_url+a_tags[1]['href']
print(repo_url)

In [None]:
star_tags=topic_doc.find_all('span',{'class':'Counter js-social-count'})

In [None]:
len(star_tags)

In [None]:
star_tags[0].text.strip()

In [None]:
def parse_star_count(stars_str):
    stars_str=stars_str.strip()
    if stars_str[-1]== 'k':
        return int(float(stars_str[:-1])*1000)
    return int(stars_str)

In [None]:
parse_star_count(star_tags[0].text.strip())

In [None]:
def get_repo_info(h1_tag,star_tag):
    a_tags =h1_tag.find_all('a')
    username=a_tags[0].text.strip()
    repo_name =a_tags[1].text.strip()
    repo_url = base_url + a_tags[1]['href']
    stars = parse_star_count(star_tag.text.strip()) 
    return username,repo_name,stars, repo_url

In [None]:
get_repo_info(repo_tags[0],star_tags[0])

In [None]:
topic_repos_dict ={
    'username':[],
    'repo_name':[],
    'stars':[],
    'repo_url':[]
}

for i in range(len(repo_tags)):
    repo_info= get_repo_info(repo_tags[i],star_tags[i])
    topic_repos_dict['username'].append(repo_info[0])
    topic_repos_dict['repo_name'].append(repo_info[1])
    topic_repos_dict['stars'].append(repo_info[2])
    topic_repos_dict['repo_url'].append(repo_info[3])

In [None]:
import os
def get_topic_page(topic_url):
    #download the page
    response= requests.get(topic_url)
    #check successful response
    if response.status_code != 200:
        raise Exception('failed to load page {}'.format (topic_url))
    #parse using Beautiful soup
    topic_doc = BeautifulSoup(response.text,'html.parser')
    return topic_doc

def get_repo_info(h1_tag,star_tag):
    #returns all the required info about a repository
    a_tags =h1_tag.find_all('a')
    username=a_tags[0].text.strip()
    repo_name =a_tags[1].text.strip()
    repo_url = base_url + a_tags[1]['href']
    stars = parse_star_count(star_tag.text.strip()) 
    return username,repo_name,stars, repo_url

def get_topic_repos(topic_doc):
    #get the hi tags containing repo title, repo URL and username
    repo_tags= topic_doc.find_all('h3',{'class':'f3 color-fg-muted text-normal lh-condensed'})    #get star tags
    star_tags=topic_doc.find_all('span',{'class':'Counter js-social-count'})
                       
    topic_repos_dict ={
        'username':[],
        'repo_name':[],
        'stars':[],
        'repo_url':[]
     }

    #get repo info
    for i in range(len(repo_tags)):
        repo_info= get_repo_info(repo_tags[i],star_tags[i])
        topic_repos_dict['username'].append(repo_info[0])
        topic_repos_dict['repo_name'].append(repo_info[1])
        topic_repos_dict['stars'].append(repo_info[2])
        topic_repos_dict['repo_url'].append(repo_info[3])
        
    return pd.DataFrame(topic_repos_dict)

def scrape_topic(topic_url,path):
    if os.path.exists(path):
        print('the file {} already esixts. skipping...'.format(path))
        return
    topic_df= get_topic_repos(get_topic_page(topic_url))
    topic_df.to_csv(path,index=None)

In [None]:
get_topic_repos(get_topic_page(topic_urls[6])).to_csv('ansible.csv',index=None)

In [None]:
topic_repos_df= pd.DataFrame(topic_repos_dict)

#### write a single function to:
 1. Get the list of topics fromt he topics page
 2. Get the list of top repos from the individual topic pages
 3. For each topic, create a CSV of the top repos for the topic

In [None]:
def get_topic_titles(doc):
    stopic_link_tags=doc.find_all('a',{'class':'no-underline flex-grow-0'})
    topic_titles=[]
    for tag in topic_title_tags:
        topic_titles.append(tag.text)
    return topic_titles

def get_topic_desc(doc):
    desc_selector = 'f5 color-fg-muted mb-0 mt-1'
    topic_desc_tags = doc.find_all('p',{'class':desc_selector})
    topic_descs=[]
    for tag in topic_desc_tags:
        topic_descs.append(tag.text.strip())
    return topic_descs

def get_topic_urls(doc):
    topic_link_tags=doc.find_all('a',{'class':'no-underline flex-grow-0'})
    topic_urls =[]
    base_url='https://github.com'
    for tag in topic_link_tags:
        topic_urls.append(base_url+tag['href'])
    return topic_urls
    
def scrape_topics():
    topics_url ='https://github.com/topics'
    response= requests.get(topics_url)
    if response.status_code !=200:
        raise Exception('Failed to load page{}'.format(topic_url))
    topics_dict = {
        'title': get_topic_titles(doc),
        'description': get_topic_desc(doc),
        'url': get_topic_urls(doc)
    }
    return pd.DataFrame(topics_dict)

In [None]:
def scrape_topics_repos():
    print('scraping list of topics')
    topics_df = scrape_topics()
    # creating folder
    os.makedirs('data',exist_ok = True)
    for index,row in topics_df.iterrows():
        print('scraping top repositoris for "{}"'.format(row['title']))
        scrape_topic(row['url'],'data/{}.csv'.format(row['title']))

In [None]:
scrape_topics_repos()