# Part 1: Getting Category + Subcategory information

In [2]:
import time
from datetime import datetime
import requests
from bs4 import BeautifulSoup
from itertools import chain
import os

In [3]:
homepage_url = "https://www.findareddit.com/"

## Identifying possible categories

In [4]:
homepage_raw = requests.get(homepage_url).text

In [6]:
homepage_soup = BeautifulSoup(homepage_raw, 'html.parser')

First, we look at the general categories available when first loading the website.

In [14]:
category_links = homepage_soup.find_all(class_='l-mobile-categoryPanel__scrollable-list__list-item')

In [15]:
category_links[0]

<a class="l-mobile-categoryPanel__scrollable-list__list-item active" href="/">
<img alt="icon for all categories" class="l-mobile-categoryPanel__category-img" src="https://i.imgur.com/XHCl2NO.png"/>
<div style="word-break: break-word;">
<span class="l-mobile-categoryPanel__scrollable-list__list-item__text">All Categories</span>
</div>
</a>

Each category is provided as a link, with the name of the category as the main text.

In [16]:
[a['href'] for a in category_links]

['/',
 '/category/psychedelics-drugs-hallucinogens',
 '/category/animals-plants-and-home',
 '/category/art-design-and-crafts',
 '/category/business-finance-and-crypto',
 '/category/reading-writing-education',
 '/category/fashion',
 '/category/health-and-fitness',
 '/category/history-and-culture',
 '/category/travel-and-food',
 '/category/funny-memes',
 '/category/pictures-videos-and-gifs',
 '/category/music',
 '/category/news-and-politics',
 '/category/relationships-and-sex',
 '/category/philosophy-religion-and-spirituality',
 '/category/science-and-engineering',
 '/category/sports-and-games',
 '/category/tv-movies-and-gaming',
 '/category/technology']

In [19]:
[a.find_all('span')[0].text for a in category_links]

['All Categories',
 'Psychedelics/Drugs/Hallucinogens',
 'Animals, Plants and Home',
 'Art, Design and Crafts',
 'Business, Finance and Crypto',
 'Reading, Writing and Education',
 'Fashion',
 'Health and Fitness',
 'History and Culture',
 'Travel and Food',
 'Funny/Memes',
 'Pictures, Videos and GIFs',
 'Music',
 'News and Politics',
 'Relationships and Sex',
 'Philosophy, Religion and Spirituality',
 'Sciences and Engineering',
 'Sports and Games',
 'TV, Movies and Gaming',
 'Technology']

We create a function that iterates through a list of `<a>` links and maps the name to the url — this will be useful throughout this project.

In [68]:
def label_to_url_map(link_list):
    return {a.text.strip() : a['href'] for a in link_list}

In [62]:
category_dict = label_to_url_map(category_links[1:])

In [63]:
category_dict

{'Psychedelics/Drugs/Hallucinogens': '/category/psychedelics-drugs-hallucinogens',
 'Animals, Plants and Home': '/category/animals-plants-and-home',
 'Art, Design and Crafts': '/category/art-design-and-crafts',
 'Business, Finance and Crypto': '/category/business-finance-and-crypto',
 'Reading, Writing and Education': '/category/reading-writing-education',
 'Fashion': '/category/fashion',
 'Health and Fitness': '/category/health-and-fitness',
 'History and Culture': '/category/history-and-culture',
 'Travel and Food': '/category/travel-and-food',
 'Funny/Memes': '/category/funny-memes',
 'Pictures, Videos and GIFs': '/category/pictures-videos-and-gifs',
 'Music': '/category/music',
 'News and Politics': '/category/news-and-politics',
 'Relationships and Sex': '/category/relationships-and-sex',
 'Philosophy, Religion and Spirituality': '/category/philosophy-religion-and-spirituality',
 'Sciences and Engineering': '/category/science-and-engineering',
 'Sports and Games': '/category/sport

## Identifying sub-categories
We'll start by looking through one example, then repeating this process for all the general categories

In [25]:
category_url = homepage_url[:-1] + category_dict['Technology']
print(category_url)

https://www.findareddit.com/category/technology


In [26]:
category_raw = requests.get(category_url).text
category_soup = BeautifulSoup(category_raw, 'html.parser')

For this example, our general category (`Category1` in the finalized data file) is Technology. The first level of subcategory (`Category2`) is provided as a sort of header in `row` class divs, and the more specific subcategories (`Category3`) are provided as links below the heading.

![A screenshot of findareddit.com with a large header "Technology" labeled as Category1, a small header "AI and Robotics" labeled as Category2, and a link "Supercomputing" labeled as Category3](GroupingExample.png "An example of how the Category values are selected")

In [40]:
subcategories_raw = category_soup.find_all(class_='c-categoryHeader')[0].find_all(class_='row')[0].find_all(class_='row')

In [77]:
subcategories_raw[0].find_all('span')[0].text

'AI and Robotics'

In [71]:
label_to_url_map(subcategories_raw[0].find_all('a'))

{'AI': '/category/technology/subcategory/ai-and-robotics/tag/ai',
 'Futurism': '/category/technology/subcategory/ai-and-robotics/tag/futurism',
 'Nanotech': '/category/technology/subcategory/ai-and-robotics/tag/nanotech',
 'Robotics': '/category/technology/subcategory/ai-and-robotics/tag/robotics',
 'Semiconductors': '/category/technology/subcategory/ai-and-robotics/tag/semiconductors',
 'Supercomputing': '/category/technology/subcategory/ai-and-robotics/tag/supercomputing',
 'Virtual Reality/Augmented Reality': '/category/technology/subcategory/ai-and-robotics/tag/virtual-reality-and-augmented-reality'}

We can once again use our `label_to_url_map` function to create a name-to-URL dictionary for all the subcategories of "AI and Robotics"

In [79]:
subcategories_dict = {x.find_all('span')[0].text : label_to_url_map(x.find_all('a')) for x in subcategories_raw}

In [80]:
subcategories_dict

{'AI and Robotics': {'AI': '/category/technology/subcategory/ai-and-robotics/tag/ai',
  'Futurism': '/category/technology/subcategory/ai-and-robotics/tag/futurism',
  'Nanotech': '/category/technology/subcategory/ai-and-robotics/tag/nanotech',
  'Robotics': '/category/technology/subcategory/ai-and-robotics/tag/robotics',
  'Semiconductors': '/category/technology/subcategory/ai-and-robotics/tag/semiconductors',
  'Supercomputing': '/category/technology/subcategory/ai-and-robotics/tag/supercomputing',
  'Virtual Reality/Augmented Reality': '/category/technology/subcategory/ai-and-robotics/tag/virtual-reality-and-augmented-reality'},
 'Development': {'Computer Science': '/category/technology/subcategory/development/tag/computer-science',
  'Databases': '/category/technology/subcategory/development/tag/databases',
  'Embedded Systems': '/category/technology/subcategory/development/tag/embedded-systems',
  'Machine Learning/Deep Learning': '/category/technology/subcategory/development/tag/m

Next, we want to apply the above workflow to ALL the high-level categories provided by FindAReddit. To do this, we'll iterate over all the categories we found earlier and stored in `category_dict`. For each high-level category, we scrape the HTML for that page. We then get all the subcategory names, and create a dictionary for that maps the subcategory to the output from `label_to_url_map`.

The result is a dictionary that looks like this:

```
{
    Category1: 
        {
            Category2:
                {
                    Category3: url,
                    Category3: url,
                    Category3: url,
                }
        }
}
        
```

We then save this to a file called `categories.json` to assist with the more intensive webscraping in the next notebook.

In [84]:
def get_subcategory_dict(category_url):
    category_raw = requests.get(category_url).text
    category_soup = BeautifulSoup(category_raw, 'html.parser')
    subcategories_raw = category_soup.find_all(class_='c-categoryHeader')[0].find_all(class_='row')[0].find_all(class_='row')
    subcategories_dict = {x.find_all('span')[0].text : label_to_url_map(x.find_all('a')) for x in subcategories_raw}
    
    return subcategories_dict

In [85]:
all_subcategories = {x : get_subcategory_dict(homepage_url[:-1] + category_dict[x]) for x in category_dict}

In [86]:
all_subcategories

{'Psychedelics/Drugs/Hallucinogens': {'Cannabis': {'CBD/cannabis/Marijuana': '/category/psychedelics-drugs-hallucinogens/subcategory/cannabis/tag/cbd-cannabis-marijuana',
   'microgrowery': '/category/psychedelics-drugs-hallucinogens/subcategory/cannabis/tag/microgrowery'},
  'Psychedelics': {'Psychedelics': '/category/psychedelics-drugs-hallucinogens/subcategory/psychedelics/tag/psychedelics'},
  'Support Groups': {'Alcohol': '/category/psychedelics-drugs-hallucinogens/subcategory/support-groups/tag/alcohol',
   'Drugs': '/category/psychedelics-drugs-hallucinogens/subcategory/support-groups/tag/drugs'}},
 'Animals, Plants and Home': {'Animals': {'General': '/category/animals-plants-and-home/subcategory/animals/tag/general-animals',
   'Biology': '/category/animals-plants-and-home/subcategory/animals/tag/biology',
   'Conservation': '/category/animals-plants-and-home/subcategory/animals/tag/conservation',
   'Domestic': '/category/animals-plants-and-home/subcategory/animals/tag/domesti

In [87]:
import json

In [88]:
with open("categories.json", 'w') as outfile:
    json.dump(all_subcategories, outfile)