## Read Khan Academy Subject Page

In [1]:
import os
from bs4 import BeautifulSoup
import requests
import pymongo
import re

### Read Subject Page

In [12]:
# Read Khan Academy Subject Page
headers = {
   'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0',
   'X-Requested-With': 'XMLHttpRequest'
}
domain = 'https://www.khanacademy.org'
# source = domain + '/math/probability'
source = domain + '/math/statistics-probability'
# source = domain + '/math/ap-statistics'
response = requests.get(source, headers=headers)
response

# print(dir(response))

<Response [200]>

#### Read units

In [6]:
# Mount lessons array

# Lessons
lessons = []
_id = 0

# Get Page headings
unit_sequence = 0
# bsSubject = BeautifulSoup(response.text, 'lxml')
bsSubject = BeautifulSoup(response.text, 'html.parser')

field   = bsSubject.find('div', attrs={"aria-label": "breadcrumbs"}).get_text()
subject = bsSubject.find('h1',  attrs={"data-test-id": "unit-block-title"}).get_text()

# print(field)

for unit_block in bsSubject.findAll('div', class_='_12yy6f6l'):
   unit_sequence += 1


   # if unit_sequence == 3:
   #    break


   unit_title = unit_block.find('h3').get_text()
   print(unit_title)

   for group in unit_block.findAll('a', attrs={"data-test-id": "lesson-link"}):
      group_title = group.get('title')
      group_link  = group.get('href')
      
      # Load group page
      group_page = requests.get(domain + group_link, headers=headers)
      print("\t" + group_title)
      bsGroup = BeautifulSoup(group_page.text, 'html.parser')
      unit_group = bsGroup.find('ul', attrs={"data-test-id": "learnable-content-cells"})

      # Browse through lessons
      for lesson in unit_group.findAll('li'):
         lesson_title = lesson.get_text()
         lesson_link = lesson.find('a').get('href')

         # Get lesson type
         lesson_type = lesson.find(['span', 'svg'], attrs={"role": "img"}).get('aria-label')
         if lesson_type not in ['Article', 'Video']:
            lesson_type = 'Exercise'

         # Load lesson metadata
         # meta_data = getLessonMeta(domain + lesson_link, lesson_type)
         print("\t\t" + lesson_title + ' - ' + lesson_type)

         # Lesson content object
         _id += 1
         lesson = {
            '_id':           subject + ' - ' + str(_id).zfill(3),
            'subject':       subject,
            'unit':          unit_title,
            'unit_sequence': unit_sequence,
            'group':         group_title,
            'lesson':        lesson_title,
            'lesson_type':   lesson_type,
            'lesson_link':   lesson_link
         }
         lessons.append(lesson)


Unit 1: Displaying a single quantitative variable
	Frequency tables and dot plots
		Frequency tables & dot plots - Video
		Creating dot plots - Exercise
		Reading dot plots & frequency tables - Exercise
	Histograms
		Creating a histogram - Video
		Create histograms - Exercise
		Interpreting a histogram - Video
		Read histograms - Exercise
	Mean and median in data displays
		Statistics intro: Mean, median, & mode - Video
		Median in a histogram - Video
		Calculating mean and median from data displays - Exercise
		Estimating mean and median in data displays - Video
		Estimating mean and median in data displays - Exercise
		Choosing the "best" measure of center - Article
	Interquartile range
		Interquartile range (IQR) - Video
		Interquartile range (IQR) - Exercise
	Box and whisker plots
		Worked example: Creating a box plot (odd number of data points) - Video
		Worked example: Creating a box plot (even number of data points) - Video
		Creating box plots - Exercise
		Reading box plots - V

#### Read lessons and save them in the database

In [7]:
# print out an array or dictionary nicely
import json
print(json.dumps(lessons, indent=4))

[
    {
        "_id": "High school statistics - 001",
        "subject": "High school statistics",
        "unit": "Unit 1: Displaying a single quantitative variable",
        "unit_sequence": 1,
        "group": "Frequency tables and dot plots",
        "lesson": "Frequency tables & dot plots",
        "lesson_type": "Video",
        "lesson_link": "/math/probability/xa88397b6:display-quantitative/xa88397b6:frequency-tables-dot-plots/v/frequency-tables-and-dot-plots"
    },
    {
        "_id": "High school statistics - 002",
        "subject": "High school statistics",
        "unit": "Unit 1: Displaying a single quantitative variable",
        "unit_sequence": 1,
        "group": "Frequency tables and dot plots",
        "lesson": "Creating dot plots",
        "lesson_type": "Exercise",
        "lesson_link": "/math/probability/xa88397b6:display-quantitative/xa88397b6:frequency-tables-dot-plots/e/creating-dot-plots"
    },
    {
        "_id": "High school statistics - 003",
      

In [8]:
# Save lessons in the database

# Create Khan Academy database
client = pymongo.MongoClient('mongodb://localhost:27017/')
db = client['khan-academy']

# Lessons collection
colLessons = db["lessons"]

query = { 'subject': subject }
x = colLessons.delete_many(query)
x = colLessons.insert_many(lessons)

#print list of the _id values of the inserted documents:
print(x.inserted_ids)

['High school statistics - 001', 'High school statistics - 002', 'High school statistics - 003', 'High school statistics - 004', 'High school statistics - 005', 'High school statistics - 006', 'High school statistics - 007', 'High school statistics - 008', 'High school statistics - 009', 'High school statistics - 010', 'High school statistics - 011', 'High school statistics - 012', 'High school statistics - 013', 'High school statistics - 014', 'High school statistics - 015', 'High school statistics - 016', 'High school statistics - 017', 'High school statistics - 018', 'High school statistics - 019', 'High school statistics - 020', 'High school statistics - 021', 'High school statistics - 022', 'High school statistics - 023', 'High school statistics - 024', 'High school statistics - 025', 'High school statistics - 026', 'High school statistics - 027', 'High school statistics - 028', 'High school statistics - 029', 'High school statistics - 030', 'High school statistics - 031', 'High s