## Read Khan Academy Lessons Mongo Collection and generate Word Templates

In [1]:
import os
import pymongo
import pandas as pd
import re
import docx
from docx.shared import Pt
from docx.shared import RGBColor
from docx.oxml.ns import qn

In [2]:
# Script Variables

subject = 'High school statistics'
# subject = 'Statistics and probability'
# subject = 'AP®︎/College Statistics'

# Target directory
root_dir = 'd:\\Primeway\\Treinamento\\Data Science & AI\\Statistics'



In [8]:
# Connect to MongoDB and load lessons collection into Pandas Dataframe

client = pymongo.MongoClient('mongodb://localhost:27017/')
db = client['khan-academy']

query = { 'subject': subject, 'lesson_type': { "$in": [ "Article", "Video" ] } }
columns = {'_id': 0, 'subject': 0, 'unit_sequence': 0, 'lesson_type': 0, 'lesson_link': 0 }

lessons = db.lessons.find(query, columns)

list_lessons = list(lessons)
df = pd.DataFrame(list_lessons)

df.head()

Unnamed: 0,unit,group,lesson
0,Unit 1: Displaying a single quantitative variable,Frequency tables and dot plots,Frequency tables & dot plots
1,Unit 1: Displaying a single quantitative variable,Histograms,Creating a histogram
2,Unit 1: Displaying a single quantitative variable,Histograms,Interpreting a histogram
3,Unit 1: Displaying a single quantitative variable,Mean and median in data displays,"Statistics intro: Mean, median, & mode"
4,Unit 1: Displaying a single quantitative variable,Mean and median in data displays,Median in a histogram


In [4]:
df.index

RangeIndex(start=0, stop=112, step=1)

In [15]:
# Prepare Word Document

def prepare_document(unit):

   # https://python-docx.readthedocs.io/en/latest/user/styles-using.html

   document = docx.Document()

   # Styling the Document

   # Normal
   style_normal = document.styles['Normal']
   style_normal.font.name = 'Calibri'
   style_normal.font.size = Pt(11)

   # Title
   style_title = document.styles['Title']
   style_title.font.name = 'Calibri Light'
   style_title.font.size = Pt(28)
   style_title.font.color.rgb = RGBColor(0x00, 0x00, 0x00)
   # https://stackoverflow.com/questions/60921603/how-do-i-change-heading-font-face-and-size-in-python-docx
   rFonts = style_title.element.rPr.rFonts
   rFonts.set(qn("w:asciiTheme"), "Calibri Light")

   # Heading 1
   style_heading1 = document.styles['Heading 1']
   style_heading1.font.name = 'Calibri Light'
   style_heading1.font.bold = None
   style_heading1.font.size = Pt(16)
   style_heading1.font.color.rgb = RGBColor(0x2F, 0x54, 0x96)
   rFonts = style_heading1.element.rPr.rFonts
   rFonts.set(qn("w:asciiTheme"), "Calibri Light")

   # Heading 2
   style_heading2 = document.styles['Heading 2']
   style_heading2.font.name = 'Calibri Light'
   style_heading2.font.bold = None
   style_heading2.font.size = Pt(13)
   style_heading2.font.color.rgb = RGBColor(0x2F, 0x54, 0x96)
   rFonts = style_heading2.element.rPr.rFonts
   rFonts.set(qn("w:asciiTheme"), "Calibri Light")

   # Generate the document content

   document.add_heading(unit, 0)

   return document

In [32]:
# Create target directory

os.chdir(root_dir)
parent_dir = os.getcwd()

path = os.path.join(parent_dir, subject)
print(path)

# Create the directory
if not os.path.isdir(path):
   os.mkdir(path)

os.chdir(path)
os.getcwd()



d:\Primeway\Treinamento\Data Science & AI\Statistics\High school statistics


'd:\\Primeway\\Treinamento\\Data Science & AI\\Statistics\\High school statistics'

In [36]:
# Interact through Dataframe and save doc file

unit = ''

for i in df.index:

    if df['unit'][i] != unit:
        # Sanitize filename and save document
        if unit != '':
            filename = re.sub(r':'," -", unit)
            filename = re.sub(r'[\\/*?"<>|]',"", filename)
            # Check out if the file exists
            path = os.path.join(os.getcwd(), filename)
            if os.path.isfile(path):
                document.save(filename + '.docx')
        # Nem unit
        unit = df['unit'][i]
        document = prepare_document(re.sub(r':'," -", unit))
        group = ''

    if df['group'][i] != group: 
        if group != '':
            document.add_page_break()
        document.add_heading(df['group'][i], level=1)
        document.add_paragraph('', style='Normal') 
        group = df['group'][i]

    document.add_heading("\t" + df['lesson'][i], level=2)
    document.add_paragraph('\t', style='Normal')
    document.add_paragraph('\t', style='Normal')
    document.add_paragraph('\t', style='Normal')
