# EdX to Markdown format

See [here for a nice tutorial on beautifulsoup for xml files](http://www2.hawaii.edu/~takebaya/cent110/xml_parse/xml_parse.html).

In [1]:
import pathlib
from bs4 import BeautifulSoup

## First read in a sample xml problem file

In [2]:
#problem_file_name = 'test2_p8_vA.xml' # mcq
problem_file_name = 'test2_p8_vC.xml' # numerical response

path = pathlib.Path('../../edx_bank/library003/problem/' + problem_file_name)

#path = pathlib.Path('../../edx_bank/library102/problem/test1_p9_vF.xml') # something with images
path = pathlib.Path('/Users/firasm/Sync/EL/Physics OER Bank/edx_bank/library065/problem/test4_p7_vA.xml')

In [3]:
soup = BeautifulSoup(path.read_text(),'xml')
soup

<?xml version="1.0" encoding="utf-8"?>
<problem display_name="7.sigma" lg="4.250,4.260,4.270" markdown="null" max_attempts="1" rerandomize="per_student" showanswer="never">
<script type="text/javascript">
   (function() {
    $( ".check-label" ).text("Submit");
   })();
</script>
<script system_path="python_lib" type="text/python">
a = 35
r = 0.1*random.randint(5, 15)
tin = random.randint(18, 22)
tout = 5
nperson = 2
pperson = 80
sola = round(a*(tin-tout)/r-nperson*pperson, 2)
pheater = 500
eff = 0.5
hours = random.randint(5, 9)
days = 5*random.randint(10, 14)
cost = 0.12
solb = round(pheater*hours*days*3600/eff/3.6e6*cost, 2)
  </script>
<p>7a) Consider a room in a house that has composite walls with an \(R_{tot} = $r\text{ m}^2\text{K/W}\). The total cross sectional area of the walls is $a m\(^2\). During winter, the temperature outside the room is \(T_{out} = $tout^\circ\text{C}\). Calculate the power (in W) of a heater that is required to keep the room's temperature constant at $ti

In [4]:
[tag.name for tag in soup.find_all()]

['problem',
 'script',
 'script',
 'p',
 'numericalresponse',
 'responseparam',
 'textline',
 'p',
 'numericalresponse',
 'responseparam',
 'textline',
 'demandhint',
 'hint']

### Title

In [5]:
title = soup.find_all('problem')[0].get('display_name')
title

'7.sigma'

### Author and Source

In [6]:
author = "edX Physics 100 Team"
source = "https://github.com/open-resources/edx_bank"

### Deal with images

In [7]:
try: 
    image = pathlib.Path(soup.find_all('img')[0].get('src')).name
    
except IndexError: # No images in the problem
    image = 'undefined'

### Problem Text

In [8]:
problem_text = soup.find_all("p")[0].contents[0]

# Clean up problem text
problem_text = (
    problem_text
    .replace("\(", "")  # get rid of left brackets
    .replace("\)", "")  # get rid of right brackets
    .replace('. ','.\n') # Each sentence on a new line
    .replace('? ','.\n') # Each question on a new line
    .replace('! ','.\n') # Each excited sentence on a new line
    + "\n\n" # Add some new line chars
)

print(problem_text)

7a) Consider a room in a house that has composite walls with an R_{tot} = $r\text{ m}^2\text{K/W}.
The total cross sectional area of the walls is $a m^2.
During winter, the temperature outside the room is T_{out} = $tout^\circ\text{C}.
Calculate the power (in W) of a heater that is required to keep the room's temperature constant at $tin^oC if the room is going to be occupied by $nperson people.
Assume that the radiative power emitted by a person is $pperson W.





In [9]:
# Replace $param with {{ param }}

import re

# Testing the regex
result = re.findall("\$([^\s]+)",problem_text)

if result:
    print("Search successful.")
    print(result)
else:
    print("Search unsuccessful.")

def edx_var_to_pl(match):
    text = match.group().replace('$','')
    return '{{ '+text+' }}'

reg = re.compile("\$([^\s|-]+)") #Match anything like "$m rest of sentence" or "$m-kg rest of sentence (leaves dash)."

problem_text = reg.sub(edx_var_to_pl,problem_text)

print(problem_text)

Search successful.
['r\\text{', 'a', 'tout^\\circ\\text{C}.', 'tin^oC', 'nperson', 'pperson']
7a) Consider a room in a house that has composite walls with an R_{tot} = {{ r\text{ }} m}^2\text{K/W}.
The total cross sectional area of the walls is {{ a }} m^2.
During winter, the temperature outside the room is T_{out} = {{ tout^\circ\text{C}. }}
Calculate the power (in W) of a heater that is required to keep the room's temperature constant at {{ tin^oC }} if the room is going to be occupied by {{ nperson }} people.
Assume that the radiative power emitted by a person is {{ pperson }} W.





### Problem Type and Answer Choices (if applicable)

See all edX [problem types here](https://edx.readthedocs.io/projects/edx-partner-course-staff/en/latest/exercises_tools/create_exercises_and_tools.html).

In [10]:
problem_type = ""
answer_section = "## Answer Section\n\n"
mcq_python = ""

for tag in soup.find_all():

    if tag.name == "numericalresponse":
        problem_type = "numeric"

        answer_section += (
            f"""The answer is: || { soup.find_all('numericalresponse')[0].get('answer') } &&."""
            .replace("||", "{{")
            .replace("&&", "}}")
            .replace("$", "")
        )

    elif tag.name == "multiplechoiceresponse":
        problem_type = "mc"

        for i, response in enumerate(
            soup.find_all("multiplechoiceresponse")[0].find_all("choice")):

            answer_section += (
                f"- || { response.string.replace('$', '') } &&\n"
                .replace('||', "{{")
                .replace("&&", "}}")
            )

            mcq_python += f"data['params']['ans{ i+1 }'] = { response.string.replace('$', '') }\n"

            if response.attrs["correct"] == "true":
                mcq_python += f"data['params']['correct_answer'] = { response.string.replace('$', '') }\n"

    elif tag.name == "choiceresponse":
        problem_type = "ma"
    elif tag.name == "stringresponse":
        print("String response - plain text; not implemented yet")
        raise NotImplementedError
    elif tag.name == "optionresponse":
        print("Dropdown response; not implemented yet")
        raise NotImplementedError

if problem_type == "":
    raise NotImplementedError

### Python Solution/Code

In [11]:
imports = "import random\nimport problem_bank_helpers as pbh"

python_code = pathlib.Path('python_front.md').read_text()

for tag in soup.find_all('script'):
    
    if tag['type'] == 'text/javascript':
        pass
    elif tag['type'] == 'text/python':
        python_code += tag.contents[0]
        
python_code #+= 'θ'
# Remove import of random
python_code.replace('import random','')
        
full_python = python_code + pathlib.Path('python_end.md').read_text() + mcq_python

print(full_python)

data2 = pbh.create_data2()

# define or load names/items/objects

# store phrases etc

# Randomize Variables

a = 35
r = 0.1*random.randint(5, 15)
tin = random.randint(18, 22)
tout = 5
nperson = 2
pperson = 80
sola = round(a*(tin-tout)/r-nperson*pperson, 2)
pheater = 500
eff = 0.5
hours = random.randint(5, 9)
days = 5*random.randint(10, 14)
cost = 0.12
solb = round(pheater*hours*days*3600/eff/3.6e6*cost, 2)
  
# store the variables in the dictionary params

# define possible answers

# Update the data object with a new dict
data.update(data2)


In [12]:
full_python

'data2 = pbh.create_data2()\n\n# define or load names/items/objects\n\n# store phrases etc\n\n# Randomize Variables\n\na = 35\nr = 0.1*random.randint(5, 15)\ntin = random.randint(18, 22)\ntout = 5\nnperson = 2\npperson = 80\nsola = round(a*(tin-tout)/r-nperson*pperson, 2)\npheater = 500\neff = 0.5\nhours = random.randint(5, 9)\ndays = 5*random.randint(10, 14)\ncost = 0.12\nsolb = round(pheater*hours*days*3600/eff/3.6e6*cost, 2)\n  \n# store the variables in the dictionary params\n\n# define possible answers\n\n# Update the data object with a new dict\ndata.update(data2)'

### Keep the feedback/Hints

In [13]:
python_code

'data2 = pbh.create_data2()\n\n# define or load names/items/objects\n\n# store phrases etc\n\n# Randomize Variables\n\na = 35\nr = 0.1*random.randint(5, 15)\ntin = random.randint(18, 22)\ntout = 5\nnperson = 2\npperson = 80\nsola = round(a*(tin-tout)/r-nperson*pperson, 2)\npheater = 500\neff = 0.5\nhours = random.randint(5, 9)\ndays = 5*random.randint(10, 14)\ncost = 0.12\nsolb = round(pheater*hours*days*3600/eff/3.6e6*cost, 2)\n  '

In [14]:
feedback = "\n\n".join([h.contents[0] for h in soup.find_all('hint')])

In [21]:
yaml_dict = {}

yaml_dict['title'] = title
yaml_dict['topic'] = 'undefined'
yaml_dict['author'] = author
yaml_dict['source'] = source
yaml_dict['template_version']: 1.1
yaml_dict['attribution'] = 'standard'
yaml_dict['outcomes'] = 'undefined'
yaml_dict['difficulty'] = 'Average'
yaml_dict['randomization'] = 'undefined'
yaml_dict['tags'] = 'undefined'
yaml_dict['assets'] = image

yaml_dict['server'] = {}
yaml_dict['server']['imports'] = imports
yaml_dict['server']['generate'] = full_python
yaml_dict['server']['prepare'] = 'pass'
yaml_dict['server']['parse'] = 'pass'
yaml_dict['server']['grade'] = 'pass'

yaml_dict['part1'] = {}
yaml_dict['part1']['type'] = problem_type
yaml_dict['part1']['pl-customizations'] = {}
yaml_dict['part1']['pl-customizations']['weight'] = 1

In [22]:
yaml_dict

{'title': '7.sigma',
 'topic': 'undefined',
 'author': 'edX Physics 100 Team',
 'source': 'https://github.com/open-resources/edx_bank',
 'attribution': 'standard',
 'outcomes': 'undefined',
 'difficulty': 'Average',
 'randomization': 'undefined',
 'tags': 'undefined',
 'assets': 'undefined',
 'server': {'imports': 'import random\nimport problem_bank_helpers as pbh',
  'generate': 'data2 = pbh.create_data2()\n\n# define or load names/items/objects\n\n# store phrases etc\n\n# Randomize Variables\n\na = 35\nr = 0.1*random.randint(5, 15)\ntin = random.randint(18, 22)\ntout = 5\nnperson = 2\npperson = 80\nsola = round(a*(tin-tout)/r-nperson*pperson, 2)\npheater = 500\neff = 0.5\nhours = random.randint(5, 9)\ndays = 5*random.randint(10, 14)\ncost = 0.12\nsolb = round(pheater*hours*days*3600/eff/3.6e6*cost, 2)\n  \n# store the variables in the dictionary params\n\n# define possible answers\n\n# Update the data object with a new dict\ndata.update(data2)',
  'prepare': 'pass',
  'parse': 'pass'

In [24]:
## Preparing the YAML dictionary

import yaml

# This solution is copied from this SO answer: https://stackoverflow.com/a/45004775/2217577 

yaml.SafeDumper.org_represent_str = yaml.SafeDumper.represent_str
    
def repr_str(dumper, data):
    #raise KeyError
    #print(data)
    if ('\n' in data):
        data = re.sub('\\n[\s].*\\n','\n\n',data)


        return dumper.represent_scalar(u'tag:yaml.org,2002:str', data.replace('\n  ','\n'), style='|')
    return dumper.org_represent_str(data)

yaml.add_representer(str, repr_str, Dumper=yaml.SafeDumper)

print(yaml.safe_dump(yaml_dict,sort_keys=False))

title: 7.sigma
topic: undefined
author: edX Physics 100 Team
source: https://github.com/open-resources/edx_bank
attribution: standard
outcomes: undefined
difficulty: Average
randomization: undefined
tags: undefined
assets: undefined
server:
  imports: |-
    import random
    import problem_bank_helpers as pbh
  generate: |-
    data2 = pbh.create_data2()


    # store phrases etc


    a = 35
    r = 0.1*random.randint(5, 15)
    tin = random.randint(18, 22)
    tout = 5
    nperson = 2
    pperson = 80
    sola = round(a*(tin-tout)/r-nperson*pperson, 2)
    pheater = 500
    eff = 0.5
    hours = random.randint(5, 9)
    days = 5*random.randint(10, 14)
    cost = 0.12
    solb = round(pheater*hours*days*3600/eff/3.6e6*cost, 2)

    # store the variables in the dictionary params


    # Update the data object with a new dict
    data.update(data2)
  prepare: pass
  parse: pass
  grade: pass
part1:
  type: numeric
  pl-customizations:
    weight: 1



### Rest of the MD format

In [25]:
start = "# {{ vars.title }}\n\n## Question Text\n\n"
end = pathlib.Path('md_end.md').read_text()

In [26]:
pathlib.Path("test_question.md").write_text('---\n' +\
                                    yaml.safe_dump(yaml_dict,sort_keys=False) +\
                                    '---\n' +\
                                    start +\
                                    problem_text +\
                                    answer_section +\
                                    end +\
                                    feedback)

2191

## Converting the above to a function

In [30]:
filepath = "../../edx_bank/library003/problem/" + problem_file_name

import re
import yaml

def convert_edx_to_md(filepath):
    
    # Load Path
    path = pathlib.Path(filepath)
    
    # Beautiful Soup XML
    soup = BeautifulSoup(path.read_text(),"xml")
    
    # XML processing
    title = soup.find_all("problem")[0].get("display_name")
    
    # Info from the template
    author = "edX Physics 100 Team"
    source = "https://github.com/open-resources/edx_bank"
    
    # Deal with images
    
    try: 
        image = pathlib.Path(soup.find_all("img")[0].get("src")).name

    except IndexError: # No images in the problem
        image = "undefined"
    
    # Load problem 

    problem_text = soup.find_all("p")[0].contents[0]

    # Clean up problem text
    problem_text = (
        problem_text
        .replace("\(", "")  # get rid of left brackets
        .replace("\)", "")  # get rid of right brackets
        .replace(". ",".\n") # Each sentence on a new line
        .replace("? ",".\n") # Each question on a new line
        .replace("! ",".\n") # Each excited sentence on a new line
        + "\n\n" # Add some new line chars
    )

    # Replace $param with {{ param }}
    def edx_var_to_pl(match):
        text = match.group().replace("$","")
        return "{{ "+text+" }}"

    reg = re.compile("\$([^\s|-]+)") #Match anything like "$m rest of sentence" or "$m-kg rest of sentence (leaves dash)."
    problem_text = reg.sub(edx_var_to_pl,problem_text)
    
    # Building the problem template file

    problem_type = ""
    answer_section = "## Answer Section\n\n"
    mcq_python = ""

    for tag in soup.find_all():

        if tag.name == "numericalresponse":
            problem_type = "numeric"

            answer_section += (
                f"The answer is: || { soup.find_all('numericalresponse')[0].get("answer") } &&."
                .replace("||", "{{")
                .replace("&&", "}}")
                .replace("$", "")
            )

        elif tag.name == "multiplechoiceresponse":
            problem_type = "mc"

            for i, response in enumerate(
                soup.find_all("multiplechoiceresponse")[0].find_all("choice")):

                answer_section += (
                    f"- || { response.string.replace("$", '') } &&\n"
                    .replace("||", "{{")
                    .replace("&&", "}}")
                )

                mcq_python += f"data["params"]["ans{ i+1 }"] = { response.string.replace("$", "") }\n"

                if response.attrs["correct"] == "true":
                    mcq_python += f"data["params"]["correct_answer"] = { response.string.replace("$", "") }\n"

        elif tag.name == "choiceresponse":
            problem_type = "ma"
        elif tag.name == "stringresponse":
            print("String response - plain text; not implemented yet")
            raise NotImplementedError
        elif tag.name == "optionresponse":
            print("Dropdown response; not implemented yet")
            raise NotImplementedError

    if problem_type == "":
        raise NotImplementedError
        
    # Solutions
    
    imports = "import random\nimport problem_bank_helpers as pbh"

    python_code = pathlib.Path("python_front.md").read_text()

    for tag in soup.find_all("script"):

        if tag["type"] == "text/javascript":
            pass
        elif tag["type"] == "text/python":
            python_code += tag.contents[0]

    # Remove import of random
    python_code.replace("import random","")

    full_python = python_code + pathlib.Path("python_end.md").read_text() + mcq_python
    
    # Feedback and hints
    feedback = "\n\n" + "\n\n".join([h.contents[0] for h in soup.find_all("hint")])
        
    # Preparing the YAML dictionary
    # This solution is copied from this SO answer: https://stackoverflow.com/a/45004775/2217577 

    yaml.SafeDumper.org_represent_str = yaml.SafeDumper.represent_str

    def repr_str(dumper, data):
        if "\n" in data:
            data = re.sub('\\n[\s].* \\n','\n\n',data)
            return dumper.represent_scalar(u"tag:yaml.org,2002:str", data, style="|")
        return dumper.org_represent_str(data)

    yaml.add_representer(str, repr_str, Dumper=yaml.SafeDumper)
    
    # Create YAML dict
    
    yaml_dict = {}

    yaml_dict["title"] = title
    yaml_dict["topic"] = "undefined"
    yaml_dict["author"] = author
    yaml_dict["source"] = source
    yaml_dict["template_version"]: 1.1
    yaml_dict["attribution"] = "standard"
    yaml_dict["outcomes"] = "undefined"
    yaml_dict["difficulty"] = "Average"
    yaml_dict["randomization"] = "undefined"
    yaml_dict["tags"] = "undefined"
    yaml_dict["assets"] = "undefined"
    yaml_dict["server"] = {}

    yaml_dict["server"]["imports"] = imports
    yaml_dict["server"]["generate"] = full_python
    yaml_dict["server"]["prepare"] = "pass"
    yaml_dict["server"]["parse"] = "pass"
    yaml_dict["server"]["grade"] = "pass"

    yaml_dict["part1"] = {}
    yaml_dict["part1"]["type"] = problem_type
    yaml_dict["part1"]["pl-customizations"] = {}
    yaml_dict["part1"]["pl-customizations"]["weight"] = 1
    
    # Finalize template
    start = "# {{ vars.title }}\n\n## Question Text\n\n"
    end = pathlib.Path("md_end.md").read_text()
    
    path.with_suffix(".md").write_text("---\n" +\
                                        yaml.safe_dump(yaml_dict,sort_keys=False) +\
                                        "---\n" +\
                                        start +\
                                        problem_text +\
                                        answer_section +\
                                        end +\
                                        feedback)

SyntaxError: invalid syntax (<ipython-input-30-6ccd316df394>, line 64)

In [19]:
# Test it once

convert_edx_to_md(filepath)

In [20]:
# Release the hounds

for f in pathlib.Path('../../edx_bank/').glob('**/*.xml'):
    if 'library' in f.name:
        continue
    else:
        try:
            convert_edx_to_md(f)
        except:
            print(f"failed: {f.name}")

failed: 99eb62fe04ff419d860883a42743bcfd.xml
failed: edba445f66a042209784ef7160c664a9.xml
failed: 963333e5fdff44b1b45b8c0af4e2cc5a.xml


In [21]:
filepath

'../../edx_bank/library003/problem/test2_p8_vC.xml'