# Data processing that generated the YAML file

The pre-processing required to produce the datasets include these steps:

  - Obtain a short `path` for each domain:
    - infer a path for `Domain`s from contents
    - infer a path for `Cluster`s from contents
  - `guid`: ensure existence of the globally unique identified (GUID) provided by the common core
  - ensure ccssUri returns a 200
  - meld with cleaned edge data:
    - remove `||` comments
    - expand `,`-  and `;`- separated lists
    - confirm existence of `Standard`s for all edge endpoints


In [1]:
import json
from pprint import pprint

In [2]:
# Get data from ASN
!wget http://asn.jesandco.org/resources/D10003FB_full.json
asn_data = json.load(open('D10003FB_full.json'))

--2016-06-24 17:31:35--  http://asn.jesandco.org/resources/D10003FB_full.json
Resolving asn.jesandco.org... 52.6.235.143
Connecting to asn.jesandco.org|52.6.235.143|:80... connected.
HTTP request sent, awaiting response... 303 See Other
Location: http://s3.amazonaws.com/asnstaticd2l/data/rdf/D10003FB.json [following]
--2016-06-24 17:31:36--  http://s3.amazonaws.com/asnstaticd2l/data/rdf/D10003FB.json
Resolving s3.amazonaws.com... 54.231.40.178
Connecting to s3.amazonaws.com|54.231.40.178|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1768569 (1.7M) [application/json]
Saving to: 'D10003FB_full.json.2'


2016-06-24 17:31:45 (193 KB/s) - 'D10003FB_full.json.2' saved [1768569/1768569]



In [3]:
len(asn_data)


def dictify_asn(asn_data):
    SKIP_KEYS = ['authorityStatus', 'educationLevel', 'listID', 'language', 'indexingStatus']
    results = {}
    types_seen_so_far = []
    labels_seen_so_far = []
    for asnid, data in asn_data.iteritems():
        result = {} 
        for key, values in data.iteritems():
            newkey = key.replace('http://purl.org/ASN/schema/core/','')
            newkey = newkey.replace('http://purl.org/dc/terms/','')
            newkey = newkey.replace('http://www.w3.org/2004/02/skos/core#','')
            newkey = newkey.replace('http://www.w3.org/1999/02/22-rdf-syntax-ns#','')
            newkey = newkey.replace('http://purl.org/gem/qualifiers/','')
            newvals = []
            for valitem in values:
                newvals.append(valitem['value'])
            if newkey not in SKIP_KEYS:            # do not copy over SKIP_KEYS
                result[newkey] = newvals
        #
        if result.has_key('type'):
            result['type'] = result['type'][0].replace('http://purl.org/ASN/schema/core/','')
        else:
            result['type'] = 'ContainerDocument'  # add type for the main container document
        if result.has_key('type'):
            thetype = result['type']
            if thetype not in types_seen_so_far:
                types_seen_so_far.append(thetype)
        #
        # 
        if result.has_key('statementLabel'):
            label = result['statementLabel'][0]
            result['statementLabel'] = label
            if label not in labels_seen_so_far:
                labels_seen_so_far.append(label)
        results[asnid] = result
    print "ALL typeS SEEN:", types_seen_so_far
    print "ALL labelS SEEN:", labels_seen_so_far    
    return results

asn_dictified = dictify_asn(asn_data)

ALL typeS SEEN: [u'Statement', u'StandardDocument', 'ContainerDocument']
ALL labelS SEEN: [u'Standard', u'Component', u'Cluster', u'Domain']


In [4]:
asn_dictified.keys()[26]
asn_dictified['http://asn.jesandco.org/resources/S1143469']

{u'altStatementNotation': [u'2.MD.10'],
 u'description': [u'Draw a picture graph and a bar graph (with single-unit scale) to represent a data set with up to four categories. Solve simple put-together, take-apart, and compare problems using information presented in a bar graph.'],
 u'exactMatch': [u'http://corestandards.org/Math/Content/2/MD/D/10',
  u'urn:guid:1E9847DB7B284F2C9E412F7EBF3A7A4D'],
 u'identifier': [u'http://purl.org/ASN/resources/S1143469'],
 u'isChildOf': [u'http://asn.jesandco.org/resources/S2390250'],
 u'isPartOf': [u'http://asn.jesandco.org/resources/D10003FB'],
 u'statementLabel': u'Standard',
 u'statementNotation': [u'CCSS.Math.Content.2.MD.D.10'],
 u'subject': [u'http://purl.org/ASN/scheme/ASNTopic/math'],
 u'type': u'Statement'}

## Step 2: add relationships

In [5]:
def add_prerequisites_and_usedfors(datadict): 
    # first pass to add ispartof and contents attributes
    for asnid, data in datadict.iteritems():
        data['ispartof'] = []
        data['contents'] = []
        if data.has_key('isChildOf'):
            for parent in data['isChildOf']:
                data['ispartof'].append(parent)
        else:
            print asnid, data['type'], 'has no parents'
    # second pass to add contents relationships (inverse of ispartof)
    for asnid, data in datadict.iteritems():
        if data.has_key('ispartof'):
            for parent_id in data['ispartof']:
                parent = datadict[parent_id]
                parent['contents'].append(asnid)
        else:
            print asnid, data['type'], 'has no parents'
    
                
            
add_prerequisites_and_usedfors(asn_dictified)

http://asn.jesandco.org/resources/D10003FB StandardDocument has no parents
http://asn.jesandco.org/resources/D10003FB.xml ContainerDocument has no parents


## Step 3: add paths

In [6]:

def strip_cluster_to_domain(instr):
    domain = instr
    pre_prefixes = ['CCSS.Math.Content.', 'CCSS.Math.Practice.']
    prefixes = ['K.', '1.', '2.', '3.', '4.', '5.', '6.', '7.', '8.']
    suffixes = ['.A', '.B', '.C', '.D']
    for pre_prefix in pre_prefixes:
        if domain.startswith(pre_prefix):
            domain = domain.replace(pre_prefix,'')
    for prefix in prefixes:
        if domain.startswith(prefix):
            domain = domain.replace(prefix,'')
    for suffix in suffixes:
        if domain.endswith(suffix):
            domain = domain.replace(suffix,'')
    return domain


def addpaths(datadict):
    """
    Ensures each data item has a `path` so we we'll be able to pivot on it in next step.
    """
    # 
    for asnid, data in datadict.iteritems():
        
        if data.has_key('statementNotation'):
            full_path = data['statementNotation'][0]
            if 'Content' in full_path:
                path = full_path.replace('CCSS.Math.Content.','')
                data['path'] = path
            elif 'Practice' in full_path:
                path = full_path.replace('CCSS.Math.Practice.','')
                data['path'] = path
            else:
                print "don't know what to do with", full_path
        else:
            # no statementNotation so we must infer a path from contents...
            path_infer_success = False
            if data.has_key('statementLabel'):
                mylabel = data['statementLabel']
                #
                # Handle Domains
                if mylabel == 'Domain':   
                    # print data['type'], asnid, 'has no statementNotation, trying to infer from contents"
                    current_guess = None
                    guesses = []
                    if data.has_key('contents'):
                        child_ids = data['contents']
                        for child_id in child_ids:
                            child = datadict[child_id]
                            #
                            if not child.has_key('statementNotation'):
                                continue
                                #print child
                                #print "NO STATE MENT NOTATIN IN THIS CHILD"
                            #
                            sn =  child['statementNotation'][0]
                            # print sn
                            guess = strip_cluster_to_domain(sn)
                            guesses.append(guess)
                            if current_guess and guess != current_guess:
                                print "error children infer different parent..."
                                print current_guess, guess
                            current_guess = guess
                        if current_guess:
                            # print guesses
                            data['path'] = current_guess
                            data['statementLabel'] = 'Domain'
                            path_infer_success = True
                        else:
                            print 'Couldnt find Deomain for', data
                            pprint(data['contents'])
                            # print guesses
                    else:
                        print "No contents! so can't infer Domain"
                #
                # Handle clusters
                elif mylabel == 'Cluster':
                    if data.has_key('exactMatch'):
                        values = data['exactMatch']
                        url = [val for val in values if val.starswith('http://corestandards.org') ][0]
                        path = path.replace('http://corestandards.org/Math/','')
                        path = path.replace('Practice/','')
                        path = path.replace('Content/','')
                        path = path.replace('/','.')
                        data['path'] = path
                        path_infer_success = True
                        print "Cluser:", path, 'recognized from', url
                    else:
                        print "Failed to recognize cluser once:", child_path, path
                else:
                    print "UNEXPECTED LABEL", mylabel, asnid, child['statementLabel']
            else:
                # print asnid, "has no statementLabel..."
                if asnid == 'http://asn.jesandco.org/resources/D10003FB':
                    path = 'CCSSM'
                    data['path'] = path
                    data['statementLabel'] = 'Standard'
                    path_infer_success = True
                elif asnid == 'http://asn.jesandco.org/resources/D10003FB.xml':
                    # will delete after
                    path_infer_success = True                    
                elif data.has_key('description'):
                    desc = data['description'][0]
                    path =  ''.join([c for c in desc if c.isupper()])
                    # print "Domain ", path, 'recognized from', desc
                    data['path'] = path
                    data['statementLabel'] = 'Domain'
                    path_infer_success = True
                else:
                    print "no statementLabel and no description"

            #
            # DID WE SUCCESS?
            if not path_infer_success:
                print "Failed to infer path", asnid
    #
    # The ASN ContainerDocument has no links so safe to delete
    if datadict.has_key('http://asn.jesandco.org/resources/D10003FB.xml'):
        del datadict['http://asn.jesandco.org/resources/D10003FB.xml']

addpaths(asn_dictified)

In [7]:
asn_dictified.keys()[120:121]

[u'http://asn.jesandco.org/resources/S114342E']

In [8]:
asn_dictified['http://asn.jesandco.org/resources/S2390246']

{'contents': [u'http://asn.jesandco.org/resources/S1143454'],
 u'description': [u'Add and subtract within 20.'],
 u'exactMatch': [u'http://corestandards.org/Math/Content/2/OA/B',
  u'urn:guid:21FF72D85AF248E28B8AD028ABF94DDE'],
 u'hasChild': [u'http://asn.jesandco.org/resources/S1143454'],
 u'isChildOf': [u'http://asn.jesandco.org/resources/S114340A'],
 u'isPartOf': [u'http://asn.jesandco.org/resources/D10003FB'],
 'ispartof': [u'http://asn.jesandco.org/resources/S114340A'],
 'path': u'2.OA.B',
 u'statementLabel': u'Cluster',
 u'statementNotation': [u'CCSS.Math.Content.2.OA.B'],
 u'subject': [u'http://purl.org/ASN/scheme/ASNTopic/math'],
 u'type': u'Statement'}

In [9]:
# confirm paths exist and unique
paths_seen_so_far = []
for asnid, data in asn_dictified.iteritems():
    #
    # PATHs check
    if data.has_key('path'):
        path = data['path']
        if path in paths_seen_so_far:
            print "ERROR---duplicate path", path
        else:
            paths_seen_so_far.append(path)
    else:
        print 'NO PATH'
        print data

# report summary statistics
print "Number of items:", len(paths_seen_so_far)

def report_counts(asn_dictified):
    counts = { 'standard':0, 'component':0, 'cluster':0, 'domain':0 }
    for asnid, data in asn_dictified.iteritems():
        if data['statementLabel']:
            label = data['statementLabel']
            if label == 'Standard':
                counts['standard'] += 1
            elif label == 'Component':
                counts['component'] += 1
            elif label == 'Cluster':
                counts['cluster'] += 1
            elif label == 'Domain':
                counts['domain'] += 1
            else:
                print "UNKNOWN LABEL!", asnid
        else:
            print "Has no label", asnid
    print counts



Number of items: 704


In [10]:
# print 
report_counts(asn_dictified)

{'cluster': 147, 'domain': 39, 'component': 124, 'standard': 394}


## Step 4: normalize attributes


In [11]:
asn_normalized = asn_dictified.copy()
for asnid, data in asn_normalized.iteritems():
    if data.has_key('description'):
        data['description'] = data['description'][0]
    if data.has_key('isPartOf'):
        del data['isPartOf']  # all identical to 'http://asn.jesandco.org/resources/D10003FB'
    if data.has_key('identifier'):
        data['identifier'] = data['identifier'][0]
        data['asn_url'] = data['identifier']
    if data.has_key('statementNotation'):
        data['statementNotation'] = data['statementNotation'][0]
    if data.has_key('altStatementNotation'):
        data['alias'] = data['altStatementNotation'][0]
    if data.has_key('subject'):
        del data['subject']  # we kind of know it's math ;)
    if data.has_key('exactMatch'):
        for fk in data['exactMatch']:
            if fk.startswith('http://corestandards.org'):
                data['ccss_url'] = fk
            elif fk.startswith('urn:guid:'):
                data['ccss_guid'] = fk.replace('urn:guid:','')

In [12]:
report_counts(asn_dictified)
report_counts(asn_normalized)

{'cluster': 147, 'domain': 39, 'component': 124, 'standard': 394}
{'cluster': 147, 'domain': 39, 'component': 124, 'standard': 394}


## Step 5: Load and process edge data

In [13]:
import itertools
import csv
edges_raw = csv.DictReader( open('GraphData-EdgesOnly-Grade-00-08-2011-06-01-cleaned - EdgeSet.csv').readlines() )
edges_raw = list(edges_raw)

In [14]:
# remove expand commas and semicolons which represent multiple nodes
# convert 0. to K.


def csv_to_list(instr):
    if ',' in instr:
        paths = []
        csvals = instr.split(',')
        basenode = csvals[0]
        paths.append(basenode)
        for i in range(1,len(csvals)):
             paths.append(basenode[:-1] + csvals[i])
        return paths
    else:
        return [instr]

                
def split_comments(instr):
    """Processes special formatting ( || and , and ;) in edge endpoint spec."""
    nodes = []
    comment = None
    
    if '||' in instr:
        nodedata, comment = instr.split('||')
    else:
        nodedata = instr
        comment = None

    if ';' in nodedata:
        tmpnodes = nodedata.split(';')
        # for ,-separated list within a ;-separated list
        nodes = []
        for node in tmpnodes:
            nodes.extend(csv_to_list(node))
    elif ',' in nodedata:
        nodes = csv_to_list(nodedata)
    else:
        nodes = [nodedata]
    #
    # rename 0. to K. 
    knodes = []
    for node in nodes:
        if node.startswith('0.'):
            node = node.replace('0.','K.')
            knodes.append(node)
        else:
            knodes.append(node)
    # manually fix domain links 'K.OA', 1.OA' 2.OA' 
    koanodes = []
    for node in knodes:
        if node in ['K.OA', '1.OA', '2.OA']:
            node = 'OA'
        koanodes.append(node)
    #
    return koanodes, comment


edges = []
for raw_edge in edges_raw:
    if raw_edge['Note']:
        print "NOTE::", Note
    # start
    start_nodes, start_comment = split_comments( raw_edge['Begin'] )
    end_nodes, end_comment = split_comments( raw_edge['End'] )

    # print start_nodes, end_nodes
    if raw_edge['EdgeDesc'] == 'Arrow':
        for element in itertools.product(start_nodes, end_nodes):
            edge = {'start':element[0], 
                    'start_comment': start_comment,
                    'end':element[1],
                    'end_comment': end_comment,
                    'type':'usedfor'}
            edges.append(edge)
    elif raw_edge['EdgeDesc'] == 'Nondirectional link':
        for element in itertools.product(start_nodes, end_nodes):
            edge = {'start':element[0], 
                    'start_comment': start_comment,
                    'end':element[1],
                    'end_comment': end_comment,
                    'type':'related'}
            edges.append(edge)
    else:
        print "UKNOWN EDGE TYPE"

        
# clean edges and  remove duplicates and self-edges
from collections import defaultdict
edges_dict = defaultdict(list)
for edge in edges:
    # use tuple (start,end) as keys
    edges_dict[ (edge['start'],edge['end']) ].append(edge)
#
cleaned_edges = []
for endpoints, edgelist in edges_dict.iteritems():
    edge = edgelist[0]
    if edge['start'] == edge['end']:
        continue
    cleaned_edges.append(edge)

In [15]:
len(edges), len(edges_dict), len(cleaned_edges)

(588, 567, 542)

In [16]:

# utility function
def find_by_path_or_alias(path):
    """
    Looks through `asn_dictified` and retunrs the asnid of element with this path.
    """
    # print "looking for", path
    found = False
    result = None
    for asnid, data in asn_normalized.iteritems():
        if data.has_key('alias'):
            if path == data['alias']:
                # print 'using alias'
                return asnid
        elif data.has_key('path'):
            if path == data['path']:
                print "using path for", path
                return asnid
        else:
            print "No alias or path"
    if not found:
        print "not found", path


# ensure we can find both edges of relationship
for edge in cleaned_edges:
    ansstart = find_by_path_or_alias(edge['start'])
    asnend = find_by_path_or_alias(edge['end'])




using path for OA


## Step 6: add prerequistite and related data

In [17]:
def create_or_append_to_list_attribute(thedict, attr_name, value):
    if thedict.has_key(attr_name):
        thedict[attr_name].append(value)
    else:
        thedict[attr_name]=[value]
    

for edge in cleaned_edges:
    start_id = find_by_path_or_alias(edge['start'])
    start = asn_normalized[start_id]
    end_id = find_by_path_or_alias(edge['end'])
    end = asn_normalized[end_id]
    
    if edge['type'] == 'usedfor':
        create_or_append_to_list_attribute(start, 'usedfors',      end_id)
        create_or_append_to_list_attribute(end,   'prerequisites', start_id)
        
    elif edge['type'] == 'related':
        create_or_append_to_list_attribute(start, 'related', end_id)
        create_or_append_to_list_attribute(end,   'related', start_id)


using path for OA


## Step 7: replace all FK with `path`s instead of asnids

In [18]:
path_dict = {}  # pivot asn_normalized to use path as keys, and foreign keys

for asnid, data in asn_normalized.items():
    clone = data.copy()
    fk_fields = ['contents', 'ispartof', 'related', 'prerequisites', 'usedfors']
    for field in fk_fields:
        if data.has_key(field):
            asnids_list = data[field]
            clone[field]=[asn_normalized[asnid]['path'] for asnid in asnids_list]
    #
    path_dict[data['path']]=clone

In [19]:
report_counts(path_dict)

{'cluster': 147, 'domain': 39, 'component': 124, 'standard': 394}


In [20]:
path_dict['OA']

{'asn_url': u'http://purl.org/ASN/resources/S114340A',
 'contents': [u'K.OA.A',
  u'3.OA.C',
  u'3.OA.B',
  u'2.OA.B',
  u'2.OA.A',
  u'1.OA.C',
  u'1.OA.A',
  u'1.OA.B',
  u'3.OA.A',
  u'3.OA.D',
  u'1.OA.D',
  u'5.OA.A',
  u'5.OA.B',
  u'2.OA.C',
  u'4.OA.C',
  u'4.OA.B',
  u'4.OA.A'],
 u'description': u'Operations and Algebraic Thinking',
 u'hasChild': [u'http://asn.jesandco.org/resources/S1143411',
  u'http://asn.jesandco.org/resources/S114342D',
  u'http://asn.jesandco.org/resources/S2390245',
  u'http://asn.jesandco.org/resources/S114342E',
  u'http://asn.jesandco.org/resources/S114342F',
  u'http://asn.jesandco.org/resources/S2390246',
  u'http://asn.jesandco.org/resources/S1143430',
  u'http://asn.jesandco.org/resources/S114344F',
  u'http://asn.jesandco.org/resources/S114346D',
  u'http://asn.jesandco.org/resources/S114346E',
  u'http://asn.jesandco.org/resources/S114346F',
  u'http://asn.jesandco.org/resources/S1143470',
  u'http://asn.jesandco.org/resources/S1143490',
  u'ht

### Convert from alias to aliases and other normalization

In [21]:
normalized_path_dict = {}
for item_key, item in path_dict.items():
    item = item.copy()
    if item.has_key('alias'):
        if item['alias'] == item['path']:
            pass
        else:
            item['aliases'] = [item['alias']]
        del item['alias']
    if item.has_key('comment'):
        item['comment'] = '\n'.join( item['comment'] )
    if item.has_key('contents') and len(item['contents'])==0:
        del item['contents']
    if item.has_key('ispartof') and len(item['ispartof'])==0:
        del item['ispartof']
    item['__class__'] = item['statementLabel']
    del item['statementLabel']
    normalized_path_dict[item_key] = item
        

## Step 8: write out YAML files to ccssm/ directory

In [22]:
import yaml
from collections import OrderedDict
DATA_DIR = 'ccssm/'

In [23]:
### Se

In [24]:
export_attributes = ['path',
                     '__class__',
                     'aliases',
                     'description',
                     'comment',
                     'contents',
                     'ispartof',
                     'prerequisites',
                     'usedfors',
                     'related',
                     'ccss_guid',
                     'ccss_url',
                     'asn_url']
export_list = []
for path, data in normalized_path_dict.items():
    clone = OrderedDict()
    for attr in export_attributes:
        if data.has_key(attr):
            clone[attr]=data[attr]
    export_list.append(clone)


    

In [25]:
len(export_list)

704

In [26]:
# analytics

def counts_by_grade(export_list):
    grades_counts = {  
       'K.':0,
       '1.':0,
       '2.':0,
       '3.':0,
       '4.':0,
       '5.':0,
       '6.':0,
       '7.':0,
       '8.':0,
       'HSA':0,
       'HSF':0,
       'HSG':0,
       'HSN':0,
       'HSS':0,
    }
    other_counts = {
       'practice':0,
       'domain':0,
       'standard':0,        
       'unrecognized':0,        
    }
    unrecognized_list = []
    for item in export_list:
        grade_recognized = False
        for key in grades_counts.keys():
            if item['path'].startswith(key):
                grades_counts[key] +=1
                grade_recognized = True
        if not grade_recognized:
            if item['path'].startswith('MP'):
                other_counts['practice'] += 1            
            elif item.get('__class__',None) == 'Domain':
                other_counts['domain'] += 1
            elif item.get('__class__',None) == 'Standard':
                other_counts['standard'] += 1
            else:
                other_counts['unrecognized'] += 1
                unrecognized_list.append(item)
    
    if len(unrecognized_list) > 0:
        print "Unrecognized nodes! ", unrecognized_list
    all_counts = grades_counts.copy()
    all_counts.update(other_counts)
    pprint( all_counts )
    print 'total', sum( all_counts.values() )
    return unrecognized_list


unrec_list = counts_by_grade(export_list)
for item in unrec_list:
    print item['path']


{'1.': 35,
 '2.': 38,
 '3.': 48,
 '4.': 49,
 '5.': 51,
 '6.': 57,
 '7.': 52,
 '8.': 46,
 'HSA': 50,
 'HSF': 60,
 'HSG': 67,
 'HSN': 46,
 'HSS': 50,
 'K.': 34,
 'domain': 12,
 'practice': 8,
 'standard': 1,
 'unrecognized': 0}
total 704


### YAML writer helper function

In [27]:
import codecs
import yaml


# custom representer to handle OrderedDict
def represent_ordereddict(dumper, data):
    value = []
    for item_key, item_value in data.items():
        node_key = dumper.represent_data(item_key)
        if node_key.value == 'description':
            node_value = dumper.represent_scalar(u'tag:yaml.org,2002:str', item_value, style='>')        
        else:
            node_value = dumper.represent_data(item_value)
        # print node_value
        value.append((node_key, node_value))
    return yaml.nodes.MappingNode(u'tag:yaml.org,2002:map', value)
yaml.SafeDumper.add_representer(OrderedDict, represent_ordereddict)




def write_list_to_file(export_list, filename):
    #
    #
    dense_yaml = yaml.safe_dump(export_list,
                                allow_unicode=True, 
                                default_flow_style=False, 
                                indent=2)
    sparse_yaml_lines = []
    for line in dense_yaml.splitlines():
        line = line.replace('  description: >-',
                            '  description: >')
        line = line.replace('- path:', 
                            "-\n  path:")
        line = line.replace('  -', 
                            '    -')
        sparse_yaml_lines.append(line)


    dataout = '\n'.join(sparse_yaml_lines)
    unicode_dataout = unicode(dataout,'utf-8')
    outfile = codecs.open(filename,'w', encoding='utf-8')
    outfile.write(unicode_dataout)
    print "wrote", len(export_list), "items to", filename

    




## Sort in alphabetically 


In [28]:
# Order alphabetically

def alpha_but_kfirst_sort_function(path):
    if path.startswith('K.'):
        path = path.replace('K.','0.')
        return path
    else:
        return path

# sort whole list
sorted_export_list = sorted(export_list, key=lambda v: alpha_but_kfirst_sort_function(v['path']))

# sort fk-like attributes
for item in sorted_export_list:
    attrs_to_sort = ['prerequsities', 'usedfors', 'related', 'contents', 'ispartof']
    for attr in attrs_to_sort:
        if item.has_key(attr):
            item[attr] = sorted(item[attr], key=alpha_but_kfirst_sort_function)


In [29]:
# EXPORT ALL IN ONE SHOT
#
# outfile_name = DATA_DIR + 'all.yml'
# write_list_to_file(sorted_export_list, outfile_name)

print "skipping all.yml export step"










skipping all.yml export step


## Split by grade level and __class__

In [30]:

def split_by_grade_and_class(export_list):
    grades_items = {  
       'K.':[],
       '1.':[],
       '2.':[],
       '3.':[],
       '4.':[],
       '5.':[],
       '6.':[],
       '7.':[],
       '8.':[],
       'HSA':[],
       'HSF':[],
       'HSG':[],
       'HSN':[],
       'HSS':[],
    }
    other_items = {
       'practice':[],
       'domain':[],
       'standard':[],        
       'unrecognized':[],        
    }
    for item in export_list:
        grade_recognized = False
        for key in grades_items.keys():
            if item['path'].startswith(key):
                grades_items[key].append(item)
                grade_recognized = True
        if not grade_recognized:
            if item['path'].startswith('MP'):
                other_items['practice'].append(item)       
            elif item.get('__class__',None) == 'Domain':
                other_items['domain'].append(item)
            elif item.get('__class__',None) == 'Standard':
                other_items['standard'].append(item)
            else:
                other_items['unrecognized'].append(item)
                unrecognized_list.append(item)
    if len(other_items['unrecognized']) > 0:
        print "Unrecognized nodes! ", other_items['unrecognized']
    else:
        del other_items['unrecognized']
    all_items = grades_items.copy()
    all_items.update(other_items)
    print 'total', sum( map(lambda v: len(v), all_items.values()) )
    return all_items


In [31]:
export_lists = split_by_grade_and_class(sorted_export_list)

total 704


In [32]:
len(export_lists['domain'])

12

In [33]:
# EXPORT TO INDIVIDUAL FILES
for list_name, list_items in export_lists.items():
    if list_name.endswith('.'):
        list_name = list_name[:-1]
    #
    outfile_name = DATA_DIR + list_name + '.yml'
    write_list_to_file(list_items, outfile_name)
    

wrote 12 items to ccssm/domain.yml
wrote 50 items to ccssm/HSA.yml
wrote 46 items to ccssm/8.yml
wrote 67 items to ccssm/HSG.yml
wrote 52 items to ccssm/7.yml
wrote 8 items to ccssm/practice.yml
wrote 46 items to ccssm/HSN.yml
wrote 34 items to ccssm/K.yml
wrote 57 items to ccssm/6.yml
wrote 1 items to ccssm/standard.yml
wrote 49 items to ccssm/4.yml
wrote 50 items to ccssm/HSS.yml
wrote 51 items to ccssm/5.yml
wrote 48 items to ccssm/3.yml
wrote 60 items to ccssm/HSF.yml
wrote 38 items to ccssm/2.yml
wrote 35 items to ccssm/1.yml


# Confirm we can load data back in


In [34]:
import yaml

ccssm_data = None
grade5 = DATA_DIR + '5.yml'
with open(grade5, 'r') as yaml_file:
    ccssm_data = yaml.load(yaml_file)

    


In [35]:
for data in ccssm_data:
    if data['path'] == '5.NF.B.5b':
        pprint(data)

{'__class__': 'Component',
 'aliases': ['5.NF.5.b'],
 'ccss_guid': 'FD677276B89E4F55AEEC482260D345C2',
 'ccss_url': 'http://corestandards.org/Math/Content/5/NF/B/5/b',
 'description': u'Explaining why multiplying a given number by a fraction greater than 1 results in a product greater than the given number (recognizing multiplication by whole numbers greater than 1 as a familiar case); explaining why multiplying a given number by a fraction less than 1 results in a product smaller than the given number; and relating the principle of fraction equivalence a/b = (n\xd7a)/(n\xd7b) to the effect of multiplying a/b by 1.\n',
 'ispartof': ['5.NF.B.5'],
 'path': '5.NF.B.5b'}


In [36]:
## TODO: Round trip load test

