In [138]:
# INFO 300
# Muatasim Qazi
# Parse UW Course Evaluation Data
# 12/8/2018

In [None]:
import pandas as pd
import numpy as np
import json
import sys
import ijson
from ijson import parse
import os
import ujson
from pandas.io.json import json_normalize
from pandas.io.json import read_json

In [139]:
with open('sample.json', 'r') as f:
    data = json.load(f)

In [140]:
# extract department name from the class value
def get_dept(class_name):
    section_char = ""
    class_name_split = class_name.split(' ')
    for char in class_name_split:
        if ((len(char) == 1 and (ord(char) >= 65 and ord(char) < 91)) or char.isdigit() or char.isupper()) and len(section_char) == 0:
                section_char = char#class_name_split[3]
    if section_char in class_name_split:
        section_index = class_name_split.index(section_char)

        return " ".join(class_name_split[:section_index])

In [141]:
# remove extra characters and convert to int
def conv_to_int(a_str):
    return int(a_str.strip("\""))

In [142]:
# helper functions to categorize the quarter and year data
def fall():
    return "Fall"

def winter():
    return "Winter"

def spring():
    return "Spring"

def summer():
    return "Summer"

switcherA = {
        'SU': summer,
        "AU": fall,
        'WI': winter,
        'SP': spring
    }


def qt_to_quarter(qt):
    # get the function from switcher dictionary
    func = switcherA.get(qt, "nothing")
    # execute the function
    return func()


def get_quarter_and_year(a_str):
    qtr_name = qt_to_quarter(a_str[:2])
    return qtr_name, int('20' + a_str[2:]) # Summer 2018
    

In [143]:
# helper functions to caterorize data
def excellent(num):
    return "excellent", num

def very_good(num):
    return "very_good", num

def good(num):
    return "good", num

def fair(num):
    return "fair", num

def poor(num):
    return "poor", num

def very_poor(num):
    return "very_poor", num

def median(num):
    return "median", float(num)

switcherB = {
        0: excellent,
        1: very_good,
        2: good,
        3: fair,
        4: poor,
        5: very_poor,
        6: median
    }


def num_to_map(per, val):
    # get the function from switcher dictionary
    func = switcherB.get(val)
    # execute the function
    return func(per)

In [156]:
# a set of all UW deparmtnet names
uw_department_names = set() 

# the new data to be populated from the parsed data
all_new_data = []

with open('all_course_evaluations_latest.json', 'rb') as data:

    for obj in ijson.items(data, 'item'):
        class_name = obj['class'][0]
        single_eval  = {}
       

        # extract the name of the department
        department = get_dept(class_name)
           
        uw_department_names.add(department)
        
        single_eval["department"] = department
        single_eval["class"] = class_name[len(department):].strip()
        
        single_eval["lecturer"] = obj['lecturer']
        single_eval["enrolled"] = conv_to_int(obj['enrolled'])
        
        qtr, year = get_quarter_and_year(obj['quarter'])
        
        single_eval["quarter_name"] = qtr
        single_eval["year"] = year

        statistics = obj['statistics']
        stats_key = statistics.keys()

        single_eval["statistic"] = {}

        for key in stats_key:
            stats_item_list = statistics[key]
            single_eval["statistic"][key]  = []
            for i in range(len(stats_item_list)):
                    item = stats_item_list[i]
                    
                    key_a, value = num_to_map(item, i)

                    single_eval["statistic"][key].append({key_a: value})
        
        # add the single evaluation data to the new data
        all_new_data.append(single_eval)
        

In [157]:
# the total number of row
len(all_new_data)

15049

In [166]:
data = json.dumps(all_new_data)


In [167]:
# write the clean data to a new json file
with open('data_clean.json', 'w') as outfile:
    json.dump(data, outfile)

In [162]:
df = pd.DataFrame.from_dict(all_new_data)

In [164]:
df.head(3)

Unnamed: 0,class,department,enrolled,lecturer,quarter_name,statistic,year
0,A A 101 A,Aeronautics & Astronautics,8,Christopher Lum,Summer,{'The course as a whole:': [{'excellent': '71%...,2017
1,A A 198 A,Arts & Sciences,24,Christopher Lum,Fall,{'The course as a whole: ': [{'excellent': '39...,2017
2,A A 210 A,Aeronautics & Astronautics,70,Dana Dabiri,Spring,{'The course as a whole: ': [{'excellent': '27...,2018
