In [1]:
import pandas as pd
import numpy as np
import json

In [2]:
f1 = open('piazza_raw/c100_raw_piazza')
c100_raw = json.load(f1)
f1.close()

f2 = open('piazza_raw/cs61a_raw_piazza')
cs61a_raw = json.load(f2)
f2.close()

f3 = open('piazza_raw/cs61b_raw_piazza')
cs61b_raw = json.load(f3)
f3.close()

### Take Out All Notes (non-question posts)

In [3]:
def extract_question_posts(post_list):
    questions = []
    for post in post_list:
        if post.get("type") == "question":
            questions.append(post)
    return questions

def extract_answers(post_list):
    answers = []
    

In [5]:
c100_questions = extract_question_posts(c100_raw)
cs61a_questions = extract_question_posts(cs61a_raw)
cs61b_questions = extract_question_posts(cs61b_raw)

Sometimes the questions have important information within the subject, so keep that key.

In [6]:
# For Example - the context of the question here is included within the subject
c100_questions[0].get("history")

[{'anon': 'no',
  'uid': 'j6ij3qk58yu2kd',
  'subject': 'Proj3 Gradescope Score',
  'created': '2019-05-21T17:10:36Z',
  'content': '<p>I don&#39;t think that&#39;s been release. When will we see that score as well?</p>'}]

### Answers - Want to Extract Answers from Each Post

In [7]:
(c100_questions[2].get("children"))

[{'folders': [],
  'data': {'embed_links': []},
  'children': [],
  'created': '2019-05-23T19:33:45Z',
  'bucket_order': 3,
  'tag_endorse': [{'role': 'student',
    'name': 'Kiran Kaur Brar',
    'endorser': {},
    'admin': False,
    'photo': None,
    'id': 'idrin5by7cepg',
    'photo_url': None,
    'published': True,
    'us': False,
    'facebook_id': None}],
  'bucket_name': 'Today',
  'id': 'jw1210jzp1p588',
  'history': [{'anon': 'no',
    'uid': 'is6p9qynt5gz9',
    'subject': '',
    'created': '2019-05-23T19:33:45Z',
    'content': 'I accidentally overwrote some scores on Gradescope for the written part of project 2. However, this was after I moved all scores to okpy. I see your score for the written part in okpy, so you have nothing to worry about.\xa0'}],
  'type': 'i_answer',
  'tag_endorse_arr': ['idrin5by7cepg'],
  'config': {},
  'is_tag_endorse': False}]

## NOTE: I think every question has been answered, unusual for piazza

In [8]:
# Checking if there is ever more than one instructor answer per post
c100_answered = []
c100_student_answered = []
for post in c100_questions:
    children_posts = post.get("children")
    num_instructor_answer = 0
    for child in children_posts:
        if child.get("type") == "i_answer":
            num_instructor_answer += 1
    # want all answered questions
    if ("unanswered" not in post.get("tags")):
        c100_answered.append(post)
    if ("unanswered" not in post.get("tags")) & (num_instructor_answer == 0):
        c100_student_answered.append(post)
    if(num_instructor_answer > 1):
        print("ERROR - more than one instructor answer")

In [22]:
# Checking if there is ever more than one instructor answer per post
cs61a_answered = []
cs61a_student_answered = []
for post in cs61a_questions:
    children_posts = post.get("children")
    num_instructor_answer = 0
    for child in children_posts:
        if child.get("type") == "i_answer":
            num_instructor_answer += 1
    # want all answered questions
    if ("unanswered" not in post.get("tags")):
        cs61a_answered.append(post)
    if ("unanswered" not in post.get("tags")) & (num_instructor_answer == 0):
        cs61a_student_answered.append(post)
    if(num_instructor_answer > 1):
        print("ERROR - more than one instructor answer")

In [23]:
# Checking if there is ever more than one instructor answer per post
cs61b_answered = []
cs61b_student_answered = []
for post in cs61b_questions:
    children_posts = post.get("children")
    num_instructor_answer = 0
    for child in children_posts:
        if child.get("type") == "i_answer":
            num_instructor_answer += 1
    # want all answered questions
    if ("unanswered" not in post.get("tags")):
        cs61b_answered.append(post)
    if ("unanswered" not in post.get("tags")) & (num_instructor_answer == 0):
        cs61b_student_answered.append(post)
    if(num_instructor_answer > 1):
        print("ERROR - more than one instructor answer")

We want: id, subject, content, answer_type, answer (what about when there is a student answer and prof answer?)

How about: id, subject, content, student_answer, instructor_answer

In [9]:
def get_answers(post):
    instructor_answer = "None"
    student_answer = "None"
    children_posts = post.get("children")
    for child in children_posts:
        if child.get("type") == "i_answer":
            instructor_answer = child.get("history")[-1].get("content")
        if child.get("type") == "s_answer":
            student_answer = child.get("history")[-1].get("content")
    return instructor_answer, student_answer

def get_question_content(post):
    return post.get("history")[-1].get("content")

def get_subject(post):
    return post.get("history")[-1].get("subject")

In [24]:
formatted_QA_c100 = []
for post in c100_answered:
    i_answer, s_answer = get_answers(post)
    post_dict = {"course": "DS100",
                 "id": post.get("id"), 
                 "subject": get_subject(post),
                 "content": get_question_content(post),
                 "student_answer": s_answer,
                 "instructor_answer": i_answer,
                 "folders": post.get("folders")}
    formatted_QA_c100.append(post_dict)

In [25]:
formatted_QA_cs61a = []
for post in cs61a_answered:
    i_answer, s_answer = get_answers(post)
    post_dict = {"course": "CS61A",
                 "id": post.get("id"), 
                 "subject": get_subject(post),
                 "content": get_question_content(post),
                 "student_answer": s_answer,
                 "instructor_answer": i_answer,
                 "folders": post.get("folders")}
    formatted_QA_cs61a.append(post_dict)

In [26]:
formatted_QA_cs61b = []
for post in cs61b_answered:
    i_answer, s_answer = get_answers(post)
    post_dict = {"course": "DS100",
                 "id": post.get("id"), 
                 "subject": get_subject(post),
                 "content": get_question_content(post),
                 "student_answer": s_answer,
                 "instructor_answer": i_answer,
                 "folders": post.get("folders")}
    formatted_QA_cs61b.append(post_dict)

In [30]:
with open('c100.json', 'w') as fout:
    json.dump(formatted_QA_c100 , fout)

In [31]:
with open('cs61a.json', 'w') as fout:
    json.dump(formatted_QA_cs61a , fout)

In [32]:
with open('cs61b.json', 'w') as fout:
    json.dump(formatted_QA_cs61b , fout)