In [None]:
!pip install flashtext

In [None]:
import re
import os
import ast
import math
import nltk
import hashlib
import flashtext
import numpy as np
import pandas as pd
from tqdm import tqdm

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
m = "fa23"

In [None]:
class_id = f"data8_{m}"

In [None]:
multi_turn_class_id = f"data8_{m}_multiturn"

In [None]:
data_dir = 'drive/MyDrive/EdSupport/Deployment/Ed_Data_Processing/Data'

In [None]:
import warnings
warnings.filterwarnings('ignore')

## Phase 1 - anonymization

In [None]:
threads = pd.read_json(data_dir + "/data (phase 0)/" + class_id + "/data.json")

Anonymization

In [None]:
def anonymize(df):
    """
    Substitutes the username of each thread/comment with a unique identifier
    :param df: a dataframe containing multiple threads/comments with user and thread metadata.
    """
    if df.shape[0] != 0:
        df["user_id"] = df["user"].apply(user_to_id)
        df["user_role"] = df["user"].apply(lambda s: s["role"])
        df.drop(columns=["user"], inplace=True)
        if "answers" in df.columns:
            for answers in df["answers"]:
                if type(answers) is not float:
                    answer_df = pd.DataFrame(answers)
                    anonymize(answer_df)
                    answer_index = 0
                    for answer in answers:
                        answer["user_id"] = answer_df.loc[answer_index, "user_id"]
                        answer["user_role"] = answer_df.loc[answer_index, "user_role"]
                        del answer["user"]
                        answer_index += 1
        if "comments" in df.columns:
            for comments in df["comments"]:
                comment_df = pd.DataFrame(comments)
                anonymize(comment_df)
                comment_index = 0
                for comment in comments:
                    comment["user_id"] = comment_df.loc[comment_index, "user_id"]
                    comment["user_role"] = comment_df.loc[comment_index, "user_role"]
                    del comment["user"]
                    comment_index += 1

In [None]:
def user_to_id(user):
    """
    Assigns a unique 12-digit identifier to each user.
    :param user: user metadata, including user name, email, role.
    :return: unique identifier of the user.
    """
    m = hashlib.md5()
    m.update(user["email"].encode('utf-8'))
    user_id = str(int(m.hexdigest(), 16))[0:12]

    return user_id

In [None]:
anonymize(threads)

Saving anonymized data

In [None]:
if not os.path.exists(data_dir + "/data (phase 1)/" + multi_turn_class_id):
  os.mkdir(data_dir + "/data (phase 1)/" + multi_turn_class_id)

threads.to_json(data_dir + "/data (phase 1)/" + multi_turn_class_id + "/data_anonymized.json", index=False)

Preliminary structuring and cleaning

In [None]:
threads = pd.read_json(data_dir + "/data (phase 1)/" + multi_turn_class_id + "/data_anonymized.json")
print(len(threads))
threads

In [None]:
def subthread_extractor(comment):
    comment_data = {
        "text": comment.get("text"),
        "user_id": comment.get("user_id"),
        "user_role": comment.get("user_role"),
        "endorsed": comment.get("endorsed"),
        "document": comment.get("document"),
        "created_at": comment.get("created_at"),
        "follow_ups": []
    }
    if comment.get("comments"):
        for c in comment.get("comments"):
            followup_comments = subthread_extractor(c)
            comment_data["follow_ups"].append(followup_comments)
    if comment.get("answers"):
        for c in comment.get("answers"):
            followup_comments = subthread_extractor(c)
            comment_data["follow_ups"].append(followup_comments)
    return comment_data

In [None]:
def find_conversations(comment, path=[], conversations=[], added_last=False):
    current_path = path + [comment] if not added_last else path  # Append current comment to path unless it was just added

    if comment['user_role'] == 'student' and not comment['endorsed']:
        for follow_up in comment['follow_ups']:
            if not (follow_up['user_role'] == 'student' and not follow_up['endorsed']):
                valid_convo = current_path + [follow_up]
                conversations.append(valid_convo)
                # Continue the conversation, marking the last admin/endorsed student comment as added
                find_conversations(follow_up, valid_convo, conversations, added_last=True)
            else:
                find_conversations(follow_up, current_path, conversations)

    else:
        for follow_up in comment['follow_ups']:
            find_conversations(follow_up, current_path, conversations)

    for i in range(len(conversations)):
        while conversations[i] and not (conversations[i][0]['user_role'] == 'student' and not conversations[i][0]['endorsed']):
            conversations[i].pop(0)

        for comment in conversations[i]:
            comment.pop('follow_ups', None)

    return conversations

In [None]:
def process_table(df):

    new_rows = []
    for index, row in df.iterrows():
        comment_dict = subthread_extractor(row)
        convos = find_conversations(comment_dict, path=[], conversations=[], added_last=False)

        for convo in convos:
            if not (convo[0]['user_role'] == 'student' and not convo[0]['endorsed']):
                convo.pop(0)
            new_row = row.to_dict()
            new_row['memory'] = convo
            new_rows.append(new_row)

    new_df = pd.DataFrame(new_rows)
    return new_df

threads = process_table(threads)
threads.drop(columns=['url', 'answers', 'comments'], inplace=True)

In [None]:
def extract_and_remove_by_index(memory_list, index):
    return memory_list.pop(index)

questions = threads['memory'].apply(lambda x: extract_and_remove_by_index(x, -2))
answers = threads['memory'].apply(lambda x: extract_and_remove_by_index(x, -1))

threads['question'] = questions.apply(lambda x: x['text'])
threads['document_q'] = questions.apply(lambda x: x['document'])
threads['created_at'] = questions.apply(lambda x: x['created_at'])

threads['answer'] = answers.apply(lambda x: x['text'])
threads['document_a'] = answers.apply(lambda x: x['document'])

threads = threads.rename(columns={'document': 'document_p'})
threads["question"] = threads["title"] + ": " + threads["question"]



# Display the updated DataFrame
threads = threads[["type","created_at",	"category",	"subcategory",	"title",	"text",	"document_p",	"memory",	"question",	"document_q",	"answer",	"document_a"]]


Saving initially cleaned data

In [None]:
if not os.path.exists(data_dir + "/data (phase 3)/" + multi_turn_class_id):
  os.mkdir(data_dir + "/data (phase 3)/" + multi_turn_class_id)

threads.to_csv(data_dir + "/data (phase 3)/" + multi_turn_class_id + "/data_anonymized_cleaned.csv", index=False)

## Phase 4 - removing threads with confidential information

Reading cleaned data from phase 3.

In [None]:
qa_data = pd.read_csv(data_dir + "/data (phase 3)/" + multi_turn_class_id + "/data_anonymized_cleaned.csv")

qa_data["memory"] = qa_data["memory"].apply(ast.literal_eval)
qa_data

Removing names appearing in the Ed roster.

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')

english_stopwords = set(stopwords.words('english'))

In [None]:
roster = pd.read_csv(data_dir + "/data (phase 0)/" + class_id + "/roster.csv")
roster = roster[~roster["Name"].isna()]

staff_names = roster[roster["Role"] != "student"]["Name"]
student_names = roster[roster["Role"] == "student"]["Name"]

staff_first_names = staff_names.str.split().str[0]
staff_last_names = staff_names.str.split().str[1:].apply(lambda x: " ".join(x))

student_first_names = student_names.str.split().str[0]
student_last_names = student_names.str.split().str[1:].apply(lambda x: " ".join(x))

staff_first_names = list(set(staff_first_names) - english_stopwords)
student_first_names = list(set(student_first_names) - english_stopwords)

In [None]:
keyword_processor = flashtext.KeywordProcessor()
keyword_dictionary = {
    "Staff x" : staff_first_names,
    "Student x" : student_first_names
}
keyword_processor.add_keywords_from_dict(keyword_dictionary)

for column in tqdm(qa_data.columns):
    if column != "memory":
        qa_data[column][qa_data[column].isna()] = ""
        replaced = []
        for i in list(qa_data[column]):
            try:
                one_replace = keyword_processor.replace_keywords(i)
                replaced.append(one_replace)
            except:
                print(f"ERROR: following text could not be parsed: \n{i}\n")
                print(f"If there are any student/staff names contained in this text, please manually remove them.")
                replaced.append(i)
        qa_data[column] = replaced


In [None]:
def process_dictionary(dictionary):
    for key, value in dictionary.items():
        if isinstance(value, str):
            dictionary[key] = keyword_processor.replace_keywords(value)
    return dictionary

qa_data["memory"] = qa_data["memory"].apply(lambda mem: [process_dictionary(m) for m in mem])

Removing points that contain "dsp", "extnesion", "extenuating" words

In [None]:
keywords = r"(dsp)|(extension)|(extenuating)|(#)"

In [None]:
for column in qa_data.columns:
  if column in ["question", "answer"]:
    qa_data = qa_data[~qa_data[column].str.lower().str.contains(keywords)]

  elif column == "memory":
    # qa_data = qa_data[~qa_data[column].apply(lambda x: sum([bool(re.search(keywords, m[0])) for m in x]) > 0)]
    pattern = re.compile(keywords)
    qa_data = qa_data[~qa_data[column].apply(lambda x: any(pattern.search(str(value)) for m in x for value in m.values()))]


Saving the files.

In [None]:
if not os.path.exists(data_dir + "/data_(phase_4)/" + multi_turn_class_id):
  os.mkdir(data_dir + "/data_(phase_4)/" + multi_turn_class_id)

qa_data.to_csv(data_dir + "/data_(phase_4)/" + multi_turn_class_id + "/qa.csv", index=False)

In [None]:
qa_data