In [None]:
import pandas as pd
import numpy as np
import json
import firebase_admin
import pandas as pd
import random
import requests
import ast
import fire
import pymongo
import pickle
import itertools
from firebase_admin import credentials, firestore, storage
from random import choice
from string import ascii_uppercase
from adaptive_learning.scheduler import DashScheduler
from urllib.parse import quote_plus

cred = credentials.Certificate("machine-learning-database.json")
app = firebase_admin.initialize_app(cred,{'storageBucket': 'machine-learning-databas-9d23e.appspot.com'})
firestore_client = firestore.client()

In [None]:
       
class QuestionSelect:
    def __init__(self, course_name, dash_path, concept_path):
        self.course_name = course_name
        self.dash_params = self.load_pickle(dash_path)
        self.concept_file = self.load_pickle(concept_path)
    
    def load_pickle(self, pickle_file_path):
        with open(pickle_file_path, 'rb') as file:
            data = pickle.load(file)
        return data
    
    def GetQuestions(self):
        questions_dict = {}
        course_list = []
        course = self.course_name

        ref = firestore_client.collection("Courses").document(course).collection("Lectures")
        for lec in ref.get():
            ref = firestore_client.collection("Courses").document(course).collection("Lectures").document(lec.id).collection("Moments")
            for moment in ref.get():
                ref = firestore_client.collection("Courses").document(course).collection("Lectures").document(lec.id).collection("Moments").document(moment.id).collection("Multiple Choice Questions")
                for q in ref.get():
                    questions_dict[q.id] = q.to_dict()
                    course_list.append(course)

        question_statements = []
        A_list = []
        B_list = []
        C_list = []
        D_list = []
        week_list = []
        correct_list = []
        moment_list = []
        question_topics = []
        for q in questions_dict.values():
            question_statements.append(q["Question"])
            A_list.append(q["A"])
            B_list.append(q["B"])
            C_list.append(q["C"])
            D_list.append(q["D"])
            correct_list.append(q["Correct answer"])
            moment_list.append(q["Moment"])
            question_topics.append(q["Topics"])

        quest_df = pd.DataFrame({
        "Question ID": questions_dict.keys(),
        "Question": question_statements,
        "A": A_list,
        "B": B_list,
        "C": C_list,
        "D": D_list,
        "Correct": correct_list,
        "Moment" : moment_list,
        "Topics" : question_topics})

        quest_df['Selected'] = False
        quest_df['Topics'] = quest_df['Topics'].apply(lambda topics: [topic.lower() for topic in topics])

        return quest_df

    def explode_list(self, df, column_to_explode):
        df = df.reset_index(drop=True)
        s = df[column_to_explode]
        i = np.arange(len(s)).repeat(s.str.len())
        return df.iloc[i].assign(**{column_to_explode: np.concatenate(s)})


    def get_dash_memory(self, dash_params,concepts, progress):
        if progress is None: # if progress is not provided...
            # then, provide a default value for progress
            progress = {'progress': [(5, 0, '08/08/2023'), (16, 1, '08/09/2023'), (5, 0, '08/10/2023'), (9, 0, '08/11/2023'), (4, 1, '08/12/2023'),
                             (0, 1, '08/13/2023'), (10, 1, '08/14/2023'), (11, 1, '08/15/2023'), (19, 0, '08/16/2023'), (10, 1, '08/17/2023'),
                             (18, 0, '08/18/2023'), (3, 1, '08/19/2023'), (1, 1, '08/20/2023'), (5, 1, '08/21/2023'), (5, 0, '08/22/2023'),
                             (0, 1, '08/23/2023'), (17, 1, '08/24/2023'), (5, 0, '08/25/2023'), (2, 0, '08/26/2023'), (16, 1, '08/27/2023'),
                             (1, 0, '08/28/2023'), (12, 0, '08/29/2023'), (20, 1, '08/30/2023'), (7, 0, '08/31/2023'), (14, 1, '09/01/2023'),
                             (20, 1, '09/02/2023'), (8, 1, '09/03/2023'), (8, 1, '09/04/2023'), (4, 0, '09/05/2023'), (7, 1, '09/06/2023')]
                        }
        scheduler = DashScheduler(concepts, dash_params)
        return scheduler.get_memory(progress['progress'])    

    def GetFrequency(self, df, student_uniqname):
        rec_prob={}
        
        #Creating concept name to concept id dictionary
        id2concept = {k: c for k, c in enumerate(self.concept_file.nodes())}    
        id2concept = {k: v.strip() for k, v in id2concept.items()}
        concept2id = {v.strip(): k for k, v in id2concept.items()}
        
        output = None
        if df is None or df.empty:
            # no df provided, so get_dash_memory uses its default progress
            output = self.get_dash_memory(self.dash_params,self.concept_file,None)
            
        else:
            # Only process df if it's not None,
            df['timestamp'] = pd.to_datetime(df['timestamp']).dt.strftime('%m/%d/%Y')
            df['correct/incorrect'] = (df['response'] == df['correct_answer']).astype(int)
            df = df[['topics_covered', 'correct/incorrect', 'timestamp']]

            #Format student history in suitable format for dash api
            df_formatted = self.explode_list(df, 'topics_covered')
            df_formatted.columns = ['concept', 'correct/incorrect', 'timestamp']
            df_formatted.reset_index(drop=True, inplace=True)
            df_formatted['concept'] = df_formatted['concept'].map(concept2id)
            df_formatted = df_formatted[df_formatted['concept'].apply(lambda x: str(x).isdigit())]
            df_formatted['concept'] = df_formatted['concept'].astype(int)
            tuple_list = [tuple(x) for x in df_formatted.values]
            student_hist = {'progress': tuple_list}

            # Get recall probabilities from the api
            output = self.get_dash_memory(self.dash_params,self.concept_file,student_hist)
            
            
#         #student history data from the database (code for connecting an online database must be added here)
#         df = df[df['student_id']==student_uniqname]          #filter by student
#         df['timestamp'] = pd.to_datetime(df['timestamp']).dt.strftime('%m/%d/%Y')
#         df['correct/incorrect'] = (df['response'] == df['correct_answer']).astype(int)
#         df = df[['topics_covered', 'correct/incorrect', 'timestamp']]
        
#         #Format student history in suitable format for dash api
#         df_formatted = self.explode_list(df, 'topics_covered')
#         df_formatted.columns = ['concept', 'correct/incorrect', 'timestamp']
#         df_formatted.reset_index(drop=True, inplace=True)
#         df_formatted['concept'] = df_formatted['concept'].map(concept2id)
#         df_formatted = df_formatted[df_formatted['concept'].apply(lambda x: str(x).isdigit())]
#         df_formatted['concept'] = df_formatted['concept'].astype(int)
#         tuple_list = [tuple(x) for x in df_formatted.values]
#         student_hist = {'progress': tuple_list}
        
#         #Get recall probabilities from the api
#         output = self.get_dash_memory(self.dash_params, self.concept_file, student_hist['progress'])


        for i in output:
            k = int(i[1])
            v = float(i[0])
            rec_prob[k] = v
            
        #Formatting the recall probabilites to frequency in the correct format
        recall_prob_dict = {id2concept.get(k, k): v for k, v in rec_prob.items()}
        recall_prob_dict = {k.strip(): v for k, v in recall_prob_dict.items()}
        recall_prob_dict = dict(sorted(recall_prob_dict.items(), key=lambda item: item[1]))
        first_25_dict = dict(itertools.islice(recall_prob_dict.items(), 25))
        frequencies = {k: round(1/(v**0.5)) for k,v in first_25_dict.items()}
        return frequencies

    def QuestionSelect(self, frequencies, quest_df, max_questions):
        # Start with all topic frequencies being the target ones
        unsatisfied_freqs = {k: v for k, v in frequencies.items() if v != 0}
        selected_questions = []

        # Greedy selection of questions
        while unsatisfied_freqs and len(selected_questions) < max_questions:

            # Calculate the score of each question by conditionally filtering unselected questions, 
            # and only if they cover a topic with unsatisfied frequency remaining
            question_scores = {q: sum(unsatisfied_freqs[topic] for topic in topics 
                                      if topic in unsatisfied_freqs and unsatisfied_freqs[topic] > 0)
                               for q, topics in quest_df[quest_df.Selected == False].set_index('Question').Topics.items() 
                               if any(topic in unsatisfied_freqs and unsatisfied_freqs[topic] > 0 for topic in topics)}

            # If no question can satisfy the remaining unsatisfied frequencies, then break the loop
            if not question_scores:
                print("No more questions can satisfy the remaining topic frequencies.")
                break

            # Select the question with the highest score
            selected_q = max(question_scores, key=question_scores.get)
            selected_questions.append(selected_q)

            # Update the 'Selected' flag for the chosen question
            quest_df.loc[quest_df.Question == selected_q, 'Selected'] = True

            # Update the unsatisfied frequencies
            for topic_list in quest_df.set_index('Question').loc[selected_q].Topics:  # Assume here each topic_list is a list
                for topic in topic_list:  # Iterate over the items in each topic_list
                    if topic in unsatisfied_freqs:
                        unsatisfied_freqs[topic] -= 1
                        if unsatisfied_freqs[topic] == 0:
                            unsatisfied_freqs.pop(topic)

        selected_questions_df = pd.DataFrame(selected_questions, columns=['Question'])
        final_df = selected_questions_df.merge(quest_df, on='Question', how='left')
        return final_df

# class QuestionSelect:
    
#     with open('dash_params.pk', 'rb') as file:
#         dash = pickle.load(file)
    
#     with open('SIADS_542_dep.pk', 'rb') as file:
#         concept_file = pickle.load(file)
        
#     def GetQuestions(course_name):
#         questions_dict = {}
#         course_list = []
#         course = course_name

#         ref = firestore_client.collection("Courses").document(course).collection("Lectures")
#         for lec in ref.get():
#             ref = firestore_client.collection("Courses").document(course).collection("Lectures").document(lec.id).collection("Moments")
#             for moment in ref.get():
#                 ref = firestore_client.collection("Courses").document(course).collection("Lectures").document(lec.id).collection("Moments").document(moment.id).collection("Multiple Choice Questions")
#                 for q in ref.get():
#                     questions_dict[q.id] = q.to_dict()
#                     course_list.append(course)

#         question_statements = []
#         A_list = []
#         B_list = []
#         C_list = []
#         D_list = []
#         week_list = []
#         correct_list = []
#         moment_list = []
#         question_topics = []
#         for q in questions_dict.values():
#             question_statements.append(q["Question"])
#             A_list.append(q["A"])
#             B_list.append(q["B"])
#             C_list.append(q["C"])
#             D_list.append(q["D"])
#             correct_list.append(q["Correct answer"])
#             moment_list.append(q["Moment"])
#             question_topics.append(q["Topics"])

#         quest_df = pd.DataFrame({
#         "Question ID": questions_dict.keys(),
#         "Question": question_statements,
#         "A": A_list,
#         "B": B_list,
#         "C": C_list,
#         "D": D_list,
#         "Correct": correct_list,
#         "Moment" : moment_list,
#         "Topics" : question_topics})

#         quest_df['Selected'] = False
#         quest_df['Topics'] = quest_df['Topics'].apply(lambda topics: [topic.lower() for topic in topics])

#         return quest_df

#     def explode_list(df, column_to_explode, fill_values):
#         df = df.reset_index(drop=True)
#         s = df[column_to_explode]
#         i = np.arange(len(s)).repeat(s.str.len())
#         return df.iloc[i].assign(**{column_to_explode: np.concatenate(s)})


#     def get_dash_memory(dash_params,concepts, progress):
#         scheduler = DashScheduler(concepts, dash_params)
#         return scheduler.get_memory(progress)    

#     def GetFrequency(df, student_uniqname, concept_dict):
#         rec_prob={}
        
#         #Creating concept name to concept id dictionary
#         id2concept = {k: c for k, c in enumerate(concept_file.nodes())}
#         concept2id = {v.strip(): k for k, v in id2concept.items()}
        
#         #student history data from the database (code for connecting an online database must be added here)
#         df = df[df['student_id']==student_uniqname]
#         df['timestamp'] = pd.to_datetime(df['timestamp']).dt.strftime('%m/%d/%Y')
#         df['correct/incorrect'] = (df['response'] == df['correct_answer']).astype(int)
#         df = df[['topics_covered', 'correct/incorrect', 'timestamp']]
        
#         #Format student history in suitable format for dash api
#         df_formatted = explode_list(df, 'topics_covered', {'correct/incorrect': 'incorrect', 'timestamp': df['timestamp']})
#         df_formatted.columns = ['concept', 'correct/incorrect', 'timestamp']
#         df_formatted.reset_index(drop=True, inplace=True)
#         df_formatted['concept'] = df_formatted['concept'].map(concept2id)
#         df_formatted = df_formatted[df_formatted['concept'].apply(lambda x: str(x).isdigit())]
#         df_formatted['concept'] = df_formatted['concept'].astype(int)
#         tuple_list = [tuple(x) for x in df_formatted.values]
#         student_hist = {'progress': tuple_list}
        
#         #Get recall probabilities from the api
#         output = get_dash_memory(dash, concept_file, student_hist['progress'])
#         for i in output:
#             k = int(i[1])
#             v = float(i[0])
#             rec_prob[k] = v
            
#         #Formatting the recall probabiites to frequency in the correct format
#         recall_prob_dict = {id2concept.get(k, k): v for k, v in rec_prob.items()}
#         recall_prob_dict = {k.strip(): v for k, v in recall_prob_dict.items()}
#         recall_prob_dict = dict(sorted(recall_prob_dict.items(), key=lambda item: item[1]))
#         first_25_dict = dict(itertools.islice(recall_prob_dict.items(), 25))
#         frequencies = {k: round(1/(v**0.5)) for k,v in recall_prob_dict.items()}
#         return frequencies

#     def QuestionSelect(frequencies, quest_df):
#         # Start with all topic frequencies being the target ones
#         unsatisfied_freqs = {k: v for k, v in frequencies.items() if v != 0}
#         selected_questions = []

#         # Greedy selection of questions
#         while unsatisfied_freqs:

#             # Calculate the score of each question by conditionally filtering unselected questions, and only if they cover a topic with unsatisfied frequency remaining
#             question_scores = {q: sum(unsatisfied_freqs[topic] for topic in topics if topic in unsatisfied_freqs and unsatisfied_freqs[topic] > 0)
#                               for q, topics in quest_df[quest_df.Selected == False].set_index('Question').Topics.items() if any(topic in unsatisfied_freqs and unsatisfied_freqs[topic] > 0 for topic in topics)}

#             # If no question can satisfy the remaining unsatisfied frequencies, then break the loop
#             if not question_scores:
#                 print("No more questions can satisfy the remaining topic frequencies.")
#                 break

#             # Select the question with the highest score
#             selected_q = max(question_scores, key=question_scores.get)
#             selected_questions.append(selected_q)

#             # Update the 'Selected' flag for the chosen question
#             quest_df.loc[quest_df.Question == selected_q, 'Selected'] = True

#             # Update the unsatisfied frequencies
#             for topic_list in quest_df.set_index('Question').loc[selected_q].Topics:  # Assume here each topic_list is a list
#                 for topic in topic_list:  # Iterate over the items in each topic_list
#                     if topic in unsatisfied_freqs:
#                         unsatisfied_freqs[topic] -= 1
#                         if unsatisfied_freqs[topic] == 0:
#                             unsatisfied_freqs.pop(topic)

#         selected_questions_df = pd.DataFrame(selected_questions, columns=['Question'])
#         final_df = selected_questions_df.merge(quest_df, on='Question', how='left')
#         return final_df


In [3]:
dash_path = 'dash_params.pk'
concept_path = 'SIADS_542_dep.pk'
selector = QuestionSelect('SIADS 542',concept_path, dash_path)

In [4]:
quest_df = selector.GetQuestions()
quest_df

Unnamed: 0,Question ID,Question,A,B,C,D,Correct,Moment,Topics,Selected
0,3KVQWnBwdP6v583JjDLr,What is the practical application of the machi...,Automated quality control in food companies,Screening for rotten oranges during processing,Estimating the mass of different fruits,Both A and B,D,Moment 1,"[supervised learning, classification, feature ...",False
1,5RDWHCXtn9kGlet8yOcU,What types of fruit were included in the origi...,"Oranges, lemons, and apples",Oranges and lemons,Apples and oranges,Lemons and apples,A,Moment 1,"[supervised learning, classification, feature ...",False
2,5bXvM0sWuWJkMePjS6fk,Where can the dataset be found?,In the folder of materials downloaded for the ...,In the University of Edinburgh's database,In a nearby store,In a fruit shipping company's database,A,Moment 1,"[supervised learning, classification, feature ...",False
3,7uG85sDYzmhOBpTCS0Bu,What additional features were added to the ori...,Color score,Fruit type,Fruit size,Fruit weight,A,Moment 1,"[feature engineering, supervised learning, cla...",False
4,9HduaghaahWg3Ap4sLL7,Who originally created the dataset used in the...,Dr. Iain Murray,The author of the text,A food company,A fruit shipping company,A,Moment 1,"[supervised learning, classification, feature ...",False
...,...,...,...,...,...,...,...,...,...,...
1105,ao8nhfDoW7dBCKT6CnaH,What does the y-axis represent in the scatter ...,The regression target,The future value,The data set samples,The informative input variable,A,Moment 2,"[regression, supervised learning, linear regre...",False
1106,gLR5Buv5fv1VyDgLvlDb,What is the purpose of low dimensional example...,To make the model more complex,To understand how the model's complexity chang...,To reduce the model's complexity,None of the above,B,Moment 2,"[learning curve, supervised learning, unsuperv...",False
1107,mogJEvUhtAZSqfa1xdDP,What does the x-axis represent in the scatter ...,The regression target,The future value,The data set samples,The informative input variable,B,Moment 2,"[regression, classification, feature engineeri...",False
1108,tBmxLQbGIITOpX0GcFLy,What is the purpose of randomly flipping the c...,To make the classifier more challenging,To make the data set more complex,To reduce the complexity of the classifier,None of the above,A,Moment 2,"[supervised learning, classification, feature ...",False


In [57]:
sample_df = quest_df.sample(7)

In [58]:
# topic_list = [["Supervised learning", "Classification", "Feature engineering", "Data Cleaning", "Decision trees"],
# ["Supervised learning", "Feature engineering", "Data Cleaning", "Classification", "Regression"],
# ["Supervised learning", "Classification", "Feature engineering", "Decision trees", "k-NN"],
# ["Supervised learning", "Classification", "Feature engineering", "Bias variance tradeoff", "Train test split"],
# ["Data Cleaning", "Feature engineering", "Anomaly detection", "Supervised learning", "Unsupervised learning"],
# ["Feature engineering", "Supervised learning", "Classification", "Data Cleaning", "Clustering"],
# ["Data Cleaning", "Feature engineering", "Supervised learning", "Classification", "Regression"],
# ["k-NN", "Supervised learning", "Classification", "Feature engineering", "Majority vote"],
# ["k-NN", "Classification", "Feature engineering", "Supervised learning", "Decision trees"],
# ["k-NN", "Supervised learning", "Classification", "Feature engineering", "Decision trees"],
# ["k-NN", "Supervised learning", "Classification", "Bias variance tradeoff", "Feature engineering"],
# ["Supervised learning", "Classification", "Regression", "Feature engineering", "Artificial neural networks"],
# ["Supervised learning", "Crowdsourcing", "Human-in-the-loop", "Active learning", "Learning curve"],
# ["Unsupervised learning", "Clustering", "Anomaly detection", "Feature learning", "Online learning"],
# ["Supervised learning", "Unsupervised learning", "Classification", "Feature engineering", "Evaluation method"],
# ["k-NN", "Classification", "Overfitting", "Supervised learning", "Bias variance tradeoff"],
# ["Regression", "Linear regression", "Bias variance tradeoff", "Supervised learning", "Overfitting"],
# ["Supervised learning", "Classification", "Regression", "Overfitting", "Generalization"],
# ["Classification", "Clustering", "Regression", "Feature engineering", "Supervised learning"],
# ["Linear regression", "Classification", "Supervised learning", "Feature engineering", "Scatter plot"],
# ["Supervised learning", "Feature engineering", "dimension reduction", "Artificial neural networks", "Bias variance tradeoff"]]

In [59]:
# subset_df = quest_df.copy(True)
# subset_df['Topics_sorted'] = subset_df['Topics'].apply(lambda x: sorted(x))
# topic_list_sorted = [sorted(sublist) for sublist in topic_list]

# # Then we convert the sorted topics back into a single string
# subset_df['Topics_str'] = subset_df['Topics_sorted'].astype(str)
# topic_list_strs = [str(sublist) for sublist in topic_list_sorted]

# # Then we filter the DataFrame to only include rows with a 'Topics_str' in our topic_list_strs
# subset_df = subset_df[subset_df['Topics_str'].isin(topic_list_strs)]

# # And we remove the columns we created for this operation 
# subset_df = subset_df.drop(['Topics_sorted', 'Topics_str'], axis=1)

In [60]:
# subset_df['Selected'] = False
# subset_df['Topics'] = subset_df['Topics'].apply(lambda topics: [topic.lower() for topic in topics])
# subset_df

In [61]:
# quest_df.Topics.explode().unique()

In [62]:
# def get_dash_memory(current_app, concepts, progress):
#     scheduler = DashScheduler(concepts, current_app.dash_params)
#     return scheduler.get_memory(progress)

In [63]:
# import requests
# import json

# url = 'https://comphcithree.eecs.umich.edu:8100/concepts/siads542' #your url here

# # Sends a HTTP request to the specified URL and save 
# # the response from server in a response object called r
# r = requests.get(url)

# # Create a dictionary from JSON file
# concept_dict = r.json()

# # print the dictionary
# concept_dict

In [64]:
# type(app)

In [65]:

# def get_concept_tree(current_app, course_name):
#     concepts = current_app.concepts[course_name]
#     concept_dict = {"concept2id":
#                     {concept: idx for concept, idx in enumerate(concepts.nodes())},
#                     "edges": list(concepts.edges())}
#     return concept_dict

# get_concept_tree(app,"SIADS 542")

In [66]:
def random_date(start, end):
    """Generate a random datetime between `start` and `end`."""
    delta = end - start
    int_delta = (delta.days * 24 * 60 * 60) + delta.seconds
    random_second = random.randrange(int_delta)
    return start + timedelta(seconds=random_second)

In [67]:
from datetime import datetime, timedelta

start_date = datetime(2023, 8, 29)
end_date = datetime(2023, 10, 15)

In [68]:
def transform_row(row):
    return {
        "_id": row["Question ID"],  # Assuming Question ID is unique and can be used as _id
        "question_id": row["Question ID"],
        "question": row["Question"],
        "response": random.choice(["A", "B", "C", "D"]),
        "correct_answer": row["Correct"],
        "student_id": 'ABCDE',
        "timestamp": random_date(start_date, end_date),  # Random timestamp
        "topics_covered": row["Topics"],  # Assuming topics are separated by semicolons
        # Add or modify fields as necessary to match your MongoDB schema
    }

transformed_data = [transform_row(row) for index, row in sample_df.iterrows()]
transformed_data

[{'_id': 'vcLXNsUiJSycyILeGxjl',
  'question_id': 'vcLXNsUiJSycyILeGxjl',
  'question': 'What is the strategy of a dummy regressor?',
  'response': 'A',
  'correct_answer': 'D',
  'student_id': 'ABCDE',
  'timestamp': datetime.datetime(2023, 9, 8, 8, 44, 20),
  'topics_covered': ['regression',
   'supervised learning',
   'linear regression',
   'bias variance tradeoff',
   'statistical learning']},
 {'_id': 'yhX8I5hMcHhbRRRgy6v7',
  'question_id': 'yhX8I5hMcHhbRRRgy6v7',
  'question': 'What is the purpose of training and testing on the same data set?',
  'response': 'A',
  'correct_answer': 'B',
  'student_id': 'ABCDE',
  'timestamp': datetime.datetime(2023, 9, 9, 9, 42, 54),
  'topics_covered': ['supervised learning',
   'overfitting',
   'feature engineering',
   'model selection',
   'evaluation metrics',
   'training and testing',
   'cross-validation',
   'grid search',
   'accuracy']},
 {'_id': 'S1gJrP9CdXFRekbMZGcx',
  'question_id': 'S1gJrP9CdXFRekbMZGcx',
  'question': 'What 

In [71]:


uri = 'mongodb://127.0.0.1:27017/?directConnection=true&serverSelectionTimeoutMS=2000&appName=mongosh+2.0.1'
client = pymongo.MongoClient(uri)
db = client["student_histDB"]

collection = db["questions"]
documents = collection.find()
# for doc in documents:
#     print(doc)
doc_count = collection.count_documents({})    
print(doc_count)

df = pd.DataFrame(list(documents))
df

7


Unnamed: 0,_id,question_id,question,response,correct_answer,student_id,timestamp,topics_covered
0,vcLXNsUiJSycyILeGxjl,vcLXNsUiJSycyILeGxjl,What is the strategy of a dummy regressor?,A,D,ABCDE,2023-09-08 08:44:20,"[regression, supervised learning, linear regre..."
1,yhX8I5hMcHhbRRRgy6v7,yhX8I5hMcHhbRRRgy6v7,What is the purpose of training and testing on...,A,B,ABCDE,2023-09-09 09:42:54,"[supervised learning, overfitting, feature eng..."
2,S1gJrP9CdXFRekbMZGcx,S1gJrP9CdXFRekbMZGcx,What is the difference between ridge regressio...,D,B,ABCDE,2023-09-21 15:40:28,"[linear regression, regression, supervised lea..."
3,2pxk5NtR5sOV9uTsHqL4,2pxk5NtR5sOV9uTsHqL4,What is the difference between K nearest neigh...,B,C,ABCDE,2023-10-12 21:20:39,"[k-nn, linear regression, supervised learning,..."
4,4EDUpqKPGThAVcJ10Uvc,4EDUpqKPGThAVcJ10Uvc,What does a beta larger than one in the F-scor...,D,B,ABCDE,2023-09-11 15:38:33,"[supervised learning, classification, bias var..."
5,Mfxl89wLhpbtbDXprjLs,Mfxl89wLhpbtbDXprjLs,Does Mean Squared Error distinguish between ov...,A,B,ABCDE,2023-10-05 06:16:45,"[regression, linear regression, supervised lea..."
6,qgACHvxcUdxV1FGHDVRD,qgACHvxcUdxV1FGHDVRD,What does the string 'precision_micro' represe...,C,A,ABCDE,2023-09-04 01:08:41,"[supervised learning, classification, feature ..."


In [70]:
# Delete existing questions (be cautious with this approach)
collection.delete_many({})

# Insert new questions
collection.insert_many(transformed_data)


<pymongo.results.InsertManyResult at 0x7f893981c760>

In [72]:
collection = db["studenthist"]
documents = collection.find()
# for doc in documents:
#     print(doc)
doc_count = collection.count_documents({})    
print(doc_count)

df = pd.DataFrame(list(documents))
df

10


Unnamed: 0,_id,question_id,student_id,timestamp,question,response,correct_answer,topics_covered
0,6585277dcb09d40dbd65ef40,3DROdxl5vsYX75KG8Nh1,ABCDE,2023-12-22 01:06:53.195,What is an example of multi-label classification?,A,C,"[supervised learning, classification, regressi..."
1,6585277dcb09d40dbd65ef41,88VMmjviJzZ9Ns6RluNm,ABCDE,2023-12-22 01:06:53.237,What is the target value in a classification p...,C,B,"[supervised learning, classification, regressi..."
2,6585277dcb09d40dbd65ef42,XpUuyTKzrepqaDyAy8MO,ABCDE,2023-12-22 01:06:53.238,What does the term 'false negative' mean?,B,B,"[classification, supervised learning, bias var..."
3,6585277dcb09d40dbd65ef43,aBbyvV7gbu1l3yLOOCSb,ABCDE,2023-12-22 01:06:53.238,What does the term 'true positive' mean?,C,C,"[classification, supervised learning, artifici..."
4,6585277dcb09d40dbd65ef44,rR48xocrkcgI7IoLedH3,ABCDE,2023-12-22 01:06:53.239,What is the first parameter of the cross_val_s...,D,B,"[supervised learning, online learning, batch l..."
5,6585277dcb09d40dbd65ef45,FNZ11XAkzDkE9YM53RZS,ABCDE,2023-12-22 01:06:53.239,What is the goal of doing a real-world deploym...,A,A,"[supervised learning, data cleaning, feature e..."
6,6585277dcb09d40dbd65ef46,w4F7oVjQADat4DCvxuqL,ABCDE,2023-12-22 01:06:53.239,What is the problem in classification?,A,A,"[classification, supervised learning, k-nn, ov..."
7,6585277dcb09d40dbd65ef47,NPglbiW5dlDwY9AdQ4n2,ABCDE,2023-12-22 01:06:53.240,What is an example of a binary classification ...,B,B,"[supervised learning, classification, regressi..."
8,6585277dcb09d40dbd65ef48,5cdduX5WQqaDMTLdlIe9,ABCDE,2023-12-22 01:06:53.240,What does a higher value of C in logistic regr...,A,B,"[logistic regression, supervised learning, cla..."
9,6585277dcb09d40dbd65ef49,KGm8Zqtw04Y6rdnGwAc6,ABCDE,2023-12-22 01:06:53.241,What is the probability estimate y hat used for?,D,A,"[logistic regression, supervised learning, cla..."


In [77]:
collection = db["questions"]
documents = collection.find()
# for doc in documents:
#     print(doc)
doc_count = collection.count_documents({})    
print(doc_count)

df = pd.DataFrame(list(documents))
df

7


Unnamed: 0,_id,question_id,question,response,correct_answer,student_id,timestamp,topics_covered
0,vcLXNsUiJSycyILeGxjl,vcLXNsUiJSycyILeGxjl,What is the strategy of a dummy regressor?,A,D,ABCDE,2023-09-08 08:44:20,"[regression, supervised learning, linear regre..."
1,yhX8I5hMcHhbRRRgy6v7,yhX8I5hMcHhbRRRgy6v7,What is the purpose of training and testing on...,A,B,ABCDE,2023-09-09 09:42:54,"[supervised learning, overfitting, feature eng..."
2,S1gJrP9CdXFRekbMZGcx,S1gJrP9CdXFRekbMZGcx,What is the difference between ridge regressio...,D,B,ABCDE,2023-09-21 15:40:28,"[linear regression, regression, supervised lea..."
3,2pxk5NtR5sOV9uTsHqL4,2pxk5NtR5sOV9uTsHqL4,What is the difference between K nearest neigh...,B,C,ABCDE,2023-10-12 21:20:39,"[k-nn, linear regression, supervised learning,..."
4,4EDUpqKPGThAVcJ10Uvc,4EDUpqKPGThAVcJ10Uvc,What does a beta larger than one in the F-scor...,D,B,ABCDE,2023-09-11 15:38:33,"[supervised learning, classification, bias var..."
5,Mfxl89wLhpbtbDXprjLs,Mfxl89wLhpbtbDXprjLs,Does Mean Squared Error distinguish between ov...,A,B,ABCDE,2023-10-05 06:16:45,"[regression, linear regression, supervised lea..."
6,qgACHvxcUdxV1FGHDVRD,qgACHvxcUdxV1FGHDVRD,What does the string 'precision_micro' represe...,C,A,ABCDE,2023-09-04 01:08:41,"[supervised learning, classification, feature ..."


In [74]:
# df['student_id'] = 'ABCDE'
# # df['topics_covered'] = 'Supervised learning'
# # df['topics_covered'] = df['topics_covered'].apply(lambda x: [x])
# def random_abcd():
#     return random.choice(['A', 'B', 'C', 'D'])

# df['response'] = df.apply(lambda x: random_abcd(), axis=1)
# df['correct_answer'] = df.apply(lambda x: random_abcd(), axis=1)
# df

In [75]:
db.studenthist.drop()

In [76]:
newtab = db['studenthist']
records = df.to_dict('records')
newtab.insert_many(records)
# documents = newtab.find()
# for doc in documents:
#     print(doc)

<pymongo.results.InsertManyResult at 0x7f890d3b4d00>

In [None]:
# def test_dash_api(progress):
    
#     url = 'https://comphcithree.eecs.umich.edu:8100/dash/siads542'
#     headers = {'Content-type': 'application/json'}
#     resp = requests.post(url, json=progress, headers=headers)
#     return resp.json()

# def explode_list(df, column_to_explode, fill_values):
#         df = df.reset_index(drop=True)
#         s = df[column_to_explode]
#         i = np.arange(len(s)).repeat(s.str.len())
#         return df.iloc[i].assign(**{column_to_explode: np.concatenate(s)})

# SCENARIO 1: no student history

In [48]:
student_ids = [''.join(choice(ascii_uppercase) for _ in range(5)) for _ in range(10)]
topics = subset_df['Topics'].explode().unique().tolist()

In [None]:
def test_students_1():    

    start_date = pd.to_datetime('2023-08-29')
    end_date = pd.to_datetime('2023-10-10')

    feedback = []

    # Loop through each student
    for student_id in student_ids:
      # Loop through each question for current student
      for question_choice in quest_df['Question ID']:

          question_info = quest_df.loc[quest_df['Question ID'] == question_choice]

          question, correct_answer, topic = question_info['Question'].values[0], question_info['Correct'].values[0], question_info['Topics'].values[0]

          all_answers = list("ABCD")

          if correct_answer in all_answers:
              all_answers.remove(correct_answer)

          if np.random.rand() < 0.5:
              response = correct_answer
          else: 
              response = np.random.choice(all_answers)

          # randomly generate a date between the start and end dates
          timestamp =  start_date + pd.Timedelta(days = int(np.random.randint(0, (end_date - start_date).days+1))) 

          feedback.append((question_choice, 
                           student_id, 
                           timestamp, # use the randomly generated timestamp here
                           question,
                           response,
                           correct_answer,
                           topic))

    df_feedback = pd.DataFrame(feedback, columns=['question_id', 
                                                 'student_id', 
                                                 'timestamp', 
                                                 'question', 
                                                 'response',
                                                 'correct_answer',
                                                 'topics_covered'])
    
    df_feedback['correct/incorrect'] = (df_feedback['response'] == df_feedback['correct_answer']).astype(int)
    print(df_feedback.student_id.unique())
    
    for i in df_feedback['student_id'].unique():
        df = df_feedback[df_feedback['student_id']==i]

        df['timestamp'] = pd.to_datetime(df['timestamp']).dt.strftime('%m/%d/%Y')



        df_new = df[['topics_covered', 'correct/incorrect', 'timestamp']]

        df_new = explode_list(df_new, 'topics_covered', {'correct/incorrect': 'incorrect', 'timestamp': df['timestamp']})
        df_new.columns = ['concept', 'correct/incorrect', 'timestamp']
        df_new.reset_index(drop=True, inplace=True)
        df_new['concept'] = df_new['concept'].str.lower()

        concept2id = concept_dict['siads542']['concept_tree']['concept2id']
        id2concept = {v: k for k, v in concept2id.items()}

        df_new['concept'] = df_new['concept'].replace(id2concept)
        df_new = df_new[df_new['concept'].apply(lambda x: str(x).isdigit())]
        df_new['concept'] = df_new['concept'].astype(int)

        output_dict = df_new.apply(tuple, axis=1).tolist()

        student_hist = {'progress': output_dict}

        output = test_dash_api(student_hist)
        output = dict(sorted(output.items(), key=lambda item: item[0]))
        output['concept'] = sorted(output['concept'], key=lambda x: x[0])

        filtered_courses = {}
    #     filtered_courses['concept'] = output['concept'][:25]

        freq_dict = {int(pair[1]): pair[0] for pair in output['concept']}

        id_to_name = concept_dict['siads542']['concept_tree']['concept2id']
        frequencies = {id_to_name[str(k)]: v for k,v in freq_dict.items()}
        
        print("\n" + i + "\n")
        print("\n")
        print(frequencies)

In [None]:
test_students_1()

# SCENARIO 2: students who have already have 1 topic strong from week 1 

In [None]:
student_ids = [''.join(choice(ascii_uppercase) for _ in range(5)) for _ in range(10)]
topics = subset_df['Topics'].explode().unique().tolist() # Assumed 'Topics' as your column name
st_strengths = {st_id: np.random.choice(topics, size=1, replace=False).tolist() for st_id in student_ids}

In [None]:
def test_students_2():    

    start_date = pd.to_datetime('2023-08-29')
    end_date = pd.to_datetime('2023-10-10')


    feedback = []

    # Loop through each student
    for student_id in student_ids:
      # Loop through each question for current student
      for question_choice in quest_df['Question ID']:

        question_info = quest_df.loc[quest_df['Question ID'] == question_choice]

        question, correct_answer, question_topics = question_info['Question'].values[0], question_info['Correct'].values[0], question_info['Topics'].values[0]

        all_answers = list("ABCD")

        if correct_answer in all_answers:
            all_answers.remove(correct_answer)

        # Check if question covers one of the topics the student is strong at
        if any(topic in question_topics for topic in st_strengths[student_id]):
            response = correct_answer
        else:
            response = np.random.choice(all_answers) # Student answers incorrectly in their weak topics

        # randomly generate a date between the start and end dates
        timestamp =  start_date + pd.Timedelta(days = int(np.random.randint(0, (end_date - start_date).days+1)))

        feedback.append((question_choice, 
                         student_id, 
                         timestamp,
                         question,
                         response,
                         correct_answer,
                         question_topics))

    df_feedback = pd.DataFrame(feedback, columns=['question_id', 
                                                  'student_id', 
                                                  'timestamp', 
                                                  'question', 
                                                  'response',
                                                  'correct_answer',
                                                  'topics_covered'])
    
    df_feedback['correct/incorrect'] = (df_feedback['response'] == df_feedback['correct_answer']).astype(int)
    print(df_feedback.student_id.unique())
    
    for i in df_feedback['student_id'].unique():
        df = df_feedback[df_feedback['student_id']==i]

        df['timestamp'] = pd.to_datetime(df['timestamp']).dt.strftime('%m/%d/%Y')



        df_new = df[['topics_covered', 'correct/incorrect', 'timestamp']]

        df_new = explode_list(df_new, 'topics_covered', {'correct/incorrect': 'incorrect', 'timestamp': df['timestamp']})
        df_new.columns = ['concept', 'correct/incorrect', 'timestamp']
        df_new.reset_index(drop=True, inplace=True)
        df_new['concept'] = df_new['concept'].str.lower()

        concept2id = concept_dict['siads542']['concept_tree']['concept2id']
        id2concept = {v: k for k, v in concept2id.items()}

        df_new['concept'] = df_new['concept'].replace(id2concept)
        df_new = df_new[df_new['concept'].apply(lambda x: str(x).isdigit())]
        df_new['concept'] = df_new['concept'].astype(int)

        output_dict = df_new.apply(tuple, axis=1).tolist()

        student_hist = {'progress': output_dict}

        output = test_dash_api(student_hist)
        output = dict(sorted(output.items(), key=lambda item: item[0]))
        output['concept'] = sorted(output['concept'], key=lambda x: x[0])

        filtered_courses = {}
    #     filtered_courses['concept'] = output['concept'][:25]

        freq_dict = {int(pair[1]): pair[0] for pair in output['concept']}

        id_to_name = concept_dict['siads542']['concept_tree']['concept2id']
        frequencies = {id_to_name[str(k)]: v for k,v in freq_dict.items()}
        
        print("\n" + i + "\n")
        print(st_strengths[i])
        print("\n")
        print(frequencies)

In [None]:
test_students_2()

# SCENARIO 3: really advanced in topics from week 1

In [None]:
student_ids = [''.join(choice(ascii_uppercase) for _ in range(5)) for _ in range(10)]
topics = subset_df['Topics'].explode().unique().tolist() # Assumed 'Topics' as your column name
st_strengths = {st_id: np.random.choice(topics, size=5, replace=False).tolist() for st_id in student_ids}

In [None]:



def test_students_3():    

    start_date = pd.to_datetime('2023-08-29')
    end_date = pd.to_datetime('2023-10-10')


    feedback = []

    # Loop through each student
    for student_id in student_ids:
      # Loop through each question for current student
      for question_choice in quest_df['Question ID']:

        question_info = quest_df.loc[quest_df['Question ID'] == question_choice]

        question, correct_answer, question_topics = question_info['Question'].values[0], question_info['Correct'].values[0], question_info['Topics'].values[0]

        all_answers = list("ABCD")

        if correct_answer in all_answers:
            all_answers.remove(correct_answer)

        # Check if question covers one of the topics the student is strong at
        if any(topic in question_topics for topic in st_strengths[student_id]):
            response = correct_answer
        else:
            response = np.random.choice(all_answers) # Student answers incorrectly in their weak topics

        # randomly generate a date between the start and end dates
        timestamp =  start_date + pd.Timedelta(days = int(np.random.randint(0, (end_date - start_date).days+1)))

        feedback.append((question_choice, 
                         student_id, 
                         timestamp,
                         question,
                         response,
                         correct_answer,
                         question_topics))

    df_feedback = pd.DataFrame(feedback, columns=['question_id', 
                                                  'student_id', 
                                                  'timestamp', 
                                                  'question', 
                                                  'response',
                                                  'correct_answer',
                                                  'topics_covered'])
    
    df_feedback['correct/incorrect'] = (df_feedback['response'] == df_feedback['correct_answer']).astype(int)
    print(df_feedback.student_id.unique())
    
    for i in df_feedback['student_id'].unique():
        df = df_feedback[df_feedback['student_id']==i]

        df['timestamp'] = pd.to_datetime(df['timestamp']).dt.strftime('%m/%d/%Y')



        df_new = df[['topics_covered', 'correct/incorrect', 'timestamp']]

        df_new = explode_list(df_new, 'topics_covered', {'correct/incorrect': 'incorrect', 'timestamp': df['timestamp']})
        df_new.columns = ['concept', 'correct/incorrect', 'timestamp']
        df_new.reset_index(drop=True, inplace=True)
        df_new['concept'] = df_new['concept'].str.lower()

        concept2id = concept_dict['siads542']['concept_tree']['concept2id']
        id2concept = {v: k for k, v in concept2id.items()}

        df_new['concept'] = df_new['concept'].replace(id2concept)
        df_new = df_new[df_new['concept'].apply(lambda x: str(x).isdigit())]
        df_new['concept'] = df_new['concept'].astype(int)

        output_dict = df_new.apply(tuple, axis=1).tolist()

        student_hist = {'progress': output_dict}

        output = test_dash_api(student_hist)
        output = dict(sorted(output.items(), key=lambda item: item[0]))
        output['concept'] = sorted(output['concept'], key=lambda x: x[0])

        filtered_courses = {}
    #     filtered_courses['concept'] = output['concept'][:25]

        freq_dict = {int(pair[1]): pair[0] for pair in output['concept']}

        id_to_name = concept_dict['siads542']['concept_tree']['concept2id']
        frequencies = {id_to_name[str(k)]: v for k,v in freq_dict.items()}
        
        print("\n" + i + "\n")
        print(st_strengths[i])
        print("\n")
        print(frequencies)
#         for j in st_strengths[i]:
#             print(frequencies[j])







In [None]:
test_students_3()

In [12]:
student_ids = [''.join(choice(ascii_uppercase) for _ in range(5)) for _ in range(10)]
topics = quest_df['Topics'].explode().unique().tolist() 
st_strengths = {st_id: np.random.choice(topics, size=5, replace=False).tolist() for st_id in student_ids}

In [13]:
start_date = pd.to_datetime('2023-08-29')
end_date = pd.to_datetime('2023-10-10')


feedback = []

# Loop through each student
for student_id in student_ids:
  # Loop through each question for current student
  for question_choice in quest_df['Question ID']:

    question_info = quest_df.loc[quest_df['Question ID'] == question_choice]

    question, correct_answer, question_topics = question_info['Question'].values[0], question_info['Correct'].values[0], question_info['Topics'].values[0]

    all_answers = list("ABCD")

    if correct_answer in all_answers:
        all_answers.remove(correct_answer)

    if any(topic in question_topics for topic in st_strengths[student_id]):
        response = correct_answer
    else:
        response = np.random.choice(all_answers) # Student answers incorrectly in their weak topics

    timestamp =  start_date + pd.Timedelta(days = int(np.random.randint(0, (end_date - start_date).days+1)))

    feedback.append((question_choice, 
                     student_id, 
                     timestamp,
                     question,
                     response,
                     correct_answer,
                     question_topics))

df_feedback = pd.DataFrame(feedback, columns=['question_id', 
                                              'student_id', 
                                              'timestamp', 
                                              'question', 
                                              'response',
                                              'correct_answer',
                                              'topics_covered'])

df_feedback['correct/incorrect'] = (df_feedback['response'] == df_feedback['correct_answer']).astype(int)


In [14]:
df_feedback.student_id.unique()

array(['ADCVO', 'JJERJ', 'HJAQN', 'HUVBL', 'HVDIJ', 'EEILT', 'ZXTPC',
       'WKTGS', 'WIRAX', 'SSCRZ'], dtype=object)

In [15]:
st_strengths

{'ADCVO': ['binary classifier',
  'training data',
  'refinement',
  'linear regression',
  'regularization'],
 'JJERJ': ['feature extraction',
  'feature representation',
  'machine learning task',
  'precision-recall curve',
  'support vector'],
 'HJAQN': ['false negative predictions',
  'iterative process',
  'user community',
  'dummy regressors',
  'data manipulation'],
 'HUVBL': ['evaluation',
  'batch learning',
  'false negative predictions',
  'activation functions',
  'activepython'],
 'HVDIJ': ['multi-agent',
  'objective function',
  'imbalanced classes',
  'clustering',
  'feature pre-processing'],
 'EEILT': ['predict',
  'generalization',
  'kernel machines',
  'clustering',
  'stratified k-fold cross validation'],
 'ZXTPC': ['online documentation',
  'activation functions',
  'user community',
  'statistical distributions',
  'active learning'],
 'WKTGS': ['density estimation',
  'deep learning',
  'false positives',
  'multi-agent',
  'confusion matrix'],
 'WIRAX': ['mo

In [16]:
with open('dash_params.pk', 'rb') as file:
    dash = pickle.load(file)
    
with open('SIADS_542_dep.pk', 'rb') as file:
    concept_file = pickle.load(file)

In [17]:
def explode_list(df, column_to_explode, fill_values):
    df = df.reset_index(drop=True)
    s = df[column_to_explode]
    i = np.arange(len(s)).repeat(s.str.len())
    return df.iloc[i].assign(**{column_to_explode: np.concatenate(s)})


def get_dash_memory(dash_params,concepts, progress):
    scheduler = DashScheduler(concepts, dash_params)
    return scheduler.get_memory(progress)    

def get_freq(df, student_uniqname, concept_dict):
    rec_prob={}
    id2concept = {k: c for k, c in enumerate(concept_file.nodes())}
    id2concept = {k: v.strip() for k, v in id2concept.items()}
    concept2id = {v.strip(): k for k, v in id2concept.items()}
    df = df[df['student_id']==student_uniqname]
    df['timestamp'] = pd.to_datetime(df['timestamp']).dt.strftime('%m/%d/%Y')
    df['correct/incorrect'] = (df['response'] == df['correct_answer']).astype(int)
    df = df[['topics_covered', 'correct/incorrect', 'timestamp']]
    
    df_formatted = explode_list(df, 'topics_covered', {'correct/incorrect': 'incorrect', 'timestamp': df['timestamp']})
    df_formatted.columns = ['concept', 'correct/incorrect', 'timestamp']
    df_formatted.reset_index(drop=True, inplace=True)
    df_formatted['concept'] = df_formatted['concept'].map(concept2id)
    df_formatted = df_formatted[df_formatted['concept'].apply(lambda x: str(x).isdigit())]
    df_formatted['concept'] = df_formatted['concept'].astype(int)
    tuple_list = [tuple(x) for x in df_formatted.values]
    student_hist = {'progress': tuple_list}
    
    output = get_dash_memory(dash, concept_file, student_hist['progress'])
    for i in output:
        k = int(i[1])
        v = float(i[0])
        rec_prob[k] = v
    recall_prob_dict = {id2concept.get(k, k): v for k, v in rec_prob.items()}
    recall_prob_dict = {k.strip(): v for k, v in recall_prob_dict.items()}
    recall_prob_dict = dict(sorted(recall_prob_dict.items(), key=lambda item: item[1]))
    first_25_dict = dict(itertools.islice(recall_prob_dict.items(), 25))
    frequencies = {k: round(1/(v**0.5)) for k,v in recall_prob_dict.items()}

    return frequencies


    


In [18]:
id2concept = {k: c for k, c in enumerate(concept_file.nodes())}
id2concept = {k: v.strip() for k, v in id2concept.items()}
id2concept

{0: 'classification',
 1: 'k-nn',
 2: 'online learning',
 3: 'pca',
 4: 'structured prediction',
 5: 'feature engineering',
 6: 'data cleaning',
 7: 'data cleaning',
 8: 'bias variance tradeoff',
 9: 'active learning',
 10: 'machine learning',
 11: 'automl',
 12: 'supervised learning',
 13: 'feature learning',
 14: 'learning curve',
 15: 'feature engineering',
 16: 'decision trees',
 17: 'artificial neural networks',
 18: 'batch learning',
 19: 'supervised learning',
 20: 'decision threshold',
 21: 'logistic regression',
 22: 'threshold',
 23: 'statistical learning',
 24: 'empirical risk minimization',
 25: 'classification',
 26: 'evaluation metrics',
 27: 'precision and recall',
 28: 'logistic regression',
 29: 'support vector machine (svm)',
 30: 'f-score',
 31: 'customer facing prediction problems',
 32: 'precision',
 33: 'false positives',
 34: 'machine learning evaluation',
 35: 'computational learning theory',
 36: 'evaluation metric',
 37: 'user experience',
 38: 'precision',
 3

In [19]:
{'progress': [(5, 0, '08/08/2023'), (16, 1, '08/09/2023'), (5, 0, '08/10/2023'), (9, 0, '08/11/2023'), (4, 1, '08/12/2023'),
                             (0, 1, '08/13/2023'), (10, 1, '08/14/2023'), (11, 1, '08/15/2023'), (19, 0, '08/16/2023'), (10, 1, '08/17/2023'),
                             (18, 0, '08/18/2023'), (3, 1, '08/19/2023'), (1, 1, '08/20/2023'), (5, 1, '08/21/2023'), (5, 0, '08/22/2023'),
                             (0, 1, '08/23/2023'), (17, 1, '08/24/2023'), (5, 0, '08/25/2023'), (2, 0, '08/26/2023'), (16, 1, '08/27/2023'),
                             (1, 0, '08/28/2023'), (12, 0, '08/29/2023'), (20, 1, '08/30/2023'), (7, 0, '08/31/2023'), (14, 1, '09/01/2023'),
                             (20, 1, '09/02/2023'), (8, 1, '09/03/2023'), (8, 1, '09/04/2023'), (4, 0, '09/05/2023'), (7, 1, '09/06/2023')]
                        }

{'progress': [(5, 0, '08/08/2023'),
  (16, 1, '08/09/2023'),
  (5, 0, '08/10/2023'),
  (9, 0, '08/11/2023'),
  (4, 1, '08/12/2023'),
  (0, 1, '08/13/2023'),
  (10, 1, '08/14/2023'),
  (11, 1, '08/15/2023'),
  (19, 0, '08/16/2023'),
  (10, 1, '08/17/2023'),
  (18, 0, '08/18/2023'),
  (3, 1, '08/19/2023'),
  (1, 1, '08/20/2023'),
  (5, 1, '08/21/2023'),
  (5, 0, '08/22/2023'),
  (0, 1, '08/23/2023'),
  (17, 1, '08/24/2023'),
  (5, 0, '08/25/2023'),
  (2, 0, '08/26/2023'),
  (16, 1, '08/27/2023'),
  (1, 0, '08/28/2023'),
  (12, 0, '08/29/2023'),
  (20, 1, '08/30/2023'),
  (7, 0, '08/31/2023'),
  (14, 1, '09/01/2023'),
  (20, 1, '09/02/2023'),
  (8, 1, '09/03/2023'),
  (8, 1, '09/04/2023'),
  (4, 0, '09/05/2023'),
  (7, 1, '09/06/2023')]}

In [51]:
from itertools import islice
first_20_values = list(islice(id2concept.values(), 20))
first_20_values

#code to get the default progress topics

['classification',
 'k-nn',
 'online learning',
 'pca',
 'structured prediction',
 'feature engineering',
 'data cleaning',
 'data cleaning',
 'bias variance tradeoff',
 'active learning',
 'machine learning',
 'automl',
 'supervised learning',
 'feature learning',
 'learning curve',
 'feature engineering',
 'decision trees',
 'artificial neural networks',
 'batch learning',
 'supervised learning']

In [50]:
frequencies = get_freq(df_feedback, 'WKTGS', concept_file)
frequencies

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['timestamp'] = pd.to_datetime(df['timestamp']).dt.strftime('%m/%d/%Y')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['correct/incorrect'] = (df['response'] == df['correct_answer']).astype(int)


{'feature engineering': 7,
 'bias variance tradeoff': 7,
 'supervised learning': 7,
 'ensembles bagging boosting random forest': 7,
 'linear regression': 7,
 'overfitting': 7,
 'online learning': 7,
 'batch learning': 7,
 'active learning': 6,
 'decision trees': 6,
 'statistical learning': 6,
 'data cleaning': 6,
 'unsupervised learning': 6,
 'naive bayes': 6,
 'precision and recall': 6,
 'k-nn': 6,
 'evaluation metrics': 6,
 'model selection': 6,
 'scikit-learn': 6,
 'logistic regression': 6,
 'kernel machines': 6,
 'classification': 6,
 'learning curve': 6,
 'regression': 6,
 'support vector machine (svm)': 6,
 'structured prediction': 6,
 'model complexity': 6,
 'anomaly detection': 6,
 'decision threshold': 6,
 'clustering': 6,
 'pca': 6,
 'cross validation': 6,
 'semi-supervised learning': 6,
 'human-in-the-loop': 6,
 'regularization': 6,
 'crowdsourcing': 6,
 'empirical risk minimization': 5,
 'cross-validation': 5,
 'roc curve': 5,
 'dimension reduction': 5,
 'accuracy': 5,
 'ev

In [None]:
def question_selection(frequencies, quest_df):
     # Start with all topic frequencies being the target ones
    unsatisfied_freqs = {k: v for k, v in frequencies.items() if v != 0}
    selected_questions = []

    # Greedy selection of questions
    while unsatisfied_freqs:

        # Calculate the score of each question by conditionally filtering unselected questions, and only if they cover a topic with unsatisfied frequency remaining
        question_scores = {q: sum(unsatisfied_freqs[topic] for topic in topics if topic in unsatisfied_freqs and unsatisfied_freqs[topic] > 0)
                          for q, topics in quest_df[quest_df.Selected == False].set_index('Question').Topics.items() if any(topic in unsatisfied_freqs and unsatisfied_freqs[topic] > 0 for topic in topics)}

        # If no question can satisfy the remaining unsatisfied frequencies, then break the loop
        if not question_scores:
            print("No more questions can satisfy the remaining topic frequencies.")
            break

        # Select the question with the highest score
        selected_q = max(question_scores, key=question_scores.get)
        selected_questions.append(selected_q)

        # Update the 'Selected' flag for the chosen question
        quest_df.loc[quest_df.Question == selected_q, 'Selected'] = True

        # Update the unsatisfied frequencies
        for topic_list in quest_df.set_index('Question').loc[selected_q].Topics:  # Assume here each topic_list is a list
            for topic in topic_list:  # Iterate over the items in each topic_list
                if topic in unsatisfied_freqs:
                    unsatisfied_freqs[topic] -= 1
                    if unsatisfied_freqs[topic] == 0:
                        unsatisfied_freqs.pop(topic)

    selected_questions_df = pd.DataFrame(selected_questions, columns=['Question'])
    final_df = selected_questions_df.merge(quest_df, on='Question', how='left')
    return final_df


In [None]:
final = question_selection(frequencies, quest_df)
final

In [None]:
duplicates = final['Question ID'].duplicated()
final[duplicates]

In [None]:
# df = df_feedback[df_feedback['student_id']=='WBEGU']

# df['timestamp'] = pd.to_datetime(df['timestamp']).dt.strftime('%m/%d/%Y')

# df_new = df[['topics_covered', 'correct/incorrect', 'timestamp']]

# df_new = explode_list(df_new, 'topics_covered', {'correct/incorrect': 'incorrect', 'timestamp': df['timestamp']})
# df_new.columns = ['concept', 'correct/incorrect', 'timestamp']
# df_new.reset_index(drop=True, inplace=True)

# df_new['concept'] = df_new['concept'].str.lower()
# df_new

In [None]:
# id2concept = {k: c for k, c in enumerate(concept_file.nodes())}
# concept2id = {v.strip(): k for k, v in id2concept.items()}
# id2concept

In [None]:
# df_new['concept'] = df_new['concept'].map(concept2id)
# df_new = df_new[df_new['concept'].apply(lambda x: str(x).isdigit())]
# df_new['concept'] = df_new['concept'].astype(int)
# tuple_list = [tuple(x) for x in df_new.values]
# student_hist = {'progress': tuple_list}
# student_hist

In [None]:
# mem = get_dash_memory(dash, concept_file, student_hist['progress'])
# rec_prob = {}
# for i in mem:
#     k = int(i[1])
#     v = float(i[0])
#     rec_prob[k] = v
    
# rec_prob

In [None]:
# recall_prob_dict = {id2concept.get(k, k): v for k, v in rec_prob.items()}
# recall_prob_dict = {k.strip(): v for k, v in recall_prob_dict.items()}
# recall_prob_dict

In [None]:
# recall_prob_dict = dict(sorted(recall_prob_dict.items(), key=lambda item: item[1]))
# frequencies = {k: round(1/(v**0.5)) for k,v in recall_prob_dict.items()}
# frequencies

In [None]:
# # df['topics_covered'] = df['topics_covered'].apply(ast.literal_eval)
# df_new.reset_index(drop=True, inplace=True)
# df_new['concept'] = df_new['concept'].str.lower()
# df_new

# concept2id = concept_dict['siads542']['concept_tree']['concept2id']
# id2concept = {v: k for k, v in concept2id.items()}

# # df_new['concept'] = df_new['concept'].replace(id2concept)

# df_new

# # df_new[df_new['correct/incorrect']==0]

In [None]:
# df_new.concept.unique()

In [None]:
# df_new = df_new[df_new['concept'].apply(lambda x: str(x).isdigit())]
# df_new

In [None]:
# get_dash_memory(app, concepts)


In [None]:
# df_new['concept'] = df_new['concept'].astype(int)
# df_new

In [None]:
# output_dict = df_new.apply(tuple, axis=1).tolist()

# student_hist = {'progress': output_dict}

# print(student_hist)

In [None]:
# with open('dash_params.pk', 'rb') as file:
#     dash = pickle.load(file)
    
# with open('SIADS_542_dep.pk', 'rb') as file:
#     concept_file = pickle.load(file)

In [None]:

# def get_freq(uniqname, data, dash_params, concept_dict):
#     rec_prob = {}
#     id2concept = {k: c for k, c in enumerate(concept_file.nodes())}
#     concept2id = {v.strip(): k for k, v in id2concept.items()}
#     df_new = data[data['student_id']=='uniqname']
#     df_new['concept'] = df_new['concept'].map(concept2id)
#     df_new = df_new[df_new['concept'].apply(lambda x: str(x).isdigit())]
#     df_new['concept'] = df_new['concept'].astype(int)
#     tuple_list = [tuple(x) for x in df_new.values]
#     student_hist = {'progress': tuple_list}
    

In [None]:
# frequencies = get_freq('WBEGU', df_feedback, dash, concept_file)
# frequencies
# # output = test_dash_api(student_hist)
# # output = dict(sorted(output.items(), key=lambda item: item[0]))
# # output['concept'] = sorted(output['concept'], key=lambda x: x[0])
# # output

In [None]:
# filtered_courses = {}
# filtered_courses['concept'] = output['concept'][:25]
# filtered_courses

In [None]:
# freq_dict = {int(pair[1]): pair[0] for pair in filtered_courses['concept']}

# id_to_name = concept_dict['siads542']['concept_tree']['concept2id']
# frequencies = {id_to_name[str(k)]: round(1/(v**0.5)) for k,v in freq_dict.items()}
# frequencies

In [None]:

# # Start with all topic frequencies being the target ones
# unsatisfied_freqs = {k: v for k, v in frequencies.items() if v != 0}
# selected_questions = []

# # Greedy selection of questions
# while unsatisfied_freqs:

#     # Calculate the score of each question by conditionally filtering unselected questions, and only if they cover a topic with unsatisfied frequency remaining
#     question_scores = {q: sum(unsatisfied_freqs[topic] for topic in topics if topic in unsatisfied_freqs and unsatisfied_freqs[topic] > 0)
#                       for q, topics in quest_df[quest_df.Selected == False].set_index('Question').Topics.items() if any(topic in unsatisfied_freqs and unsatisfied_freqs[topic] > 0 for topic in topics)}

#     # If no question can satisfy the remaining unsatisfied frequencies, then break the loop
#     if not question_scores:
#         print("No more questions can satisfy the remaining topic frequencies.")
#         break

#     # Select the question with the highest score
#     selected_q = max(question_scores, key=question_scores.get)
#     selected_questions.append(selected_q)

#     # Update the 'Selected' flag for the chosen question
#     quest_df.loc[quest_df.Question == selected_q, 'Selected'] = True
    
#     # Update the unsatisfied frequencies
#     for topic_list in quest_df.set_index('Question').loc[selected_q].Topics:  # Assume here each topic_list is a list
#         for topic in topic_list:  # Iterate over the items in each topic_list
#             if topic in unsatisfied_freqs:
#                 unsatisfied_freqs[topic] -= 1
#                 if unsatisfied_freqs[topic] == 0:
#                     unsatisfied_freqs.pop(topic)


# print(selected_questions)


In [None]:
# selected_questions_df = pd.DataFrame(selected_questions, columns=['Question'])
# selected_questions_df


In [None]:
# final_df = selected_questions_df.merge(quest_df, on='Question', how='left')
# final_df
