In [1]:
import pandas as pd
import numpy as np
import os
cwd = os.chdir(os.path.join(os.getcwd(), ".."))

In [6]:
Skill_hirereachy ={
    "Data Loading": [],
    "Data Cleaning": ["Data Loading"],
    "Handling Missing Data": ["Data Cleaning"],
    "Feature Scaling": ["Data Cleaning"],
    "Exploratory Data Analysis (EDA)": ["Data Loading"],
    "Statistical Summaries": ["Exploratory Data Analysis (EDA)"],
    "Visualization": ["Exploratory Data Analysis (EDA)"],
    "Correlation Analysis": ["Statistical Summaries", "Visualization"],
    "Data Splitting": [],
    "Train-Test Split": ["Data Splitting"],
    "Model Selection": ["Train-Test Split"],
    "Supervised Learning": ["Model Selection"],
    "Classification Algorithms": ["Supervised Learning"],
    "Hyperparameter Tuning": ["Model Selection"],
    "Evaluation Metrics": [],
    "Confusion Matrix": ["Evaluation Metrics"],
    "Accuracy Metrics": ["Evaluation Metrics"],
    "Precision and Recall": ["Evaluation Metrics"],
    "F1 Score": ["Precision and Recall"],
    "ROC-AUC": ["Evaluation Metrics"],
    "Cost-Sensitive Training": ["Supervised Learning", "Evaluation Metrics"],
    "Dimensionality Reduction": ["Exploratory Data Analysis (EDA)"],
    "Feature Engineering": ["Exploratory Data Analysis (EDA)"],
    "Feature Selection": ["Feature Engineering"],
    "One-Hot Encoding": ["Feature Engineering"],
    "Outlier Detection": ["Data Cleaning"],
    "Model Deployment": ["Model Selection", "Evaluation Metrics"],
    "Prediction": ["Model Selection"],
    "Imbalanced Data": ["Supervised Learning"],
}


In [7]:

from collections import deque
# convert the values in Skill_hirereachy to a tuple, where the first element is the rate 
# and the second element is the list of prerequisites
# the rate is calculated start from 0, if it does not have any prerequisites, otherwwize 
# it is 1 + the maximum rate of its prerequisites

need_to_rate =deque([key for key in Skill_hirereachy.keys()])
while need_to_rate:
    key = need_to_rate[0]
    value = Skill_hirereachy[key]
    if len(value) == 0:
        Skill_hirereachy[key] = (1, value)
    else:
        # check if we have already rate all prerequisites, if not put it back to the end of the queue
        for prerequisite in value:
            if not isinstance(Skill_hirereachy[prerequisite], tuple):
                need_to_rate.append(key)
                break
        else:
            Skill_hirereachy[key] = (1 + max(Skill_hirereachy[prerequisite][0] for prerequisite in value), value) 
    need_to_rate.popleft()

Skill_hirereachy

{'Data Loading': (1, []),
 'Data Cleaning': (2, ['Data Loading']),
 'Handling Missing Data': (3, ['Data Cleaning']),
 'Feature Scaling': (3, ['Data Cleaning']),
 'Exploratory Data Analysis (EDA)': (2, ['Data Loading']),
 'Statistical Summaries': (3, ['Exploratory Data Analysis (EDA)']),
 'Visualization': (3, ['Exploratory Data Analysis (EDA)']),
 'Correlation Analysis': (4, ['Statistical Summaries', 'Visualization']),
 'Data Splitting': (1, []),
 'Train-Test Split': (2, ['Data Splitting']),
 'Model Selection': (3, ['Train-Test Split']),
 'Supervised Learning': (4, ['Model Selection']),
 'Classification Algorithms': (5, ['Supervised Learning']),
 'Hyperparameter Tuning': (4, ['Model Selection']),
 'Evaluation Metrics': (1, []),
 'Confusion Matrix': (2, ['Evaluation Metrics']),
 'Accuracy Metrics': (2, ['Evaluation Metrics']),
 'Precision and Recall': (2, ['Evaluation Metrics']),
 'F1 Score': (3, ['Precision and Recall']),
 'ROC-AUC': (2, ['Evaluation Metrics']),
 'Cost-Sensitive Trainin

In [8]:
# given the rate of each skill, generate a list of 4 random numbers
# [P(L),P(T), P(G), P(S)] where for a larger rate, all the probabilities are lower, also
# P(L) and P(T) are higher than P(G) and P(S)
# then add the list of probability to the last element of the tuple value in Skill_hirereachy
np.random.seed(0) 
for key in Skill_hirereachy.keys():
    p = np.random.rand(4) / Skill_hirereachy[key][0]
    p = np.sort(p)[::-1]
    # convert to list
    p = p.tolist()
    Skill_hirereachy[key]+=(p,)

Skill_hirereachy

{'Data Loading': (1,
  [],
  [0.7151893663724195,
   0.6027633760716439,
   0.5488135039273248,
   0.5448831829968969]),
 'Data Cleaning': (2,
  ['Data Loading'],
  [0.4458865003910399,
   0.32294705653332806,
   0.21879360563134626,
   0.21182739966945235]),
 'Handling Missing Data': (3,
  ['Data Cleaning'],
  [0.32122092016700976,
   0.26390834602755486,
   0.17629830658430148,
   0.12781383960859258]),
 'Feature Scaling': (3,
  ['Data Cleaning'],
  [0.3085322127642203,
   0.18934818703131076,
   0.02904309990051357,
   0.023678686065962313]),
 'Exploratory Data Analysis (EDA)': (2,
  ['Data Loading'],
  [0.4350060741234096,
   0.416309922773969,
   0.38907837547492524,
   0.01010919872016286]),
 'Statistical Summaries': (3,
  ['Exploratory Data Analysis (EDA)'],
  [0.326206114077588,
   0.2663861880722412,
   0.26017639209548515,
   0.15382645408431062]),
 'Visualization': (3,
  ['Exploratory Data Analysis (EDA)'],
  [0.31488963901652794,
   0.2133070071091746,
   0.0477844291363488

In [9]:
# save as a pickle file
import pickle
pickle.dump(Skill_hirereachy, open("data/Skill_hirereachy.pkl", "wb"))

In [11]:
Problem = {
  "1": ["Supervised Learning", "Classification Algorithms"],
  "2": ["Dimensionality Reduction", "Feature Scaling"],
  "3": ["Data Loading"],
  "4": ["Statistical Summaries"],
  "5": ["Feature Engineering"],
  "6": ["Outlier Detection"],
  "7": ["Handling Missing Data"],
  "8": ["Train-Test Split"],
  "9": ["Feature Scaling"],
  "10": ["Model Selection"],
  "11": ["Model Selection"],
  "12": ["Evaluation Metrics"],
  "13": ["Hyperparameter Tuning"],
  "14": ["Evaluation Metrics"],
  "15": ["Hyperparameter Tuning"],
  "16": ["Supervised Learning", "Classification Algorithms"],
  "17": ["Supervised Learning", "Classification Algorithms"],
  "18": ["Visualization"],
  "19": ["Feature Scaling"],
  "20": ["Data Loading"],
  "21": ["Statistical Summaries"],
  "22": ["Outlier Detection"],
  "23": ["Handling Missing Data"],
  "24": ["Feature Selection"],
  "25": ["Feature Selection"],
  "26": ["Feature Engineering"],
  "27": ["One-Hot Encoding"],
  "28": ["Feature Selection"],
  "29": ["Feature Scaling"],
  "30": ["Model Selection"],
  "31": ["ROC-AUC"],
  "32": ["Evaluation Metrics"],
  "33": ["Confusion Matrix"],
  "34": ["Hyperparameter Tuning"],
  "35": ["Precision and Recall"],
  "36": ["Supervised Learning", "Classification Algorithms"],
  "37": ["Data Loading", "Feature Scaling"],
  "38": ["Data Loading"],
  "39": ["Statistical Summaries"],
  "40": ["Handling Missing Data"],
  "41": ["Handling Missing Data"],
  "42": ["One-Hot Encoding"],
  "43": ["Correlation Analysis"],
  "44": ["Feature Selection"],
  "45": ["Exploratory Data Analysis (EDA)"],
  "46": ["Feature Scaling"],
  "47": ["Train-Test Split"],
  "48": ["Confusion Matrix"],
  "49": ["Accuracy Metrics"],
  "50": ["Cost-Sensitive Training"],
  "51": ["Imbalanced Data", "Statistical Summaries"],
  "52": ["Imbalanced Data","Statistical Summaries"],
  "53": ["Evaluation Metrics"],
  "54": ["Prediction"],
}
# check if the problems values are in the skill hierarchy
for _, value in Problem.items():
    for v in value:
        if v not in Skill_hirereachy:
            print(f"Skill {v} is not in the skill hierarchy")
            break

# create a dataframe from the dictionary with ID colomn as key and skill column as value
df_problem = pd.DataFrame(Problem.items(), columns=['ID', 'Skill'])
print(df_problem.head())

  ID                                             Skill
0  1  [Supervised Learning, Classification Algorithms]
1  2       [Dimensionality Reduction, Feature Scaling]
2  3                                    [Data Loading]
3  4                           [Statistical Summaries]
4  5                             [Feature Engineering]


In [12]:
File_xlsx = "data/Solutions_v2.xlsx"

df_sol = pd.read_excel(File_xlsx)
# add a new column ID, with the index of the dataframe
df_sol['ID'] = df_sol.index+1
# change the type to string
df_sol['ID'] = df_sol['ID'].astype(str)

print(df_sol.head(2))

   problem_id  question_id question_type                      question_title  \
0           2           23            mc  What is the machine learning task?   
1           2           24            mc       Which import is not required?   

  question_description                            multiple_choice_options  \
0                  NaN  Supervised binary classification, Supervised m...   
1                  NaN  from sklearn.cluster import KMeans # Our clust...   

                                      correct_answer ID  
0                   Supervised binary classification  1  
1  from sklearn.cluster import KMeans # Our clust...  2  


In [13]:
# merge df_sol with df_problem on ID
df1 = pd.merge(df_problem, df_sol, on='ID')[["problem_id", "question_id", "Skill" , "correct_answer" ]]
print(df1.head())

   problem_id  question_id                                             Skill  \
0           2           23  [Supervised Learning, Classification Algorithms]   
1           2           24       [Dimensionality Reduction, Feature Scaling]   
2           2           25                                    [Data Loading]   
3           2           26                           [Statistical Summaries]   
4           2           27                             [Feature Engineering]   

                                      correct_answer  
0                   Supervised binary classification  
1  from sklearn.cluster import KMeans # Our clust...  
2                                        $4 = 'Race'  
3                            The dataset is balanced  
4  TwoHourSerIns value has the least effect on de...  


In [14]:
File_xlsx = "data/KT_logs.xlsx"

df0 = pd.read_excel(File_xlsx)
df0.rename(columns={"correct_answer":'answer' }, inplace=True)
# convert date_time column to datetime
df0['date_time'] = pd.to_datetime(df0['date_time'])
    
print(df0.head())
print(len(df0))

  username  problem_id  question_id        details  \
0       a1           2           23   First answer   
1       a1           2           23  Change answer   
2       a1           2           24   First answer   
3       a1           2           24  Change answer   
4       a1           2           24  Change answer   

                                              answer               date_time  
0                   Supervised binary classification 2023-08-03 09:41:32.112  
1              Supervised multi-label classification 2023-08-03 09:45:15.714  
2  from sklearn.cluster import KMeans # Our clust... 2023-08-03 09:47:33.993  
3  from pandas import read_csv\nimport seaborn as... 2023-08-03 09:47:36.525  
4  from sklearn.model_selection import train_test... 2023-08-03 09:47:37.888  
15951


In [15]:
# merge df with df0 on question_id and problem_id
df = pd.merge(df0, df1, on=['question_id', 'problem_id'])
print(len(df))
df.head()

15951


Unnamed: 0,username,problem_id,question_id,details,answer,date_time,Skill,correct_answer
0,a1,2,23,First answer,Supervised binary classification,2023-08-03 09:41:32.112,"[Supervised Learning, Classification Algorithms]",Supervised binary classification
1,a1,2,23,Change answer,Supervised multi-label classification,2023-08-03 09:45:15.714,"[Supervised Learning, Classification Algorithms]",Supervised binary classification
2,a2,2,23,First answer,Supervised multi-label classification,2023-08-03 09:39:02.603,"[Supervised Learning, Classification Algorithms]",Supervised binary classification
3,a2,2,23,Change answer,Supervised binary classification,2023-08-03 09:41:01.851,"[Supervised Learning, Classification Algorithms]",Supervised binary classification
4,a3,2,23,First answer,Supervised binary classification,2023-08-03 10:45:40.027,"[Supervised Learning, Classification Algorithms]",Supervised binary classification


In [16]:
# check answer and correct_answer columns if same put true else false in colomn correct]
df['correct'] = df['answer'] == df['correct_answer']
df.head()

Unnamed: 0,username,problem_id,question_id,details,answer,date_time,Skill,correct_answer,correct
0,a1,2,23,First answer,Supervised binary classification,2023-08-03 09:41:32.112,"[Supervised Learning, Classification Algorithms]",Supervised binary classification,True
1,a1,2,23,Change answer,Supervised multi-label classification,2023-08-03 09:45:15.714,"[Supervised Learning, Classification Algorithms]",Supervised binary classification,False
2,a2,2,23,First answer,Supervised multi-label classification,2023-08-03 09:39:02.603,"[Supervised Learning, Classification Algorithms]",Supervised binary classification,False
3,a2,2,23,Change answer,Supervised binary classification,2023-08-03 09:41:01.851,"[Supervised Learning, Classification Algorithms]",Supervised binary classification,True
4,a3,2,23,First answer,Supervised binary classification,2023-08-03 10:45:40.027,"[Supervised Learning, Classification Algorithms]",Supervised binary classification,True


In [17]:
# for each username, problem_id, question_id find the min date_time and subtract it from all its group and return the minutes
df['time'] = df.groupby(['username', 'problem_id'])['date_time'].transform(lambda x: (x - x.min()).dt.total_seconds() / 60)
df.rename(columns={"Skill":'skill' }, inplace=True)
df.head()

Unnamed: 0,username,problem_id,question_id,details,answer,date_time,skill,correct_answer,correct,time
0,a1,2,23,First answer,Supervised binary classification,2023-08-03 09:41:32.112,"[Supervised Learning, Classification Algorithms]",Supervised binary classification,True,0.0
1,a1,2,23,Change answer,Supervised multi-label classification,2023-08-03 09:45:15.714,"[Supervised Learning, Classification Algorithms]",Supervised binary classification,False,3.7267
2,a2,2,23,First answer,Supervised multi-label classification,2023-08-03 09:39:02.603,"[Supervised Learning, Classification Algorithms]",Supervised binary classification,False,0.0
3,a2,2,23,Change answer,Supervised binary classification,2023-08-03 09:41:01.851,"[Supervised Learning, Classification Algorithms]",Supervised binary classification,True,1.987467
4,a3,2,23,First answer,Supervised binary classification,2023-08-03 10:45:40.027,"[Supervised Learning, Classification Algorithms]",Supervised binary classification,True,0.0


In [23]:
# save the dataframe to a pickle file
df[["username", "skill", "correct", "time"]].to_pickle("data/KT_logs_annotated.pkl")