# load some packages

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

#import utils 
#reloads modules automagically, changes
#to code update without having to close the notebook
%load_ext autoreload
%autoreload 2

import os
from bs4 import BeautifulSoup
from snorkel.labeling import labeling_function

import blooms_preprocess_utils
import textstat

from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
import pickle
import seaborn as sns

# set seed for reproducibility
seed = 2021

#### Set desired mapping here

The **mapping** specifies which classification task we care about.

For example, below, the mapping has Bloom's Level One (Knowledge) in a class and all other
levels in a different class. 

**Note that when you change the mapping, labeling function parameters need to be changed to reflect the structure you've chosen.**

In this work, we consider a simple binary bloom classification where bloom 1 is one class and bloom 2 - 6 is the other class.

In [None]:
ABSTAIN = -1
KNOWLEDGE = 0
COMPREHENSION = 1
APPLICATION = 1
ANALYSIS = 1
SYNTHESIS = 1
EVALUATION = 1

mapping = {1:1, 2:2, 3:2, 4:2, 5:2, 6:2}

# load and preprocess dataset
the dataset that we use in this paper is not publicly available. 

your dataset should ideally be a dataframe with at least a column with the question texts

To preprocess your own dataset, we refer to the utility file `blooms_preprocess_utils` for detailed preprocessing steps that we performed. Those steps are specific to our dataset, but some utilities might be useful for preprocessing your own dataset.

In [None]:
df = pd.read_csv('questions_utf8.csv')  # PATH_TO_YOUR_DATASET
preprocess_df = blooms_preprocess_utils.preprocess_dataset(df, mapping)

#### compute the flesch readability scores of each question
this will be used later for one of our labeling functions

In [None]:
readability = []
for i in range(len(preprocess_df)):
    readability.append(textstat.textstat.flesch_reading_ease(preprocess_df.iloc[i].text))
preprocess_df['flesch_score'] = readability

# construct labeling functions
here we construct a few labeling functions, one of the key building blocks of our weakly supervised learning framework

In [None]:
from snorkel.labeling import labeling_function
from snorkel.labeling import LabelingFunction

#### labeling function based on question length (number of words)

In [None]:
@labeling_function()
def short(x):
    return KNOWLEDGE if len(x.text) < 75 else ABSTAIN

#### labeling function based on whether the word `why` appears in the question

In [None]:
@labeling_function()
def why(x):
    return EVALUATION if "why?" in x.text.lower() else ABSTAIN

#### labeling function based on whether the question contains any glossary terms

In [None]:
def glossary_terms(x, low_bin, high_bin, label):
    if (x.glossary_terms >= low_bin) & (x.glossary_terms < high_bin):
        return label
    return ABSTAIN

def make_glossary_lf(low_bin, high_bin, label, name):
    return LabelingFunction(
        name = name + '_glossary',
        f=glossary_terms,
        resources = dict(low_bin=low_bin, high_bin=high_bin, label = label),
    )

glossary_high = make_glossary_lf(3, float('inf'), COMPREHENSION, 'comprehension')

#### labeling function based on whether the question contains any bloom-specific keywords

In [None]:
# first we define a list of keywords specific to each bloom level
knowledge_list = ['define','identify','describe','label','list','name','state',
                  'match','recognize','select','examine','locate','memorize',
                  'quote','recall','reproduce','tabulate','tell','copy','discover','duplicate',
                  'enumerate','listen','observe','omit','read','recite','record','repeat','retell','visualize']
comprehension_list = ['explain','describe','interpret','paraphrase','summarize','classify','compare',
                      'differentiate','discuss','distinguish','extend','predict','associate','contrast',
                      'convert','demonstrate','estimate','express','identify','indicate','infer','relate',
                      'restate','select','translate','ask','cite','discover','generalize','group','illustrate','judge',
                      'observe','order','report','represent','research','review','rewrite','show','trace']
application_list = ['solve','apply','illustrate','modify','use','calculate','change','choose',
                    'demonstrate','discover','experiment','relate','show','sketch','complete',
                    'construct','dramatize','interpret','manipulate','paint','prepare','teach',
                    'act','collect','compute','explain','list','operate','practice','simulate',
                    'transfer','write']
analysis_list = ['analyze','compare','classify','contrast','distinguish','infer','separate','explain',
                 'select','categorize','connect','differentiate','divide','order','prioritize','survey',
                 'calculate','conclude','correlate','deduce','devise','diagram','dissect','estimate',
                 'evaluate','experiment','focus','illustrate','organize','outline','plan','question','test']
synthesis_list = ['design','compose','create','plan','combine','formulate','invent','hypothesize','substitute',
                  'write','compile','construct','develop','generalize','integrate','modify','organize','prepare',
                  'produce','rearrange','rewrite','adapt','anticipate','arrange','assemble','choose','collaborate',
                  'facilitate','imagine','intervene','make','manage','originate','propose','simulate','solve',
                  'support','test','validate']
evaluation_list = ['design','compose','create','plan','combine','formulate','invent','hypothesize','substitute',
                   'write','compile','construct','develop','generalize','integrate','modify','organize','prepare',
                   'produce','rearrange','rewrite','adapt','anticipate','arrange','assemble','choose','collaborate',
                   'facilitate','imagine','intervene','make','manage','originate','propose','simulate','solve',
                   'support','test','validate']


# then we construct the keyword labeling functions
def keyword_lookup(x, keywords, label):
    if any(word in x.text.lower() for word in keywords):
        return label
    return ABSTAIN

def make_keyword_lf(keywords, label, name):
    return LabelingFunction(
        name= name + '_keywords',
        f=keyword_lookup,
        resources=dict(keywords=keywords, label=label),
    )

keyword_knowledge = make_keyword_lf(knowledge_list, KNOWLEDGE, 'knowledge')
keyword_comprehension = make_keyword_lf(comprehension_list, COMPREHENSION, 'comprehension')
keyword_application = make_keyword_lf(application_list, APPLICATION, 'application')
keyword_analysis = make_keyword_lf(analysis_list, ANALYSIS, 'analysis')
keyword_synthesis = make_keyword_lf(synthesis_list, SYNTHESIS, 'synthesis')
keyword_evaluation = make_keyword_lf(evaluation_list, EVALUATION, 'evaluation')

#### labeling function based on the flesch readability score

In [None]:
@labeling_function()
def low_readability(x):
    return KNOWLEDGE if x.flesch_score < 50 else ABSTAIN

#### assemble the labeling functions into a list

In [None]:
lfs = [short, why, keyword_knowledge, keyword_comprehension,
       keyword_application, keyword_analysis, keyword_synthesis, keyword_evaluation,
       glossary_high, low_readability]

# The label model and the model training utility

In [None]:
from snorkel.labeling import filter_unlabeled_dataframe
from snorkel.labeling import PandasLFApplier
from snorkel.labeling import LFAnalysis
from sklearn.metrics import confusion_matrix
from snorkel.labeling.model import LabelModel


def weak_supervision_experiment(X_train, X_test, y_train, y_test, lfs, analysis = True, datasets = True, save_analysis = False):
    """
    -trains a weak supervision model on X_train with given lf list
    -if analysis=True, shows output of LF analysis and confusion matrix
    -if datasets=True, makes predictions and returns weak and supervised versions
    
    """
    ## setting applier and applying to data
    applier = PandasLFApplier(lfs=lfs)
    L_train = applier.apply(df=X_train)
    L_test = applier.apply(df=X_test)
    
    ## fitting model
    label_model = LabelModel(cardinality=2, verbose=True)
    label_model.fit(L_train=L_train, n_epochs=2000, log_freq=100)
    #label_model.fit(L_train=L_train, n_epochs=500, log_freq=100)

    
    label_model_acc = label_model.score(L=L_test, Y=y_test, tie_break_policy="random")[
    "accuracy"
]
    
    ## analysis
    if analysis:

        if save_analysis:
            return LFAnalysis(L=L_train, lfs=lfs).lf_summary()

        display(LFAnalysis(L=L_train, lfs=lfs).lf_summary())

        print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%")

        predictions = label_model.predict(L_test)
        display(confusion_matrix(y_test.adj_label, predictions))

    ## filtered training sets for supervised learning
    if datasets:
        train_predictions = label_model.predict(L_train)
        train_predictions = pd.DataFrame(train_predictions, columns = ['predictions'])

        y_train_weak = train_predictions[train_predictions['predictions'] != -1]
        X_train_weak = X_train.reset_index(drop = True)

        #need to use same filter on x_train
        X_train_weak = X_train_weak[(train_predictions['predictions'] != -1)]
        
        accuracy = round(label_model_acc * 100, 2)
        
        return [accuracy, X_train_weak, y_train_weak, X_train, X_test, y_train, y_test]

              
    accuracy = round(label_model_acc * 100, 2)
              
    return accuracy
    

# train the label model
The label model is trained on noisy labels produced by labeling functions. the resulting label can produce "weak" bloom labels for questions

please see the `bloom_preprocess_utils.py` for info on the relevant dataframe fields

the `adj_label` is the adjusted label because we are doing binary classification in this work. i.e., bloom 1 becomes label 0, and bloom 2 - 6 becomes label 1

In [None]:
# get the relevant data
X = pd.DataFrame(preprocess_df[['text', 'text_length', 'adj_label', 'num_words', 'glossary_terms', 'flesch_score']])
y = pd.DataFrame(preprocess_df['adj_label'])

# train the label model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = seed)

out = weak_supervision_experiment(X_train, X_test, y_train, y_test, lfs, True, datasets=True)

100%|██████████| 14175/14175 [00:27<00:00, 521.13it/s]
100%|██████████| 3544/3544 [00:07<00:00, 485.85it/s]


Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
short,0,[0],0.436261,0.326984,0.111323
why,1,[1],0.007125,0.005855,0.003386
knowledge_keywords,2,[0],0.32254,0.309771,0.133545
comprehension_keywords,3,[1],0.219259,0.211993,0.16381
application_keywords,4,[1],0.285079,0.248113,0.185115
analysis_keywords,5,[1],0.165855,0.161975,0.102081
synthesis_keywords,6,[1],0.140529,0.140529,0.091781
evaluation_keywords,7,[1],0.140529,0.140529,0.091781
comprehension_glossary,8,[1],0.104339,0.084656,0.053192
low_readability,9,[0],0.329877,0.270899,0.198377


Label Model Accuracy:     74.4%


array([[   0,    0,    0],
       [  62,  865,  373],
       [ 304,  368, 1572]])

# train a classifier on the "weakly" labeled dataset and compare to fully supervised methods
here we include linear SVM. one can use the many other choices of classifiers in sk-learn package.

text featurization is done via tf-idf. one can also use the many other choices of text vectorization methods such as word embeddings or BERT embeddings.

In [None]:
# first define a utility function for the comparison
def supervision_experiment(dataset, preprocess_method, model_names, model_list):
    
    X_train_weak = dataset[0]['text']
    y_train_weak = dataset[1]
    
    X_train = dataset[2]['text']
    y_train = dataset[4]
    
    X_test = dataset[3]['text']
    y_test = dataset[5]
    
    results_list = []
    
    if preprocess_method == 'tfidf':
        v = TfidfVectorizer()

        x_train_weak = v.fit_transform(X_train_weak.values.astype('U'))
        x_test_weak = v.transform(X_test.values.astype('U'))

        x_train = v.fit_transform(X_train.values.astype('U'))
        x_test = v.transform(X_test.values.astype('U'))

        
    elif preprocess_method == 'embedding':
        model = SentenceTransformer('distilbert-base-nli-mean-tokens')
        
        x_train_weak = model.encode(X_train_weak.values)
        x_test_weak = model.encode(X_test.values)
        
        x_train = model.encode(X_train.values)
        x_test = model.encode(X_test.values)
    
    else:
        #throw error
        print('need to set preprocessing method')
        

    for name, clf in zip(model_names, model_list):
        
        
        #### supervised portion
        
        clf.fit(x_train, y_train);
        supervised_train_score = clf.score(x_train, y_train);
        supervised_score = clf.score(x_test, y_test);
        
        #### weak supervision portion
        
        clf.fit(x_train_weak, y_train_weak);
        weak_train_score = clf.score(x_train_weak, y_train_weak);
        weak_score = clf.score(x_test_weak, y_test);

        row = {'name' : name,
               'super_train_score': supervised_train_score,
               'super_test_score': supervised_score,
               'weak_train_score': weak_train_score,
               'weak_test_score': weak_score}
        results_list.append(row)
        
        print("Done with:", name)
     
    #construct result df
    i = 0
    
    my_dict = {}
    
    for row in results_list:
        
        my_dict[i] = {"Classifier": row['name'],
                   "Supervised Train Accuracy": row['super_train_score'],
                  "Supervised Test Accuracy": row['super_test_score'],
                   "Weak Train Accuracy": row['weak_train_score'],
                   "Weak Test Accuracy": row['weak_test_score']
                  }
        i += 1
        
    
    return pd.DataFrame.from_dict(my_dict, "index")

#### define a classifier and run the comparison

In [None]:
linear_svm_clf = SVC(kernel="linear", C=0.025)

supervision_experiment(out[1:], 'tfidf', ["Linear SVM"],[linear_svm_clf])

  y = column_or_1d(y, warn=True)
Done with: Linear SVM


Unnamed: 0,Classifier,Supervised Train Accuracy,Supervised Test Accuracy,Weak Train Accuracy,Weak Test Accuracy
0,Linear SVM,0.795485,0.792325,0.786134,0.787528


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=bcbe6184-b36a-4097-968c-a2ee6ebad722' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>