In [None]:
import pandas as pd
import numpy as np

import os

from bs4 import BeautifulSoup
import blooms_preprocess_utils
import textstat

from snorkel.labeling import labeling_function
from snorkel.labeling.model import LabelModel


from sklearn.model_selection import train_test_split
import pickle

In [None]:
with open("./supervised_model_weights.pkl", 'wb') as file:

    pickle.dump(lin_svm_supervised, file)

with open("./weak_model_weights.pkl", 'wb') as file:
    pickle.dump(lin_svm_weak, file)

In [None]:
demo_qs = pd.read_csv('question_info_2.csv')

demo_test_weak = v_weak.transform(demo_qs.question_text.values.astype('U'))

demo_test_supervised = v.transform(demo_qs.question_text.values.astype('U'))

## Loading things in

First, loading the weak and supervised trained model weights. The specific model is a linear SVM. It's easy to load models with a few lines of code, as shown below.

Then, the weak test and supervised test data are loaded. Note the differing dimensionalities among the datasets; this is a result of the preprocessing method. TF-IDF is used, and in order to avoid leakage a separate TFIDF vectorizer is used for each of the respective datasets (weak and fully supervised).

In [None]:
with open('weak_model_weights.pkl', 'rb') as f:
    weak_model = pickle.load(f)

with open('supervised_model_weights.pkl', 'rb') as f:
    supervised_model = pickle.load(f)

In [None]:
with open('./demo_Xtest_weak.pkl', 'rb') as handle:
    test_weak = pickle.load(handle)

with open('./demo_Xtest_supervised.pkl', 'rb') as handle:
    test_supervised = pickle.load(handle)

In [None]:
print(test_weak.shape)
test_supervised.shape

(10, 14099)


(10, 14829)

In [None]:
display(weak_model.predict(test_weak))
supervised_model.predict(test_supervised)

array([1, 0, 0, 1, 1, 1, 1, 0, 1, 1])

array([1, 0, 1, 1, 1, 1, 1, 0, 1, 1])

## Label prediction

We have 10 example questions, which we will predict labels for using both the weakly supervised model and the fully supervised model.

A simple explanation for differences between the models: the weak model and supervised model are trained on **different datasets** - the weak model uses labels ("y_train" data) generated from the weak supervision pipeline. In contrast, the supervised models uses ground truth labels.

Predidcted labels correspond to 0 for Lower Order (), and 1 for High Order ().

In [None]:
q_idx_list = [3064, 2293, 444, #(weak wrong) 
764, 62, 3454, 1929, 1244, 1830, #(both wrong),
 3074]

In [None]:
demo_qs = pd.read_csv("./question_info_2.csv", index_col =0)

In [None]:
demo_qs.head()

Unnamed: 0,uid,question_text
0,194@5,What is the principle function or role of trig...
1,13477@6,Define product rule in your own words.
2,4605@4,Which of the following statements about epigen...
3,18625@3,The conservation movement of the early twentie...
4,8049@3,Why is carbon considered a macronutrient?


In [None]:
demo_qs.question_text[2]

'Which of the following statements about epigenetic regulation is false?'

In [None]:
for i in range(demo_qs.shape[0]):
    print("Question Text:")
    print(demo_qs.question_text[i])

    print("Supervised Model predicts:")
    print(supervised_model.predict(test_supervised[i, :]))

    print("Weak Model predicts:")
    print(weak_model.predict(test_weak[i, :]))

    print('-'*50)

Question Text:
What is the principle function or role of triglycerides in animals?
Supervised Model predicts:
[1]
Weak Model predicts:
[1]
--------------------------------------------------
Question Text:
Define product rule in your own words.
Supervised Model predicts:
[0]
Weak Model predicts:
[0]
--------------------------------------------------
Question Text:
Which of the following statements about epigenetic regulation is false?
Supervised Model predicts:
[1]
Weak Model predicts:
[0]
--------------------------------------------------
Question Text:
The conservation movement of the early twentieth century echoed the progressive movement’s emphasis on
Supervised Model predicts:
[1]
Weak Model predicts:
[1]
--------------------------------------------------
Question Text:
Why is carbon considered a macronutrient?
Supervised Model predicts:
[1]
Weak Model predicts:
[1]
--------------------------------------------------
Question Text:
Joints are classified based on the material composi

In [None]:
# df = pd.read_csv('questions_utf8.csv')


# mapping = {1:1, 2:2, 3:2, 4:2, 5:2, 6:2}
# preprocess_df = blooms_preprocess_utils.preprocess_dataset(df, mapping);

# readability = []
# for i in range(len(preprocess_df)):
#     readability.append(textstat.textstat.flesch_reading_ease(preprocess_df.iloc[i].text))


In [None]:
t_df = X_test.reset_index(drop = True)
t_df.sample(8)

2058    What is 32.0^{\circ}\text{C} in degrees Fahren...
3240    Mammals, reptiles, and birds have the followin...
2827    \nAfter traveling along a path, a toy car’s fi...
2923    The slogan “Fifty-four Forty or Fight” best de...
778     Which group would most likely oppose a U.S. de...
2950                       Define lens in your own words.
1881    \nThe levels of carbon dioxide levels increase...
2159    Joints are classified based on the material co...
Name: text, dtype: object

In [None]:
# supervised_labels= lin_svm_supervised.predict(x_test)

# weak_labels = lin_svm_weak.predict(x_test_weak)

# supervised_labels.shape


(3544,)

In [None]:
# X_train_weak = pd.read_csv('../edu_research/big_dataset/vectorizer_data/X_train_weak.csv', index_col=0)['text']
# y_train_weak = pd.read_csv('../edu_research/big_dataset/vectorizer_data/y_train_weak.csv', index_col=0)

# X_train = pd.read_csv('../edu_research/big_dataset/vectorizer_data/X_train.csv', index_col=0)['text']
# y_train = pd.read_csv('../edu_research/big_dataset/vectorizer_data/y_train.csv', index_col=0)

# X_test = pd.read_csv('../edu_research/big_dataset/vectorizer_data/X_test.csv', index_col=0)['text']
# y_test = pd.read_csv('../edu_research/big_dataset/vectorizer_data/y_test.csv', index_col=0)


# v_weak = TfidfVectorizer()

# x_train_weak = v_weak.fit_transform(X_train_weak.values.astype('U'))
# x_test_weak = v_weak.transform(X_test.values.astype('U'))

# v = TfidfVectorizer()

# x_train = v.fit_transform(X_train.values.astype('U'))
# x_test = v.transform(X_test.values.astype('U'))


# lin_svm_supervised = SVC(kernel="linear", C=0.025)
# lin_svm_supervised.fit(x_train, y_train);

# #### weak supervision portion
# lin_svm_weak = SVC(kernel="linear", C=0.025)
# lin_svm_weak.fit(x_train_weak, y_train_weak);



In [None]:
# q_idx_list = [3064, 2293, 444, #(weak wrong) 
# 764, 62, 3454, 1929, 1244, 1830, #(both wrong),
#  3074]

In [None]:
# display(preprocess_df[preprocess_df.text == X_test.iloc[q_idx]])

# print(preprocess_df[preprocess_df.text == X_test.iloc[q_idx]]['type'])


# supervised_label = lin_svm_supervised.predict(x_test[q_idx, :])
# weak_label = lin_svm_weak.predict(x_test_weak[q_idx, :])

# print("blooms label:",
#  preprocess_df[preprocess_df.text == X_test.iloc[q_idx]]['adj_label'].values)
# print("supervised model predicts:", supervised_label)
# print("weak model predicts:", weak_label)
# print(X_test.iloc[q_idx])



In [None]:
# q_idx_list = [1867, 2293, 444, #(weak wrong)
# 764, 1013, 2159, 1929, 1244, 1830, #(both wrong),
#  3074]


# uid_list = []
# for i in range(len(q_idx_list)):
#     uid = preprocess_df[preprocess_df.text == X_test.iloc[q_idx_list[i]]].uid
#     text = preprocess_df[preprocess_df.text == X_test.iloc[q_idx_list[i]]].text
#     type = preprocess_df[preprocess_df.text == X_test.iloc[q_idx_list[i]]]['type']

#     print(i)
#     print(q_idx_list[i])
#     print(text.values[0])
#     print(type.values)

#     print(df[df.uid == uid.values[0]].blooms)
#     print(df[df.uid == uid.values[0]].)
#     print('-'*50)

#     uid_list.append([uid.values[0], text.values[0]])



<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=bcbe6184-b36a-4097-968c-a2ee6ebad722' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>