In [2]:
!pip install snorkel datasets scikit-learn pandas

Collecting snorkel
  Downloading snorkel-0.10.0-py3-none-any.whl.metadata (9.5 kB)
Collecting munkres>=1.0.6 (from snorkel)
  Downloading munkres-1.1.4-py2.py3-none-any.whl.metadata (980 bytes)
Downloading snorkel-0.10.0-py3-none-any.whl (103 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.3/103.3 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading munkres-1.1.4-py2.py3-none-any.whl (7.0 kB)
Installing collected packages: munkres, snorkel
Successfully installed munkres-1.1.4 snorkel-0.10.0


In [19]:
from datasets import load_dataset
import pandas as pd
# Load 2000 training and 500 test examples for speed
imdb = load_dataset("imdb")
train = pd.DataFrame(imdb["train"])
test = pd.DataFrame(imdb["test"].select(range(500)))
print("Train size:", len(train), "Test size:", len(test))
train.head()

Train size: 25000 Test size: 500


Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0


In [20]:
import re
def clean_text(text):
    text = re.sub(r"<br\s*/?>", " ", text)
    text = re.sub(r"[^\w\s']", "", text)
    return text.lower()
    train["text"] = train["text"].apply(clean_text)
    test["text"] = test["text"].apply(clean_text)

In [21]:
from snorkel.labeling import labeling_function, LFAnalysis
from snorkel.labeling.model import LabelModel

# Define label constants
ABSTAIN, NEG, POS = -1, 0, 1

# Define keyword sets
positive_words = {"great", "excellent", "amazing", "wonderful", "best", "fantastic"}
negative_words = {"bad", "terrible", "awful", "worst", "boring", "poor"}

# Labeling function for positive sentiment
@labeling_function()
def lf_positive(x):
    return POS if any(w in x.text.split() for w in positive_words) else ABSTAIN

# Labeling function for negative sentiment
@labeling_function()
def lf_negative(x):
    return NEG if any(w in x.text.split() for w in negative_words) else ABSTAIN

# Labeling function for exclamation marks
@labeling_function()
def lf_exclaim(x):
    return POS if x.text.count("!") > 2 else ABSTAIN

# Combine labeling functions into a list
lfs = [lf_positive, lf_negative, lf_exclaim]


In [22]:
from snorkel.labeling import PandasLFApplier
applier = PandasLFApplier(lfs)
L_train = applier.apply(train)
LFAnalysis(L_train, lfs).lf_summary()

100%|██████████| 25000/25000 [00:03<00:00, 7981.57it/s]


Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
lf_positive,0,[1],0.39768,0.14232,0.11116
lf_negative,1,[0],0.31544,0.13956,0.13956
lf_exclaim,2,[1],0.1132,0.07768,0.04652


In [23]:
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=42)
# Get probabilistic labels
train_probs = label_model.predict_proba(L_train)

100%|██████████| 500/500 [00:00<00:00, 1212.55epoch/s]


In [24]:
train_preds = label_model.predict(L_train)

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
# Vectorize
vectorizer = TfidfVectorizer(max_features=5_000)
X_train = vectorizer.fit_transform(train["text"])
y_train = train_preds
# Fit classifier
clf = LogisticRegression(max_iter=200)
clf.fit(X_train, y_train)
# Evaluate on test set
X_test = vectorizer.transform(test["text"])
y_test = test["label"]
preds = clf.predict(X_test)
print(classification_report(y_test, preds, target_names=["neg","pos"], labels=[0, 1]))

              precision    recall  f1-score   support

         neg       1.00      0.51      0.68       500
         pos       0.00      0.00      0.00         0

   micro avg       0.82      0.51      0.63       500
   macro avg       0.50      0.26      0.34       500
weighted avg       1.00      0.51      0.68       500



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [26]:
clf_fs = LogisticRegression(max_iter=200)
clf_fs.fit(X_train, train["label"])
fs_preds = clf_fs.predict(X_test)
print("Fully supervised performance:")
print(classification_report(y_test, fs_preds, target_names=["neg","pos"]))

Fully supervised performance:
              precision    recall  f1-score   support

         neg       1.00      0.87      0.93       500
         pos       0.00      0.00      0.00         0

    accuracy                           0.87       500
   macro avg       0.50      0.43      0.46       500
weighted avg       1.00      0.87      0.93       500



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
