In [155]:
# https://www.snorkel.org/get-started/
pd.set_option("display.max_colwidth", 100)

In [156]:
# Define the label mappings for convenience
ABSTAIN = -1
NOT_SPAM = 0
SPAM = 1

In [157]:
df = pd.read_parquet('sms.parquet')
df['category'] = df.apply(lambda r: SPAM if r['label'] == 'spam' else NOT_SPAM, axis=1)
df.head(1)

Unnamed: 0,label,text,category
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...",0


In [158]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df[['text']], df[['category']], test_size=0.2, random_state=0)

In [159]:
from snorkel.labeling import labeling_function

@labeling_function()
def lf_keyword_my(x):
    """Many spam comments talk about 'my channel', 'my video', etc."""
    return SPAM if "my" in x.text.lower() else ABSTAIN

In [160]:
import re

@labeling_function()
def lf_regex_check_out(x):
    """Spam comments say 'check out my video', 'check it out', etc."""
    return SPAM if re.search(r"check.*out", x.text, flags=re.I) else ABSTAIN

In [161]:
@labeling_function()
def lf_short_comment(x):
    """Non-spam comments are often short, such as 'cool video!'."""
    return NOT_SPAM if len(x.text.split()) < 5 else ABSTAIN

In [162]:
from textblob import TextBlob

@labeling_function()
def lf_textblob_polarity(x):
    """
    We use a third-party sentiment classification model, TextBlob.

    We combine this with the heuristic that non-spam comments are often positive.
    """
    return NOT_SPAM if TextBlob(x.text).sentiment.polarity > 0.3 else ABSTAIN

In [163]:
from snorkel.labeling import LabelModel, PandasLFApplier

# Define the set of labeling functions (LFs)
lfs = [lf_keyword_my, lf_regex_check_out, lf_short_comment, lf_textblob_polarity]

# Apply the LFs to the unlabeled training data
applier = PandasLFApplier(lfs)
L_train = applier.apply(X_train[['text']])

# Train the label model and compute the training labels
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train, n_epochs=500, log_freq=50, seed=123)
X_train['label'] = label_model.predict(L=L_train, tie_break_policy="abstain")


  0%|          | 0/4457 [00:00<?, ?it/s][A
  4%|▎         | 158/4457 [00:00<00:02, 1577.09it/s][A
  9%|▊         | 387/4457 [00:00<00:02, 1738.72it/s][A
 13%|█▎        | 596/4457 [00:00<00:02, 1830.92it/s][A
 19%|█▊        | 827/4457 [00:00<00:01, 1952.18it/s][A
 24%|██▍       | 1069/4457 [00:00<00:01, 2071.58it/s][A
 29%|██▉       | 1288/4457 [00:00<00:01, 2103.71it/s][A
 34%|███▍      | 1521/4457 [00:00<00:01, 2165.91it/s][A
 39%|███▉      | 1750/4457 [00:00<00:01, 2200.15it/s][A
 45%|████▍     | 1993/4457 [00:00<00:01, 2264.30it/s][A
 50%|████▉     | 2216/4457 [00:01<00:01, 2230.29it/s][A
 55%|█████▍    | 2443/4457 [00:01<00:00, 2242.02it/s][A
 60%|██████    | 2682/4457 [00:01<00:00, 2283.71it/s][A
 65%|██████▌   | 2919/4457 [00:01<00:00, 2308.03it/s][A
 71%|███████   | 3157/4457 [00:01<00:00, 2329.03it/s][A
 76%|███████▌  | 3397/4457 [00:01<00:00, 2349.20it/s][A
 81%|████████▏ | 3632/4457 [00:01<00:00, 2338.51it/s][A
 87%|████████▋ | 3874/4457 [00:01<00:00, 2361.1

In [164]:
X_train['label'].value_counts()

-1    2695
 0    1386
 1     376
Name: label, dtype: int64

In [165]:
X_train = X_train[X_train.label != ABSTAIN]
y_train = X_train[['label']]
X_train = X_train[['text']]

In [166]:
# From text to a feature vector
from sklearn.feature_extraction import text

# bow_transform = text.CountVectorizer(max_features=500, min_df=0.0, max_df=1.0)
bow_transform = text.CountVectorizer()
X_train_bow = bow_transform.fit_transform(X_train['text'])
X_text_bow = bow_transform.transform(X_test['text'])
len(bow_transform.vocabulary_)

4394

In [167]:
# Feature scaling (TF-IDF & L2 normalization) 
tfidf_trfm = text.TfidfTransformer(norm='l2')
X_train_tfidf = tfidf_trfm.fit_transform(X_train_bow)
X_test_tfidf = tfidf_trfm.transform(X_text_bow)

In [168]:
from sklearn import tree

clf = tree.DecisionTreeClassifier(max_depth=32)
clf.fit(X_train_tfidf, y_train)
clf.score(X_test_tfidf, y_test)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=32,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

0.7865470852017937