In [13]:
%reload_ext autoreload
%autoreload 2
import sys
sys.path.append('../')
from snorkelling.labelling_functions import get_lfs
from snorkel.labeling import PandasLFApplier
from snorkel.labeling import LFAnalysis
import pandas as pd
import numpy as np

### Define the values for call and not call

In [14]:
ABSTAIN = -1
CALL = 1
NOTCALL = 0

## Create training and testing split

### Read train and test from pickle

In [15]:
train: pd.DataFrame = pd.read_pickle('train-test-splits/train_2.pkl')
test: pd.DataFrame = pd.read_pickle('train-test-splits/test_2.pkl')
print(train[["url", "label"]])
print(test[["url", "label"]])

                                             url  label
521  https://www.youtube.com/watch?v=XQJLkt0GZss      1
737  https://www.youtube.com/watch?v=kvkhr6G92uM      0
660  https://www.youtube.com/watch?v=_k0HW4k-b8Y      0
411  https://www.youtube.com/watch?v=y9zgz-QJqUU      1
678  https://www.youtube.com/watch?v=El06CcMZS60      0
..                                           ...    ...
976  https://www.youtube.com/watch?v=18bovtIlrpI      1
980  https://www.youtube.com/watch?v=_5_hgrVrkTs      1
982  https://www.youtube.com/watch?v=_iwLCKZrMLU      1
992  https://www.youtube.com/watch?v=11fGUPtG_UM      1
996  https://www.youtube.com/watch?v=NFTE5WK_tlc      1

[800 rows x 2 columns]
                                             url  label
610  https://www.youtube.com/watch?v=kJnEkpcqfow      0
103  https://www.youtube.com/watch?v=le71yVPh4uk      0
209  https://www.youtube.com/watch?v=P_GBQrHghcg      1
284  https://www.youtube.com/watch?v=L3JxeoWK2vc      0
662  https://www.youtube

### Create label column

In [16]:
def create_label_column(df: pd.DataFrame):
    df = df.copy(deep=True)
    # df["label"] = df["Answer_is-a-call_most"].apply(lambda x: CALL if x else NOTCALL)
    df["label"] = df["Answer_is-a-call_most"].apply(lambda x: CALL if x else NOTCALL)
    # # drop redundant columns
    # data_pd = data_pd.drop(columns=["Answer_is-a-call_most", "Answer_is-a-call_some", "Answer_is-a-call_none", "Input_video_id", "Answer_is-a-call_none_y"])
    return df
train = create_label_column(train)
test = create_label_column(test)

## Get labelling functions

In [17]:
lfs = get_lfs()
lf_array = []
# for lf in lfs.keys():
#     lf_array += lfs[lf]
lf_array = lfs['transcript'] + lfs['general'] + lfs['scam_types']
remove_these_lfs = []
lf_array = list(filter(lambda x: x.name not in remove_these_lfs, lf_array))

# Get the applier

In [18]:
applier = PandasLFApplier(lfs=lf_array)
L_train = applier.apply(df=train)
L_test = applier.apply(df=test)

100%|██████████| 800/800 [00:11<00:00, 68.94it/s]
100%|██████████| 200/200 [00:02<00:00, 78.19it/s]


## Summarise

In [23]:
LFAnalysis(L=L_train, lfs=lf_array).lf_summary(Y=np.array(train['label'])).round(2).to_latex('lf.tex', float_format="%.2f")
LFAnalysis(L=L_train, lfs=lf_array).lf_summary(Y=np.array(train['label'])).round(2)
# the y parameter sets the ground truth labels for the training set

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
two_hellos,0,[1],0.29,0.29,0.21,138,91,0.6
thanks_for_calling,1,[1],0.08,0.08,0.04,50,11,0.82
only_two_people_speaking,2,[1],0.14,0.14,0.09,75,40,0.65
transcript_sentiment,3,[1],0.74,0.74,0.56,272,317,0.46
anydesk_in_transcript,4,[1],0.05,0.05,0.04,25,17,0.6
secure_server_in_transcript,5,[1],0.02,0.02,0.01,13,5,0.72
pronoun_usage,6,[0],0.54,0.54,0.5,339,93,0.78
sponsor_in_video,7,[0],0.05,0.05,0.05,23,16,0.59
small_video,8,[1],0.35,0.35,0.18,168,110,0.6
hacking,9,[0],0.04,0.04,0.04,24,7,0.77


# Train the LabelModel
https://medium.com/sculpt/a-technique-for-building-nlp-classifiers-efficiently-with-transfer-learning-and-weak-supervision-a8e2f21ca9c8
This link may take you through how to include ground truth in training

In [20]:
from snorkel.labeling.model import LabelModel
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train=L_train, Y_dev=train['label'].values, n_epochs=500, log_freq=100, seed=123)
label_model

INFO:root:Computing O...
INFO:root:Estimating \mu...
  0%|          | 0/500 [00:00<?, ?epoch/s]INFO:root:[0 epochs]: TRAIN:[loss=2.573]
 12%|█▏        | 58/500 [00:00<00:00, 573.50epoch/s]INFO:root:[100 epochs]: TRAIN:[loss=0.020]
 26%|██▌       | 128/500 [00:00<00:00, 646.66epoch/s]INFO:root:[200 epochs]: TRAIN:[loss=0.019]
 58%|█████▊    | 291/500 [00:00<00:00, 759.89epoch/s]INFO:root:[300 epochs]: TRAIN:[loss=0.019]
 75%|███████▍  | 374/500 [00:00<00:00, 783.42epoch/s]INFO:root:[400 epochs]: TRAIN:[loss=0.019]
100%|██████████| 500/500 [00:00<00:00, 688.40epoch/s]
INFO:root:Finished Training


LabelModel()

# Test the LabelModel

In [21]:
label_model_acc = label_model.score(L=L_test, Y=test.label.values, tie_break_policy="random", metrics=["accuracy", "coverage", "precision", "recall", "f1"])
# print(f"{'Label Model Accuracy:':<25} {label_model_acc["accuracy"] * 100:.1f}%")
print(label_model_acc)

{'accuracy': 0.805, 'coverage': 1.0, 'precision': 0.7272727272727273, 'recall': 0.810126582278481, 'f1': 0.7664670658682634}


In [22]:
train_w_predictions = train.copy(deep=True)
train_w_predictions['predicted_labels'] = label_model.predict(L=L_train, return_probs=True, tie_break_policy="random")[0]
train_w_predictions['url'] = train_w_predictions.apply(lambda x: f'https://invidious.perennialte.ch/watch?v={x["video_id"]}', axis=1)
print(train_w_predictions[train_w_predictions['label'].ne(train_w_predictions['predicted_labels'])][['url','title', 'label', 'predicted_labels']].to_string())
print(train_w_predictions['predicted_labels'].value_counts())

                                                      url                                                                                                 title  label  predicted_labels
883  https://invidious.perennialte.ch/watch?v=yx_AtdAzfZs                                                 Found this Scammers 400+ Photos and then Telling her!      0                 1
210  https://invidious.perennialte.ch/watch?v=n3zZNPBDHhk                                                                      Showing A Scammer His Own Photo!      1                 0
986  https://invidious.perennialte.ch/watch?v=pbWHVO7S_u4                                                 The Most "Professional" Scammers I've Met (9 HR Call)      1                 0
621  https://invidious.perennialte.ch/watch?v=CggArLHGHyk  4️⃣2️⃣7️⃣ 🔥 TWITTER SCAMBAITING #133 🔥 A Royal Scammer, an Inappropriate Request, Scammer vs Scammer      0                 1
198  https://invidious.perennialte.ch/watch?v=7mBql2yMkXo                  