# Tweets Labeler

## Setup Snorkel Session and Load Data
Creates a snorkel session on SQLite database and loads tweets.

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from snorkel import SnorkelSession
session = SnorkelSession()

In [None]:
from snorkel.models import candidate_subclass

Exercise = candidate_subclass('Exercise', ['content'])

In [None]:
import pandas as pd

docs = []
tweets_file_path = 'data/unlabeled_tweets.tsv'

tweets = pd.read_csv(tweets_file_path, sep = '\t')
for idx, row in tweets.iterrows():
    docs.append(row['content'])
    
train_set = set()
dev_set = set()
test_set = set()

for i, doc in enumerate(docs):
    if i % 10 == 8:
        dev_set.add(doc)
    elif i % 10 == 9:
        test_set.add(doc)
    else:
        train_set.add(doc)    

In [None]:
from snorkel.candidates import CandidateExtractor
cand_extractor = CandidateExtractor(Exercise, [], [])

In [None]:
%%time
for i, docs in enumerate([train_set, dev_set, test_set]):    
    cand_extractor.apply(docs, split=i)
    print("Number of candidates:", session.query(Exercise).filter(Exercise.split == i).count())

In [None]:
# Load Gold Labels
from util import load_external_labels
%time missed = load_external_labels(session, Exercise, annotator_name='gold')

In [None]:
# Load existing dev and test sets
from snorkel.annotations import load_gold_labels

L_gold_dev = load_gold_labels(session, annotator_name='gold', split=1)
L_gold_test = load_gold_labels(session, annotator_name='gold', split=2)
print(L_gold_dev.shape, L_gold_test.shape)

## Labeling Functions (LFs)
LF is a python function that accepts a tweet and returns 1 if it marks it relevant, 2 if irrelevant, 3 if junk, or 0 to abstain.

In [None]:
# Looks for a kb phrase in the tweet
kb = 'data/kb.txt'
def is_exercise(c):
    with open(kb) as f:
        for phrase in f:
            if c.content.find(phrase.strip()) >= 0:
                return True
        return False

In [None]:
import spacy
nlp = spacy.load('en')

# Look for person names
def has_person(c):
    ents = [e.label_ for e in nlp(c.content).ents]
    for l in ents:
        if l == 'PERSON':
            return True
    return False

In [None]:
first_person = []
other_person = []
pronouns_path = 'data/pronouns.tsv'

pronouns = pd.read_csv(pronouns_path, sep = '\t')

for idx, row in pronouns.iterrows():
    if row['category'] == 1:
        first_person.append(row['pronoun'])
    else:
        other_person.append(row['pronoun'])        

def LF_1(c):
    if is_exercise(c):
        for pronoun in first_person:
            if pronoun in c.content.split():
                #print('me = {0} => {1}'.format(me, c.content))
                return 1
        return 0
    return 3

In [None]:
def LF_2(c):
    if is_exercise(c):
        if has_person(c):
            return 2
        else:
            for pronoun in other_person:
                if pronoun in c.content.split():   
                    #print('other = {0} => {1}'.format(o, c.content))
                    return 2
            return 0
    return 3               

In [None]:
def LF_3(c):    
    for idx, tweet in tweets.iterrows():
        if c.content == tweet['content']:
            #print('content = {0}, label = {1}'.format(c.content, tweet['label']))
            return tweet['label']
    return 0

In [None]:
# Use weak classifier
from snorkel.weak_classifier import train_classifier, classify

# First train the classifier
vec, clf = train_classifier()

def LF_weak_classifier(c):
    label = classify(vec, clf, [c.content])
    return label

In [None]:
# Group LFs in a list for later use
LFs = [LF_1, LF_2, LF_3, LF_weak_classifier]

In [None]:
# Check size of dev set labeled as exercise tweets using LF_weak_classifier
labeled = []
for c in session.query(Exercise).filter(Exercise.split == 1):
    if LF_weak_classifier(c) == 1:
        labeled.append(c)
print("Number labeled:", len(labeled))

In [None]:
# Apply LFs
from snorkel.annotations import LabelAnnotator
labeler = LabelAnnotator(lfs=LFs)

In [None]:
# Run labeler
import numpy as np
np.random.seed(1701)
%time L_train = labeler.apply(split=0)
L_train

In [None]:
L_train.todense()

In [None]:
# Load the labels as a sparse matrix
%time L_train = labeler.load_matrix(session, split=0)
L_train

In [None]:
# View statistics about the resulting label matrix
L_train.lf_stats(session)

## The Generative Model

Train a model of the LFs to estimate their accuracies and then combine the outputs of the LFs into a noise-aware training labels set.

In [None]:
from snorkel.learning import GenerativeModel

gen_model = GenerativeModel()
gen_model.train(L_train, epochs=100, decay=0.95, step_size=0.1 / L_train.shape[0], reg_param=1e-6)

In [None]:
gen_model.weights.lf_accuracy

In [None]:
# Apply gen model to the training candidates to get the noise-aware training label set (training marginals)
train_marginals = gen_model.marginals(L_train)

In [None]:
# Distribution of the training marginals
import matplotlib.pyplot as plt
plt.hist(train_marginals, bins=20)
plt.show()

In [None]:
# Learned accuracy parameters, and other statistics about the LFs learned by the generative model
gen_model.learned_lf_stats()

### Iterate on Labeling Functions
Improve the LF set.  First, apply the LFs to the development set:

In [None]:
L_dev = labeler.apply_existing(split=1)

In [None]:
# Get the score of the generative model
correct, incorrect = gen_model.error_analysis(session, L_dev, L_gold_dev)

### Save the training labels

Save the `training_marginals` (**probabilistic training labels**) for later use to train an end extraction model:

In [None]:
from snorkel.annotations import save_marginals
%time save_marginals(session, L_train, train_marginals)

## Training an End Extraction Model
Use the noisy training labels to train the end extraction model (Bi-LSTM, a state-of-the-art deep neural network). 

In [None]:
# Reload the probabilistic training labels
from snorkel.annotations import load_marginals
train_marginals = load_marginals(session, split=0)

In [None]:
# Reload the candidates
train_cands = session.query(Exercise).filter(Exercise.split == 0).order_by(Exercise.id).all()
dev_cands   = session.query(Exercise).filter(Exercise.split == 1).order_by(Exercise.id).all()
test_cands  = session.query(Exercise).filter(Exercise.split == 2).order_by(Exercise.id).all()

In [None]:
# Load the gold labels for evaluation
#from snorkel.annotations import load_gold_labels
#L_gold_dev  = load_gold_labels(session, annotator_name='gold', split=1)
#L_gold_test = load_gold_labels(session, annotator_name='gold', split=2)

In [None]:
# Get labels of train set
train_candidates = [train_cands[i].content for i in range(len(train_cands))]
Y_train = [t['label'] for c in train_candidates for (i, t) in tweets.iterrows() if c == t['content']] 
Y_train = np.asarray(Y_train)

In [None]:
# Get labels of dev set
dev_candidates = [dev_cands[i].content for i in range(len(dev_cands))]
Y_dev = [t['label'] for c in dev_candidates for (i, t) in tweets.iterrows() if c == t['content']] 
Y_dev = np.asarray(Y_dev)

In [None]:
# Get labels of test set
test_candidates = [test_cands[i].content for i in range(len(test_cands))]
Y_test = [t['label'] for c in test_candidates for (i, t) in tweets.iterrows() if c == t['content']] 
Y_test = np.asarray(Y_test)

In [None]:
# Setup the discriminative model
from snorkel.learning.disc_models.rnn import reRNN

train_kwargs = {
    'lr':         0.01,
    'dim':        50,
    'n_epochs':   10,
    'dropout':    0.25,
    'print_freq': 1
}

lstm = reRNN(seed=1701, n_threads=None)
lstm.train(train_cands, train_marginals, X_dev=dev_cands, Y_dev=Y_dev, **train_kwargs)

In [None]:
# Stats from the discriminative model
train_cands_ac = lstm.score(train_cands, Y_train)
dev_cands_ac = lstm.score(dev_cands, Y_dev)
test_cands_ac = lstm.score(test_cands, Y_test)

print("Accuracy Score:\ntrain = {0:.3f}, dev = {1:.3f}, and test= {2:.3f}".format(train_cands_ac, dev_cands_ac, test_cands_ac))

In [None]:
# Create a cleaned jupyter notebook for version control
import os

notebook_path = os.path.join(os.getcwd(),'label_tweets.ipynb')
cleaned_path = os.path.join(os.getcwd(),'tweets_labeler.ipynb')

!cat {notebook_path} | nbstripout > {cleaned_path}