In [131]:
import os
import re
import tweepy as tw
import json
import pandas as pd
from scipy import sparse
import numpy as np
from snorkel.labeling import labeling_function
from snorkel.labeling import LFApplier
from snorkel.labeling import PandasLFApplier
from snorkel.labeling import LFAnalysis
from sklearn import svm
from sklearn.model_selection import train_test_split
import sqlite3

ABSTAIN = -1
NEGATIVE = 0
POSITIVE = 1

#Common names that have been found in dataset
l_names_or_places = r"(van dyke|ponce de leon|ponce city|ponce inlet town)" 
@labeling_function()
def names_or_places(tweet):
    return NEGATIVE if re.search(l_names_or_places, tweet.TWEET.lower()) else ABSTAIN

#Common simple insults
l_simple_insults = r"(fucking|disgusting|ugly|bitchy|pathetic) (fag|faggot|fags|fudgepacker|fudge packer|poofter|pansy|bender|batty boy|ponce|dyke|rug muncher|lesbo|tranny|trannie|transvestite|ladyboy|heshe|shemale|switch hitter)"
@labeling_function()
def simple_insults(tweet):
    return POSITIVE if re.search(l_simple_insults, tweet.TWEET.lower()) else ABSTAIN

#Common terms used to identify the subject of a given bad term e.g. "You ***" 
l_term_to_person = r"(that|this|you|shut the fuck up|stfu) (fag|faggot|fags|fudgepacker|fudge packer|poofter|pansy|bender|batty boy|ponce|dyke|rug muncher|lesbo|tranny|trannie|transvestite|ladyboy|heshe|shemale|switch hitter)"
@labeling_function()
def term_to_person(tweet):
    return POSITIVE if re.search(l_term_to_person, tweet.TWEET.lower()) else ABSTAIN

#Using term in a descriptive yet derogatory manner
l_descriptive_bad = r"(piece of|like a) (fag|faggot|fags|fudgepacker|fudge packer|poofter|pansy|bender|batty boy|ponce|dyke|rug muncher|lesbo|tranny|trannie|transvestite|ladyboy|heshe|shemale|switch hitter)"
@labeling_function()
def descriptive_bad(tweet):
    return POSITIVE if re.search(l_descriptive_bad, tweet.TWEET.lower()) else ABSTAIN

#Stating someone on the LGBT spectrum is unnatural
l_against_nature = r"(against|defying) (god|biology|nature)"
@labeling_function()
def against_nature(tweet):
    return POSITIVE if re.search(l_against_nature, tweet.TWEET.lower()) else ABSTAIN


#Contains trigger warning
l_trigger_warnings = r"(\/+ *tw|\/* *trigger warning)"
@labeling_function()
def trigger_warning(tweet):
    return NEGATIVE if re.search(l_trigger_warnings, tweet.TWEET.lower()) else ABSTAIN

#Using term 'Bender' when talking about drunken stints
l_bender_as_drunk = r"(day|on a) bender"
@labeling_function()
def bender_as_drunk(tweet):
    return NEGATIVE if re.search(l_bender_as_drunk, tweet.TWEET.lower()) else ABSTAIN

#Using Bender as a pop culture reference e.g. avatar the last airbender or Futurama reference
l_bender_pop_culture = r"((fender|water|earth|fire|wind|energy) *bender)|futurama.*bender|bender.*futurama|avatar.*bender"
@labeling_function()
def bender_pop_culture(tweet):
    return NEGATIVE if re.search(l_bender_pop_culture, tweet.TWEET.lower()) else ABSTAIN

#Using slang term for using cigarettes
l_slang_using_cigarettes = r"((for|smoke|have|smoking|having) (some|a) fags?)|fag ash"
@labeling_function()
def slang_using_cigarettes(tweet):
    return NEGATIVE if re.search(l_slang_using_cigarettes, tweet.TWEET.lower()) else ABSTAIN

#Slur is part of a mentioned twitter handle
l_handles = r"(\@[a-z0-9_]*)"
@labeling_function()
def slur_in_handles(tweet):
    in_handle = 0
    bad_terms = ['fag', 'faggot', 'fags', 'fudgepacker', 'fudge+packer', 'poofter', 'pansy', 'bender', 'batty+boy', 'ponce', 'dyke', 'rug+muncher', 'lesbo','tranny', 'trannie', 'transvestite', 'ladyboy', 'heshe', 'shemale','switch+hitter', 'gay+for+pay']
    #check for slur in handle
    handles = re.findall(l_handles, tweet.TWEET.lower())
    for handle in handles:
        if any(substring in handle for substring in bad_terms):
            in_handle+=1
    return NEGATIVE if in_handle>0 else ABSTAIN

#User has pronouns in bio
@labeling_function()
def has_pronouns(tweet):
    return NEGATIVE if tweet.HASPRONOUNS==1 else ABSTAIN

def make_Ls_matrix(data, LFs):
    noisy_labels = np.empty((len(data), len(LFs)))
    for i, row in data.iterrows():
        for j, lf in enumerate(LFs):
            noisy_labels[i][j] = lf(row)
    return noisy_labels


In [132]:
#instantiate labelling functions
lfs = [names_or_places,simple_insults,term_to_person,descriptive_bad,against_nature,trigger_warning,bender_as_drunk,bender_pop_culture,slang_using_cigarettes,slur_in_handles,has_pronouns]
applier = LFApplier(lfs=lfs)
pdapplier = PandasLFApplier(lfs=lfs)
#Store dict of tweets to labelling function output for performance analysis
tweet_dict = {}
#Connect to DB
conn = sqlite3.connect("tweets.db")
#Retrieve Tweets from Database
with conn:
    cur = conn.cursor()
    #Construct SQL Queries
    #Obtain all manually labelled tweets from DB
    pd_data = pd.read_sql(con=conn,sql="SELECT * FROM TWEETS WHERE ISHARASSMENT IS NOT NULL ORDER BY RANDOM()")
#Create train test split
df_train, df_test = train_test_split(pd_data, test_size=0.33, random_state=42)

Y_test = test.ISHARASSMENT

https://www.snorkel.org/use-cases/01-spam-tutorial

In [133]:

applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=df_train)

100%|██████████| 342/342 [00:00<00:00, 5659.39it/s]


In [134]:
L_train[0]

array([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1])

In [135]:
coverage_names_or_places, coverage_simple_insults,coverage_term_to_person,coverage_descriptive_bad,coverage_against_nature,coverage_trigger_warning,coverage_bender_as_drunk,coverage_bender_pop_culture,coverage_slang_using_cigarettes,coverage_slur_in_handles,coverage_has_pronouns = (L_train!=ABSTAIN).mean(axis=0)
print(f"coverage names or places: { coverage_names_or_places * 100:.14f}%")
print(f"coverage simple insults: { coverage_simple_insults * 100:.14f}%")
print(f"coverage term to person: { coverage_term_to_person * 100:.14f}%")
print(f"coverage descriptive bad term: { coverage_descriptive_bad * 100:.14f}%")
print(f"coverage against nature: { coverage_against_nature * 100:.14f}%")
print(f"coverage trigger warning: { coverage_trigger_warning * 100:.14f}%")
print(f"coverage bender as drunk: { coverage_bender_as_drunk * 100:.14f}%")
print(f"coverage bender in pop culture: { coverage_bender_pop_culture * 100:.14f}%")
print(f"coverage slang using cigarettes: { coverage_slang_using_cigarettes * 100:.14f}%")
print(f"coverage slur in handles: { coverage_slur_in_handles * 100:.14f}%")
print(f"coverage has pronouns in bio: { coverage_has_pronouns* 100:.14f}%")


coverage names or places: 7.60233918128655%
coverage simple insults: 0.87719298245614%
coverage term to person: 2.63157894736842%
coverage descriptive bad term: 0.58479532163743%
coverage against nature: 0.00000000000000%
coverage bender as drunk: 2.04678362573099%
coverage bender in pop culture: 1.16959064327485%
coverage slang using cigarettes: 0.58479532163743%
coverage slur in handles: 15.20467836257310%
coverage has pronouns in bio: 9.35672514619883%


In [136]:
LFAnalysis(L=L_train, lfs=lfs).lf_summary()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
names_or_places,0,[0],0.076023,0.002924,0.0
simple_insults,1,[1],0.008772,0.0,0.0
term_to_person,2,[1],0.026316,0.002924,0.002924
descriptive_bad,3,[1],0.005848,0.0,0.0
against_nature,4,[],0.0,0.0,0.0
trigger_warning,5,[0],0.002924,0.0,0.0
bender_as_drunk,6,[0],0.020468,0.002924,0.0
bender_pop_culture,7,[0],0.011696,0.0,0.0
slang_using_cigarettes,8,[0],0.005848,0.0,0.0
slur_in_handles,9,[0],0.152047,0.023392,0.0


In [141]:
df_train.iloc[L_train[:,1] == POSITIVE].sample(3, random_state=1)

Unnamed: 0,TWEETID,USERID,TWEET,ISTYPEHOMOSEXUAL,ISTYPETRANSGENDER,ISTYPEBISEXUAL,HASPRONOUNS,ISHARASSMENT,AUTO_ISHARASSMENT
67,1365301416873574400,1362080784417497093,Any other Alphas been fucked over by @Football...,1,0,0,0.0,1,
288,1365295656563339265,1325247924,Ready to boycott every remake / reboot movie f...,1,0,0,0.0,1,
74,1371949898208251904,2562590314,If we can have concerts and festivals safely t...,1,0,0,0.0,0,


In [142]:
L_test = applier.apply(df=df_test)

100%|██████████| 169/169 [00:00<00:00, 6290.60it/s]


In [143]:
LFAnalysis(L=L_train, lfs=lfs).lf_summary()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
names_or_places,0,[0],0.076023,0.002924,0.0
simple_insults,1,[1],0.008772,0.0,0.0
term_to_person,2,[1],0.026316,0.002924,0.002924
descriptive_bad,3,[1],0.005848,0.0,0.0
against_nature,4,[],0.0,0.0,0.0
trigger_warning,5,[0],0.002924,0.0,0.0
bender_as_drunk,6,[0],0.020468,0.002924,0.0
bender_pop_culture,7,[0],0.011696,0.0,0.0
slang_using_cigarettes,8,[0],0.005848,0.0,0.0
slur_in_handles,9,[0],0.152047,0.023392,0.0


In [144]:
from snorkel.labeling.model import MajorityLabelVoter

majority_model = MajorityLabelVoter()
preds_train = majority_model.predict(L=L_train)

In [146]:
preds_train

array([-1, -1, -1, -1, -1,  0, -1, -1, -1,  0, -1, -1, -1, -1,  0,  0, -1,
       -1, -1, -1,  0, -1, -1, -1, -1,  0, -1, -1, -1,  1, -1,  0, -1,  0,
       -1, -1,  0, -1, -1,  0, -1, -1,  0, -1,  0, -1,  0,  0, -1, -1,  1,
       -1, -1, -1,  0, -1,  0,  0, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0,
        0, -1,  0,  0, -1,  0,  0, -1, -1, -1, -1, -1,  0, -1, -1, -1,  1,
        0, -1,  0, -1, -1,  0, -1, -1, -1, -1,  0, -1, -1,  0, -1, -1, -1,
       -1, -1, -1, -1,  0,  0, -1,  0,  0, -1, -1,  0, -1, -1,  0, -1, -1,
        0, -1, -1, -1, -1, -1, -1, -1,  0,  0, -1, -1, -1,  0,  0, -1, -1,
       -1, -1, -1,  0, -1, -1,  0,  0,  0, -1, -1, -1,  1, -1, -1, -1, -1,
        0,  0, -1,  0, -1, -1, -1, -1, -1, -1,  0, -1,  0,  0, -1, -1,  0,
        1, -1, -1, -1,  0,  0,  0, -1,  1, -1, -1,  1, -1, -1, -1, -1,  0,
       -1,  0, -1, -1,  0,  1, -1, -1,  0, -1, -1, -1,  0, -1,  0, -1, -1,
       -1,  0, -1,  0, -1, -1, -1,  0, -1,  0,  1,  0,  0, -1, -1, -1,  0,
       -1, -1,  0,  0, -1

In [147]:
from snorkel.labeling.model import LabelModel

label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123)

In [149]:
majority_acc = majority_model.score(L=L_test, Y=Y_test, tie_break_policy="random")[
    "accuracy"
]
print(f"{'Majority Vote Accuracy:':<25} {majority_acc * 100:.1f}%")

label_model_acc = label_model.score(L=L_test, Y=Y_test, tie_break_policy="random")[
    "accuracy"
]
print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:1f}%")

Majority Vote Accuracy:   55.0%
Label Model Accuracy:     53.254438%


In [152]:
from snorkel.labeling import filter_unlabeled_dataframe

df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
    X=df_train, Y=probs_train, L=L_train
)

NameError: name 'probs_train' is not defined

In [153]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(ngram_range=(1, 5))
X_train = vectorizer.fit_transform(df_train_filtered.text.tolist())
X_test = vectorizer.transform(df_test.text.tolist())

NameError: name 'df_train_filtered' is not defined

In [154]:
from snorkel.utils import probs_to_preds

preds_train_filtered = probs_to_preds(prods=probs_train_filtered)

NameError: name 'probs_train_filtered' is not defined

In [156]:
from sklearn.linear_model import LogisticRegression

sklearn_model = LogisticRegression(C=1e3, solver="liblinear")
sklearn_model.fit(X=X_train, y=preds_train_filtered)

NameError: name 'X_train' is not defined

In [157]:
print(f"Test Accuracy: {sklearn_model.score(X=X_test, y=Y_test) * 100:.1f}%")

NameError: name 'X_test' is not defined