In [1]:
import os
import re
import tweepy as tw
import json
import pandas as pd
from scipy import sparse
import numpy as np
from snorkel.labeling import labeling_function
from snorkel.labeling import LFApplier
from snorkel.labeling import PandasLFApplier
from snorkel.labeling import LFAnalysis
from sklearn import svm
from sklearn.model_selection import train_test_split
import sqlite3

ABSTAIN = -1
NEGATIVE = 0
POSITIVE = 1

#Common names that have been found in dataset
l_names_or_places = r"(van dyke|ponce de leon|ponce city|ponce inlet town)" 
@labeling_function()
def names_or_places(tweet):
    return NEGATIVE if re.search(l_names_or_places, tweet.TWEET.lower()) else ABSTAIN

#Common simple insults
l_simple_insults = r"(fucking|disgusting|ugly|bitchy|pathetic) (fag|faggot|fags|fudgepacker|fudge packer|poofter|pansy|bender|batty boy|ponce|dyke|rug muncher|lesbo|tranny|trannie|transvestite|ladyboy|heshe|shemale|switch hitter)"
@labeling_function()
def simple_insults(tweet):
    return POSITIVE if re.search(l_simple_insults, tweet.TWEET.lower()) else ABSTAIN

#Common terms used to identify the subject of a given bad term e.g. "You ***" 
l_term_to_person = r"(that|this|you|shut the fuck up|stfu|shut up) (fag|faggot|fags|fudgepacker|fudge packer|poofter|pansy|bender|batty boy|ponce|dyke|rug muncher|lesbo|tranny|trannie|transvestite|ladyboy|heshe|shemale|switch hitter)"
@labeling_function()
def term_to_person(tweet):
    return POSITIVE if re.search(l_term_to_person, tweet.TWEET.lower()) else ABSTAIN

#Using term in a descriptive yet derogatory manner
l_descriptive_bad = r"(piece of|like a|being a|kind of a?) (fag|faggot|fags|fudgepacker|fudge packer|poofter|pansy|bender|batty boy|ponce|dyke|rug muncher|lesbo|tranny|trannie|transvestite|ladyboy|heshe|shemale|switch hitter)"
@labeling_function()
def descriptive_bad(tweet):
    return POSITIVE if re.search(l_descriptive_bad, tweet.TWEET.lower()) else ABSTAIN

#Stating someone on the LGBT spectrum is unnatural
l_against_nature = r"(against|defying) (god|biology|nature)"
@labeling_function()
def against_nature(tweet):
    return POSITIVE if re.search(l_against_nature, tweet.TWEET.lower()) else ABSTAIN

#Term is used in full caps
l_bad_words_caps = r"(FAGS?|FAGGOTS?|FUDGE ?PACKERS?|POOFTERS?|PANSY|PANSIES|BENDERS?|BATTY BOYS?|PONCE|DYKES?|RUG ?MUNCHERS?|LESBOS?|TRANNY|TRANNIES?|TRANSVESTITES?|LADYBOYS?|HESHES?|SHE ?MALES?|SWITCH ?-?HITTERS?)"
@labeling_function()
def full_caps(tweet):
    return POSITIVE if re.search(l_bad_words, tweet.TWEET) else ABSTAIN

#Contains trigger warning
l_trigger_warnings = r"(\/+ *tw|\/* *trigger warning|tw *\/+)"
@labeling_function()
def trigger_warning(tweet):
    return NEGATIVE if re.search(l_trigger_warnings, tweet.TWEET.lower()) else ABSTAIN

#Using term 'Bender' when talking about drunken stints
l_bender_as_drunk = r"(day|on a) bender"
@labeling_function()
def bender_as_drunk(tweet):
    return NEGATIVE if re.search(l_bender_as_drunk, tweet.TWEET.lower()) else ABSTAIN

#Using Bender as a pop culture reference e.g. avatar the last airbender or Futurama reference
l_bender_pop_culture = r"((fender|water|earth|fire|wind|energy) *bender)|futurama.*bender|bender.*futurama|avatar.*bender"
@labeling_function()
def bender_pop_culture(tweet):
    return NEGATIVE if re.search(l_bender_pop_culture, tweet.TWEET.lower()) else ABSTAIN

#Using slang term for using cigarettes
l_slang_using_cigarettes = r"((for|smoke|have|smoking|having|want) (some|a) fags?)|fag ash"
@labeling_function()
def slang_using_cigarettes(tweet):
    return NEGATIVE if re.search(l_slang_using_cigarettes, tweet.TWEET.lower()) else ABSTAIN

#Slur is part of a mentioned twitter handle
l_handles = r"(\@[a-z0-9_]*)"
@labeling_function()
def slur_in_handles(tweet):
    in_handle = 0
    bad_terms = ['fag', 'faggot', 'fags', 'fudgepacker', 'fudge+packer', 'poofter', 'pansy', 'bender', 'batty+boy', 'ponce', 'dyke', 'rug+muncher', 'lesbo','tranny', 'trannie', 'transvestite', 'ladyboy', 'heshe', 'shemale','switch+hitter', 'gay+for+pay']
    #check for slur in handle
    handles = re.findall(l_handles, tweet.TWEET.lower())
    for handle in handles:
        if any(substring in handle for substring in bad_terms):
            in_handle+=1
    return NEGATIVE if in_handle>0 else ABSTAIN

#User has pronouns in bio
@labeling_function()
def has_pronouns(tweet):
    return NEGATIVE if tweet.HASPRONOUNS==1 else ABSTAIN

def make_Ls_matrix(data, LFs):
    noisy_labels = np.empty((len(data), len(LFs)))
    for i, row in data.iterrows():
        for j, lf in enumerate(LFs):
            noisy_labels[i][j] = lf(row)
    return noisy_labels


In [2]:
#instantiate labelling functions
lfs = [names_or_places,simple_insults,term_to_person,descriptive_bad,against_nature,trigger_warning,bender_as_drunk,bender_pop_culture,slang_using_cigarettes,slur_in_handles,has_pronouns]
applier = LFApplier(lfs=lfs)
pdapplier = PandasLFApplier(lfs=lfs)
#Update Pandas Dataframe so that displayed dataframes are not truncated
pd.set_option('display.max_colwidth', -1)
#Connect to DB
conn = sqlite3.connect("tweets.db")
#Retrieve Tweets from Database
with conn:
    cur = conn.cursor()
    #Construct SQL Queries
    #Obtain all manually labelled tweets from DB
    pd_data = pd.read_sql(con=conn,sql="SELECT * FROM TWEETS WHERE ISHARASSMENT IS NOT NULL")
#Create train test split
df_train, df_test = train_test_split(pd_data, test_size=0.33, random_state=42)
Y_test = df_test.ISHARASSMENT

https://www.snorkel.org/use-cases/01-spam-tutorial

In [3]:

applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=df_train)

100%|██████████| 342/342 [00:00<00:00, 5901.13it/s]


In [4]:
L_train[0]

array([-1, -1, -1, -1, -1, -1, -1, -1, -1,  0, -1])

In [5]:
coverage_names_or_places, coverage_simple_insults,coverage_term_to_person,coverage_descriptive_bad,coverage_against_nature,coverage_trigger_warning,coverage_bender_as_drunk,coverage_bender_pop_culture,coverage_slang_using_cigarettes,coverage_slur_in_handles,coverage_has_pronouns = (L_train!=ABSTAIN).mean(axis=0)
print(f"coverage names or places: { coverage_names_or_places * 100:.14f}%")
print(f"coverage simple insults: { coverage_simple_insults * 100:.14f}%")
print(f"coverage term to person: { coverage_term_to_person * 100:.14f}%")
print(f"coverage descriptive bad term: { coverage_descriptive_bad * 100:.14f}%")
print(f"coverage against nature: { coverage_against_nature * 100:.14f}%")
print(f"coverage trigger warning: { coverage_trigger_warning * 100:.14f}%")
print(f"coverage bender as drunk: { coverage_bender_as_drunk * 100:.14f}%")
print(f"coverage bender in pop culture: { coverage_bender_pop_culture * 100:.14f}%")
print(f"coverage slang using cigarettes: { coverage_slang_using_cigarettes * 100:.14f}%")
print(f"coverage slur in handles: { coverage_slur_in_handles * 100:.14f}%")
print(f"coverage has pronouns in bio: { coverage_has_pronouns* 100:.14f}%")


coverage names or places: 6.14035087719298%
coverage simple insults: 1.16959064327485%
coverage term to person: 2.04678362573099%
coverage descriptive bad term: 1.46198830409357%
coverage against nature: 0.29239766081871%
coverage bender as drunk: 0.58479532163743%
coverage bender in pop culture: 1.75438596491228%
coverage slang using cigarettes: 1.16959064327485%
coverage slur in handles: 16.95906432748538%
coverage has pronouns in bio: 10.23391812865497%


In [6]:
LFAnalysis(L=L_train, lfs=lfs).lf_summary()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
names_or_places,0,[0],0.061404,0.002924,0.0
simple_insults,1,[1],0.011696,0.0,0.0
term_to_person,2,[1],0.020468,0.002924,0.002924
descriptive_bad,3,[1],0.01462,0.0,0.0
against_nature,4,[1],0.002924,0.0,0.0
trigger_warning,5,[0],0.005848,0.0,0.0
bender_as_drunk,6,[0],0.005848,0.0,0.0
bender_pop_culture,7,[0],0.017544,0.002924,0.0
slang_using_cigarettes,8,[0],0.011696,0.002924,0.0
slur_in_handles,9,[0],0.169591,0.02924,0.0


In [7]:
df_train.iloc[L_train[:,2] == POSITIVE].sample(7, random_state=0)

Unnamed: 0,TWEETID,USERID,TWEET,ISTYPEHOMOSEXUAL,ISTYPETRANSGENDER,ISTYPEBISEXUAL,HASPRONOUNS,ISHARASSMENT,AUTO_ISHARASSMENT
466,1371969703027953666,1227977888860798982,This man Will Barton is 6'5 wearing super skinnie jeans with his knee out. Idc about this fags ego the Nuggets need to start MPJ police loving ass,1,0,0,0.0,1,
464,1371970229010534401,1111839485874995200,@horansarizona stfu dyke,1,0,0,1.0,1,
198,1372542942087364609,1050750048475209730,"i remember I was dating this dyke who constantly tweeted about how much she liked big booty girls with tiny waists but could not begin to understand why I felt so offended by it. this is the reason right here, because I’m clearly not your type https://t.co/KP8LqVIgOM",1,0,0,0.0,1,
499,1371949548021690377,1358657061467222018,TG: shut up karkat\nTG: you fag,1,0,0,0.0,1,
353,1371242299519303681,1342695061151166472,"Gibbs said ""I'll never let this industry demasculinize me"" with all this batty boy shit going on in the industry he never had a chance...",0,0,0,0.0,1,
149,1365280595081797633,1195598451729321984,@schizoidfemdog That faggot is so annoying and hot God,1,0,0,0.0,1,
484,1371955536619433985,1237100903347269639,@MrsLadyValkyrie @glowingcanary @CompelLearning @hondo64ou1 @back2monke @LindseyBoylan I probably know more than you pansy ass ever will. I am very knowledgeable that's why I can take the stance I do. From knowledge and experience. And I know bull shit when I see it.,1,0,0,0.0,1,


In [8]:
from snorkel.analysis import get_label_buckets

buckets = get_label_buckets(L_train[:, 0], L_train[:, 2])
df_train.iloc[buckets[(ABSTAIN, POSITIVE)]].sample(7, random_state=1)

Unnamed: 0,TWEETID,USERID,TWEET,ISTYPEHOMOSEXUAL,ISTYPETRANSGENDER,ISTYPEBISEXUAL,HASPRONOUNS,ISHARASSMENT,AUTO_ISHARASSMENT
466,1371969703027953666,1227977888860798982,This man Will Barton is 6'5 wearing super skinnie jeans with his knee out. Idc about this fags ego the Nuggets need to start MPJ police loving ass,1,0,0,0.0,1,
464,1371970229010534401,1111839485874995200,@horansarizona stfu dyke,1,0,0,1.0,1,
198,1372542942087364609,1050750048475209730,"i remember I was dating this dyke who constantly tweeted about how much she liked big booty girls with tiny waists but could not begin to understand why I felt so offended by it. this is the reason right here, because I’m clearly not your type https://t.co/KP8LqVIgOM",1,0,0,0.0,1,
353,1371242299519303681,1342695061151166472,"Gibbs said ""I'll never let this industry demasculinize me"" with all this batty boy shit going on in the industry he never had a chance...",0,0,0,0.0,1,
484,1371955536619433985,1237100903347269639,@MrsLadyValkyrie @glowingcanary @CompelLearning @hondo64ou1 @back2monke @LindseyBoylan I probably know more than you pansy ass ever will. I am very knowledgeable that's why I can take the stance I do. From knowledge and experience. And I know bull shit when I see it.,1,0,0,0.0,1,
499,1371949548021690377,1358657061467222018,TG: shut up karkat\nTG: you fag,1,0,0,0.0,1,
149,1365280595081797633,1195598451729321984,@schizoidfemdog That faggot is so annoying and hot God,1,0,0,0.0,1,


In [9]:
L_test = applier.apply(df=df_test)

100%|██████████| 169/169 [00:00<00:00, 5845.45it/s]


In [10]:
LFAnalysis(L=L_train, lfs=lfs).lf_summary()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
names_or_places,0,[0],0.061404,0.002924,0.0
simple_insults,1,[1],0.011696,0.0,0.0
term_to_person,2,[1],0.020468,0.002924,0.002924
descriptive_bad,3,[1],0.01462,0.0,0.0
against_nature,4,[1],0.002924,0.0,0.0
trigger_warning,5,[0],0.005848,0.0,0.0
bender_as_drunk,6,[0],0.005848,0.0,0.0
bender_pop_culture,7,[0],0.017544,0.002924,0.0
slang_using_cigarettes,8,[0],0.011696,0.002924,0.0
slur_in_handles,9,[0],0.169591,0.02924,0.0


In [11]:
from snorkel.labeling.model import MajorityLabelVoter

majority_model = MajorityLabelVoter()
preds_train = majority_model.predict(L=L_train)

In [12]:
preds_train

array([ 0,  0,  1, -1,  0,  0,  0,  0, -1, -1, -1, -1, -1, -1,  0, -1,  0,
        0, -1, -1,  0, -1, -1,  0, -1, -1, -1, -1,  0,  0, -1, -1, -1, -1,
       -1,  0, -1, -1, -1,  1,  0, -1,  0,  0, -1, -1, -1, -1, -1,  0, -1,
        0,  1,  0, -1, -1,  0, -1, -1, -1,  0, -1, -1,  0, -1,  0, -1,  0,
        0,  0, -1,  0, -1, -1,  0,  0, -1,  0, -1,  1, -1, -1, -1, -1, -1,
        0, -1,  0, -1,  0,  1, -1,  0, -1,  0, -1, -1, -1, -1, -1, -1,  0,
        0, -1,  0,  0,  0,  0,  0, -1, -1,  0,  0, -1, -1, -1, -1, -1, -1,
       -1, -1,  0, -1, -1, -1, -1, -1,  0, -1, -1, -1, -1,  0,  0, -1, -1,
        0, -1, -1, -1, -1,  0,  0,  1,  0, -1,  0, -1, -1, -1, -1,  0, -1,
       -1, -1,  0, -1, -1, -1,  0, -1,  0, -1, -1,  0, -1, -1, -1,  0, -1,
       -1,  0,  0, -1, -1,  0,  0,  1,  0, -1, -1,  0,  0,  0, -1,  0, -1,
       -1, -1, -1, -1, -1, -1, -1,  1, -1,  0,  0, -1, -1, -1,  0,  0, -1,
        0, -1,  1, -1, -1, -1,  0, -1, -1,  0, -1, -1,  0, -1, -1, -1, -1,
       -1, -1, -1, -1, -1

In [13]:
from snorkel.labeling.model import LabelModel

label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123)

In [14]:
majority_acc = majority_model.score(L=L_test, Y=Y_test, tie_break_policy="random")[
    "accuracy"
]
print(f"{'Majority Vote Accuracy:':<25} {majority_acc * 100:.1f}%")

label_model_acc = label_model.score(L=L_test, Y=Y_test, tie_break_policy="random")[
    "accuracy"
]
print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:1f}%")

Majority Vote Accuracy:   59.8%
Label Model Accuracy:     44.970414%


In [15]:
probs_train = label_model.predict_proba(L=L_train)

In [16]:
from snorkel.labeling import filter_unlabeled_dataframe

df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
    X=df_train, y=probs_train, L=L_train
)

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(ngram_range=(1, 5))
X_train = vectorizer.fit_transform(df_train_filtered.TWEET.tolist())
X_test = vectorizer.transform(df_test.TWEET.tolist())

In [18]:
from snorkel.utils import probs_to_preds

preds_train_filtered = probs_to_preds(probs=probs_train_filtered)

In [19]:
from sklearn.linear_model import LogisticRegression

sklearn_model = LogisticRegression(C=1e3, solver="liblinear")
sklearn_model.fit(X=X_train, y=preds_train_filtered)

LogisticRegression(C=1000.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [20]:
print(f"Test Accuracy: {sklearn_model.score(X=X_test, y=Y_test) * 100:.1f}%")

Test Accuracy: 69.8%


In [21]:
preds_train_filtered

array([0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1,
       1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1,
       1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0])