<h1>Data Pipeline Demo</h1>

<h2>Import prerequisites and prelabelled data</h2>

In [42]:
import os
import re
import tweepy as tw
import json
import pandas as pd
from scipy import sparse
import numpy as np
from snorkel.labeling import labeling_function
from snorkel.labeling import LFApplier
from snorkel.labeling import PandasLFApplier
from snorkel.labeling import LFAnalysis
from sklearn import svm
from sklearn.model_selection import train_test_split
import sqlite3

In [43]:
#Update Pandas Dataframe so that displayed dataframes are not truncated
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 500)

  pd.set_option('display.max_colwidth', -1)


<h3>Retrieve database of tweets</h3>

In [44]:
#Connect to DB
conn = sqlite3.connect("../../etc/database_store/auto_tweets_2.db")
print("Opened Database successfully")
#Obtain all manually labelled tweets from DB
pd_data = pd.read_sql(con=conn,sql="SELECT * FROM TWEETS WHERE ISHARASSMENT IS NOT NULL")
print("Retrieved Tweets Successfully")
#Create train test split
df_train, df_test = train_test_split(pd_data, test_size=0.33, random_state=42)
Y_test = df_test.ISHARASSMENT

Opened Database successfully
Retrieved Tweets Successfully


<h2>Inspect small number of tweets to understand format and identify patterns</h2>

In [45]:
df_train[10:35].TWEET

26     Just wana be brukkinngg it down in a field with a Bacardi breezer in my batty riders üï∫üèº‚òÄÔ∏è                                                                                                                                                                                   
7      Just what I needed today. A batty happily munching a banana. \n\nhttps://t.co/nPU0JFyePI                                                                                                                                                                                    
181    Idk buy a gun and stop being a fag lol https://t.co/rAyexZtlXJ                                                                                                                                                                                                              
464    @ttdagrah Ponce                                                                                                                                            

<h3>Create Label Functions based on Identified patterns</h3>

In [46]:
ABSTAIN = -1
NEGATIVE = 0
POSITIVE = 1

#Common names that have been found in dataset
l_names_or_places = r"van dyke" 
@labeling_function()
def names_or_places(tweet):
    return NEGATIVE if re.search(l_names_or_places, tweet.TWEET.lower()) else ABSTAIN

#Phrase identified in dataset to be used in harassment
l_offensive_phrase = r"being a fag(got)?"
@labeling_function()
def offensive_phrase(tweet):
    return POSITIVE if re.search(l_offensive_phrase, tweet.TWEET.lower()) else ABSTAIN

#Premade labelling function for purpose of displaying conflicts
#Term is used in full caps
l_bad_words_caps = r"(FAGS?|FAGGOTS?|FUDGE ?PACKERS?|POOFTERS?|PANSY|PANSIES|BENDERS?|BATTY BOYS?|PONCE|DYKES?|RUG ?MUNCHERS?|LESBOS?|TRANNY|TRANNIES?|TRANSVESTITES?|LADYBOYS?|HESHES?|SHE ?MALES?|SWITCH ?-?HITTERS?)"
@labeling_function()
def full_caps(tweet):
    return POSITIVE if re.search(l_bad_words_caps, tweet.TWEET) else ABSTAIN

In [48]:
#Instantiate functions and appliers
lfs = [names_or_places, offensive_phrase, full_caps]
applier = LFApplier(lfs=lfs)
pdapplier = PandasLFApplier(lfs=lfs)

<h3>Apply functions to dataset</h3>

In [49]:
applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=df_train)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 343/343 [00:00<00:00, 6394.15it/s]


<h3>Inspect to ensure output is as expected</h3>

In [51]:
print(df_train.iloc[12], L_train[12])

TWEETID                1372558262302822405                                           
USERID                 1358165820015280132                                           
TWEET                  Idk buy a gun and stop being a fag lol https://t.co/rAyexZtlXJ
ISTYPEHOMOSEXUAL       1                                                             
ISTYPETRANSGENDER      0                                                             
ISTYPEBISEXUAL         0                                                             
HASPRONOUNS            0.0                                                           
ISHARASSMENT           1.0                                                           
AUTO_ISHARASSMENT      1.0                                                           
PASTEXPERIENCE         0.0                                                           
AUTO_PASTEXPERIENCE    None                                                          
CLEAN_TWEET            ink buy a gun and stop being a 

In [53]:
print(df_train.iloc[28], L_train[28])

TWEETID                1370859019041173510                    
USERID                 369160681                              
TWEET                  @TSleftfoot What happened to Van Dyke ?
ISTYPEHOMOSEXUAL       0                                      
ISTYPETRANSGENDER      0                                      
ISTYPEBISEXUAL         0                                      
HASPRONOUNS            0.0                                    
ISHARASSMENT           0.0                                    
AUTO_ISHARASSMENT      0.0                                    
PASTEXPERIENCE         NaN                                    
AUTO_PASTEXPERIENCE    None                                   
CLEAN_TWEET            tsleftfoot what happened to van dyke   
Name: 321, dtype: object [ 0 -1 -1]


<h3>Analyse coverage of labelling functions</h3>

In [54]:
LFAnalysis(L=L_train, lfs=lfs).lf_summary()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
names_or_places,0,[0],0.043732,0.005831,0.005831
offensive_phrase,1,[1],0.005831,0.0,0.0
full_caps,2,[1],0.037901,0.005831,0.005831


Goal is to increase coverage as much as possible and minimise conflicts. Maximising performance may be okay to come to the expense of conflicts, provided that coverage of opposite polarity is high enough

In [56]:
from snorkel.analysis import get_label_buckets
#Split L_train matrix of labels into buckets, with second value being the index of label (e.g. 0 = label names_or_places)
#Retrieve results with specified value, e.g. NEGATIVE on index 0 = return results that match name_or_place LF
buckets = get_label_buckets(L_train[:,0], L_train[:, 2])
df_train.iloc[buckets[(NEGATIVE, POSITIVE)]].sample(2, random_state=4)

Unnamed: 0,TWEETID,USERID,TWEET,ISTYPEHOMOSEXUAL,ISTYPETRANSGENDER,ISTYPEBISEXUAL,HASPRONOUNS,ISHARASSMENT,AUTO_ISHARASSMENT,PASTEXPERIENCE,AUTO_PASTEXPERIENCE,CLEAN_TWEET
159,1365276638720782340,1359673537469026305,OH MY GOD WANDAVISION SPOILER SO IF YOU DONT WANNA SEE IT SCROLL\n\nHOW DID I NOT SEE SIMILARITIES WITH THE DICK VAN DYKE SHOW I USED TO WATCH IT WITH GRAMS ALL THE TIME,0,0,0,0.0,0.0,,0.0,,oh my god wandavision spoiler so if you dont wanna see it scrollhow did i not see similarities with the dick van dyke show i used to watch it with grams all the time
115,1365293783949180930,488515078,"The biggest implication of WANDAVISION is that MALCOLM IN THE MIDDLE, I LOVE LUCY, and THE DICK VAN DYKE SHOW are all within the MCU now. Can‚Äôt take that back.",0,0,0,0.0,0.0,,0.0,,the biggest implication of wandavision is that malcolm in the middle i love lucy and the dick van dyke show are all within the mu now cant take that back


<h3>Train label model and evaluate performance</h3>

In [57]:
L_test = applier.apply(df=df_test)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 169/169 [00:00<00:00, 22805.40it/s]


In [58]:
from snorkel.labeling.model import MajorityLabelVoter

majority_model = MajorityLabelVoter()
preds_train = majority_model.predict(L=L_train, tie_break_policy="abstain")

In [60]:
majority_acc = majority_model.score(L=L_test, Y=Y_test, tie_break_policy="abstain")[
    "accuracy"
]
print(f"{'Majority Vote Accuracy:':<25} {majority_acc * 100:.1f}%")



Majority Vote Accuracy:   83.3%


In [61]:
from snorkel.labeling.model import LabelModel

label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123)

In [62]:
label_model.get_weights()

array([0.68770626, 0.82507676, 0.65740713])

In [63]:
label_model_acc = label_model.score(L=L_test, Y=Y_test, tie_break_policy="abstain")[
    "accuracy"
]
print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:1f}%")



Label Model Accuracy:     85.714286%


<h2>Apply to unseen data and create predictions

<h3>Retrieve unseen tweets</h3>

In [64]:
#Get unseen data
#Connect to DB
conn = sqlite3.connect("../../etc/database_store/auto_tweets_2.db")
#Retrieve Tweets from Database
with conn:
    cur = conn.cursor()
    #Construct SQL Queries
    #Obtain all manually labelled tweets from DB
    new_data = pd.read_sql(con=conn,sql="SELECT * FROM TWEETS")

<h3>Apply labelling functions to unseen tweets</h3>

In [65]:
#Apply on unseen data
L_new = applier.apply(df=new_data)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 40266/40266 [00:01<00:00, 22055.65it/s]


In [66]:
preds_new = majority_model.predict(L=L_new, tie_break_policy="abstain")

In [68]:
print("Tweet: ", new_data.iloc[176].TWEET)
print("\nAutomated Label: ", preds_new[176])

Tweet:  CROSS THE LINE.  FAGGOT https://t.co/ZJKBATUhiz

Automated Label:  1


In [69]:
print("Tweet: ", new_data.iloc[155].TWEET)
print("\nAutomated Label: ", preds_new[155])

Tweet:  I AM 100% QUEER I AM A FAGGOT

Automated Label:  1


<h1>Import All Labeling functions from end product</h1>

In [70]:
#Common names that have been found in dataset
l_names_or_places = r"(van ?dyke|ponce ?de ?leon|ponce ?city|ponce ?inlet ?town)" 
l_capitalised_names = r"Dyke|Ponce|Bender"
@labeling_function()
def names_or_places(tweet):
    is_name_case = re.search(l_capitalised_names, tweet.TWEET)
    return NEGATIVE if re.search(l_names_or_places, tweet.TWEET.lower()) or is_name_case else ABSTAIN

#Common simple insults
l_simple_insults = r"(fucking|disgusting|ugly|bitchy|pathetic|gay|scamming)(\w|\s){0,20}(fag|faggot|fags|fudgepacker|fudge packer|poofter|pansy|bender|batty boy|ponce|dyke|rug muncher|lesbo|tranny|trannie|transvestite|ladyboy|heshe|shemale|switch hitter)"
@labeling_function()
def simple_insults(tweet):
    return POSITIVE if re.search(l_simple_insults, tweet.CLEAN_TWEET) else ABSTAIN

#Short tweet containing an offensive term and little more
l_slurs = r"(fag|faggot|fudge ?packer|poofter|pansy|bender|batty ?boy|ponce|dyke|rug ?muncher|lesbo|tranny|trannie|transvestite|ladyboy|heshe|shemale|switch hitter)s?"
@labeling_function()
def short_insult_tweet(tweet):
    #Remove handles from tweet
    tweet = re.sub('@[\w]+','',tweet.TWEET)
    return POSITIVE if re.search(l_slurs, tweet.lower()) and len(tweet) < 18 else ABSTAIN

#Common terms used to identify the subject of a given bad term e.g. "You ***" 
l_term_to_person = r"(that|you|shut the fuck up|stfu|shut up|these)(\w|\s){0,5}(fag|faggot|fudge ?packer|poofter|pansy|bender|batty ?boy|ponce|dyke|rug ?muncher|lesbo|tranny|trannie|transvestite|ladyboy|heshe|shemale|switch hitter)s?"
@labeling_function()
def term_to_person(tweet):
    return POSITIVE if re.search(l_term_to_person, tweet.TWEET.lower()) else ABSTAIN

#Using term in a descriptive yet derogatory manner
l_descriptive_bad = r"(piece of|like a|being a|kind of a?|what a|this|(bunch ?(of | ?a?))|some) ?(fag|faggot|fudge ?packer|poofter|pansy|bender|batty ?boy|ponce|dyke|rug ?muncher|lesbo|tranny|trannie|transvestite|ladyboy|heshe|shemale|switch hitter)s?( (bull|dog)?shit)?"
@labeling_function()
def descriptive_bad(tweet):
    return POSITIVE if re.search(l_descriptive_bad, tweet.CLEAN_TWEET) else ABSTAIN

#Stating someone on the LGBT spectrum is unnatural
l_against_nature = r"(against|defying) ?(god|biology|nature)"
@labeling_function()
def against_nature(tweet):
    return POSITIVE if re.search(l_against_nature, tweet.CLEAN_TWEET) else ABSTAIN

#General Transphobic statements e.g. stating someone who is transgender is not a 'real' person of that gender
l_transphobic_statements = r"(false|fake) (advertising|male|man|woman|female|girl|boy)"
@labeling_function()
def transphobic_statements(tweet):
    return POSITIVE if re.search(l_transphobic_statements, tweet.CLEAN_TWEET) else ABSTAIN

#Sending death threats
l_death_threats = r"kill (yo)?urself|kys|neck (it|(yo)?ursel(f|ves)|themsel(f|ves))|needs? to die"
@labeling_function()
def death_threats(tweet):
    return POSITIVE if re.search(l_death_threats, tweet.TWEET.lower()) else ABSTAIN
    
#Stating gay people are more likely to get disease/aids or are a disease themselves
l_disease = r"(gay|dyke|fag|faggot)(\w|\s|\D){0,20}(disease|aids|cancer)"
@labeling_function()
def gay_disease(tweet):
    return POSITIVE if re.search(l_disease, tweet.CLEAN_TWEET) else ABSTAIN

#Term is used in full caps
l_bad_words_caps = r"(FAGS?|FAGGOTS?|FUDGE ?PACKERS?|POOFTERS?|PANSY|PANSIES|BENDERS?|BATTY BOYS?|PONCE|DYKES?|RUG ?MUNCHERS?|LESBOS?|TRANNY|TRANNIES?|TRANSVESTITES?|LADYBOYS?|HESHES?|SHE ?MALES?|SWITCH ?-?HITTERS?)"
@labeling_function()
def full_caps(tweet):
    return POSITIVE if re.search(l_bad_words_caps, tweet.TWEET) else ABSTAIN

#Contains trigger warning
l_trigger_warnings = r"(\/+ *tw|\/* *trigger warning|tw *\/+)"
@labeling_function()
def trigger_warning(tweet):
    return NEGATIVE if re.search(l_trigger_warnings, tweet.TWEET.lower()) else ABSTAIN

#Using term 'Bender' when talking about drunken stints
l_bender_as_drunk = r"(day|on a)? ?bender( weekend)?"
@labeling_function()
def bender_as_drunk(tweet):
    return NEGATIVE if re.search(l_bender_as_drunk, tweet.CLEAN_TWEET) else ABSTAIN

#Using Bender as a pop culture reference e.g. avatar the last airbender or Futurama reference
l_safe_bender = r"((fender|water|earth|fire|wind|energy|mind) *bender)|futurama.*bender|bender.*futurama|avatar.*bender|the last"
@labeling_function()
def safe_bender(tweet):
    return NEGATIVE if re.search(l_safe_bender, tweet.CLEAN_TWEET) else ABSTAIN

#If tweet contains drawing, it may be fan art of one of the previous pop culture references
l_drawing = r"((fan ?)art| art |drawing|i drew|(ha|i'?)ve drawn)"
@labeling_function()
def is_drawing(tweet):
    return NEGATIVE if re.search(l_drawing, tweet.CLEAN_TWEET) else ABSTAIN

#If tweet contains link in form t.co , its likely to be an image and unrelated to harassment. 
l_check_link = r'(https?:\/\/t\.co)' 
@labeling_function()
def has_image(tweet):
    return NEGATIVE if re.search(l_check_link, tweet.TWEET.lower()) else ABSTAIN

#Using slang term for using cigarettes
l_slang_using_cigarettes = r"(for a|smoke a|h?ave?|smoking?|having?|light|pack|get|smell(s|ing)?)(\w|\s){0,25}fags?|fag ash|(un)?lit fags?|want (a|some|a couple (of)?) fags?|fag (in(\w|\s){0,8}hand|pack(et)?)|(\d){1,4} fags|fag (brands?)|brands? (of|a) fags?"
@labeling_function()
def slang_using_cigarettes(tweet):
    return NEGATIVE if re.search(l_slang_using_cigarettes, tweet.CLEAN_TWEET) else ABSTAIN

#Uses term 'batty' by itself, as is insulting only when followed by 'boy'
l_safe_batty = r"batty ?(?!boy|man)"
@labeling_function()
def safe_batty(tweet):
    return NEGATIVE if re.search(l_safe_batty, tweet.CLEAN_TWEET) else ABSTAIN

#Uses term in an innocent question
l_question = r"what( is| does|'s)(\w|\s){0,15}(fag|faggot|fags|fudge ?packer|poofter|pansy|bender|batty ?boy|ponce|dyke|rug ?muncher|lesbo|tranny|trannie|transvestite|ladyboy|heshe|shemale|switch ?hitter)s?"
@labeling_function()
def innocent_question(tweet):
    return NEGATIVE if re.search(l_question, tweet.CLEAN_TWEET) else ABSTAIN

#Uses term in quotation marks, meaning likely to be talking in third person or about an experience
l_quotations = r'\"(fag|faggot|fags|fudgepacker|fudge packer|poofter|pansy|bender|batty boy|ponce|dyke|rug muncher|lesbo|tranny|trannie|transvestite|ladyboy|heshe|shemale|switch hitter)"'
@labeling_function()
def term_in_quotations(tweet):
    return NEGATIVE if re.search(l_quotations, tweet.TWEET.lower()) else ABSTAIN

#Uses term in talking about experience, meaning unlikely to be harassment
l_received_term = r"((((call(ed|ing)?|telling|told) ?(me|them|her|him|us|my))|(said|saying))(\w|\s){0,30}(fag|faggot|fags|fudge ?packer|poofter|pansy|bender|batty ?boy|ponce|dyke|rug ?muncher|lesbo|tranny|trannie|transvestite|ladyboy|heshe|shemale|switch ?hitter)s?)"
@labeling_function()
def received_term(tweet):
    return NEGATIVE if re.search(l_received_term, tweet.CLEAN_TWEET) else ABSTAIN

#Slur is part of a mentioned twitter handle
l_handles = r"(\@[a-z0-9_]*)"
@labeling_function()
def slur_in_handles(tweet):
    in_handle = 0
    bad_terms = ['fag', 'faggot', 'fags', 'fudgepacker', 'poofter', 'pansy', 'bender', 'ponce', 'dyke', 'lesbo','tranny', 'trannie', 'transvestite', 'ladyboy', 'heshe', 'shemale', 'gayforpay']
    #check for slur in handle
    handles = re.findall(l_handles, tweet.TWEET.lower())
    for handle in handles:
        if any(substring in handle for substring in bad_terms):
            in_handle+=1
    return NEGATIVE if in_handle>0 else ABSTAIN

#User has pronouns in bio
@labeling_function()
def has_pronouns(tweet):
    return NEGATIVE if tweet.HASPRONOUNS==1 else ABSTAIN

In [71]:
#instantiate labelling functions
lfs = [names_or_places,simple_insults,short_insult_tweet,term_to_person,descriptive_bad,against_nature,transphobic_statements,death_threats, gay_disease,full_caps,trigger_warning,bender_as_drunk,safe_bender,is_drawing,slang_using_cigarettes, safe_batty, innocent_question,term_in_quotations, received_term, slur_in_handles,has_pronouns]
# has_image
applier = LFApplier(lfs=lfs)
pdapplier = PandasLFApplier(lfs=lfs)

In [72]:
applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=df_train)
L_test = applier.apply(df=df_test)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 343/343 [00:00<00:00, 2458.65it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 169/169 [00:00<00:00, 2208.51it/s]


In [73]:
from snorkel.labeling.model import MajorityLabelVoter

majority_model = MajorityLabelVoter()
preds_train = majority_model.predict(L=L_train, tie_break_policy="abstain")

In [74]:
preds_new = majority_model.predict(L=L_new, tie_break_policy="abstain")

In [75]:
majority_acc = majority_model.score(L=L_test, Y=Y_test, tie_break_policy="abstain")[
    "accuracy"
]
print(f"{'Majority Vote Accuracy:':<25} {majority_acc * 100:.1f}%")



Majority Vote Accuracy:   88.8%
