In [1]:
import os
import re
import tweepy as tw
import json
import pandas as pd
from scipy import sparse
import numpy as np
from snorkel.labeling import labeling_function
from snorkel.labeling import LFApplier
from snorkel.labeling import PandasLFApplier
from snorkel.labeling import LFAnalysis
from sklearn import svm
from sklearn.model_selection import train_test_split
import sqlite3

ABSTAIN = -1
NEGATIVE = 0
POSITIVE = 1

#Common names that have been found in dataset
l_names_or_places = r"(van ?dyke|ponce ?de ?leon|ponce ?city|ponce ?inlet ?town)" 
l_capitalised_names = r"Dyke|Ponce|Bender"
@labeling_function()
def names_or_places(tweet):
    is_name_case = re.search(l_capitalised_names, tweet.TWEET)
    return NEGATIVE if re.search(l_names_or_places, tweet.TWEET.lower()) or is_name_case else ABSTAIN

#Common simple insults
l_simple_insults = r"(fucking|disgusting|ugly|bitchy|pathetic|gay|scamming)(\w|\s){0,20}(fag|faggot|fags|fudgepacker|fudge packer|poofter|pansy|bender|batty boy|ponce|dyke|rug muncher|lesbo|tranny|trannie|transvestite|ladyboy|heshe|shemale|switch hitter)"
@labeling_function()
def simple_insults(tweet):
    return POSITIVE if re.search(l_simple_insults, tweet.CLEAN_TWEET) else ABSTAIN

#Short tweet containing an offensive term and little more
l_slurs = r"(fag|faggot|fudge ?packer|poofter|pansy|bender|batty ?boy|ponce|dyke|rug ?muncher|lesbo|tranny|trannie|transvestite|ladyboy|heshe|shemale|switch hitter)s?"
@labeling_function()
def short_insult_tweet(tweet):
    #Remove handles from tweet
    tweet = re.sub('@[\w]+','',tweet.TWEET)
    return POSITIVE if re.search(l_slurs, tweet.lower()) and len(tweet) < 18 else ABSTAIN

#Common terms used to identify the subject of a given bad term e.g. "You ***" 
l_term_to_person = r"(that|you|shut the fuck up|stfu|shut up|these)(\w|\s){0,5}(fag|faggot|fudge ?packer|poofter|pansy|bender|batty ?boy|ponce|dyke|rug ?muncher|lesbo|tranny|trannie|transvestite|ladyboy|heshe|shemale|switch hitter)s?"
@labeling_function()
def term_to_person(tweet):
    return POSITIVE if re.search(l_term_to_person, tweet.TWEET.lower()) else ABSTAIN

#Using term in a descriptive yet derogatory manner
l_descriptive_bad = r"(piece of|like a|being a|kind of a?|what a|this|(bunch ?(of | ?a?))|some) ?(fag|faggot|fudge ?packer|poofter|pansy|bender|batty ?boy|ponce|dyke|rug ?muncher|lesbo|tranny|trannie|transvestite|ladyboy|heshe|shemale|switch hitter)s?( (bull|dog)?shit)?"
@labeling_function()
def descriptive_bad(tweet):
    return POSITIVE if re.search(l_descriptive_bad, tweet.CLEAN_TWEET) else ABSTAIN

#Stating someone on the LGBT spectrum is unnatural
l_against_nature = r"(against|defying) ?(god|biology|nature)"
@labeling_function()
def against_nature(tweet):
    return POSITIVE if re.search(l_against_nature, tweet.CLEAN_TWEET) else ABSTAIN

#General Transphobic statements e.g. stating someone who is transgender is not a 'real' person of that gender
l_transphobic_statements = r"(false|fake) (advertising|male|man|woman|female|girl|boy)"
@labeling_function()
def transphobic_statements(tweet):
    return POSITIVE if re.search(l_transphobic_statements, tweet.CLEAN_TWEET) else ABSTAIN

#Sending death threats
l_death_threats = r"kill (yo)?urself|kys|neck (it|(yo)?ursel(f|ves)|themsel(f|ves))|needs? to die"
@labeling_function()
def death_threats(tweet):
    return POSITIVE if re.search(l_death_threats, tweet.TWEET.lower()) else ABSTAIN
    
#Stating gay people are more likely to get disease/aids or are a disease themselves
l_disease = r"(gay|dyke|fag|faggot)(\w|\s|\D){0,20}(disease|aids|cancer)"
@labeling_function()
def gay_disease(tweet):
    return POSITIVE if re.search(l_disease, tweet.CLEAN_TWEET) else ABSTAIN

#Term is used in full caps
l_bad_words_caps = r"(FAGS?|FAGGOTS?|FUDGE ?PACKERS?|POOFTERS?|PANSY|PANSIES|BENDERS?|BATTY BOYS?|PONCE|DYKES?|RUG ?MUNCHERS?|LESBOS?|TRANNY|TRANNIES?|TRANSVESTITES?|LADYBOYS?|HESHES?|SHE ?MALES?|SWITCH ?-?HITTERS?)"
@labeling_function()
def full_caps(tweet):
    return POSITIVE if re.search(l_bad_words_caps, tweet.TWEET) else ABSTAIN

#Contains trigger warning
l_trigger_warnings = r"(\/+ *tw|\/* *trigger warning|tw *\/+)"
@labeling_function()
def trigger_warning(tweet):
    return NEGATIVE if re.search(l_trigger_warnings, tweet.TWEET.lower()) else ABSTAIN

#Using term 'Bender' when talking about drunken stints
l_bender_as_drunk = r"(day|on a)? ?bender( weekend)?"
@labeling_function()
def bender_as_drunk(tweet):
    return NEGATIVE if re.search(l_bender_as_drunk, tweet.CLEAN_TWEET) else ABSTAIN

#Using Bender as a pop culture reference e.g. avatar the last airbender or Futurama reference
l_safe_bender = r"((fender|water|earth|fire|wind|energy|mind) *bender)|futurama.*bender|bender.*futurama|avatar.*bender|the last"
@labeling_function()
def safe_bender(tweet):
    return NEGATIVE if re.search(l_safe_bender, tweet.CLEAN_TWEET) else ABSTAIN

#If tweet contains drawing, it may be fan art of one of the previous pop culture references
l_drawing = r"((fan ?)art| art |drawing|i drew|(ha|i'?)ve drawn)"
@labeling_function()
def is_drawing(tweet):
    return NEGATIVE if re.search(l_drawing, tweet.CLEAN_TWEET) else ABSTAIN

#If tweet contains link in form t.co , its likely to be an image and unrelated to harassment. 
l_check_link = r'(https?:\/\/t\.co)' 
@labeling_function()
def has_image(tweet):
    return NEGATIVE if re.search(l_check_link, tweet.TWEET.lower()) else ABSTAIN

#Using slang term for using cigarettes
l_slang_using_cigarettes = r"(for a|smoke a|h?ave?|smoking?|having?|light|pack|get|smell(s|ing)?)(\w|\s){0,25}fags?|fag ash|(un)?lit fags?|want (a|some|a couple (of)?) fags?|fag (in(\w|\s){0,8}hand|pack(et)?)|(\d){1,4} fags|fag (brands?)|brands? (of|a) fags?"
@labeling_function()
def slang_using_cigarettes(tweet):
    return NEGATIVE if re.search(l_slang_using_cigarettes, tweet.CLEAN_TWEET) else ABSTAIN

#Uses term 'batty' by itself, as is insulting only when followed by 'boy'
l_safe_batty = r"batty ?(?!boy|man)"
@labeling_function()
def safe_batty(tweet):
    return NEGATIVE if re.search(l_safe_batty, tweet.CLEAN_TWEET) else ABSTAIN

#Uses term in an innocent question
l_question = r"what( is| does|'s)(\w|\s){0,15}(fag|faggot|fags|fudge ?packer|poofter|pansy|bender|batty ?boy|ponce|dyke|rug ?muncher|lesbo|tranny|trannie|transvestite|ladyboy|heshe|shemale|switch ?hitter)s?"
@labeling_function()
def innocent_question(tweet):
    return NEGATIVE if re.search(l_question, tweet.CLEAN_TWEET) else ABSTAIN

#Uses term in quotation marks, meaning likely to be talking in third person or about an experience
l_quotations = r'\"(fag|faggot|fags|fudgepacker|fudge packer|poofter|pansy|bender|batty boy|ponce|dyke|rug muncher|lesbo|tranny|trannie|transvestite|ladyboy|heshe|shemale|switch hitter)"'
@labeling_function()
def term_in_quotations(tweet):
    return NEGATIVE if re.search(l_quotations, tweet.TWEET.lower()) else ABSTAIN

#Uses term in talking about experience, meaning unlikely to be harassment
l_received_term = r"((((call(ed|ing)?|telling|told) ?(me|them|her|him|us|my))|(said|saying))(\w|\s){0,30}(fag|faggot|fags|fudge ?packer|poofter|pansy|bender|batty ?boy|ponce|dyke|rug ?muncher|lesbo|tranny|trannie|transvestite|ladyboy|heshe|shemale|switch ?hitter)s?)"
@labeling_function()
def received_term(tweet):
    return NEGATIVE if re.search(l_received_term, tweet.CLEAN_TWEET) else ABSTAIN

#Slur is part of a mentioned twitter handle
l_handles = r"(\@[a-z0-9_]*)"
@labeling_function()
def slur_in_handles(tweet):
    in_handle = 0
    bad_terms = ['fag', 'faggot', 'fags', 'fudgepacker', 'poofter', 'pansy', 'bender', 'ponce', 'dyke', 'lesbo','tranny', 'trannie', 'transvestite', 'ladyboy', 'heshe', 'shemale', 'gayforpay']
    #check for slur in handle
    handles = re.findall(l_handles, tweet.TWEET.lower())
    for handle in handles:
        if any(substring in handle for substring in bad_terms):
            in_handle+=1
    return NEGATIVE if in_handle>0 else ABSTAIN

#User has pronouns in bio
@labeling_function()
def has_pronouns(tweet):
    return NEGATIVE if tweet.HASPRONOUNS==1 else ABSTAIN

def make_Ls_matrix(data, LFs):
    noisy_labels = np.empty((len(data), len(LFs)))
    for i, row in data.iterrows():
        for j, lf in enumerate(LFs):
            noisy_labels[i][j] = lf(row)
    return noisy_labels


In [2]:
from spellchecker import SpellChecker
from wordsegment import load, segment
load()

def clean_data(df):
    l_clean_tweets = []
    for i,line in df.iterrows():
        l_clean_tweets.append(preprocess(line.TWEET))
    return l_clean_tweets
        
    
def preprocess(tweet):
    tweet = re.sub(r'https?:\/\/(www\.)?[-a-zA-Z0–9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0–9@:%_\+.~#?&//=]*)', '',
                   tweet, flags=re.MULTILINE) # to remove links that start with HTTP/HTTPS in the tweet
    tweet = re.sub(r'[-a-zA-Z0–9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0–9@:%_\+.~#?&//=]*)', '',
                   tweet, flags=re.MULTILINE) # to remove other url links
    tweet = ' '.join(re.sub('([^0-9A-Za-z \t])|(\w+:\/\/\S+)','',tweet).split()) #remove # and emojis
    tweet = re.sub(r"\d", "", tweet)
    spell = SpellChecker()
    tweet = ' '.join([spell.correction(w) for w in tweet.split()]) #correct spelling errors
    #tweet = ' '.join(segment(tweet))
    tweet = tweet.lower()
    return tweet
    

In [3]:
#instantiate labelling functions
lfs = [names_or_places,simple_insults,short_insult_tweet,term_to_person,descriptive_bad,against_nature,transphobic_statements,death_threats, gay_disease,full_caps,trigger_warning,bender_as_drunk,safe_bender,is_drawing,slang_using_cigarettes, safe_batty, innocent_question,term_in_quotations, received_term, slur_in_handles,has_pronouns]
# has_image
applier = LFApplier(lfs=lfs)
pdapplier = PandasLFApplier(lfs=lfs)
#Update Pandas Dataframe so that displayed dataframes are not truncated
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 500)
#Connect to DB
conn = sqlite3.connect("../../etc/database_store/new_tweets_2.db")
#Retrieve Tweets from Database
with conn:
    cur = conn.cursor()
    #Construct SQL Queries
    #Obtain all manually labelled tweets from DB
    pd_data = pd.read_sql(con=conn,sql="SELECT * FROM TWEETS WHERE ISHARASSMENT IS NOT NULL")
#Create train test split
df_train, df_test = train_test_split(pd_data, test_size=0.33, random_state=42)
Y_test = df_test.ISHARASSMENT

  pd.set_option('display.max_colwidth', -1)


https://www.snorkel.org/use-cases/01-spam-tutorial

In [4]:
applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=df_train)

100%|██████████| 343/343 [00:00<00:00, 3783.56it/s]


In [5]:
print(df_train.iloc[13])
print(L_train[13])

TWEETID                1371969863892209668
USERID                 1309799151539040256
TWEET                  @ttdagrah Ponce    
ISTYPEHOMOSEXUAL       0                  
ISTYPETRANSGENDER      0                  
ISTYPEBISEXUAL         0                  
HASPRONOUNS            0.0                
ISHARASSMENT           1                  
AUTO_ISHARASSMENT      None               
PASTEXPERIENCE         NaN                
AUTO_PASTEXPERIENCE    None               
CLEAN_TWEET            ttdagrah ponce     
Name: 464, dtype: object
[ 0 -1  1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]


In [6]:
#Obtain tweets without any label at all
i=0
indexes = []
for label in L_train:
    if label.argmin() == label.argmax():
        #print(label)
        temp = df_train.iloc[i]
        indexes.append(temp.name)
    i+=1
new = df_train.filter(items=indexes, axis=0)

In [7]:
new

Unnamed: 0,TWEETID,USERID,TWEET,ISTYPEHOMOSEXUAL,ISTYPETRANSGENDER,ISTYPEBISEXUAL,HASPRONOUNS,ISHARASSMENT,AUTO_ISHARASSMENT,PASTEXPERIENCE,AUTO_PASTEXPERIENCE,CLEAN_TWEET
302,1370870722671828993,1253685173398331393,@HPokestar1 @youndlnyctop fag &amp; you a bottom too LMFAOOO i don’t talk i fight so whenever you ready,1,0,0,0.0,1,,0.0,,polestar youndlnyctop fag amp you a bottom too lmfaooo i dont talk i fight so whenever you ready
37,1365266152713314306,1315600916372844544,I am becoming a raypeeest at my house :) BHEWJHKVFHJWV faggot,1,0,0,,1,,,,i am becoming a raypeeest at my house bhewjhkvfhjwv faggot
141,1365284653045854209,244926711,@TruthfulTreason From Minds. I didn’t know “pansy” was so offensive. https://t.co/Aop8U3isWK,1,0,0,0.0,1,,,,truthfultreason from minds i didnt know pansy was so offensive httpstcoaopuiswk
364,1371234730503471105,1362782902388408326,"I can't sleep... Fags write me now, I want to empty wallets, that relaxes me 😈🤑🔥",0,0,0,0.0,0,,,,i cant sleep fags write me now i want to empty wallets that relaxes me
168,1372566027771469824,1653769801,@phibetarage Lmao why was this so calming i would absolutely watch,0,0,0,0.0,0,,,,phibetarage lao why was this so calming i would absolutely watch
81,1365307488518631425,2538970826,"True. I never expected this from him. How can such an educated, aged &amp; experienced person become such a narrow minded man at d fag end of his life?\nAll d good job he did will be negated with this pathetic decision\nWhat growth or devp is this anti-ppl BJP going to give to Kerala https://t.co/yfFR6aowUR",1,0,0,0.0,0,,0.0,,true i never expected this from him how can such an educated aged amp experienced person become such a narrow minded man at i fag end of his life'll i good job he did will be negated with this pathetic decisionwhat growth or deep is this antippl bop going to give to kerala httpstcoyffraowur
426,1371589247539499019,996172824,"@MHanson62 #CollegePrePartyPlaylist I forget, what's a ponce?",1,0,0,0.0,0,,0.0,,manson collegeprepartyplaylist i forget whats a ponce
83,1365305909845635073,1324416520244649985,@Elescorial3 @Otto_English 🤣comparing pensioners and jihadi brides... behave! I paid my taxes here in the U.K! Unlike that little ponce and her family. Scrounging off the tax payer. Wish they’d all leave to be with her. We don’t want them. ⚡️,1,0,0,0.0,1,,,,elescorial ottoenglish comparing pensioners and jihadi brides behave i paid my taxes here in the uk unlike that little ponce and her family scrounging off the tax payer wish theyd all leave to be with her we dont want them
163,1365276206493544451,1292485135934332930,~I don't want him to kiss me\nI want him pray to me~\n\n•Pansy Parkinson https://t.co/iSNUh32nOM,0,0,0,0.0,0,,,,i dont want him to kiss mei want him pray to means parkinson httpstcoisnuhnom
199,1372542392411258885,1208825765258809345,Turned into a country of scum due to the huge melting pot of the United Nations coming here to ponce an easier life! https://t.co/AnFwo187fx,1,0,0,0.0,0,,0.0,,turned into a country of scum due to the huge melting pot of the united nations coming here to ponce an easier life httpstcoanfwofx


In [8]:
coverage_names_or_places, coverage_simple_insults,coverage_short_insult_tweet,coverage_term_to_person,coverage_descriptive_bad,coverage_against_nature,coverage_transphobic_statements,coverage_death_threats, coverage_gay_disease,coverage_full_caps,coverage_trigger_warning,coverage_bender_as_drunk,coverage_safe_bender,coverage_is_drawing,coverage_slang_using_cigarettes,coverage_safe_batty,coverage_innocent_question,coverage_term_in_quotations, coverage_received_term, coverage_slur_in_handles,coverage_has_pronouns = (L_train!=ABSTAIN).mean(axis=0)
print(f"coverage names or places: { coverage_names_or_places * 100:.14f}%")
print(f"coverage simple insults: { coverage_simple_insults * 100:.14f}%")
print(f"coverage short insult tweet: { coverage_short_insult_tweet * 100:.14f}%")
print(f"coverage term to person: { coverage_term_to_person * 100:.14f}%")
print(f"coverage descriptive bad term: { coverage_descriptive_bad * 100:.14f}%")
print(f"coverage against nature: { coverage_against_nature * 100:.14f}%")
print(f"coverage transphobic statements: { coverage_transphobic_statements * 100:.14f}%")
print(f"coverage death threats: { coverage_death_threats * 100:.14f}%")
print(f"coverage gay disease: {coverage_gay_disease * 100:.14f}%")
print(f"coverage full caps: { coverage_full_caps * 100:.14f}%")
print(f"coverage trigger warning: { coverage_trigger_warning * 100:.14f}%")
print(f"coverage bender as drunk: { coverage_bender_as_drunk * 100:.14f}%")
print(f"coverage safe bender: { coverage_safe_bender * 100:.14f}%")
print(f"coverage is drawing: { coverage_is_drawing * 100:.14f}%")
#print(f"coverage has image: { coverage_has_image * 100:.14f}%")
print(f"coverage slang using cigarettes: { coverage_slang_using_cigarettes * 100:.14f}%")
print(f"coverage safe batty: {coverage_safe_batty * 100:.14f}%")
print(f"coverage innocent question: {coverage_innocent_question * 100:.14f}%")
print(f"coverage term in quotations: {coverage_term_in_quotations * 100:.14f}%")
print(f"coverage received term: {coverage_received_term * 100:.14f}%")
print(f"coverage slur in handles: { coverage_slur_in_handles * 100:.14f}%")
print(f"coverage has pronouns in bio: { coverage_has_pronouns* 100:.14f}%")


coverage names or places: 14.86880466472303%
coverage simple insults: 2.33236151603499%
coverage short insult tweet: 4.37317784256560%
coverage term to person: 1.74927113702624%
coverage descriptive bad term: 5.24781341107872%
coverage against nature: 0.29154518950437%
coverage transphobic statements: 0.29154518950437%
coverage death threats: 0.00000000000000%
coverage gay disease: 0.29154518950437%
coverage full caps: 3.79008746355685%
coverage bender as drunk: 16.32653061224490%
coverage safe bender: 4.37317784256560%
coverage is drawing: 1.16618075801749%
coverage slang using cigarettes: 5.53935860058309%
coverage safe batty: 6.70553935860058%
coverage innocent question: 0.00000000000000%
coverage term in quotations: 1.16618075801749%
coverage received term: 7.87172011661808%
coverage slur in handles: 16.90962099125364%
coverage has pronouns in bio: 9.91253644314869%


In [9]:
LFAnalysis(L=L_train, lfs=lfs).lf_summary()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
names_or_places,0,[0],0.148688,0.075802,0.008746
simple_insults,1,[1],0.023324,0.005831,0.0
short_insult_tweet,2,[1],0.043732,0.017493,0.008746
term_to_person,3,[1],0.017493,0.002915,0.002915
descriptive_bad,4,[1],0.052478,0.020408,0.017493
against_nature,5,[1],0.002915,0.0,0.0
transphobic_statements,6,[1],0.002915,0.0,0.0
death_threats,7,[],0.0,0.0,0.0
gay_disease,8,[1],0.002915,0.0,0.0
full_caps,9,[1],0.037901,0.017493,0.005831


In [10]:
df_train.iloc[L_train[:,5] == POSITIVE].sample(1, random_state=42)

Unnamed: 0,TWEETID,USERID,TWEET,ISTYPEHOMOSEXUAL,ISTYPETRANSGENDER,ISTYPEBISEXUAL,HASPRONOUNS,ISHARASSMENT,AUTO_ISHARASSMENT,PASTEXPERIENCE,AUTO_PASTEXPERIENCE,CLEAN_TWEET
145,1365281122884722688,1242055420883226625,"@Mistberg_ It's not our problem that you love to become a faggot. Honestly, I don't know why people are trying to change their organisms that were created by it. First, it's harmful and it will cause problems in the future. Second, it's against biology. Give up you don't have any come backs",1,0,0,0.0,1,,0.0,,mister its not our problem that you love to become a faggot honestly i dont know why people are trying to change their organisms that were created by it first its harmful and it will cause problems in the future second its against biology give up you dont have any come backs


In [11]:
from snorkel.analysis import get_label_buckets
#Split L_train matrix of labels into buckets, with second value being the index of label (e.g. 0 = label names_or_places)
#Retrieve results with specified value, e.g. NEGATIVE on index 0 = return results that match name_or_place LF
buckets = get_label_buckets(df_train.ISHARASSMENT, L_train[:, 2])
df_train.iloc[buckets[(NEGATIVE, POSITIVE)]].sample(1, random_state=4)

Unnamed: 0,TWEETID,USERID,TWEET,ISTYPEHOMOSEXUAL,ISTYPETRANSGENDER,ISTYPEBISEXUAL,HASPRONOUNS,ISHARASSMENT,AUTO_ISHARASSMENT,PASTEXPERIENCE,AUTO_PASTEXPERIENCE,CLEAN_TWEET
239,1370514933230628864,707878637236654080,@9NewsAdel @tomrehn9 3 day bender,1,0,0,0.0,0,,,,newsreel tommen day bender


In [12]:
L_test = applier.apply(df=df_test)

100%|██████████| 169/169 [00:00<00:00, 3871.82it/s]


In [13]:
LFAnalysis(L=L_train, lfs=lfs).lf_summary()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
names_or_places,0,[0],0.148688,0.075802,0.008746
simple_insults,1,[1],0.023324,0.005831,0.0
short_insult_tweet,2,[1],0.043732,0.017493,0.008746
term_to_person,3,[1],0.017493,0.002915,0.002915
descriptive_bad,4,[1],0.052478,0.020408,0.017493
against_nature,5,[1],0.002915,0.0,0.0
transphobic_statements,6,[1],0.002915,0.0,0.0
death_threats,7,[],0.0,0.0,0.0
gay_disease,8,[1],0.002915,0.0,0.0
full_caps,9,[1],0.037901,0.017493,0.005831


In [14]:
from snorkel.labeling.model import MajorityLabelVoter

majority_model = MajorityLabelVoter()
preds_train = majority_model.predict(L=L_train)

In [15]:
preds_train

array([ 0,  0,  0,  0,  1, -1,  0,  0,  0, -1,  0,  0,  1, -1,  0, -1,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1,  0,  0,
       -1, -1,  1,  1,  0,  0, -1,  1, -1,  1,  0,  1,  0,  0, -1,  1,  0,
       -1, -1,  0,  1,  0, -1,  0,  0, -1,  0,  0,  0,  0, -1,  0, -1,  0,
        0,  0,  0, -1,  0, -1,  0,  0,  0,  0,  1,  0,  0,  0, -1,  0, -1,
        0, -1,  0,  0,  1,  0,  1, -1,  0, -1,  0,  0, -1,  1, -1,  1, -1,
       -1,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1,  0, -1, -1,  0,  0, -1,
        0,  1, -1,  0, -1,  0,  0,  0,  0, -1, -1, -1,  0, -1,  0,  1,  1,
        0,  0,  0, -1,  1, -1,  0,  0,  1,  0,  0, -1,  0,  0,  0,  0,  0,
        0, -1, -1,  0, -1,  0, -1,  0,  0,  0, -1,  0,  0,  0,  1,  0, -1,
       -1, -1,  0,  0,  0,  0,  1,  0,  1,  0,  0,  0,  0,  0,  0,  1,  0,
        1,  0,  0,  0, -1,  1, -1,  0,  1, -1,  0,  0,  0,  0, -1,  0,  0,
        0,  0, -1,  1, -1, -1,  1,  0, -1,  0,  0, -1,  0,  0,  0, -1, -1,
        0,  0, -1,  1,  0

In [16]:
from snorkel.labeling.model import LabelModel

label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123)

In [17]:
label_model.get_weights()

  return np.clip(accs / self.coverage, 1e-6, 1.0)


array([0.88893318, 0.74637356, 0.62913107, 0.79487742, 0.55076104,
       0.85464006, 0.85464006, 1.        , 0.85464006, 0.64864882,
       0.17150001, 1.        , 1.        , 0.58584774, 0.1420844 ,
       0.18974313, 1.        , 0.22663178, 0.16809927, 0.50722477,
       0.38044951])

In [18]:
majority_acc = majority_model.score(L=L_test, Y=Y_test, tie_break_policy="abstain")[
    "accuracy"
]
print(f"{'Majority Vote Accuracy:':<25} {majority_acc * 100:.1f}%")

label_model_acc = label_model.score(L=L_test, Y=Y_test, tie_break_policy="abstain")[
    "accuracy"
]
print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:1f}%")



Majority Vote Accuracy:   88.8%
Label Model Accuracy:     75.454545%


In [19]:
majority_model.score(L_test, Y_test, metrics=["f1_micro"])



{'f1_micro': 0.8878504672897196}

In [20]:
from snorkel.labeling import filter_unlabeled_dataframe
probs_train = label_model.predict_proba(L=L_train)

df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
    X=df_train, y=probs_train, L=L_train
)

In [21]:
probs_train_filtered

array([[9.97115305e-01, 2.88469482e-03],
       [2.02871633e-01, 7.97128367e-01],
       [1.40608600e-01, 8.59391400e-01],
       [9.97515873e-01, 2.48412697e-03],
       [1.82371267e-01, 8.17628733e-01],
       [9.96408989e-01, 3.59101082e-03],
       [3.77920426e-01, 6.22079574e-01],
       [9.97701447e-01, 2.29855338e-03],
       [9.99982049e-01, 1.79513359e-05],
       [2.02871633e-01, 7.97128367e-01],
       [2.02871633e-01, 7.97128367e-01],
       [4.08632192e-01, 5.91367808e-01],
       [8.29540317e-01, 1.70459683e-01],
       [9.99753920e-01, 2.46079946e-04],
       [9.99982049e-01, 1.79513359e-05],
       [5.19447019e-01, 4.80552981e-01],
       [9.99982049e-01, 1.79513359e-05],
       [5.19447019e-01, 4.80552981e-01],
       [2.02871633e-01, 7.97128367e-01],
       [8.50439147e-01, 1.49560853e-01],
       [1.71404138e-01, 8.28595862e-01],
       [8.66239181e-02, 9.13376082e-01],
       [9.99982049e-01, 1.79513359e-05],
       [5.19447019e-01, 4.80552981e-01],
       [9.100512

In [22]:
df_train_filtered

Unnamed: 0,TWEETID,USERID,TWEET,ISTYPEHOMOSEXUAL,ISTYPETRANSGENDER,ISTYPEBISEXUAL,HASPRONOUNS,ISHARASSMENT,AUTO_ISHARASSMENT,PASTEXPERIENCE,AUTO_PASTEXPERIENCE,CLEAN_TWEET
3,1365275128590917634,1097939009391730688,did i ever post this bender i drew? \nhere's this bender i drew https://t.co/MqlBKAWv93,1,0,0,,0,,0.0,,did i ever post this bender i drew heres this bender i drew httpstcomqlbkawv
234,1370516070268829701,1336494755472142336,(What he said was something she never put thought behind. Batty was so busy beating herself up mentally and emotionally that she never stopped to consider she was a child who didn't deserve half the shit that happened. Batty slowly approached Tcmmy and put her forehead -- https://t.co/Rkab8FXNRJ,0,0,0,0.0,0,,,,what he said was something she never put thought behind batty was so busy beating herself up mentally and emotionally that she never stopped to consider she was a child who didnt deserve half the shit that happened batty slowly approached tummy and put her forehead httpstcorkabfxnrj
382,1371221893219508226,1310639140669714434,"@TonyLea17 @BSnapz2019 Yeah ""Havin a fag"" is not what it sound like to Americans",1,0,0,0.0,0,,,,tonya snap yeah havin a fag is not what it sound like to americans
60,1365315495298269184,1340666050103422976,@MrC_AndTheNews @evelynerskine1 @toryboypierce @NicolaSturgeon @RuthDavidsonMSP I’ve got a faith in my country and countrymen that you just obviously don’t share!\nI believe we will fly when we’re unshackled from the toxic union... but some people have been on a bender knee so long they’re afraid of standing on their own 2 feet again I guess!,1,0,0,0.0,0,,,,mrcandthenews evelynerskine toryboypierce nicolasturgeon ruthdavidsonmsp ive got a faith in my country and countrymen that you just obviously dont share believe we will fly when were unshackled from the toxic union but some people have been on a bender knee so long theyre afraid of standing on their own feet again i guess
110,1365295656563339265,1325247924,Ready to boycott every remake / reboot movie from here on in 😂 45mins into the new Wrong Turn movie &amp; its clear to see directors of horror movies compared to 20years ago clearly have no imagination anymore. And the actors are always nancy fucking pansy ass cheesy yanks.,1,0,0,0.0,1,,,,ready to boycott every remake reboot movie from here on in mins into the new wrong turn movie amp its clear to see directors of horror movies compared to years ago clearly have no imagination anymore and the actors are always nancy fucking pansy ass cheesy yanks
407,1371601805445099520,1270827673179258880,@Lyons32_ @vfxaero @Twirfy_ What a bender LMAO,1,0,0,0.0,1,,0.0,,lyons vfxaero twirly what a bender lao
418,1371591162297999368,726688231,@dyke_gothic I have a instamax mini printer! it's fun !,1,0,0,1.0,0,,,,dykegothic i have a inutama mini printer its fun
29,1365267416800968707,3438620050,@tamaran_bender @cspan @RandPaul You actually have no clue what you’re talking about!,1,0,0,,0,,,,tamaranbender span randall you actually have no clue what youre talking about
274,1370494290615537667,1354226166270455808,@KaylaChowShow The last testicle bender.,1,0,0,0.0,0,,,,kaylachowshow the last testicle bender
26,1365268364344651778,366088429,Just wana be brukkinngg it down in a field with a Bacardi breezer in my batty riders 🕺🏼☀️,1,0,0,,0,,,,just wana be brukkinngg it down in a field with a bacardi breeze in my batty riders


In [23]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(ngram_range=(1, 5))
X_train = vectorizer.fit_transform(df_train_filtered.CLEAN_TWEET.tolist())
X_test = vectorizer.transform(df_test.CLEAN_TWEET.tolist())

In [24]:
from snorkel.utils import probs_to_preds

preds_train_filtered = probs_to_preds(probs=probs_train_filtered, tie_break_policy="abstain")

In [25]:
from sklearn.linear_model import LogisticRegression

sklearn_model = LogisticRegression(C=1e3, solver="liblinear")
sklearn_model.fit(X=X_train, y=preds_train_filtered)

LogisticRegression(C=1000.0, solver='liblinear')

In [26]:
print(f"Test Accuracy: {sklearn_model.score(X=X_test, y=Y_test) * 100:.1f}%")

Test Accuracy: 57.4%


In [27]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
svm_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])
svm_clf.fit(df_train.TWEET, df_train.ISHARASSMENT)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier())])

In [28]:
svm_pred = svm_clf.predict(df_test.TWEET)
print("Accuracy of Support Vector Machine classifier:\t",np.mean(svm_pred == df_test.ISHARASSMENT))

Accuracy of Support Vector Machine classifier:	 0.7928994082840237


In [29]:
preds_train_filtered

array([0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1,
       1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0,
       1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0])

In [30]:
#Get unseen data
#Connect to DB
conn = sqlite3.connect("../../etc/database_store/new_tweets_2.db")
#Retrieve Tweets from Database
with conn:
    cur = conn.cursor()
    #Construct SQL Queries
    #Obtain all manually labelled tweets from DB
    new_data = pd.read_sql(con=conn,sql="SELECT * FROM TWEETS")

In [31]:
#Apply on unseen data
L_new = applier.apply(df=new_data)

100%|██████████| 40266/40266 [00:09<00:00, 4427.00it/s]


In [32]:
preds_new = majority_model.predict(L=L_new)

In [33]:
print(new_data.iloc[1])
print(preds_new[1])

TWEETID                1365309571221262339                                                                        
USERID                 2905772875                                                                                 
TWEET                  @trrixzy Yeah scamming faggot been waiting for 2 weeks while everyone else gets there stuff
ISTYPEHOMOSEXUAL       1                                                                                          
ISTYPETRANSGENDER      0                                                                                          
ISTYPEBISEXUAL         0                                                                                          
HASPRONOUNS            NaN                                                                                        
ISHARASSMENT           NaN                                                                                        
AUTO_ISHARASSMENT      None                                                     

In [34]:
import math
rows_without_pred = []
for i in range(len(new_data)):
    if preds_new[i]==0:
        new_data.loc[i, "AUTO_ISHARASSMENT"] = np.float64(0.0)
    elif preds_new[i]==1:
        new_data.loc[i, "AUTO_ISHARASSMENT"] = np.float64(1.0)
    else:
        if math.isnan(new_data.loc[i, "ISHARASSMENT"]):
            rows_without_pred.append(i)
    if i%1000 == 0:
        print(i)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000


In [35]:
#new_data.loc[1]
#preds_new[39756]

In [36]:
#new_data.loc[1,"AUTO_ISHARASSMENT"] = 1

In [37]:
new_data["AUTO_ISHARASSMENT"] = pd.to_numeric(new_data["AUTO_ISHARASSMENT"], downcast="float")

In [38]:
new_data.dtypes

TWEETID                int64  
USERID                 object 
TWEET                  object 
ISTYPEHOMOSEXUAL       int64  
ISTYPETRANSGENDER      int64  
ISTYPEBISEXUAL         int64  
HASPRONOUNS            float64
ISHARASSMENT           float64
AUTO_ISHARASSMENT      float32
PASTEXPERIENCE         float64
AUTO_PASTEXPERIENCE    object 
CLEAN_TWEET            object 
dtype: object

In [39]:
#auto_labelled_data = new_data.drop(rows_without_pred)

In [40]:
print(len(rows_without_pred))

18005


In [41]:
print(len(rows_without_pred), "tweets without predictions")
print(len(new_data) - len(rows_without_pred), "tweets with predictions")
#auto_labelled_data.dtypes

18005 tweets without predictions
22261 tweets with predictions


In [42]:
#conn = sqlite3.connect("auto_tweets.db")
#new_data.to_sql("TWEETS", conn, if_exists="append", index=False)
#print("successfully put new db")

In [43]:
import sqlite3
import pandas as pd
#Retrieve storage to double check if stored correctly
#Connect to DB
conn = sqlite3.connect("../../etc/database_store/auto_tweets.db")
#Retrieve Tweets from Database
with conn:
    cur = conn.cursor()
    #Construct SQL Queries
    #Obtain all manually labelled tweets from DB
    auto_data = pd.read_sql(con=conn,sql="SELECT * FROM TWEETS")

In [44]:
len(auto_data)

40266

In [45]:
len(new_data)

40266