In [1]:
#Import dependencies
import sqlite3
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [2]:
#Connect to DB
conn = sqlite3.connect("tweets.db")

In [3]:
with conn:
    cur = conn.cursor()
    #Construct SQL Queries
    #Count Total amount of Tweets in Database
    cur.execute("SELECT COUNT(USERID) FROM TWEETS")
    print("Total Tweets in DB: ", cur.fetchall()[0][0])
    #Count amount of manually labelled tweets currently in Database
    cur.execute("SELECT COUNT(USERID) FROM TWEETS WHERE ISHARASSMENT IS NOT NULL")
    print("Total Manual Labelled tweets in DB: ", cur.fetchall()[0][0])

Total Tweets in DB:  22597
Total Manual Labelled tweets in DB:  511


In [4]:
#Obtain all manually labelled tweets from DB
cur.execute("SELECT * FROM TWEETS WHERE ISHARASSMENT IS NOT NULL ORDER BY RANDOM()")
tweets = cur.fetchall()

In [5]:
tweets

[(1370875529377804291,
  '1173516925030281217',
  '@Bitboy_Crypto You are a batty boy not bitboy',
  1,
  0,
  0,
  0,
  1,
  None),
 (1365265020863455234,
  '899209774754967552',
  'That moment when you realise that Dick Van Dyke is your longest lasting celebrity crush üò∂ https://t.co/cFnot4I1ei',
  0,
  0,
  0,
  None,
  0,
  None),
 (1372565860750008332,
  '420065645',
  "There are public dinosaurs, and private dinosaurs.  And you're a faggot.",
  1,
  0,
  0,
  0,
  1,
  None),
 (1365299087332024320,
  '816249891173388288',
  'All I need is 18 degrees and I‚Äôm in batty riders.',
  1,
  0,
  0,
  0,
  0,
  None),
 (1371227358636675077,
  '353097785',
  'The pansy pond is looking good and we are in love with this tulip tree! @alyssaharad  #FlowerReport https://t.co/rexiomYyDS',
  1,
  0,
  0,
  0,
  0,
  None),
 (1365317096356409344,
  '1300391751774068736',
  '@bingusboingus Ily too batty ^-^',
  1,
  0,
  0,
  0,
  0,
  None),
 (1371225412710363136,
  '1336915878127005696',
  '@

In [6]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
lemmatizer = WordNetLemmatizer()

# function to convert nltk tag to wordnet tag
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None
    
def lemmatize_sentence(sentence):
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:        
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

In [9]:
lemmatize_sentence("This is a test sentence I'm not too sure what this will do")

"This be a test sentence I 'm not too sure what this will do"

In [24]:
count = 0
data = []
data_labels = []
for twid, uid, tweet, is_homophobic, is_transphobic, is_biphobic, has_pronouns, is_harassment, a_is_harassment in tweets:
    data.append(lemmatize_sentence(tweet))
    data_labels.append(is_harassment)

In [25]:
vectorizer = CountVectorizer(
    analyzer = 'word',
    lowercase = False,
)
features = vectorizer.fit_transform(
    data
)
features_nd = features.toarray() # for easy usage

In [26]:
X_train, X_test, y_train, y_test  = train_test_split(
        features_nd, 
        data_labels,
        train_size=0.66, 
        random_state=1234)

In [27]:
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression()

In [28]:
log_model = log_model.fit(X=X_train, y=y_train)
y_pred = log_model.predict(X_test)



In [29]:
import random
j = random.randint(0,len(X_test)-7)
for i in range(j,j+7):
    print(y_pred[0])
    ind = features_nd.tolist().index(X_test[i].tolist())
    print(data[ind].strip())

0
shall we send this tweet to the gay man wgs professor who say dyke in class today yes or yes http : //t.co/CSW7D1REJg
0
@ ejjallorina @ RighteousLeftst @ teejaymarquez @ ponce_jerome @ RoyceCabrera @ vance_larena @ RegalFilms @ RighteousLeftst @ ejjallorina sense ko i feel it , the # romantic and # hope btween Val & amp ; Jim , Roy & amp ; Ben , be just temporary . Universe will let you cross path with people on purpose : comfort and realize thing , but the ‚ù§Ô∏è will end up whom it should be‚òùüèª # BXJForeverEpisodeThree
0
@ Anotherfknegirl Twink ? Nah , just don ‚Äô t use it as an insult because some people use it to hide behind their use of faggot , you can say it normally tho
0
I ca n't sleep ... Fags write me now , I want to empty wallet , that relax me üòàü§ëüî•
0
TV Talk : # WandaVision If this show get a season 2 , I demand they put Bob Newhart in this show . He 's just as much television royalty as Dick Van Dyke . http : //t.co/RKJibNljhr
0
@ __Hunter234 @ Pignite772 @

In [30]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.7528735632183908
