In [1]:
#Import dependencies
import sqlite3
from sklearn import svm
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [2]:
#Connect to DB
conn = sqlite3.connect("auto_tweets.db")

In [3]:
with conn:
    cur = conn.cursor()
    #Construct SQL Queries
    #Count Total amount of Tweets in Database
    cur.execute("SELECT COUNT(USERID) FROM TWEETS")
    print("Total Tweets in DB: ", cur.fetchall()[0][0])
    #Count amount of manually labelled tweets currently in Database
    cur.execute("SELECT COUNT(USERID) FROM TWEETS WHERE ISHARASSMENT IS NOT NULL")
    print("Total Manual Labelled tweets in DB: ", cur.fetchall()[0][0])

Total Tweets in DB:  40266
Total Manual Labelled tweets in DB:  512


In [4]:
#Obtain all labelled tweets from DB
pd_data = pd.read_sql(con=conn,sql="SELECT * FROM TWEETS WHERE AUTO_ISHARASSMENT IS NOT NULL OR ISHARASSMENT IS NOT NULL")

In [5]:
#Get sample of data to save memory. Using full dataset causes Kernel to crash from memory overflow on 16GB system
pd_data = pd_data.sample(n=5000, random_state=70824426)

In [6]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
lemmatizer = WordNetLemmatizer()

# function to convert nltk tag to wordnet tag
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None
    
def lemmatize_sentence(sentence):
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:        
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

In [7]:
import math

def convert_size(size_bytes):
   if size_bytes == 0:
       return "0B"
   size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
   i = int(math.floor(math.log(size_bytes, 1024)))
   p = math.pow(1024, i)
   s = round(size_bytes / p, 2)
   return "%s %s" % (s, size_name[i])

In [9]:
import sys
print("size of dataframe: ", convert_size(sys.getsizeof(pd_data)))

size of dataframe:  2.76 MB


In [10]:
pd_data

Unnamed: 0,TWEETID,USERID,TWEET,ISTYPEHOMOSEXUAL,ISTYPETRANSGENDER,ISTYPEBISEXUAL,HASPRONOUNS,ISHARASSMENT,AUTO_ISHARASSMENT,PASTEXPERIENCE,AUTO_PASTEXPERIENCE,CLEAN_TWEET
13935,1376217439940251651,1221213601559666688,@ilovebram2 nah katara’s the best bender i sai...,1,0,0,0.0,,0.0,,,ilovebram nah katara the best bender i said wh...
7184,1374509846347718658,242759164,@BorisJohnson Hey Doris which you want Restaur...,1,0,0,0.0,,0.0,,,borisjohnson hey doris which you want restaura...
11459,1375428519346380805,1315261675944124416,@Fag_Prince Good morning,0,0,0,0.0,,0.0,,,fagprince good morning
18387,1377751657459589120,1145706546309083136,@fIwrchuu slur // \n\nyou called me a fag,1,0,0,0.0,,0.0,,,fiwrchuu slur you called me a fag
19575,1382479737864396803,773560017945554945,i knew dominic fike was rlly dominic fag,1,0,0,1.0,,0.0,,,i knew dominic like was rlly dominic fag
...,...,...,...,...,...,...,...,...,...,...,...,...
11465,1375428123504881669,1018594013119729665,@Vylocs @Fag_Prince The best of them,0,0,0,0.0,,0.0,,,cylons fagprince the best of them
4219,1372680076282753026,1213342491165908994,@aqua_pansy TAKE CARE KAIRI!!! i’m sorry i did...,1,0,0,1.0,,0.0,,,aquapansy take care uairi im sorry i didnt see...
3473,1371970264741912577,1264981560895381505,@straight_fag I came to the horrid realization...,1,0,0,1.0,,0.0,,,straightway i came to the horrid realization t...
14977,1376610109187444736,18568117,Can the #STLCards back end of the rotation hol...,0,0,0,0.0,,0.0,,,can the stewards back end of the rotation hold...


In [11]:
import math
def get_labels(df):
    labels = []
    for i in range(len(df)):
        if math.isnan(df.iloc[i].ISHARASSMENT):
            labels.append(int(df.iloc[i].AUTO_ISHARASSMENT))
        else:
            labels.append(int(df.iloc[i].ISHARASSMENT))
            print(i, "ISHARASSMENT is", int(df.iloc[i].ISHARASSMENT))
    return labels
labels = get_labels(pd_data)

24 ISHARASSMENT is 0
35 ISHARASSMENT is 1
36 ISHARASSMENT is 0
97 ISHARASSMENT is 0
144 ISHARASSMENT is 0
193 ISHARASSMENT is 0
222 ISHARASSMENT is 1
240 ISHARASSMENT is 0
299 ISHARASSMENT is 0
316 ISHARASSMENT is 0
375 ISHARASSMENT is 0
399 ISHARASSMENT is 0
436 ISHARASSMENT is 0
477 ISHARASSMENT is 0
584 ISHARASSMENT is 1
638 ISHARASSMENT is 0
685 ISHARASSMENT is 0
746 ISHARASSMENT is 1
756 ISHARASSMENT is 0
777 ISHARASSMENT is 1
815 ISHARASSMENT is 0
855 ISHARASSMENT is 0
872 ISHARASSMENT is 1
908 ISHARASSMENT is 0
925 ISHARASSMENT is 0
927 ISHARASSMENT is 0
955 ISHARASSMENT is 0
1116 ISHARASSMENT is 0
1130 ISHARASSMENT is 1
1135 ISHARASSMENT is 0
1184 ISHARASSMENT is 0
1212 ISHARASSMENT is 0
1219 ISHARASSMENT is 0
1221 ISHARASSMENT is 1
1252 ISHARASSMENT is 1
1338 ISHARASSMENT is 0
1352 ISHARASSMENT is 0
1424 ISHARASSMENT is 1
1531 ISHARASSMENT is 0
1582 ISHARASSMENT is 1
1613 ISHARASSMENT is 0
1750 ISHARASSMENT is 0
1761 ISHARASSMENT is 0
1825 ISHARASSMENT is 0
1837 ISHARASSMENT i

In [12]:
np.asarray(labels)

array([0, 0, 0, ..., 0, 0, 0])

In [13]:
tweets_as_list = pd_data["TWEET"].tolist()

In [14]:
X_train, X_test, y_train, y_test  = train_test_split(
        pd_data, 
        np.asarray(labels),
        train_size=0.66, 
        random_state=1234)

In [15]:
X_test

Unnamed: 0,TWEETID,USERID,TWEET,ISTYPEHOMOSEXUAL,ISTYPETRANSGENDER,ISTYPEBISEXUAL,HASPRONOUNS,ISHARASSMENT,AUTO_ISHARASSMENT,PASTEXPERIENCE,AUTO_PASTEXPERIENCE,CLEAN_TWEET
1022,1372548037277532162,1276771879521574913,love junkies on a three-day bender,1,0,0,0.0,,0.0,,,love junkies on a thready bender
2474,1371244693435125761,4298426974,@anarcho_fag G-get it?,1,0,0,1.0,,0.0,,,anarchofag gget it
12051,1375344986242215938,2291700865,faggot.,1,0,0,0.0,,1.0,,,faggot
21124,1382823631596621827,1326918557125500934,Just posted a video @ Ponce City Market https:...,0,0,0,0.0,,0.0,,,just posted a video ponce city market httpstco...
635,1365284653045854209,244926711,@TruthfulTreason From Minds. I didn’t know “pa...,1,0,0,0.0,1.0,,,,truthfultreason from minds i didnt know pansy ...
...,...,...,...,...,...,...,...,...,...,...,...,...
15637,1377027944200146947,958422254244810752,@TheVeganLeft @anarcho_fag based take,1,0,0,0.0,,0.0,,,theveganleft anarchofag based take
7754,1374468395932950530,15640446,Two Pirates pitching updates:\n1. RHP James Ma...,0,0,0,0.0,,0.0,,,two pirates pitching updates rip james marvel ...
3006,1371605939783090183,1204950720866344961,@Ziox56771 @straight_fag do you just hammer f5...,1,0,0,1.0,,0.0,,,zion straightway do you just hammer i then
1516,1370513027762548736,879108576018546688,@aqua_pansy THANK YOU AGAIN KAIRIN!!! 🥺💖💕💗,1,0,0,1.0,,0.0,,,aquapansy thank you again karin


Train on 'CLEAN' tweet

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

log_model = LogisticRegression(solver="liblinear", random_state=0)

#Instantiate vectorizer to generate features from text input
vectorizer = TfidfVectorizer()
vectorizer.fit(X_train.CLEAN_TWEET)
num_features = len(vectorizer.get_feature_names())
print("Number of features: ", num_features)
#Generate feature set for train and test data
train_tfidf = vectorizer.transform(X_train.CLEAN_TWEET)
test_tfidf =  vectorizer.transform(X_test.CLEAN_TWEET)

#Generate model and fit to training features
log_model = log_model.fit(X=train_tfidf, y=y_train)

#Generate predictions on test features
y_pred = log_model.predict(test_tfidf)

#Generate accuracy score of predictions on our test dataset
print("Accuracy of classifier: ", accuracy_score(y_test, y_pred))

#Generate confusion matrix for performance analysis on our test dataset
print("\nconfusion matrix: \n", confusion_matrix(y_test, y_pred, labels=[0,1]))

#Generate classification report for performance analysis on our test dataset
print("\nclassification report: \n",classification_report(y_test, y_pred))

Number of features:  11994
Accuracy of classifier:  0.8947058823529411

confusion matrix: 
 [[1365   31]
 [ 148  156]]

classification report: 
               precision    recall  f1-score   support

           0       0.90      0.98      0.94      1396
           1       0.83      0.51      0.64       304

    accuracy                           0.89      1700
   macro avg       0.87      0.75      0.79      1700
weighted avg       0.89      0.89      0.88      1700



**Train on Original, non preprocessed, Tweet**

In [24]:
log_model = LogisticRegression(solver="liblinear", random_state=0)

#Instantiate vectorizer to generate features from text input
vectorizer = TfidfVectorizer()
vectorizer.fit(X_train.TWEET)
num_features = len(vectorizer.get_feature_names())
print("Number of features: ", num_features)
#Generate feature set for train and test data
train_tfidf = vectorizer.transform(X_train.TWEET)
test_tfidf =  vectorizer.transform(X_test.TWEET)

#Generate model and fit to training features
log_model = log_model.fit(X=train_tfidf, y=y_train)

#Generate predictions on test features
y_pred = log_model.predict(test_tfidf)

#Generate accuracy score of predictions on our test dataset
print("Accuracy of classifier: ", accuracy_score(y_test, y_pred))

#Generate confusion matrix for performance analysis on our test dataset
print("\nconfusion matrix: \n", confusion_matrix(y_test, y_pred, labels=[0,1]))

#Generate classification report for performance analysis on our test dataset
print("\nclassification report: \n",classification_report(y_test, y_pred))

Number of features:  12709
Accuracy of classifier:  0.8923529411764706

confusion matrix: 
 [[1360   36]
 [ 147  157]]

classification report: 
               precision    recall  f1-score   support

           0       0.90      0.97      0.94      1396
           1       0.81      0.52      0.63       304

    accuracy                           0.89      1700
   macro avg       0.86      0.75      0.78      1700
weighted avg       0.89      0.89      0.88      1700



-----------------------------------------------------------------------

https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

In [31]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train.TWEET)
X_train_counts.shape

(3300, 12709)

In [35]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(3300, 12709)

In [36]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [40]:
X_test_counts = count_vect.transform(X_test.TWEET)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

predicted = clf.predict(X_test_tfidf)

for tweet, label in zip(X_test.TWEET, y_test):
    print('%r => %s' % (tweet, label))

'love junkies on a three-day bender' => 0
'@anarcho_fag G-get it?' => 0
'faggot.' => 1
'Just posted a video @ Ponce City Market https://t.co/1eN5NAriS0' => 0
'@TruthfulTreason From Minds. I didn’t know “pansy” was so offensive. https://t.co/Aop8U3isWK' => 1
'Even if you consider an accident to only be a "fender bender", do not hesitate to contact us if you\'ve been injured. We will review your case and fight for the maximum financial compensation you deserve. Contact us today for a free consultation! https://t.co/zp8dQXhBs5' => 0
'They call that not eating a professional bender Presentation: non-binary people though especially cis. — https://t.co/kS5atIjIXX' => 0
'@hydrated_dyke @chief_beef5 They’re all going to hell' => 0
'@RachelSJohnson @LBC @munirawilson @petercardwell @jamesmatesitv @annehernandez "Spain kicked them out"? gives us a break, They went as it was their choice not to apply for residency, The Guardia wouldn\'t have pounced on them this morning and shoved them on a plane

In [50]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
svm_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])
svm_clf.fit(X_train.TWEET, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier())])

In [51]:
from sklearn.naive_bayes import MultinomialNB
naive_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])
naive_clf.fit(X_train.TWEET, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB())])

In [53]:
svm_pred = svm_clf.predict(X_test.TWEET)
print("Accuracy of Support Vector Machine classifier:\t",np.mean(svm_pred == y_test))
naive_pred = naive_clf.predict(X_test.TWEET)
print("Accuracy of Naive Bayes classifier:\t\t",np.mean(naive_pred == y_test))

Accuracy of Support Vector Machine Classifier:	 0.8911764705882353
Accuracy of Naive Bayes classifier:		 0.8247058823529412


In [61]:
from sklearn import metrics
print("\n\nclassification report for Support Vector Machine classifer\n", metrics.classification_report(y_test, svm_pred, target_names=["0","1"]))
print("confusion matrix for Support Vector Machine classifier\n",metrics.confusion_matrix(y_test, svm_pred))

print("\n\nclassification report for Naive Bayes classifier\n", metrics.classification_report(y_test, naive_pred, target_names=["0","1"]))
print("confusion matrix for Support Vector Machine classifier\n",metrics.confusion_matrix(y_test, naive_pred))



classification report for Support Vector Machine classifer
               precision    recall  f1-score   support

           0       0.94      0.93      0.93      1396
           1       0.68      0.73      0.71       304

    accuracy                           0.89      1700
   macro avg       0.81      0.83      0.82      1700
weighted avg       0.89      0.89      0.89      1700

confusion matrix for Support Vector Machine classifier
 [[1292  104]
 [  81  223]]


classification report for Naive Bayes classifier
               precision    recall  f1-score   support

           0       0.82      1.00      0.90      1396
           1       1.00      0.02      0.04       304

    accuracy                           0.82      1700
   macro avg       0.91      0.51      0.47      1700
weighted avg       0.86      0.82      0.75      1700

confusion matrix for Support Vector Machine classifier
 [[1396    0]
 [ 298    6]]
