In [1]:
#Import dependencies
import sqlite3
from sklearn import svm
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [2]:
#Connect to DB
conn = sqlite3.connect("../../etc/database_store/auto_tweets.db")

In [3]:
with conn:
    cur = conn.cursor()
    #Construct SQL Queries
    #Count Total amount of Tweets in Database
    cur.execute("SELECT COUNT(USERID) FROM TWEETS")
    print("Total Tweets in DB: ", cur.fetchall()[0][0])
    #Count amount of manually labelled tweets currently in Database
    cur.execute("SELECT COUNT(USERID) FROM TWEETS WHERE ISHARASSMENT IS NOT NULL")
    print("Total Manual Labelled tweets in DB: ", cur.fetchall()[0][0])

Total Tweets in DB:  40266
Total Manual Labelled tweets in DB:  512


In [4]:
#Obtain all labelled tweets from DB
pd_data = pd.read_sql(con=conn,sql="SELECT * FROM TWEETS WHERE AUTO_ISHARASSMENT IS NOT NULL OR ISHARASSMENT IS NOT NULL")

In [5]:
#Get sample of data to save memory. Using full dataset causes Kernel to crash from memory overflow on 16GB system
pd_data = pd_data.sample(n=15000, random_state=70824426)

In [6]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
lemmatizer = WordNetLemmatizer()

# function to convert nltk tag to wordnet tag
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None
    
def lemmatize_sentence(sentence):
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:        
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

In [7]:
import math

def convert_size(size_bytes):
   if size_bytes == 0:
       return "0B"
   size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
   i = int(math.floor(math.log(size_bytes, 1024)))
   p = math.pow(1024, i)
   s = round(size_bytes / p, 2)
   return "%s %s" % (s, size_name[i])

In [8]:
import sys
print("size of dataframe: ", convert_size(sys.getsizeof(pd_data)))

size of dataframe:  8.31 MB


In [9]:
pd_data

Unnamed: 0,TWEETID,USERID,TWEET,ISTYPEHOMOSEXUAL,ISTYPETRANSGENDER,ISTYPEBISEXUAL,HASPRONOUNS,ISHARASSMENT,AUTO_ISHARASSMENT,PASTEXPERIENCE,AUTO_PASTEXPERIENCE,CLEAN_TWEET
13935,1376217439940251651,1221213601559666688,@ilovebram2 nah katara’s the best bender i sai...,1,0,0,0.0,,0.0,,,ilovebram nah katara the best bender i said wh...
7184,1374509846347718658,242759164,@BorisJohnson Hey Doris which you want Restaur...,1,0,0,0.0,,0.0,,,borisjohnson hey doris which you want restaura...
11459,1375428519346380805,1315261675944124416,@Fag_Prince Good morning,0,0,0,0.0,,0.0,,,fagprince good morning
18387,1377751657459589120,1145706546309083136,@fIwrchuu slur // \n\nyou called me a fag,1,0,0,0.0,,0.0,,,fiwrchuu slur you called me a fag
19575,1382479737864396803,773560017945554945,i knew dominic fike was rlly dominic fag,1,0,0,1.0,,0.0,,,i knew dominic like was rlly dominic fag
...,...,...,...,...,...,...,...,...,...,...,...,...
152,1365319227964608518,78631784,@elonmusk Totally Batty,0,0,0,0.0,,0.0,,,elonmusk totally batty
19852,1382454058859036674,17742969,@BettyBathory0 i just read this to alice and s...,0,0,0,1.0,,0.0,,,bettybathory i just read this to alice and she...
1759,1370497145602707458,1366470932601180160,Kill this fag https://t.co/vMGomyFBRY,1,0,0,0.0,,1.0,,,kill this fag httpstcovmgomyfbry
4229,1372679295458451458,1719221334,I was trying to remember how many of my high s...,1,0,0,0.0,,0.0,,,i was trying to remember how many of my high s...


In [10]:
import math
def get_labels(df):
    labels = []
    for i in range(len(df)):
        if math.isnan(df.iloc[i].ISHARASSMENT):
            labels.append(int(df.iloc[i].AUTO_ISHARASSMENT))
        else:
            labels.append(int(df.iloc[i].ISHARASSMENT))
            print(i, "ISHARASSMENT is", int(df.iloc[i].ISHARASSMENT))
    return labels
labels = get_labels(pd_data)

24 ISHARASSMENT is 0
35 ISHARASSMENT is 1
36 ISHARASSMENT is 0
97 ISHARASSMENT is 0
144 ISHARASSMENT is 0
193 ISHARASSMENT is 0
222 ISHARASSMENT is 1
240 ISHARASSMENT is 0
299 ISHARASSMENT is 0
316 ISHARASSMENT is 0
375 ISHARASSMENT is 0
399 ISHARASSMENT is 0
436 ISHARASSMENT is 0
477 ISHARASSMENT is 0
584 ISHARASSMENT is 1
638 ISHARASSMENT is 0
685 ISHARASSMENT is 0
746 ISHARASSMENT is 1
756 ISHARASSMENT is 0
777 ISHARASSMENT is 1
815 ISHARASSMENT is 0
855 ISHARASSMENT is 0
872 ISHARASSMENT is 1
908 ISHARASSMENT is 0
925 ISHARASSMENT is 0
927 ISHARASSMENT is 0
955 ISHARASSMENT is 0
1116 ISHARASSMENT is 0
1130 ISHARASSMENT is 1
1135 ISHARASSMENT is 0
1184 ISHARASSMENT is 0
1212 ISHARASSMENT is 0
1219 ISHARASSMENT is 0
1221 ISHARASSMENT is 1
1252 ISHARASSMENT is 1
1338 ISHARASSMENT is 0
1352 ISHARASSMENT is 0
1424 ISHARASSMENT is 1
1531 ISHARASSMENT is 0
1582 ISHARASSMENT is 1
1613 ISHARASSMENT is 0
1750 ISHARASSMENT is 0
1761 ISHARASSMENT is 0
1825 ISHARASSMENT is 0
1837 ISHARASSMENT i

In [11]:
np.asarray(labels)

array([0, 0, 0, ..., 1, 0, 0])

In [12]:
tweets_as_list = pd_data["TWEET"].tolist()

In [13]:
X_train, X_test, y_train, y_test  = train_test_split(
        pd_data, 
        np.asarray(labels),
        train_size=0.66, 
        random_state=1234)

In [14]:
X_test

Unnamed: 0,TWEETID,USERID,TWEET,ISTYPEHOMOSEXUAL,ISTYPETRANSGENDER,ISTYPEBISEXUAL,HASPRONOUNS,ISHARASSMENT,AUTO_ISHARASSMENT,PASTEXPERIENCE,AUTO_PASTEXPERIENCE,CLEAN_TWEET
12339,1375308376490512386,3245379542,@Fag_Prince I can't really say much about how ...,0,0,0,0.0,,0.0,,,fagprince i cant really say much about how any...
7397,1374495081533599747,1295478121429426176,"@anarcho_fag Yes, exactly. I don't know how yo...",1,0,0,1.0,,0.0,,,anarchofag yes exactly i dont know how you man...
15226,1376592792995000321,2557158404,@wavytzi @dumbass__dyke @unh0lypriestess ratio,1,0,0,0.0,,0.0,,,wavytzi dumbassdyke unhlypriestess ratio
6958,1374062563072348161,46594586,@MisterSkeezler Did it collapse when you tried...,1,0,0,0.0,,0.0,,,misterskeezler did it collapse when you tried ...
9968,1375110351604260864,1369049021751427074,"@Its_Josias Yeah, that's the Ponce native Mott...",0,0,0,0.0,,0.0,,,itsjosias yeah thats the ponce native motto ha...
...,...,...,...,...,...,...,...,...,...,...,...,...
6265,1374115662369587208,2265979861,#TeamTati💞\nLista 1⃣ \n@tatysolcito \n@teamdin...,0,0,0,0.0,,0.0,,,teamtatilista tatysolcito teamdinamicas martha...
19713,1382466968146567170,1373152052940058627,@moo0_0moo fag\n\n(PS just in case I become fa...,1,0,0,1.0,,0.0,,,voodoo fags just in case i become famous and y...
20831,1382381723636404227,4742508449,"@rutgerFM @NguyenNgocTan04 Agreed, Bender look...",0,0,0,0.0,,0.0,,,rutgers nguyenngoctan agreed bender looks good
18544,1377739640061448203,1246449609741938689,@oscahss @fabbgrat she’ll always be that girl ...,1,0,0,1.0,,0.0,,,oscar's fatbrat shell always be that girl when...


Train on 'CLEAN' tweet

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

log_model = LogisticRegression(solver="liblinear", random_state=0)

#Instantiate vectorizer to generate features from text input
vectorizer = TfidfVectorizer()
vectorizer.fit(X_train.CLEAN_TWEET)
num_features = len(vectorizer.get_feature_names())
print("Number of features: ", num_features)
#Generate feature set for train and test data
train_tfidf = vectorizer.transform(X_train.CLEAN_TWEET)
test_tfidf =  vectorizer.transform(X_test.CLEAN_TWEET)

#Generate model and fit to training features
log_model = log_model.fit(X=train_tfidf, y=y_train)

#Generate predictions on test features
y_pred = log_model.predict(test_tfidf)

#Generate accuracy score of predictions on our test dataset
print("Accuracy of classifier: ", accuracy_score(y_test, y_pred))

#Generate confusion matrix for performance analysis on our test dataset
print("\nconfusion matrix: \n", confusion_matrix(y_test, y_pred, labels=[0,1]))

#Generate classification report for performance analysis on our test dataset
print("\nclassification report: \n",classification_report(y_test, y_pred))

Number of features:  25173
Accuracy of classifier:  0.9027450980392157

confusion matrix: 
 [[4058  137]
 [ 359  546]]

classification report: 
               precision    recall  f1-score   support

           0       0.92      0.97      0.94      4195
           1       0.80      0.60      0.69       905

    accuracy                           0.90      5100
   macro avg       0.86      0.79      0.82      5100
weighted avg       0.90      0.90      0.90      5100



**Train on Original, non preprocessed, Tweet**

In [16]:
log_model = LogisticRegression(solver="liblinear", random_state=0)

#Instantiate vectorizer to generate features from text input
vectorizer = TfidfVectorizer()
vectorizer.fit(X_train.TWEET)
num_features = len(vectorizer.get_feature_names())
print("Number of features: ", num_features)
#Generate feature set for train and test data
train_tfidf = vectorizer.transform(X_train.TWEET)
test_tfidf =  vectorizer.transform(X_test.TWEET)

#Generate model and fit to training features
log_model = log_model.fit(X=train_tfidf, y=y_train)

#Generate predictions on test features
y_pred = log_model.predict(test_tfidf)

#Generate accuracy score of predictions on our test dataset
print("Accuracy of classifier: ", accuracy_score(y_test, y_pred))

#Generate confusion matrix for performance analysis on our test dataset
print("\nconfusion matrix: \n", confusion_matrix(y_test, y_pred, labels=[0,1]))

#Generate classification report for performance analysis on our test dataset
print("\nclassification report: \n",classification_report(y_test, y_pred))

Number of features:  27121
Accuracy of classifier:  0.9035294117647059

confusion matrix: 
 [[4055  140]
 [ 352  553]]

classification report: 
               precision    recall  f1-score   support

           0       0.92      0.97      0.94      4195
           1       0.80      0.61      0.69       905

    accuracy                           0.90      5100
   macro avg       0.86      0.79      0.82      5100
weighted avg       0.90      0.90      0.90      5100



-----------------------------------------------------------------------

https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

In [17]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
svm_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])
svm_clf.fit(X_train.TWEET, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier())])

In [18]:
from sklearn.naive_bayes import MultinomialNB
naive_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])
naive_clf.fit(X_train.TWEET, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB())])

In [19]:
svm_pred = svm_clf.predict(X_test.TWEET)
print("Accuracy of Support Vector Machine classifier:\t",np.mean(svm_pred == y_test))
naive_pred = naive_clf.predict(X_test.TWEET)
print("Accuracy of Naive Bayes classifier:\t\t",np.mean(naive_pred == y_test))

Accuracy of Support Vector Machine classifier:	 0.9107843137254902
Accuracy of Naive Bayes classifier:		 0.8254901960784313


In [20]:
from sklearn import metrics
print("\n\nclassification report for Support Vector Machine classifer\n", metrics.classification_report(y_test, svm_pred, target_names=["0","1"]))
print("confusion matrix for Support Vector Machine classifier\n",metrics.confusion_matrix(y_test, svm_pred))

print("\n\nclassification report for Naive Bayes classifier\n", metrics.classification_report(y_test, naive_pred, target_names=["0","1"]))
print("confusion matrix for Naive Bayes classifier\n",metrics.confusion_matrix(y_test, naive_pred))



classification report for Support Vector Machine classifer
               precision    recall  f1-score   support

           0       0.94      0.96      0.95      4195
           1       0.77      0.70      0.74       905

    accuracy                           0.91      5100
   macro avg       0.86      0.83      0.84      5100
weighted avg       0.91      0.91      0.91      5100

confusion matrix for Support Vector Machine classifier
 [[4009  186]
 [ 269  636]]


classification report for Naive Bayes classifier
               precision    recall  f1-score   support

           0       0.83      1.00      0.90      4195
           1       0.94      0.02      0.03       905

    accuracy                           0.83      5100
   macro avg       0.88      0.51      0.47      5100
weighted avg       0.85      0.83      0.75      5100

confusion matrix for Naive Bayes classifier
 [[4194    1]
 [ 889   16]]


In [21]:
from sklearn.neural_network import MLPClassifier
neural_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MLPClassifier()),
])
neural_clf.fit(X_train.TWEET, y_train)
neural_pred = neural_clf.predict(X_test.TWEET)
print("Accuracy of Neural Network classifier:\t\t",np.mean(neural_pred == y_test))
print("\n\nclassification report for Neural Network classifier\n", metrics.classification_report(y_test, neural_pred, target_names=["0","1"]))
print("confusion matrix for Neural Network classifier\n",metrics.confusion_matrix(y_test, neural_pred))

Accuracy of Neural Network classifier:		 0.8911764705882353


classification report for Neural Network classifier
               precision    recall  f1-score   support

           0       0.93      0.94      0.93      4195
           1       0.71      0.65      0.68       905

    accuracy                           0.89      5100
   macro avg       0.82      0.80      0.81      5100
weighted avg       0.89      0.89      0.89      5100

confusion matrix for Neural Network classifier
 [[3957  238]
 [ 317  588]]


In [22]:
from sklearn.svm import SVC
svc_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SVC()),
])
svc_clf.fit(X_train.TWEET, y_train)
svc_pred = svc_clf.predict(X_test.TWEET)
print("Accuracy of SVC classifier:\t\t",np.mean(svc_pred == y_test))
print("\n\nclassification report for SVC classifier:\n", metrics.classification_report(y_test, svc_pred, target_names=["0","1"]))
print("confusion matrix for SVC classifier\n",metrics.confusion_matrix(y_test, svc_pred))

Accuracy of SVC classifier:		 0.9103921568627451


classification report for SVC classifier:
               precision    recall  f1-score   support

           0       0.93      0.97      0.95      4195
           1       0.81      0.65      0.72       905

    accuracy                           0.91      5100
   macro avg       0.87      0.81      0.83      5100
weighted avg       0.91      0.91      0.91      5100

confusion matrix for SVC classifier
 [[4058  137]
 [ 320  585]]


In [23]:
from sklearn.ensemble import RandomForestClassifier
forest_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', RandomForestClassifier()),
])
forest_clf.fit(X_train.TWEET, y_train)
forest_pred = forest_clf.predict(X_test.TWEET)
print("Accuracy of Random Forest classifier:\t\t",np.mean(forest_pred == y_test))
print("\n\nclassification report for Random Forest classifier:\n", metrics.classification_report(y_test, forest_pred, target_names=["0","1"]))
print("confusion matrix for Random Forest classifier\n",metrics.confusion_matrix(y_test, forest_pred))

Accuracy of Random Forest classifier:		 0.8911764705882353


classification report for Random Forest classifier:
               precision    recall  f1-score   support

           0       0.90      0.97      0.94      4195
           1       0.81      0.51      0.62       905

    accuracy                           0.89      5100
   macro avg       0.85      0.74      0.78      5100
weighted avg       0.88      0.89      0.88      5100

confusion matrix for Random Forest classifier
 [[4085  110]
 [ 445  460]]
