In [1]:
#Import dependencies
import sqlite3
from sklearn import svm
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [2]:
#Connect to DB
conn = sqlite3.connect("../../etc/database_store/auto_tweets_2.db")

In [3]:
with conn:
    cur = conn.cursor()
    #Construct SQL Queries
    #Count Total amount of Tweets in Database
    cur.execute("SELECT COUNT(USERID) FROM TWEETS")
    print("Total Tweets in DB: ", cur.fetchall()[0][0])
    #Count amount of manually labelled tweets currently in Database
    cur.execute("SELECT COUNT(USERID) FROM TWEETS WHERE ISHARASSMENT IS NOT NULL")
    print("Total Manual Labelled tweets in DB: ", cur.fetchall()[0][0])
    #Count amount of automatically labelled tweets currently in Database
    cur.execute("SELECT COUNT(USERID) FROM TWEETS WHERE AUTO_ISHARASSMENT IS NOT NULL")
    print("Total Manual Labelled tweets in DB: ", cur.fetchall()[0][0])

Total Tweets in DB:  120798
Total Manual Labelled tweets in DB:  1536
Total Manual Labelled tweets in DB:  66321


In [4]:
#Obtain all labelled tweets from DB
pd_data = pd.read_sql(con=conn,sql="SELECT * FROM TWEETS WHERE AUTO_ISHARASSMENT IS NOT NULL OR ISHARASSMENT IS NOT NULL")

In [5]:
len(pd_data)

22261

In [6]:
#Get sample of data to save memory. Using full dataset causes Kernel to crash from memory overflow on 16GB system
pd_data = pd_data.sample(n=22000, random_state=70824426)

In [7]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
lemmatizer = WordNetLemmatizer()

# function to convert nltk tag to wordnet tag
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None
    
def lemmatize_sentence(sentence):
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:        
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

In [8]:
import math

def convert_size(size_bytes):
   if size_bytes == 0:
       return "0B"
   size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
   i = int(math.floor(math.log(size_bytes, 1024)))
   p = math.pow(1024, i)
   s = round(size_bytes / p, 2)
   return "%s %s" % (s, size_name[i])

In [9]:
import sys
print("size of dataframe: ", convert_size(sys.getsizeof(pd_data)))

size of dataframe:  12.2 MB


In [10]:
pd_data

Unnamed: 0,TWEETID,USERID,TWEET,ISTYPEHOMOSEXUAL,ISTYPETRANSGENDER,ISTYPEBISEXUAL,HASPRONOUNS,ISHARASSMENT,AUTO_ISHARASSMENT,PASTEXPERIENCE,AUTO_PASTEXPERIENCE,CLEAN_TWEET
13935,1376217439940251651,1221213601559666688,@ilovebram2 nah kataraâ€™s the best bender i sai...,1,0,0,0.0,,0.0,,,ilovebram nah katara the best bender i said wh...
7184,1374509846347718658,242759164,@BorisJohnson Hey Doris which you want Restaur...,1,0,0,0.0,,0.0,,,borisjohnson hey doris which you want restaura...
11459,1375428519346380805,1315261675944124416,@Fag_Prince Good morning,0,0,0,0.0,,0.0,,,fagprince good morning
18387,1377751657459589120,1145706546309083136,@fIwrchuu slur // \n\nyou called me a fag,1,0,0,0.0,,0.0,,,fiwrchuu slur you called me a fag
19575,1382479737864396803,773560017945554945,i knew dominic fike was rlly dominic fag,1,0,0,1.0,,0.0,,,i knew dominic like was rlly dominic fag
...,...,...,...,...,...,...,...,...,...,...,...,...
17277,1377372355886870533,2564305129,@5MIl3MUSIC Meet me in the pit you pansy https...,1,0,0,0.0,,1.0,,,milmusic meet me in the pit you pansy httpstco...
13914,1376219361808166912,1171577501946142735,@tjamesgaralt I'll do it again faggot https://...,1,0,0,1.0,,0.0,,,tjamesgaralt ill do it again faggot httpstcogp...
13088,1376287255212347398,2419976203,@shadowsoobin did u just call me an ugly dyke?...,1,0,0,1.0,,0.0,,,shadowsoobin did u just call me an ugly dyke d...
12842,1376304929250418689,1367989557879275520,Got my hair tied back like van dyke rn https:/...,1,0,0,0.0,,0.0,,,got my hair tied back like van dyke in httpstc...


In [11]:
import math
def get_labels(df):
    labels = []
    for i in range(len(df)):
        if math.isnan(df.iloc[i].ISHARASSMENT):
            labels.append(int(df.iloc[i].AUTO_ISHARASSMENT))
        else:
            labels.append(int(df.iloc[i].ISHARASSMENT))
            print(i, "ISHARASSMENT is", int(df.iloc[i].ISHARASSMENT))
    return labels
labels = get_labels(pd_data)

24 ISHARASSMENT is 0
35 ISHARASSMENT is 1
36 ISHARASSMENT is 0
97 ISHARASSMENT is 0
144 ISHARASSMENT is 0
193 ISHARASSMENT is 0
222 ISHARASSMENT is 1
240 ISHARASSMENT is 0
299 ISHARASSMENT is 0
316 ISHARASSMENT is 0
375 ISHARASSMENT is 0
399 ISHARASSMENT is 0
436 ISHARASSMENT is 0
477 ISHARASSMENT is 0
584 ISHARASSMENT is 1
638 ISHARASSMENT is 0
685 ISHARASSMENT is 0
746 ISHARASSMENT is 1
756 ISHARASSMENT is 0
777 ISHARASSMENT is 1
815 ISHARASSMENT is 0
855 ISHARASSMENT is 0
872 ISHARASSMENT is 1
908 ISHARASSMENT is 0
925 ISHARASSMENT is 0
927 ISHARASSMENT is 0
955 ISHARASSMENT is 0
1116 ISHARASSMENT is 0
1130 ISHARASSMENT is 1
1135 ISHARASSMENT is 0
1184 ISHARASSMENT is 0
1212 ISHARASSMENT is 0
1219 ISHARASSMENT is 0
1221 ISHARASSMENT is 1
1252 ISHARASSMENT is 1
1338 ISHARASSMENT is 0
1352 ISHARASSMENT is 0
1424 ISHARASSMENT is 1
1531 ISHARASSMENT is 0
1582 ISHARASSMENT is 1
1613 ISHARASSMENT is 0
1750 ISHARASSMENT is 0
1761 ISHARASSMENT is 0
1825 ISHARASSMENT is 0
1837 ISHARASSMENT i

15790 ISHARASSMENT is 1
15875 ISHARASSMENT is 1
15886 ISHARASSMENT is 0
15980 ISHARASSMENT is 0
15982 ISHARASSMENT is 1
16042 ISHARASSMENT is 0
16154 ISHARASSMENT is 0
16170 ISHARASSMENT is 1
16179 ISHARASSMENT is 1
16283 ISHARASSMENT is 0
16296 ISHARASSMENT is 0
16399 ISHARASSMENT is 0
16418 ISHARASSMENT is 0
16437 ISHARASSMENT is 1
16460 ISHARASSMENT is 0
16490 ISHARASSMENT is 0
16502 ISHARASSMENT is 0
16527 ISHARASSMENT is 0
16532 ISHARASSMENT is 0
16576 ISHARASSMENT is 0
16604 ISHARASSMENT is 0
16642 ISHARASSMENT is 0
16673 ISHARASSMENT is 0
16686 ISHARASSMENT is 1
16716 ISHARASSMENT is 0
16735 ISHARASSMENT is 1
16802 ISHARASSMENT is 0
16835 ISHARASSMENT is 0
16844 ISHARASSMENT is 0
16877 ISHARASSMENT is 0
16891 ISHARASSMENT is 0
16959 ISHARASSMENT is 0
16974 ISHARASSMENT is 0
17021 ISHARASSMENT is 1
17047 ISHARASSMENT is 1
17051 ISHARASSMENT is 0
17125 ISHARASSMENT is 0
17139 ISHARASSMENT is 0
17175 ISHARASSMENT is 0
17233 ISHARASSMENT is 0
17283 ISHARASSMENT is 0
17288 ISHARASSME

In [12]:
np.asarray(labels)

array([0, 0, 0, ..., 0, 0, 0])

In [13]:
tweets_as_list = pd_data["TWEET"].tolist()

In [14]:
X_train, X_test, y_train, y_test  = train_test_split(
        pd_data, 
        np.asarray(labels),
        train_size=0.66, 
        random_state=1234)

In [15]:
X_test

Unnamed: 0,TWEETID,USERID,TWEET,ISTYPEHOMOSEXUAL,ISTYPETRANSGENDER,ISTYPEBISEXUAL,HASPRONOUNS,ISHARASSMENT,AUTO_ISHARASSMENT,PASTEXPERIENCE,AUTO_PASTEXPERIENCE,CLEAN_TWEET
5767,1373759723900604418,1106283738239885312,yall i drove myself home today iâ€™m beating the...,1,0,0,0.0,,1.0,,,yall i drove myself home today im beating the ...
21271,1382814325899989002,1206123732802134016,@cayon4299454775 @i_destroy_Fags @BootlegChanc...,0,0,0,0.0,,0.0,,,canyon idestroyfags bootlegchancla brioniago i...
11036,1375008807034753026,1342187600627556352,retweet and follow to win with me on my twitte...,0,0,0,0.0,,0.0,,,retweet and follow to win with me on my twitte...
20520,1382406785789784065,1045717980364099584,@fag_femdom @Visa @Mastercard Maybe they need ...,1,0,0,0.0,,0.0,,,fagfemdom visa mastercard maybe they need to t...
11250,1374974266085535744,813528827607584772,suck my dick and call me a faggot,1,0,0,0.0,,0.0,,,suck my dick and call me a faggot
...,...,...,...,...,...,...,...,...,...,...,...,...
2924,1371610522588745731,807750294,@brexitblog_info @PatWilliams1944 @IainDale @B...,1,0,0,0.0,,0.0,,,brexitbloginfo patwilliams andale barrier unfo...
16523,1376960820433846272,1244013545320316930,@LemonLimeFrog ty bby appreciate it &lt;3 /gen...,0,0,0,1.0,,0.0,,,lemonlimefrog ty by appreciate it it gen is re...
6530,1374095716969295880,1346880669033635845,@Femboy_AgeRe I think thatâ€™s kinda like gays a...,1,0,0,0.0,,1.0,,,femboyagere i think thats kinda like gays and ...
15653,1377027211451043842,1148841399472840705,If you a dyke just say that https://t.co/BCvf7...,1,0,0,0.0,,1.0,,,if you a dyke just say that httpstcobcvfbugiy


Train on 'CLEAN' tweet

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

log_model = LogisticRegression(solver="liblinear", random_state=0)

#Instantiate vectorizer to generate features from text input
vectorizer = TfidfVectorizer()
vectorizer.fit(X_train.CLEAN_TWEET)
num_features = len(vectorizer.get_feature_names())
print("Number of features: ", num_features)
#Generate feature set for train and test data
train_tfidf = vectorizer.transform(X_train.CLEAN_TWEET)
test_tfidf =  vectorizer.transform(X_test.CLEAN_TWEET)

#Generate model and fit to training features
log_model = log_model.fit(X=train_tfidf, y=y_train)

#Generate predictions on test features
y_pred = log_model.predict(test_tfidf)

#Generate accuracy score of predictions on our test dataset
print("Accuracy of classifier: ", accuracy_score(y_test, y_pred))

#Generate confusion matrix for performance analysis on our test dataset
print("\nconfusion matrix: \n", confusion_matrix(y_test, y_pred, labels=[0,1]))

#Generate classification report for performance analysis on our test dataset
print("\nclassification report: \n",classification_report(y_test, y_pred))

Number of features:  32556
Accuracy of classifier:  0.9106951871657754

confusion matrix: 
 [[5992  191]
 [ 477  820]]

classification report: 
               precision    recall  f1-score   support

           0       0.93      0.97      0.95      6183
           1       0.81      0.63      0.71      1297

    accuracy                           0.91      7480
   macro avg       0.87      0.80      0.83      7480
weighted avg       0.91      0.91      0.91      7480



**Train on Original, non preprocessed, Tweet**

In [17]:
log_model = LogisticRegression(solver="liblinear", random_state=0)

#Instantiate vectorizer to generate features from text input
vectorizer = TfidfVectorizer()
vectorizer.fit(X_train.TWEET)
num_features = len(vectorizer.get_feature_names())
print("Number of features: ", num_features)
#Generate feature set for train and test data
train_tfidf = vectorizer.transform(X_train.TWEET)
test_tfidf =  vectorizer.transform(X_test.TWEET)

#Generate model and fit to training features
log_model = log_model.fit(X=train_tfidf, y=y_train)

#Generate predictions on test features
y_pred = log_model.predict(test_tfidf)

#Generate accuracy score of predictions on our test dataset
print("Accuracy of classifier: ", accuracy_score(y_test, y_pred))

#Generate confusion matrix for performance analysis on our test dataset
print("\nconfusion matrix: \n", confusion_matrix(y_test, y_pred, labels=[0,1]))

#Generate classification report for performance analysis on our test dataset
print("\nclassification report: \n",classification_report(y_test, y_pred))

Number of features:  35243
Accuracy of classifier:  0.9127005347593583

confusion matrix: 
 [[5983  200]
 [ 453  844]]

classification report: 
               precision    recall  f1-score   support

           0       0.93      0.97      0.95      6183
           1       0.81      0.65      0.72      1297

    accuracy                           0.91      7480
   macro avg       0.87      0.81      0.83      7480
weighted avg       0.91      0.91      0.91      7480



-----------------------------------------------------------------------

https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

In [18]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
svm_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])
svm_clf.fit(X_train.TWEET, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier())])

In [19]:
from sklearn.naive_bayes import MultinomialNB
naive_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])
naive_clf.fit(X_train.TWEET, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB())])

In [20]:
svm_pred = svm_clf.predict(X_test.TWEET)
print("Accuracy of Support Vector Machine classifier:\t",np.mean(svm_pred == y_test))
naive_pred = naive_clf.predict(X_test.TWEET)
print("Accuracy of Naive Bayes classifier:\t\t",np.mean(naive_pred == y_test))

Accuracy of Support Vector Machine classifier:	 0.9181818181818182
Accuracy of Naive Bayes classifier:		 0.8334224598930481


In [21]:
from sklearn import metrics
print("\n\nclassification report for Support Vector Machine classifer\n", metrics.classification_report(y_test, svm_pred, target_names=["0","1"]))
print("confusion matrix for Support Vector Machine classifier\n",metrics.confusion_matrix(y_test, svm_pred))

print("\n\nclassification report for Naive Bayes classifier\n", metrics.classification_report(y_test, naive_pred, target_names=["0","1"]))
print("confusion matrix for Naive Bayes classifier\n",metrics.confusion_matrix(y_test, naive_pred))



classification report for Support Vector Machine classifer
               precision    recall  f1-score   support

           0       0.94      0.96      0.95      6183
           1       0.80      0.71      0.75      1297

    accuracy                           0.92      7480
   macro avg       0.87      0.83      0.85      7480
weighted avg       0.92      0.92      0.92      7480

confusion matrix for Support Vector Machine classifier
 [[5952  231]
 [ 381  916]]


classification report for Naive Bayes classifier
               precision    recall  f1-score   support

           0       0.83      1.00      0.91      6183
           1       0.98      0.04      0.08      1297

    accuracy                           0.83      7480
   macro avg       0.91      0.52      0.49      7480
weighted avg       0.86      0.83      0.76      7480

confusion matrix for Naive Bayes classifier
 [[6182    1]
 [1245   52]]


In [22]:
from sklearn.neural_network import MLPClassifier
neural_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MLPClassifier()),
])
neural_clf.fit(X_train.TWEET, y_train)
neural_pred = neural_clf.predict(X_test.TWEET)
print("Accuracy of Neural Network classifier:\t\t",np.mean(neural_pred == y_test))
print("\n\nclassification report for Neural Network classifier\n", metrics.classification_report(y_test, neural_pred, target_names=["0","1"]))
print("confusion matrix for Neural Network classifier\n",metrics.confusion_matrix(y_test, neural_pred))

Accuracy of Neural Network classifier:		 0.8986631016042781


classification report for Neural Network classifier
               precision    recall  f1-score   support

           0       0.93      0.95      0.94      6183
           1       0.73      0.67      0.70      1297

    accuracy                           0.90      7480
   macro avg       0.83      0.81      0.82      7480
weighted avg       0.90      0.90      0.90      7480

confusion matrix for Neural Network classifier
 [[5854  329]
 [ 429  868]]


In [23]:
from sklearn.svm import SVC
svc_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SVC()),
])
svc_clf.fit(X_train.TWEET, y_train)
svc_pred = svc_clf.predict(X_test.TWEET)
print("Accuracy of SVC classifier:\t\t",np.mean(svc_pred == y_test))
print("\n\nclassification report for SVC classifier:\n", metrics.classification_report(y_test, svc_pred, target_names=["0","1"]))
print("confusion matrix for SVC classifier\n",metrics.confusion_matrix(y_test, svc_pred))

Accuracy of SVC classifier:		 0.9185828877005348


classification report for SVC classifier:
               precision    recall  f1-score   support

           0       0.94      0.97      0.95      6183
           1       0.82      0.68      0.74      1297

    accuracy                           0.92      7480
   macro avg       0.88      0.82      0.85      7480
weighted avg       0.92      0.92      0.92      7480

confusion matrix for SVC classifier
 [[5987  196]
 [ 413  884]]


In [24]:
from sklearn.ensemble import RandomForestClassifier
forest_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', RandomForestClassifier()),
])
forest_clf.fit(X_train.TWEET, y_train)
forest_pred = forest_clf.predict(X_test.TWEET)
print("Accuracy of Random Forest classifier:\t\t",np.mean(forest_pred == y_test))
print("\n\nclassification report for Random Forest classifier:\n", metrics.classification_report(y_test, forest_pred, target_names=["0","1"]))
print("confusion matrix for Random Forest classifier\n",metrics.confusion_matrix(y_test, forest_pred))

Accuracy of Random Forest classifier:		 0.8989304812834225


classification report for Random Forest classifier:
               precision    recall  f1-score   support

           0       0.91      0.98      0.94      6183
           1       0.83      0.53      0.64      1297

    accuracy                           0.90      7480
   macro avg       0.87      0.75      0.79      7480
weighted avg       0.89      0.90      0.89      7480

confusion matrix for Random Forest classifier
 [[6042  141]
 [ 615  682]]
