<h1>Classifying Harassment Demo</h1>

 <h2>Required imports</h2>

In [20]:
#Import dependencies
import sqlite3
from sklearn import svm
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
import math
import warnings
warnings.filterwarnings("ignore")

In [21]:
def get_labels(df):
    labels = []
    for i in range(len(df)):
        if math.isnan(df.iloc[i].ISHARASSMENT):
            labels.append(int(df.iloc[i].AUTO_ISHARASSMENT))
        else:
            labels.append(int(df.iloc[i].ISHARASSMENT))
    return labels

<h2>Retrieving Tweet datasets</h2>

In [22]:
#Connect to DB
conn = sqlite3.connect("../../etc/database_store/auto_tweets.db")
#Obtain all labelled tweets from DB
auto_data = pd.read_sql(con=conn,sql="SELECT * FROM TWEETS WHERE AUTO_ISHARASSMENT IS NOT NULL OR ISHARASSMENT IS NOT NULL")
man_data = pd.read_sql(con=conn,sql="SELECT * FROM TWEETS WHERE ISHARASSMENT IS NOT NULL")
print("number of automatically labelled tweets: ", len(auto_data))
print("number of manually labelled tweets: ", len(man_data))

number of automatically labelled tweets:  22261
number of manually labelled tweets:  512


In [23]:
auto_labels = get_labels(auto_data)
man_labels = get_labels(man_data)

In [24]:
auto_X_train, auto_X_test, auto_y_train, auto_y_test  = train_test_split(auto_data,np.asarray(auto_labels),train_size=0.66,random_state=1234)
man_X_train, man_X_test, man_y_train, man_y_test  = train_test_split(man_data, np.asarray(man_labels),train_size=0.66,random_state=3)

<h2>Creating classifier pipelines to later apply on datasets</h2>

In [25]:
svm_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])

In [26]:
svc_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SVC()),
])

<h2>Train on both datasets and produce predictions</h2>

<h3>Performance of classifiers trained on Full dataset</h3>

In [27]:
#Full dataset with automated labels
svm_clf.fit(auto_X_train.TWEET, auto_y_train)
auto_svm_pred = svm_clf.predict(auto_X_test.TWEET)
svc_clf.fit(auto_X_train.TWEET, auto_y_train)
auto_svc_pred = svc_clf.predict(auto_X_test.TWEET)

In [28]:
print("\n\nclassification report for Support Vector Machine classifer\n", classification_report(auto_y_test, auto_svm_pred, target_names=["0","1"]))
print("confusion matrix for Support Vector Machine classifier\n",confusion_matrix(auto_y_test, auto_svm_pred))
print("\n\nclassification report for SVC classifier:\n", classification_report(auto_y_test, auto_svc_pred, target_names=["0","1"]))
print("confusion matrix for SVC classifier\n",confusion_matrix(auto_y_test, auto_svc_pred))



classification report for Support Vector Machine classifer
               precision    recall  f1-score   support

           0       0.94      0.96      0.95      6262
           1       0.76      0.69      0.72      1307

    accuracy                           0.91      7569
   macro avg       0.85      0.82      0.83      7569
weighted avg       0.91      0.91      0.91      7569

confusion matrix for Support Vector Machine classifier
 [[5982  280]
 [ 409  898]]


classification report for SVC classifier:
               precision    recall  f1-score   support

           0       0.93      0.96      0.95      6262
           1       0.78      0.66      0.72      1307

    accuracy                           0.91      7569
   macro avg       0.86      0.81      0.83      7569
weighted avg       0.91      0.91      0.91      7569

confusion matrix for SVC classifier
 [[6025  237]
 [ 447  860]]


<h3>Performance of classifiers trained on manually labelled dataset</h3>

In [29]:
#Only manually labelled tweets
svm_clf.fit(man_X_train.TWEET, man_y_train)
man_svm_pred = ssvm_clf.predict(man_X_test.TWEET)
svc_clf.fit(man_X_train.TWEET, man_y_train)
man_svc_pred = svc_clf.predict(man_X_test.TWEET)

NameError: name 'ssvm_clf' is not defined

In [None]:
print("\n\nclassification report for Support Vector Machine classifer\n", classification_report(man_y_test, man_svm_pred, target_names=["0","1"]))
print("confusion matrix for Support Vector Machine classifier\n",confusion_matrix(man_y_test, man_svm_pred))
print("\n\nclassification report for SVC classifier:\n", classification_report(man_y_test, man_svc_pred, target_names=["0","1"]))
print("confusion matrix for SVC classifier\n",confusion_matrix(man_y_test, man_svc_pred))

<h3>Giving the classifier trained on a smaller amount of data the benefit of the doubt, make predictions on the larger dataset

In [None]:
bigman_svm_pred = svm_clf.predict(auto_X_test.TWEET)
bigman_svc_pred = svc_clf.predict(auto_X_test.TWEET)

In [None]:
print("\n\nclassification report for Support Vector Machine classifer\n", classification_report(auto_y_test, bigman_svm_pred, target_names=["0","1"]))
print("confusion matrix for Support Vector Machine classifier\n",confusion_matrix(auto_y_test, bigman_svm_pred))
print("\n\nclassification report for SVC classifier:\n", classification_report(auto_y_test, bigman_svc_pred, target_names=["0","1"]))
print("confusion matrix for SVC classifier\n",confusion_matrix(auto_y_test, bigman_svc_pred))

<h3>Testing Classifier trained on larger dataset on manual label dataset</h3>

In [14]:
#Only manually labelled tweets
svm_clf.fit(auto_X_train.TWEET, auto_y_train)
autosmall_svm_pred = svm_clf.predict(man_X_test.TWEET)
svc_clf.fit(auto_X_train.TWEET, auto_y_train)
autosmall_svc_pred = svc_clf.predict(man_X_test.TWEET)

In [15]:
print("\n\nclassification report for Support Vector Machine classifer\n", classification_report(man_y_test, autosmall_svm_pred, target_names=["0","1"]))
print("confusion matrix for Support Vector Machine classifier\n",confusion_matrix(man_y_test, autosmall_svm_pred))
print("\n\nclassification report for SVC classifier:\n", classification_report(man_y_test, autosmall_svc_pred, target_names=["0","1"]))
print("confusion matrix for SVC classifier\n",confusion_matrix(man_y_test, autosmall_svc_pred))



classification report for Support Vector Machine classifer
               precision    recall  f1-score   support

           0       0.87      0.95      0.91       131
           1       0.81      0.59      0.68        44

    accuracy                           0.86       175
   macro avg       0.84      0.77      0.80       175
weighted avg       0.86      0.86      0.86       175

confusion matrix for Support Vector Machine classifier
 [[125   6]
 [ 18  26]]


classification report for SVC classifier:
               precision    recall  f1-score   support

           0       0.89      0.98      0.93       131
           1       0.93      0.64      0.76        44

    accuracy                           0.90       175
   macro avg       0.91      0.81      0.85       175
weighted avg       0.90      0.90      0.89       175

confusion matrix for SVC classifier
 [[129   2]
 [ 16  28]]


NameError: name 'labeling_function' is not defined

In [None]:
#instantiate labelling functions
lfs = [names_or_places,simple_insults,short_insult_tweet,term_to_person,descriptive_bad,against_nature,transphobic_statements,death_threats, gay_disease,full_caps,trigger_warning,bender_as_drunk,safe_bender,is_drawing,slang_using_cigarettes, safe_batty, innocent_question,term_in_quotations, received_term, slur_in_handles,has_pronouns]
# has_image
applier = LFApplier(lfs=lfs)
pdapplier = PandasLFApplier(lfs=lfs)