In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import tensorflow as tf

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from time import time

^ Imports

Upload dataset

In [None]:
from google.colab import files

uploaded = files.upload()

Saving hatespeech.csv to hatespeech.csv


Import dataset

In [None]:
dataset = pd.read_csv('hatespeech.csv')
dataset["label"] = dataset["label"].astype('category') # Convert to category
print(dataset.dtypes) # Check data types
dataset.head() # Print first few rows
print("\n")
print(dataset.head()) # Printing first few lines of file
print("\n")
count = dataset['label'].value_counts() # Running count of type freqency
print('Frequency of Hate Speech (1) and No Hate Speech (0):\n\n', count) 

Unnamed: 0       int64
text            object
label         category
dtype: object


   Unnamed: 0                                               text label
0           0  It should be legal for a husband to rape his wife     1
1           1                            dalits are lowlife scum     1
2           2                                dalits are lowlives     1
3           3  It was a better world when women dare not ques...     1
4           4  I don't work this hard so that those immigrant...     1


Frequency of Hate Speech (1) and No Hate Speech (0):

 0    48219
1    24366
Name: label, dtype: int64


In [None]:
dataset.shape

(72585, 3)

In [None]:
dataset.index

RangeIndex(start=0, stop=72585, step=1)

In [None]:
dataset.columns

Index(['Unnamed: 0', 'text', 'label'], dtype='object')

Check for missing values

In [None]:
dataset.isnull().sum()

Unnamed: 0    0
text          0
label         0
dtype: int64

Split data into training and testing data

In [None]:
x = dataset.text
y = dataset.label

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, train_size = 0.8, random_state = 4)

In [None]:
x_train.shape

(58068,)

In [None]:
x_test.shape

(14517,)

Vectorize text

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stopWords = set(stopwords.words('english'))

labels = y_train
true_k = np.unique(labels).shape[0]

t0 = time()

vectorizer = TfidfVectorizer(stop_words=stopWords, 
                     max_features=1000, ngram_range=(1,2))
# Fitting and transforming training data
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test) # Transforming testing data

print(x_test.shape) # shape of test

print("done in %fs" % (time() - t0))
print("n_samples: %d, n_features: %d" % x_train.shape)
print()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
(14517, 1000)
done in 3.408312s
n_samples: 58068, n_features: 1000



**Kmeans Clustering**

Parse arguments to display options in commandline

In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics

from sklearn.cluster import KMeans, MiniBatchKMeans

import logging
from optparse import OptionParser
import sys

logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')

op = OptionParser()
op.add_option("--lsa",
              dest="n_components", type="int",
              help="Preprocess documents with latent semantic analysis.")
op.add_option("--no-minibatch",
              action="store_false", dest="minibatch", default=True,
              help="Use ordinary k-means algorithm (in batch mode).")
op.add_option("--no-idf",
              action="store_false", dest="use_idf", default=True,
              help="Disable Inverse Document Frequency feature weighting.")
op.add_option("--use-hashing",
              action="store_true", default=False,
              help="Use a hashing feature vectorizer")
op.add_option("--n-features", type=int, default=10000,
              help="Maximum number of features (dimensions)"
                   " to extract from text.")
op.add_option("--verbose",
              action="store_true", dest="verbose", default=False,
              help="Print progress reports inside k-means algorithm.")

print(__doc__)
op.print_help()


def is_interactive():
    return not hasattr(sys.modules['__main__'], '__file__')

if opts.minibatch:
    km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,
                         init_size=1000, batch_size=1000, verbose=opts.verbose)
else:
Usage: ipykernel_launcher.py [options]

Options:
  -h, --help            show this help message and exit
  --lsa=N_COMPONENTS    Preprocess documents with latent semantic analysis.
  --no-minibatch        Use ordinary k-means algorithm (in batch mode).
  --no-idf              Disable Inverse Document Frequency feature weighting.
  --use-hashing         Use a hashing feature vectorizer
  --n-features=N_FEATURES
                        Maximum number of features (dimensions) to extract
                        from text.
  --verbose             Print progress reports inside k-means algorithm.


Account for Jupyter or IPython console

In [None]:
argv = [] if is_interactive() else sys.argv[1:]
(opts, args) = op.parse_args(argv)
if len(args) > 0:
    op.error("this script takes no arguments.")
    sys.exit(1)

Vectorizer results are normalized, which makes KMeans behave as spherical k-means for better results. Since LSA/SVD results are not normalized, we have to redo the normalization.

In [None]:
if opts.n_components:
    print("Performing dimensionality reduction using LSA")
    t0 = time()
    # Vectorizer results are normalized, which makes KMeans behave as
    # spherical k-means for better results. Since LSA/SVD results are
    # not normalized, we have to redo the normalization.
    svd = TruncatedSVD(opts.n_components)
    normalizer = Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)

    x_train = lsa.fit_transform(x_train)

    print("done in %fs" % (time() - t0))

    explained_variance = svd.explained_variance_ratio_.sum()
    print("Explained variance of the SVD step: {}%".format(
        int(explained_variance * 100)))

    print()


**Actual Clustering**

Clustering Statistics 

In [None]:
if opts.minibatch:
    km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,
                         init_size=1000, batch_size=1000, verbose=opts.verbose)
else:
    km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1,
                verbose=opts.verbose)

print("Clustering sparse data with %s" % km)
t0 = time()
km.fit(x_train)
print("done in %0.3fs" % (time() - t0))
print()

print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(x_train, km.labels_, sample_size=1000))

print()

Clustering sparse data with MiniBatchKMeans(batch_size=1000, compute_labels=True, init='k-means++',
                init_size=1000, max_iter=100, max_no_improvement=10,
                n_clusters=2, n_init=1, random_state=None,
                reassignment_ratio=0.01, tol=0.0, verbose=False)
done in 0.097s

Homogeneity: 0.010
Completeness: 0.020
V-measure: 0.013
Adjusted Rand-Index: 0.043
Silhouette Coefficient: 0.015



Display Clusters of words

In [None]:
if not opts.use_hashing:
    print("Top terms per cluster:")

    if opts.n_components:
        original_space_centroids = svd.inverse_transform(km.cluster_centers_)
        order_centroids = original_space_centroids.argsort()[:, ::-1]
    else:
        order_centroids = km.cluster_centers_.argsort()[:, ::-1]

    terms = vectorizer.get_feature_names()
    for i in range(true_k):
        print("Cluster %d:" % i, end='')
        for ind in order_centroids[i, :10]:
            print(' %s' % terms[ind], end='')
        print()


Top terms per cluster:
Cluster 0: user love women fucking user user like black day happy one
Cluster 1: people black people black think like world many people think wrong everyone


**ROC Curve**

Find the "True Positive" and "False Positive" rates and plot them

In [1]:
#generate no skill prediction
worst_probs = [0 for _ in range(len(y_test))]
#Prediction probability matrix
km_probs = km.predict_proba(x_test)
# Keep probabilities from positive outcomes
km_probs = km_probs[:, 1] 

# compute AUROC:
worst_auc = roc_auc_score(y_test, worst_probs)
km_auc = roc_auc_score(y_test, km_probs)

# print AUROC Scores:
print('Random chance predicion aka worst scenario: AUROC = %.3f' % (worst_auc))
print('Random MLP: AUROC = %.3f' % (km_auc))

# calculate roc curves
worst_fpr, worst_tpr, _ = roc_curve(y_test, worst_probs)
km_fpr, km_tpr, _ = roc_curve(y_test, km_probs)

# plot roc curves
plt.plot(worst_fpr, worst_tpr, linestyle='--', label='Random Prediction (AUROC = %0.3f)' % worst_auc)
plt.plot(km_fpr, km_tpr, marker='.', label='Multilayer Perceptron (AUROC = %0.3f)' % km_auc)
plt.title('ROC Plot') # Title
plt.xlabel('False Positive Rate') # X-axis label
plt.ylabel('True Positive Rate') # Y-axis label
plt.legend() # Show Legend
plt.show() # Show Plot