# Imports

In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Reading Dataset

In [2]:
import pandas as pd
df = pd.read_csv("traindata.csv")
df["tweet_content"].head()

0    pics mexican school girl naked melanie safka f...
1    RT shereiqns Having curly hair is a gotdamn ch...
2     BreakingNews 7 Unexplained Prisoners Deaths M...
3     girl dirty quarterback latino pokemon young h...
4     miliondollameat sexting n ngirl daddy n nme l...
Name: tweet_content, dtype: object

In [3]:
df["tweet_content"] = df["tweet_content"].str.lower()
df["tweet_content"] = df["tweet_content"].str.replace(r"[^a-z]+"," ")
df.head()

Unnamed: 0.1,Unnamed: 0,tweet_content,harassment,IndirectH,PhysicalH,SexualH
0,0,pics mexican school girl naked melanie safka f...,1,0,0,1
1,1,rt shereiqns having curly hair is a gotdamn ch...,0,0,0,0
2,2,breakingnews unexplained prisoners deaths mur...,0,0,0,0
3,3,girl dirty quarterback latino pokemon young h...,1,0,0,1
4,4,miliondollameat sexting n ngirl daddy n nme l...,1,0,0,1


## Store our dataset into a list

In [4]:
data = list(df["tweet_content"])
len(data)

6374

## Store our labels into a list

In [5]:
labels = list(df["harassment"])
len(labels)

6374

# Tokenizing our data

In [6]:
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(data)
word_index = tokenizer.word_index
print(word_index)

{'<OOV>': 1, 'x': 2, 't': 3, 'a': 4, 'i': 5, 'amp': 6, 'co': 7, 'the': 8, 'slut': 9, 'rt': 10, 'to': 11, 's': 12, 'you': 13, 'and': 14, 'ava': 15, 'bitch': 16, 'is': 17, 'in': 18, 'for': 19, 'of': 20, 'my': 21, 'me': 22, 'she': 23, 'it': 24, 'that': 25, 'n': 26, 'm': 27, 'this': 28, 'girl': 29, 'girls': 30, 'on': 31, 'her': 32, 'like': 33, 'with': 34, 'd': 35, 'sassy': 36, 'fucking': 37, 'fuck': 38, 'all': 39, 'but': 40, 'be': 41, 'can': 42, 'so': 43, 'just': 44, 'are': 45, 'don': 46, 'about': 47, 'porn': 48, 'who': 49, 'if': 50, 'f': 51, 'was': 52, 'pussy': 53, 'what': 54, 'have': 55, 'out': 56, 'up': 57, 'we': 58, 'people': 59, 'your': 60, 'not': 61, 'when': 62, 'he': 63, 'being': 64, 'video': 65, 'at': 66, 're': 67, 'do': 68, 'cumshot': 69, 'they': 70, 'u': 71, 'little': 72, 'chopped': 73, 'ass': 74, 'love': 75, 'one': 76, 'as': 77, 'really': 78, 'shaming': 79, 'get': 80, 'sex': 81, 'or': 82, 'y': 83, 'horny': 84, 'because': 85, 'xxx': 86, 'videos': 87, 'no': 88, 'how': 89, 'their':

In [7]:
len(set(word_index.keys()))

17479

# Sequencing our data

In [8]:
sequences = tokenizer.texts_to_sequences(data)
print(sequences)

[[453, 1375, 151, 29, 98, 6341, 6342, 551, 133, 628, 347, 9, 299, 1817, 1228, 48, 728, 3816, 75], [10, 6343, 210, 6344, 477, 17, 4, 6345, 6346, 68, 61, 182, 8, 6347, 2217, 13, 516], [6348, 6349, 6350, 3817, 1229, 6, 6351, 6352, 6353, 6354, 6355, 6356], [29, 288, 2796, 2797, 2218, 154, 598, 1818, 338, 105, 2, 3, 7, 6357, 2219, 3818], [6358, 329, 26, 6359, 198, 26, 651, 454, 16, 156, 206, 115, 54], [1376, 1376, 2220, 2220, 17, 3819, 1018, 17, 821, 23, 117, 517, 11, 395, 19, 4, 266, 11, 552, 6, 12], [2798, 9, 50, 13, 108, 1819, 58, 42, 68, 24, 110, 5, 149, 518], [10, 3820, 6360, 5, 55, 3821, 652, 685, 9, 3822, 16, 629, 2799, 2221, 310, 68, 13, 55, 54, 24, 630, 42, 13, 330, 57, 11, 21], [129, 362, 599, 28, 76, 316, 3, 99, 66, 246, 6361, 6, 2222, 6362, 6, 6363], [10, 3823, 435, 6364, 930, 19, 64, 21, 255, 278, 6, 553, 1564, 18, 8, 74, 13, 67, 142, 11, 68, 272, 436, 14, 5, 27, 631, 11], [6365, 42, 2, 3, 2223, 1102, 3, 7, 3824, 3825], [3826, 6366, 686, 1377, 2800, 2, 12, 3827, 5, 2, 27, 37, 3

# Padding our sequences
## to make all sequences with same length and ready for our NN model

In [9]:
padded = pad_sequences(sequences, padding="post")
print(padded)

[[  453  1375   151 ...     0     0     0]
 [   10  6343   210 ...     0     0     0]
 [ 6348  6349  6350 ...     0     0     0]
 ...
 [   18   188    34 ...     0     0     0]
 [17473 17474 17475 ...     0     0     0]
 [  128   124    65 ...     0     0     0]]


# Splitting our padded dataset into 2 segments (Train set and Validation set)

In [31]:
train_data = padded[: -int(0.1*(len(padded)))]
test_data = padded[-int(0.1*(len(padded))):]
train_labels = labels[: -int(0.1*(len(padded)))]
test_labels = labels[-int(0.1*(len(padded))):]

len(train_data), len(test_data), len(train_labels), len(test_labels)

(5737, 637, 5737, 637)

# Creating our Keras model

In [32]:
vocab_size = 17000
max_len = 103
embedding_dim = 8

binary_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_len),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(6, activation="relu"),
    tf.keras.layers.Dense(1, activation="sigmoid")
])

# Compiling the model

In [33]:
binary_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Model Summary

In [34]:
binary_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 103, 8)            136000    
_________________________________________________________________
flatten_2 (Flatten)          (None, 824)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 6)                 4950      
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 7         
Total params: 140,957
Trainable params: 140,957
Non-trainable params: 0
_________________________________________________________________


# Training our model

In [35]:
num_epochs = 10
binary_model.fit(train_data, train_labels, epochs=num_epochs, validation_data=(test_data, test_labels))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 5737 samples, validate on 637 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7faad3d20c50>

In [36]:
from sklearn.metrics import classification_report

In [37]:
preds = binary_model.predict_classes(train_data)
preds

array([[1],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]], dtype=int32)

In [38]:
print(classification_report(y_pred=preds, y_true=train_labels))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00      3303
           1       0.99      1.00      1.00      2434

   micro avg       1.00      1.00      1.00      5737
   macro avg       1.00      1.00      1.00      5737
weighted avg       1.00      1.00      1.00      5737



In [39]:
def classify_binary(sentence):
    classes = {0: "non-harassment", 1: "harassment"}
    seq = tokenizer.texts_to_sequences([sentence])
    padded = pad_sequences(seq, padding="post", maxlen=103)
    return classes[binary_model.predict_classes(padded)[0][0]]

In [52]:
classify_binary("fuck you fucking dumb")

'harassment'

In [53]:
classify_binary("I love you")

'non-harassment'

# Preparing our harassment data for Categorical Classification

In [54]:
X = []
y = []

for sentence, h, indirect, physical, sexual in zip(df["tweet_content"], df["harassment"],df["IndirectH"],df["PhysicalH"], df["SexualH"]):
    if h:
        X.append(sentence)
        if indirect:
            y.append(0)
        elif physical:
            y.append(1)
        else:
            y.append(2)

In [55]:
len(X), len(y)

(2713, 2713)

## One Hot Encoding our labels

In [56]:
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder(handle_unknown='ignore')
y = np.array(y).reshape(-1,1)
enc.fit(y)
y = enc.transform(y).toarray()

In [57]:
y

array([[0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]])

## Tokenizing our data

In [58]:
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(X)
word_index = tokenizer.word_index
print(word_index)

{'<OOV>': 1, 'slut': 2, 't': 3, 'a': 4, 'x': 5, 'bitch': 6, 'i': 7, 'co': 8, 'you': 9, 'the': 10, 'rt': 11, 'to': 12, 'and': 13, 's': 14, 'girl': 15, 'me': 16, 'girls': 17, 'is': 18, 'for': 19, 'my': 20, 'amp': 21, 'she': 22, 'that': 23, 'in': 24, 'porn': 25, 'like': 26, 'n': 27, 'pussy': 28, 'of': 29, 'fucking': 30, 'it': 31, 'm': 32, 'her': 33, 'fuck': 34, 'cumshot': 35, 'with': 36, 'this': 37, 'on': 38, 'all': 39, 'ass': 40, 'video': 41, 'be': 42, 'but': 43, 'if': 44, 'xxx': 45, 'can': 46, 'horny': 47, 'sex': 48, 'ava': 49, 'videos': 50, 'naked': 51, 'who': 52, 'your': 53, 'just': 54, 'shaming': 55, 'don': 56, 'are': 57, 'so': 58, 'd': 59, 'u': 60, 'do': 61, 'bitches': 62, 'not': 63, 'sassy': 64, 'he': 65, 'have': 66, 'about': 67, 'whore': 68, 'being': 69, 'was': 70, 're': 71, 'when': 72, 'nude': 73, 'dick': 74, 'what': 75, 'get': 76, 'we': 77, 'up': 78, 'or': 79, 'porno': 80, 'nudes': 81, 'little': 82, 'out': 83, 'no': 84, 'women': 85, 'free': 86, 'hot': 87, 'f': 88, 'more': 89, 'c

In [59]:
len(set(word_index.keys()))

8273

## Sequencing our data

In [60]:
sequences = tokenizer.texts_to_sequences(X)
print(sequences)

[[314, 1056, 152, 15, 51, 2859, 2860, 452, 73, 422, 190, 2, 205, 890, 680, 25, 522, 1745, 97], [15, 158, 1746, 1304, 1057, 107, 315, 1305, 170, 146, 5, 3, 8, 2861, 1306, 2862], [2863, 350, 27, 2864, 142, 27, 681, 453, 6, 154, 215, 171, 75], [1747, 2, 44, 9, 108, 1748, 77, 46, 61, 31, 132, 7, 159, 891], [11, 2865, 2866, 7, 66, 1749, 560, 454, 2, 1750, 6, 682, 1307, 1751, 238, 61, 9, 66, 75, 31, 523, 46, 9, 283, 78, 12, 20], [1752, 2867, 776, 1058, 1753, 5, 14, 1754, 7, 5, 32, 30, 1755, 316, 2868, 5], [777, 16, 38, 395, 13, 215, 16, 91, 2869, 1308, 73, 81, 395, 17, 103, 28, 47], [11, 1059, 65, 149, 16, 12, 297, 172, 74, 111, 297, 74, 31, 14, 20, 892, 2, 40, 74, 109, 196, 317, 170, 298], [11, 366, 351, 455, 7, 128, 61, 31, 19, 9, 133, 185, 216, 165, 217, 250, 12, 218, 299, 12, 17, 2], [11, 173, 62, 380, 53, 272, 82, 1060, 38, 166, 1756, 524, 53, 109, 2870, 115, 2871, 166, 1061, 13, 210, 524, 9, 1757, 2872], [2873, 46, 3, 6, 3, 8, 1758], [683, 2874, 318, 27, 3, 8, 2875, 2876, 684, 35, 205,

## Padding our Sequences

In [61]:
padded = pad_sequences(sequences, padding="post")
print(padded)

[[ 314 1056  152 ...    0    0    0]
 [  15  158 1746 ...    0    0    0]
 [2863  350   27 ...    0    0    0]
 ...
 [  37    6  131 ...    0    0    0]
 [  24  470   36 ...    0    0    0]
 [ 303   86   41 ...    0    0    0]]


## Splitting our dataset into Train set & Validation set

In [62]:
train_data = padded[: -int(0.15*(len(padded)))]
test_data = padded[-int(0.15*(len(padded))):]
train_labels = y[: -int(0.15*(len(padded)))]
test_labels = y[-int(0.15*(len(padded))):]

len(train_data), len(test_data), len(train_labels), len(test_labels)

(2307, 406, 2307, 406)

In [63]:
len(padded[0])

32

## Creating our Model for Categorical Classification

In [64]:
vocab_size = 8000
max_len = 32
embedding_dim = 8

categorical_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_len),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(8, activation="relu"),
    tf.keras.layers.Dense(3, activation="softmax")
])

## Compiling our model

In [65]:
categorical_model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [66]:
num_epochs = 10
categorical_model.fit(train_data, train_labels, epochs=num_epochs, validation_data=(test_data, test_labels))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 2307 samples, validate on 406 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fab023c5cc0>

## Categorical Classification

In [67]:
def classify_categorical(sentence):
    classes = {0:"Indirect Harassment",1:"Physical Harassment", 2:"Sexual Harassment"}
    seq = tokenizer.texts_to_sequences([sentence])
    padded = pad_sequences(seq, padding="post", maxlen=32)
    return classes[categorical_model.predict_classes(padded)[0]]

In [68]:
classify_categorical("fuck you")

'Sexual Harassment'

In [73]:
classify_categorical("The woman brutally tortured minor servant starved MAJOR AMMARA")

'Physical Harassment'

## Full classification

In [79]:
def classify(sentence):
    h = classify_binary(sentence)
    if h == "harassment":
        h_type = classify_categorical(sentence)
        return h + ": " + h_type
    else:
        return h

In [80]:
sample = "I hate you and I will kill you with my gun and bury your body"
classify(sample)

'harassment: Sexual Harassment'

In [83]:
sample = "too much love"
classify(sample)

'non-harassment'

## Saving predictions to CSV

In [84]:
def get_preds(sentence):
    pred = []
    classes = {"Indirect Harassment":[1,0,0], "Physical Harassment":[0,1,0], "Sexual Harassment":[0,0,1]}
    h = classify_binary(sentence)
    if h == "harassment":
        pred += [sentence, 1]
        h_type = classify_categorical(sentence)
        pred += classes[h_type]
    else:
        pred = [sentence,0,0,0,0]
    return pred

In [85]:
get_preds("too much love")

['too much love', 0, 0, 0, 0]

In [87]:
get_preds("I will screw you")

['I will screw you', 1, 0, 0, 1]

In [90]:
preds = []

for sent in df["tweet_content"]:
    preds.append(get_preds(sent))

In [91]:
import csv

with open('results.csv', 'w') as writeFile:
    writer = csv.writer(writeFile)
    writer.writerows(preds)