In [None]:
#!pip install pandas
#!pip install nltk
#!pip install tokenizer

In [2]:
import pandas as pd
import numpy as np

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [3]:
comments = pd.read_csv('../../data/attack_annotated_comments.tsv', sep='\t')
labels = pd.read_csv('../../data/attack_annotations.tsv', sep='\t')

In [6]:
comments = comments.drop(columns=['year','logged_in','ns','sample','split'])

In [None]:
comments

In [7]:
labels = labels.drop(columns=['worker_id','quoting_attack','recipient_attack','third_party_attack','other_attack'])

In [None]:
labels



In [8]:
res = labels.groupby('rev_id')['attack'].mean().reset_index()

In [None]:
res

In [9]:
data = comments.join(res.set_index('rev_id'), on='rev_id')
data['attack'] = data['attack'].round(0)

In [None]:
data

In [None]:
data.info()
data.isnull().values.any()

In [10]:
# In out data are newlines, tabulators and quotions marks replaced, so we need to delete this replacement
data['comment'] = data['comment'].apply(lambda x: x.replace("NEWLINE_TOKEN",""))
data['comment'] = data['comment'].apply(lambda x: x.replace("TAB_TOKEN",""))
data['comment'] = data['comment'].apply(lambda x: x.replace("``",'"'))

In [None]:
# Source: https://www.kaggle.com/prashantkikani/pooled-gru-with-preprocessing
repl = {
    "&lt;3": " good ",
    ":d": " good ",
    ":dd": " good ",
    ":p": " good ",
    "8)": " good ",
    ":-)": " good ",
    ":)": " good ",
    ";)": " good ",
    "(-:": " good ",
    "(:": " good ",
    "yay!": " good ",
    "yay": " good ",
    "yaay": " good ",
    "yaaay": " good ",
    "yaaaay": " good ",
    "yaaaaay": " good ",
    ":/": " bad ",
    ":&gt;": " sad ",
    ":')": " sad ",
    ":-(": " bad ",
    ":(": " bad ",
    ":s": " bad ",
    ":-s": " bad ",
    "&lt;3": " heart ",
    ":d": " smile ",
    ":p": " smile ",
    ":dd": " smile ",
    "8)": " smile ",
    ":-)": " smile ",
    ":)": " smile ",
    ";)": " smile ",
    "(-:": " smile ",
    "(:": " smile ",
    ":/": " worry ",
    ":&gt;": " angry ",
    ":')": " sad ",
    ":-(": " sad ",
    ":(": " sad ",
    ":s": " sad ",
    ":-s": " sad ",
    r"\br\b": "are",
    r"\bu\b": "you",
    r"\bhaha\b": "ha",
    r"\bhahaha\b": "ha",
    r"\bdon't\b": "do not",
    r"\bdoesn't\b": "does not",
    r"\bdidn't\b": "did not",
    r"\bhasn't\b": "has not",
    r"\bhaven't\b": "have not",
    r"\bhadn't\b": "had not",
    r"\bwon't\b": "will not",
    r"\bwouldn't\b": "would not",
    r"\bcan't\b": "can not",
    r"\bcannot\b": "can not",
    r"\bi'm\b": "i am",
    "m": "am",
    "r": "are",
    "u": "you",
    "haha": "ha",
    "hahaha": "ha",
    "don't": "do not",
    "doesn't": "does not",
    "didn't": "did not",
    "hasn't": "has not",
    "haven't": "have not",
    "hadn't": "had not",
    "won't": "will not",
    "wouldn't": "would not",
    "can't": "can not",
    "cannot": "can not",
    "i'm": "i am",
    "m": "am",
    "i'll" : "i will",
    "its" : "it is",
    "'s" : " is",
    "'s" : " is",
    "`d": " would",
    "that's" : "that is",
    "weren't" : "were not",
    "e.g" : "eg",
}

In [None]:
#splitting comments by spaces
comments = data['comment'].apply(lambda x: x.split())

In [None]:
import re
abbr = [i for i in repl.keys()]
result = []
for index in range(len(comments)):
    new_comment = ""
    for word in comments[index]:
        word = word.lower()
        word = re.sub(r"n't", " not", word)
        word = re.sub(r"\ 's", " not", word)
        word = re.sub(r"\ 've", " not", word)
        word = re.sub(r"\ 'd", " not", word)
        word = re.sub(r"\ ll", " not", word)
        if re.search("^http.+|^www.+",word): # deleting links
            continue
        elif word in abbr:
            new_comment += repl[word]
            new_comment += " "
            word = repl[word]
        elif re.search("[^a-zA-Z ]+",word): # only alphabet
            new_comment += re.sub("[^a-zA-Z ]+"," ",word)
            new_comment += " "
        else:
            new_comment += word  
            new_comment += " "
            
    new_comment = " ".join(new_comment.split())        
   # print("Old = ", comments[index])
   # print("New = ", new_comment)
    result.append(new_comment)
data["comment"] = result

In [None]:
pd.options.display.max_rows = 999
data

In [None]:
# replacing empty strings with nan
data['comment'].replace('', np.nan, inplace=True)

In [None]:
# checking nan values on data
data.info()
data.isnull().values.any()

In [None]:
# drop nan rows
data.dropna(subset=['comment'], inplace=True)
data = data.reset_index(drop=True)

# Stemming

In [None]:
from nltk.stem import SnowballStemmer

def stemmer(comment):
    stemmer = SnowballStemmer('english')
    words = comment.split()
    stemmed = [stemmer.stem(word) for word in words]
    stemmed
    return " ".join(stemmed)

t = data['comment'].map(lambda x: stemmer(x))

In [None]:
data['stemmed_comments'] = t

In [None]:
# saving cleaned data
data.to_csv('../../data/preprocessed_data.csv', encoding='utf-8')

# Tokenizing

In [2]:
data= pd.read_csv('../../data/preprocessed_data.csv', index_col=0)

In [3]:
import tensorflow.keras as keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.tokenize import word_tokenize

In [4]:
tokenizer = Tokenizer()

# Training of our tokenizer
tokenizer.fit_on_texts(data['stemmed_comments'])
#tokenizer.fit_on_texts(data['comment'])

# Number of unique words in our data
vocab_len = len(tokenizer.word_index) + 1

#converting comment into numeric form, each unique word has a number, so comment will be rewrite into numbers
embedded = tokenizer.texts_to_sequences(data['stemmed_comments'])

[13, 8, 11, 2516, 161, 17, 1, 1681, 437, 6, 1, 270, 3234, 5, 1934, 19, 565, 653, 2, 2134, 26, 4, 21, 11, 224, 9, 727, 1208, 397, 3, 42, 227, 83, 439, 402, 1761, 5, 11568, 3963, 5, 93, 10, 42, 15, 1137, 2, 224, 84, 1934, 5, 3234, 17, 209, 5, 84, 256, 209, 34, 2143, 1, 465, 4, 378, 8, 800, 417, 4, 54, 17, 11, 1310, 24, 1, 240, 561, 6, 1790, 173, 309, 556, 7220, 19, 1864, 12, 2673, 4219, 1284, 1, 3234, 610, 47, 1, 439, 402, 1761, 1479, 9, 1271, 1, 3224, 34, 15, 1875, 33, 1, 598, 26, 9, 8, 30, 2001, 727, 3, 43, 546, 9, 2, 290, 29, 6230, 3896, 254, 94, 27, 2516, 39, 10, 3, 318, 1859, 6, 724, 24, 4, 104, 1, 55, 22, 3025, 470, 711, 83, 1768, 5, 45, 8, 67, 52, 6, 2935, 105, 2935, 26, 3, 92, 2, 653, 23, 509, 3, 162, 15, 294, 2935, 2143, 2134, 2, 60, 14, 11, 1375, 2, 7, 5663, 3593, 20, 389, 39, 10, 8, 668, 5, 14, 1140, 1, 15135, 45, 8, 7, 1377, 509, 12, 23, 36, 26, 10, 189, 34, 68, 228, 200, 236, 47, 1313, 1074, 27, 318, 7, 162, 19, 888, 12, 4, 47, 10, 103, 26, 1, 437, 8, 588, 1059, 12, 5, 88, 6

In [5]:
longest_sentence_len = len(max(embedded, key= len))

#Adding zeroes to end of eat embeded sentece to len of the longest sentence
padded_comments = pad_sequences(embedded, longest_sentence_len, padding='post')

# Glove embeddings

In [6]:
glove = open('../../data/glove.6B.50d.txt', encoding="utf8")
from numpy import array
from numpy import asarray

In [7]:
# Train and test data
size_train = round(len(padded_comments)* 0.8)

train_padded = padded_comments[:size_train]
train_attack =asarray(data['attack'][:size_train], dtype='int32')

test_padded = padded_comments[size_train + 1:]
test_attack = asarray(data['attack'][size_train + 1:], dtype='int32')

In [8]:
# Preparing glove dictonary
glove_dict = dict()

#according to glove file, first is word and next are vectors
for line in glove:
    splitted = line.split()
    word = splitted[0]
    vectors = asarray(splitted[1:], dtype='float32')
    glove_dict [word] = vectors 


glove.close()

In [9]:
from numpy import zeros
zle = 0
# create dict with only words in out data
word_matrix = zeros((vocab_len, 50))
# iterate throuh all words in our data and find vector for them
for word, index in tokenizer.word_index.items():
    vectors = glove_dict.get(word)
    if vectors is not None:
        word_matrix[index] = vectors # pretrained word embedings with words from our comments
        

In [10]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,GRU,Dense
from tensorflow.keras.initializers import Constant

def GRU_model():
    model = Sequential()
    embedding_layer = Embedding(vocab_len,50,embeddings_initializer = Constant(word_matrix),input_length=longest_sentence_len,trainable=False)
    model.add(embedding_layer)
    model.add(GRU(units = 1, dropout=0.2, recurrent_dropout = 0.2))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


In [11]:
model = GRU_model()
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 2831, 50)          5603050   
_________________________________________________________________
gru (GRU)                    (None, 1)                 159       
_________________________________________________________________
dense (Dense)                (None, 1)                 2         
Total params: 5,603,211
Trainable params: 161
Non-trainable params: 5,603,050
_________________________________________________________________


In [None]:
#Not working in notebook and docker, trained in google colab, screen of results in model
model.fit(train_padded,train_attack,batch_size=128, epochs=3, validation_data=(test_padded,test_attack),verbose=1)

In [None]:
model.save('GRU_model_1') 