In [1]:
#@title Load libraries

import os
import pandas as pd
import numpy as np
import re
import tensorflow as tf
import keras.backend as K
from tensorflow import keras


from json import loads
from cleantext import clean
from hazm import Normalizer, Lemmatizer, word_tokenize
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Dense, Embedding, Dropout, BatchNormalization, Flatten
from tensorflow.keras.layers import GlobalMaxPool1D, MaxPooling1D, GlobalMaxPooling1D
from tensorflow.keras.layers import Conv1D, LSTM, GRU, Bidirectional, SimpleRNN
from tensorflow.keras.layers import multiply, Input, Concatenate
from tensorflow.keras.optimizers import Adam, schedules
from tensorflow.keras.regularizers import l2
from tensorflow.keras import losses, metrics

Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.


In [18]:
import tensorflow as tf
import numpy as np
import re
from tensorflow.keras.layers import TextVectorization
from cleantext import clean
from hazm import Normalizer, Lemmatizer, word_tokenize
import pandas as pd
import string

In [19]:
test_dir = 'nicknames.csv'
train_dir = 'data_with_embed.csv'

df_train = pd.read_csv(train_dir)
df_test = pd.read_csv(test_dir)

df_test = df_test.rename(columns={"nick_name":"text"})

In [29]:
raw_train_ds = df_train.apply(tuple, axis=1).tolist()

In [20]:
def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext


def cleaning(text):

    # regular cleaning
    text = clean(text,
      fix_unicode=True,
      to_ascii=False,
      lower=True,
      no_line_breaks=True,
      no_urls=True,
      no_emails=True,
      no_phone_numbers=True,
      no_numbers=False,
      no_digits=False,
      no_currency_symbols=True,
      no_punct=False,
      replace_with_url="",
      replace_with_email="",
      replace_with_phone_number="",
      replace_with_number="",
      replace_with_digit="0",
      replace_with_currency_symbol="",
    )

    # cleaning htmls
    text = cleanhtml(text)

    # normalizing
    normalizer = Normalizer()
    text = normalizer.normalize(text)

    # removing wierd patterns
    wierd_pattern = re.compile("["
      u"\U0001F600-\U0001F64F"  # emoticons
      u"\U0001F300-\U0001F5FF"  # symbols & pictographs
      u"\U0001F680-\U0001F6FF"  # transport & map symbols
      u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
      u"\U00002702-\U000027B0"
      u"\U000024C2-\U0001F251"
      u"\U0001f926-\U0001f937"
      u'\U00010000-\U0010ffff'
      u"\u200d"
      u"\u2640-\u2642"
      u"\u2600-\u2B55"
      u"\u23cf"
      u"\u23e9"
      u"\u231a"
      u"\u3030"
      u"\ufe0f"
      u"\u2069"
      u"\u2066"
      # u"\u200c"
      u"\u2068"
      u"\u2067"
      "]+", flags=re.UNICODE)

    text = wierd_pattern.sub(r'', text)

    # removing extra spaces, hashtags
    text = re.sub("#", "", text)
    text = re.sub("\s+", " ", text)

    return text

In [21]:
text_ds = df_train['text'].tolist()

In [22]:
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
    return tf.strings.regex_replace(
        stripped_html, f"[{re.escape(string.punctuation)}]", ""
    )


# Model constants.
max_features = 20000
embedding_dim = 128
sequence_length = 500

# Now that we have our custom standardization, we can instantiate our text
# vectorization layer. We are using this layer to normalize, split, and map
# strings to integers, so we set our 'output_mode' to 'int'.
# Note that we're using the default split function,
# and the custom standardization defined above.
# We also set an explicit maximum sequence length, since the CNNs later in our
# model won't support ragged sequences.
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,
)

# Now that the vocab layer has been created, call `adapt` on a text-only
# dataset to create the vocabulary. You don't have to batch, but for very large
# datasets this means you're not keeping spare copies of the dataset in memory.

# Let's make a text-only dataset (no labels):
text_ds = raw_train_ds.map(lambda x, y: x)
# Let's call `adapt`:
vectorize_layer.adapt(text_ds)

In [31]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label


# Vectorize the data.
train_ds = raw_train_ds
# val_ds = raw_val_ds.map(vectorize_text)
# test_ds = raw_test_ds.map(vectorize_text)

# Do async prefetching / buffering of the data for best performance on GPU.
train_ds = train_ds.cache().prefetch(buffer_size=10)
# val_ds = val_ds.cache().prefetch(buffer_size=10)
# test_ds = test_ds.cache().prefetch(buffer_size=10)


AttributeError: 'list' object has no attribute 'cache'

In [None]:
from tensorflow.keras import layers

# A integer input for vocab indices.
inputs = tf.keras.Input(shape=(None,), dtype="int64")

# Next, we add a layer to map those vocab indices into a space of dimensionality
# 'embedding_dim'.
x = layers.Embedding(max_features, embedding_dim)(inputs)
x = layers.Dropout(0.5)(x)

# Conv1D + global max pooling
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.GlobalMaxPooling1D()(x)

# We add a vanilla hidden layer:
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)

# We project onto a single unit output layer, and squash it with a sigmoid:
predictions = layers.Dense(1, activation="sigmoid", name="predictions")(x)

model = tf.keras.Model(inputs, predictions)

# Compile the model with binary crossentropy loss and an adam optimizer.
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])


In [4]:
test_dir = 'nicknames.csv'
train_dir = 'data_with_embed.csv'

df_train = pd.read_csv(train_dir)
df_test = pd.read_csv(test_dir)

df_test = df_test.rename(columns={"nick_name":"text"})

In [6]:
df_train

Unnamed: 0,is_offensive,text,embed
0,0,Then go to the village pump and suggest they c...,"[0.012799851, -0.005207469, 0.043878723, 0.015..."
1,1,ANTI GREEK NATIONALIS -WIKIPEDIA \n\nHi Alexik...,"[-0.04229796, 0.017758716, 0.03235738, -0.0264..."
2,1,Dis hoe wasnt dis violent on Lottery Ticket 😂😂,"[0.04530848, 0.017994655, 0.0020758153, 0.0274..."
3,0,It is better for Atabay not helping the banned...,"[-0.032195956, 0.04678882, 0.036112364, 0.0550..."
4,0,"""is in CamelCase. """"SiCKO"""" is not CamelCase,...","[0.04661622, 0.0054911007, 0.05259348, -0.0218..."
...,...,...,...
221202,1,wnker,"[0.008764465, 0.003481107, -0.01818306, -0.028..."
221203,1,wop,"[0.04365152, 0.022549827, -0.021462098]"
221204,1,wophead,"[0.02341805, 0.024824617, -0.0039961617, -0.01..."
221205,1,zip in the wire,"[-0.018007062, -0.011607238, -0.076187894, 0.0..."


In [7]:
df_offensive = df_train[df_train['is_offensive'] == 1]

In [8]:
df_offensive.shape

(43654, 3)

In [11]:
df_not_offensive = df_train[df_train['is_offensive'] == 0][:43654]

In [12]:
df_not_offensive.shape

(43654, 3)

In [13]:
df_train = pd.concat([df_offensive, df_not_offensive], ignore_index=True)

In [15]:
df_train = df_train.sample(frac=1).reset_index(drop=True)

In [16]:
df_train

Unnamed: 0,is_offensive,text,embed
0,1,fuckknob,"[0.037752267, -0.0022094315, 0.014026786, -0.0..."
1,0,yo \n\nyou could at least reply to my message...,"[-0.039693847, 0.025858853, -0.010963443, 0.00..."
2,1,Fuck bitches get money mission complete!,"[0.097139135, -0.053507056, 0.06915419, 0.0040..."
3,1,Can I have a bad bitch without no flaws?\nCome...,"[-0.030658739, -0.01667491, 0.024269698, -0.01..."
4,0,This article and the information that it conta...,"[0.02879249, 0.009228323, -0.017284226, -0.006..."
...,...,...,...
87303,1,"How do you go to sleep at night bitch, if I'm ...","[-0.03246857, -0.036685046, 0.019301916, -0.01..."
87304,0,I suppose that current Wikimedia RU's represen...,"[0.034710474, 0.043651644, 0.07749149, -0.0219..."
87305,1,"Black Kite, fuck you embesil. What vandalisati...","[-0.0010581572, 0.01095422, 0.035655998, -0.01..."
87306,1,بپرسرش,"[0.0069294553, -0.03663931, -0.045679964, -0.0..."


In [28]:
df_train['embed'] = df_train['embed'].astype(object)

In [37]:
df_train['embed'][20]

'[-0.019798907, 0.06992075, 0.03906029, -0.010753411, -0.020354515, -0.050790906, -0.08109843, 0.025596768, -0.05479451, 0.059618037, 0.019742906, 0.011374324, -0.039813988, 0.004966542, 0.032846, 0.04050419, 0.055378612, -0.013902562, -0.012338051, 0.007263977, 0.032679245, -0.019076856, 0.021666853, 0.01541229, 0.039753374, 0.024381626, -0.020589316, -0.009773094, -0.06343269, 0.018820241, 0.025423925, -0.10522496, -0.05043286, 0.07265367, -0.054062605, 0.039737824, -0.08000645, 0.042770747, -0.015547166, 0.018783294, 0.040337183, 0.041852232, -0.05955996, 0.02838242, -0.023051413, -0.028263032, -0.03970317, 0.0096473675, 0.057353202, -0.04391071, 0.03188073, -0.0023838321, 0.01218581, -0.09811522, -0.003161731, -0.009085218, -0.012346759, -0.07657337, -0.008177076, 0.043865263, 0.081729956, 0.0004152057, 0.006713048, 0.015672114, -0.07011687]'

In [34]:
import ast
x=ast.literal_eval(df_train['embed'][0])
x

[0.037752267,
 -0.0022094315,
 0.014026786,
 -0.066830605,
 -0.07979133,
 -0.009551473,
 0.017095307,
 -0.068690866]

In [33]:
len(x)

8

In [17]:
samples = df_train['embed'].tolist()
labels = df_train['is_offensive'].tolist()

In [18]:
validation_split = 0.1
num_validation_samples = int(validation_split * len(samples))
train_samples = samples[:-num_validation_samples]
val_samples = samples[-num_validation_samples:]
train_labels = labels[:-num_validation_samples]
val_labels = labels[-num_validation_samples:]

array('[0.037752267, -0.0022094315, 0.014026786, -0.066830605, -0.07979133, -0.009551473, 0.017095307, -0.068690866]',
      dtype='<U109')

In [7]:
from tensorflow.keras.layers import TextVectorization

vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=200)
text_ds = tf.data.Dataset.from_tensor_slices(train_samples).batch(128)
vectorizer.adapt(text_ds)

In [8]:
vectorizer.get_vocabulary()[:5]

['', '[UNK]', 'the', 'to', 'you']

In [9]:
output = vectorizer([["the cat sat on the mat"]])
output.numpy()[0, :6]

array([   2, 2087, 6091,   15,    2,    1])

In [10]:
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [11]:
fastTextDir = '.'
fastText_fa_path = os.path.join(fastTextDir, 'cc.fa.300.vec')
fastText_en_path = os.path.join(fastTextDir, 'cc.en.300.vec')

In [12]:
embeddings_index = {}

with open(fastText_fa_path) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

with open(fastText_en_path) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 3745941 word vectors.


In [13]:
num_tokens = len(voc) + 2
embedding_dim = 300
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 18496 words (1504 misses)


In [14]:
from tensorflow.keras.layers import Embedding

embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
)


In [15]:
from tensorflow.keras import layers

int_sequences_input = keras.Input(shape=(None,), dtype="int64")
embedded_sequences = embedding_layer(int_sequences_input)
x = layers.Conv1D(128, 5, activation="relu")(embedded_sequences)
x = layers.MaxPooling1D(5)(x)
x = layers.Conv1D(128, 5, activation="relu")(x)
x = layers.MaxPooling1D(5)(x)
x = layers.Conv1D(128, 5, activation="relu")(x)
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)
preds = layers.Dense(2, activation="softmax")(x)
model = keras.Model(int_sequences_input, preds)
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding (Embedding)       (None, None, 300)         6000600   
                                                                 
 conv1d (Conv1D)             (None, None, 128)         192128    
                                                                 
 max_pooling1d (MaxPooling1D  (None, None, 128)        0         
 )                                                               
                                                                 
 conv1d_1 (Conv1D)           (None, None, 128)         82048     
                                                                 
 max_pooling1d_1 (MaxPooling  (None, None, 128)        0         
 1D)                                                         

In [16]:
x_train = vectorizer(np.array([[s] for s in train_samples])).numpy()
x_val = vectorizer(np.array([[s] for s in val_samples])).numpy()

y_train = np.array(train_labels)
y_val = np.array(val_labels)


In [18]:
x_train.shape

(72251, 200)

In [20]:
y_train.shape

(72251,)

In [36]:
model.compile(
    loss="sparse_categorical_crossentropy", optimizer="rmsprop", metrics=["acc"]
)
model.fit(x_train, y_train, batch_size=128, epochs=20, validation_data=(x_val, y_val))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7faf7e8a7cd0>

In [45]:
string_input = keras.Input(shape=(1,), dtype="string")
x = vectorizer(string_input)
preds = model(x)
end_to_end_model = keras.Model(string_input, preds)

probabilities = end_to_end_model.predict(
    [["کیرم دهنت"]]
)

np.argmax(probabilities[0])



0

In [5]:
df_train['text'] = df_train['text'].astype(str)
df_test['text'] = df_test['text'].astype(str)

df_train['text'] = df_train['text'].apply(cleaning)
df_test['text'] = df_test['text'].apply(cleaning)

train_data, val_data = train_test_split(df_train, test_size=0.2)

In [6]:
def get_dict(df):
    wordDict = {}
    for idx, row in enumerate(df.text):
        row = re.split(r'([a-zA-Z]+)', row)
        row = " ".join(str(item) for item in row)
        words = row.split()
        for wrd in words:
            if wrd in wordDict:
                wordDict[wrd] += 1
            else:
                wordDict[wrd] = 1
    return wordDict

train_wordDict = get_dict(df_train)
test_wordDict = get_dict(df_test)

In [7]:
fastTextDir = '.'
fastText_fa_path = os.path.join(fastTextDir, 'cc.fa.300.vec')
fastText_en_path = os.path.join(fastTextDir, 'cc.en.300.vec')

def get_embedding(wordDict):
    embeddings_index = {}
    with open(fastText_fa_path, encoding='utf8') as infile:
        for line in infile:
            values = line.split()
            word = values[0]
            try:
                coefs = np.asarray(values[1:], dtype='float32')
            except:
                print("Warnning"+str(values)+" in" + str(line))
            if word in wordDict:
                embeddings_index[word] = coefs

    with open(fastText_en_path, encoding='utf8') as infile:
        for line in infile:
            values = line.split()
            word = values[0]
            try:
                coefs = np.asarray(values[1:], dtype='float32')
            except:
                print("Warnning"+str(values)+" in" + str(line))
            if word in wordDict:
                embeddings_index[word] = coefs
    return embeddings_index


In [8]:
train_embeddings = get_embedding(train_wordDict)
test_embeddings = get_embedding(test_wordDict)

In [9]:
MAX_NB_WORDS = 5500
MAX_SEQUENCE_LENGTH = 350

content_train = df_train['text']
content_test = df_test['text']

y_train = np.array(df_train['is_offensive'])

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(content_train)
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(content_train)
test_sequences = tokenizer.texts_to_sequences(content_test)

train_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [10]:
def prepare_test(sentence):
    test_sequences = tokenizer.texts_to_sequences([sentence])
    test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
    return test_data

In [11]:
EMBEDDING_DIM = 300
embeddings_index = train_embeddings
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


In [12]:
nClasses = 2

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH, ))
embedded_sequences = embedding_layer(sequence_input)

x = BatchNormalization()(embedded_sequences)
x = Conv1D(256, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(256, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(256, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Flatten()(x)
x = Dense(256, activation='relu')(x)
preds = Dense(nClasses, activation='softmax')(x)
model = Model(sequence_input, preds)

optimizer = Adam(learning_rate=5e-3, beta_1=0.9, beta_2=0.999, 
                epsilon=1e-07, amsgrad=False)

model.compile(loss=losses.SparseCategoricalCrossentropy(from_logits=True),
                optimizer='rmsprop',
                metrics=metrics.SparseCategoricalAccuracy('accuracy'))

In [13]:
BATCH_SIZE = 64
EPOCHS =   20 


model.fit(train_data, y_train.astype(float),
        validation_split=0.2, 
        epochs=EPOCHS,
        batch_size=BATCH_SIZE)
model.save(('nickname.h5'))

Epoch 1/20


  output, from_logits = _get_logits(


Epoch 2/20
Epoch 3/20

KeyboardInterrupt: 