In [1]:
# Import the dependencies
import os
import pandas as pd
import tensorflow as tf
import numpy as np

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
df_train = pd.read_csv("../Data/train.csv")
df_test = pd.read_csv("../Data/test.csv")
df_test_labels = pd.read_csv("../Data/test_labels.csv")

In [3]:
df_train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [8]:
df_test = pd.concat([df_test, df_test_labels], axis=1)

In [3]:
from tensorflow.keras.layers import TextVectorization

In [4]:
# Splitting out our data into text and features
X = df_train['comment_text']
Y = df_train[df_train.columns[2:]].values

In [5]:
# nUMBER OF WORDS IN THE VOCAB
MAX_FEATURES = 200000 

In [6]:
vectorizer = TextVectorization(max_tokens=MAX_FEATURES,
                               output_sequence_length=3000,
                               output_mode='int')

In [7]:
vectorizer.adapt(X.values)

In [8]:
vectorized_text = vectorizer(X.values)

In [9]:
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, Y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8)

In [10]:
batch_X, batch_Y = dataset.as_numpy_iterator().next()

In [11]:
# Take the first 70% of data
train = dataset.take(int(len(dataset)*.7))

# Skip the first 70% of the data and take 20% of the data
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))

# Skip the first 90% of the data and take the last 10% of the data
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

In [12]:
train_generator = train.as_numpy_iterator()

In [13]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [14]:
model = Sequential()

# This is the embedding layer
model.add(Embedding(MAX_FEATURES+1, 32))

# tanh, because gpu acceleration requires tanh, it is dictated by tensorflow
model.add(Bidirectional(LSTM(32, activation='tanh')))

model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(6, activation='sigmoid'))

In [15]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam')

In [18]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          6400032   
                                                                 
 bidirectional (Bidirectiona  (None, 64)               16640     
 l)                                                              
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 6)                 774       
                                                        

In [19]:
history = model.fit(train, epochs=20, validation_data=val)

Epoch 1/20
  20/6981 [..............................] - ETA: 3:00:29 - loss: 0.4810

KeyboardInterrupt: 