In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
ls drive/MyDrive/jigsaw-toxic-comment-classification-challenge/

[0m[01;34msample_submission.csv[0m/     test.csv.zip         train.csv.zip
sample_submission.csv.zip  [01;34mtest_labels.csv[0m/     X_test.pickle
simple_model.h5            test_labels.csv.zip  X_train.pickle
[01;34mtest.csv[0m/                  [01;34mtrain.csv[0m/


In [3]:
%cd drive/MyDrive/jigsaw-toxic-comment-classification-challenge/

/content/drive/MyDrive/jigsaw-toxic-comment-classification-challenge


In [4]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import re
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [5]:
train = pd.read_csv('train.csv/train.csv')


In [6]:
test = pd.read_csv('test.csv/test.csv')

In [7]:
test_labels = pd.read_csv('test_labels.csv/test_labels.csv')

In [8]:
train.shape

(159571, 8)

In [9]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


Check for Missing values - no missing values found

In [10]:
train.isnull().sum()

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64

In [11]:
X_train = train.iloc[:,1]
Y_train = train.iloc[:,2:]

In [12]:
X_test = test.iloc[:,1]
Y_test = test_labels.iloc[:,1:]

In [13]:
sentences = np.asarray(X_train)

**STEP 1: REMOVING CONTRACTIONS**

In [14]:
CONTRACTION_MAP = {
"ain't": "is not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have",
}

In [15]:
def expand_contractions(sentences):
  contractions_re = re.compile('(%s)'%'|'.join(CONTRACTION_MAP.keys()))
  def exp_cont(s, contractions_dict=CONTRACTION_MAP):
    def replace(match):
      return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, s)
  for i in range(len(sentences)):
    sentences[i] = exp_cont(sentences[i])


In [16]:
sentences

array(["Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",
       "D'aww! He matches this background colour I'm seemingly stuck with. Thanks.  (talk) 21:51, January 11, 2016 (UTC)",
       "Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.",
       ...,
       'Spitzer \n\nUmm, theres no actual article for prostitution ring.  - Crunch Captain.',
       'And it looks like it was actually you who put on the speedy to have the first version deleted now that I look at it.',
       '"\nAnd ... I really don\'t think you understand.  I came here and my idea was bad right away.  What kind of communit

In [17]:
# expand_contractions(sentences)

In [18]:
from nltk.tokenize import word_tokenize
nltk.download('punkt')
word_tokenize(sentences[0])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['Explanation',
 'Why',
 'the',
 'edits',
 'made',
 'under',
 'my',
 'username',
 'Hardcore',
 'Metallica',
 'Fan',
 'were',
 'reverted',
 '?',
 'They',
 'were',
 "n't",
 'vandalisms',
 ',',
 'just',
 'closure',
 'on',
 'some',
 'GAs',
 'after',
 'I',
 'voted',
 'at',
 'New',
 'York',
 'Dolls',
 'FAC',
 '.',
 'And',
 'please',
 'do',
 "n't",
 'remove',
 'the',
 'template',
 'from',
 'the',
 'talk',
 'page',
 'since',
 'I',
 "'m",
 'retired',
 'now.89.205.38.27']

**STEP 2 : REMOVE NEWLINES AND TABS**

In [19]:
def remove_newlines_and_tabs(sentences):
  
  for i in range(len(sentences)):
    sentences[i] = sentences[i].replace('\n',' ').replace('\t',' ').replace('\\', ' ')

In [20]:
remove_newlines_and_tabs(sentences)

In [21]:
sentences[0]

"Explanation Why the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27"

**STEP 3: REMOVE ALL STOPWORDS**

In [22]:
stoplist = set(stopwords.words('english'))

In [23]:
stoplist

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [24]:
def remove_stopwords(sentences):
  for i in range(len(sentences)):
    tokens = word_tokenize(sentences[i])
    
    filtered_tokens = [token for token in tokens if token.lower() not in stoplist]
    sentences[i] = " ".join(filtered_tokens)

In [25]:
# remove_stopwords(sentences)

In [26]:
sentences[0]

"Explanation Why the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27"

In [27]:
X_train[0]

"Explanation Why the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27"

**STEP 4: LEMMETIZATION**

In [28]:
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.


True

In [29]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

In [30]:
def lemmetization(sentences):
  for i in range(len(sentences)):
    lemma = [lemmatizer.lemmatize(w,'v') for w in w_tokenizer.tokenize(sentences[i])]

    sentences[i] = " ".join(lemma)

In [31]:
# lemmetization(sentences)

In [32]:
sentences[0]

"Explanation Why the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27"

**COMPLETE PREPROCESSING**

In [33]:
def preprocess(sentences):
  expand_contractions(sentences)
  remove_newlines_and_tabs(sentences)
  remove_stopwords(sentences)
  lemmetization(sentences)


In [34]:
X_train = np.asarray(X_train)


In [35]:
X_test = np.asarray(X_test)

In [36]:
# try:
#   file = open('X_train.pickle')
#   X_train = pickle.load(file)
# except:
#   preprocess(X_train)

In [37]:
# try:
#   file = open('X_test.pickle')
#   X_test = pickle.load(file)
# except:
#   preprocess(X_test)

In [38]:
preprocess(X_train)


In [39]:
preprocess(X_test)

KeyboardInterrupt: ignored

In [None]:
len(X_train)

In [None]:
len(Y_train)

In [None]:
X_train[:10]

In [None]:
X_train[:10]

In [None]:
import pickle

# Store data (serialize)
with open('X_train.pickle', 'wb') as handle:
    pickle.dump(X_train, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
import pickle

# Store data (serialize)
with open('X_test.pickle', 'wb') as handle:
    pickle.dump(X_test, handle, protocol=pickle.HIGHEST_PROTOCOL)

**STEP 5: TOKENIZATION**

In [61]:
tokenizer = Tokenizer()

In [62]:
tokenizer.fit_on_texts(X_train)

In [63]:
X_train

array(["Explanation edit make username Hardcore Metallica Fan revert ? vandalisms , closure GAs vote New York Dolls FAC . please remove template talk page since 'm retire now.89.205.38.27",
       "D'aww ! match background colour 'm seemingly stick . Thanks . ( talk ) 21:51 , January 11 , 2016 ( UTC )",
       "Hey man , 'm really try edit war . 's guy constantly remove relevant information talk edit instead talk page . seem care format actual info .",
       ...,
       'Spitzer Umm , theres actual article prostitution ring . - Crunch Captain .',
       'look like actually put speedy first version delete look .',
       "`` ... really think understand . come idea bad right away . kind community go `` '' bad ideas '' '' go away , instead help rewrite them. ``"],
      dtype=object)

In [64]:
X_train_tokenized = tokenizer.texts_to_sequences(X_train)

In [None]:
X_train_tokenized

**STEP 6: FIND MAX LENGTH OF SENTENCES**

In [45]:
max_len = 0
test = ""
j=0
for i,sentence in enumerate(X_train_tokenized):
  length = len(sentence)
  if length>max_len:
    j=i
    max_len = length
    test = sentence

In [46]:
max_len

1348

**STEP 7 : PAD SEQUENCES**

In [47]:
X_train_processed = pad_sequences(X_train_tokenized,maxlen=max_len,padding = 'post')

In [48]:
X_train_processed

array([[  562,     7,    10, ...,     0,     0,     0],
       [86373,   934,   431, ...,     0,     0,     0],
       [  305,   312,    25, ...,     0,     0,     0],
       ...,
       [27845,  6291,  4403, ...,     0,     0,     0],
       [   41,    13,   139, ...,     0,     0,     0],
       [   74,    14,   124, ...,     0,     0,     0]], dtype=int32)

**SIMPLE NN MODEL**

Test

In [53]:
max_features=100000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train))
list_tokenized_train = tokenizer.texts_to_sequences(X_train)


In [54]:
maxpadlen = 200
X_t=pad_sequences(list_tokenized_train, maxlen=maxpadlen, padding = 'post')


In [55]:
X_t

array([[  562,     7,    10, ...,     0,     0,     0],
       [86373,   934,   431, ...,     0,     0,     0],
       [  305,   312,    25, ...,     0,     0,     0],
       ...,
       [27845,  6291,  4403, ...,     0,     0,     0],
       [   41,    13,   139, ...,     0,     0,     0],
       [   74,    14,   124, ...,     0,     0,     0]], dtype=int32)

In [56]:

from sklearn.model_selection import train_test_split
# x_train, x_val, y_train, y_val = train_test_split(X_train_processed, Y_train, test_size=0.2)
lstm_model = keras.Sequential([
    keras.layers.Embedding(max_features+1,32) ,          
    keras.layers.Bidirectional(keras.layers.LSTM(32, activation='tanh')) ,         
    keras.layers.Dense(128, activation="relu"),
    keras.layers.Dense(256, activation="relu"),
    keras.layers.Dense(128, activation="relu"),
    keras.layers.Dense(6, activation="sigmoid")
])
lstm_model.compile(loss="BinaryCrossentropy", optimizer="Adam", metrics=["accuracy"])
model_history = lstm_model.fit(X_t, Y_train, epochs=1)



In [145]:
res = lstm_model.predict(np.expand_dims(X_t[15],axis=0))

(res > 0.5).astype(int)

array([[0, 0, 0, 0, 0, 0]])

In [142]:
Y_train.iloc[12]

toxic            1
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
Name: 12, dtype: int64

In [160]:
input_text = 'COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK'

In [158]:
def predict_using_simple_model(text):
  sentences =[text]
  expand_contractions(sentences)
  remove_newlines_and_tabs(sentences)
  remove_stopwords(sentences)
  lemmetization(sentences)
  print(sentences)
  tokenized_text = tokenizer.texts_to_sequences(sentences)
  padded_text = pad_sequences(tokenized_text,maxlen=maxpadlen,padding = 'post')
  print(lstm_model.predict(padded_text))
  return lstm_model.predict(padded_text)

In [159]:
res = predict_using_simple_model(input_text)
(res > 0.5).astype(int)

['HATE BLACK']
[[0.8108128  0.04237914 0.459502   0.02853931 0.3863017  0.10296743]]


array([[1, 0, 0, 0, 0, 0]])

**Test 2**

In [59]:
from sklearn.model_selection import train_test_split
# x_train, x_val, y_train, y_val = train_test_split(X_train_processed, Y_train, test_size=0.2)
lstm_model_2 = keras.Sequential([
    keras.layers.Embedding(max_features+1,32) ,          
    keras.layers.Bidirectional(keras.layers.LSTM(32, activation='tanh')) ,         
    keras.layers.Dense(128, activation="relu"),
    keras.layers.Dense(256, activation="relu"),
    keras.layers.Dense(128, activation="relu"),
    keras.layers.Dense(6, activation="sigmoid")
])
lstm_model_2.compile(loss="BinaryCrossentropy", optimizer="Adam", metrics=["accuracy"])
model_history = lstm_model_2.fit(X_train_processed, Y_train, epochs=1)



In [60]:
res = lstm_model_2.predict(np.expand_dims(X_train_processed[6],axis=0))

(res > 0.5).astype(int)

array([[1, 0, 1, 0, 1, 0]])

In [68]:
text = 'Stupid peace of shit stop deleting my stuff asshole go die and fall in a hole go to hell!'

In [72]:
def predict_using_text(text):
  sentences = [text]
  expand_contractions(sentences)
  remove_newlines_and_tabs(sentences)
  remove_stopwords(sentences)
  lemmetization(sentences)
  tokenized = tokenizer.texts_to_sequences(sentences)
  padded = pad_sequences(tokenized,maxlen=max_len,padding = 'post')
  res = lstm_model_2.predict(padded)
  print((res > 0.5).astype(int))

In [73]:
predict_using_text(text)

[[1 0 1 0 1 0]]


In [76]:
lstm_model_2.save('comment_toxicity_model.h5')

**Gradio**

In [74]:

!pip install gradio jinja2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gradio
  Downloading gradio-3.0.20-py3-none-any.whl (5.1 MB)
[K     |████████████████████████████████| 5.1 MB 4.1 MB/s 
Collecting analytics-python
  Downloading analytics_python-1.4.0-py2.py3-none-any.whl (15 kB)
Collecting paramiko
  Downloading paramiko-2.11.0-py2.py3-none-any.whl (212 kB)
[K     |████████████████████████████████| 212 kB 53.9 MB/s 
[?25hCollecting python-multipart
  Downloading python-multipart-0.0.5.tar.gz (32 kB)
Collecting fastapi
  Downloading fastapi-0.78.0-py3-none-any.whl (54 kB)
[K     |████████████████████████████████| 54 kB 3.1 MB/s 
[?25hCollecting ffmpy
  Downloading ffmpy-0.3.0.tar.gz (4.8 kB)
Collecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 49.7 MB/s 
Collecting uvicorn
  Downloading uvic

In [75]:
import tensorflow as tf
import gradio as gr

In [81]:
def score_comment(comment):
    sentences = [comment]
    expand_contractions(sentences)
    remove_newlines_and_tabs(sentences)
    remove_stopwords(sentences)
    lemmetization(sentences)
    tokenized = tokenizer.texts_to_sequences(sentences)
    padded = pad_sequences(tokenized,maxlen=max_len,padding = 'post')
    results = lstm_model_2.predict(padded)
    
    text = ''
    for idx, col in enumerate(train.columns[2:]):
        text += '{}: {}\n'.format(col, results[0][idx]>0.5)
    print(text)
    return text

In [82]:
score_comment(text)

toxic: True
severe_toxic: False
obscene: True
threat: False
insult: True
identity_hate: False



'toxic: True\nsevere_toxic: False\nobscene: True\nthreat: False\ninsult: True\nidentity_hate: False\n'

In [83]:
interface = gr.Interface(fn=score_comment, 
                         inputs=gr.inputs.Textbox(lines=2, placeholder='Comment to score'),
                        outputs='text')



In [84]:
interface.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set `debug=True` in `launch()`
Running on public URL: https://53293.gradio.app

This share link expires in 72 hours. For free permanent hosting, check out Spaces (https://huggingface.co/spaces)


(<gradio.routes.App at 0x7fe3a02b58d0>,
 'http://127.0.0.1:7860/',
 'https://53293.gradio.app')