**Approach**- We will start with basic data sanity check and preprocessing. Then we will move on to preparing data for the model. In this step we will create two columns one with preprocessed tweets and the other one that will tell whether it has obscene word in it or not.

Now for modeling we are doing-> embedding-> dropout-conv1d-lstm-dense- droupout-dense- output.
After that we move on to inference with some tweets.

In [None]:
#Download Data
#Data Source- https://github.com/jerrytigerxu/Twitter-Sentiment-Analysis/tree/master/data

!wget https://raw.githubusercontent.com/jerrytigerxu/Twitter-Sentiment-Analysis/master/data/train.csv
!wget https://raw.githubusercontent.com/jerrytigerxu/Twitter-Sentiment-Analysis/master/data/test.csv

In [7]:
%matplotlib inline
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
import nltk
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

In [8]:
#Read Data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [9]:
train.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [10]:
print(len(train))
print(len(test))

31962
17197


In [11]:
#Obscene WordList

toxic_words = ["anal",
"anus",
"ballsack",
"blowjob",
"blow job",
"boner",
"clitoris",
"cock",
"cunt",
"dick",
"dildo",
"dyke",
"fag",
"fuck",
"jizz",
"labia",
"muff",
"naked",
"nigger",
"nigga",
"penis",
"piss",
"porn"
"pussy",
"scrotum",
"sex",
"shit",
"slut",
"smegma",
"spunk",
"twat",
"vagina",
"wank",
"whore",
"sexy" ,"piss","porn","topless","hardcore","xxx","redtube","boob"
]

In [12]:
#Stemming to avoid different forms of a word.
from nltk.stem.porter import *
stemmer = PorterStemmer()
toxic_words_stemmed = []
for word in toxic_words:
  toxic_words_stemmed.append(stemmer.stem(word))

In [13]:
data = train.append(test , ignore_index = True)

In [14]:
data.head()

Unnamed: 0,id,label,tweet
0,1,0.0,@user when a father is dysfunctional and is s...
1,2,0.0,@user @user thanks for #lyft credit i can't us...
2,3,0.0,bihday your majesty
3,4,0.0,#model i love u take with u all the time in ...
4,5,0.0,factsguide: society now #motivation


In [15]:
data = data.drop(['id','label'] , axis=1)

In [16]:
data.head()

Unnamed: 0,tweet
0,@user when a father is dysfunctional and is s...
1,@user @user thanks for #lyft credit i can't us...
2,bihday your majesty
3,#model i love u take with u all the time in ...
4,factsguide: society now #motivation


In [17]:
#remove the Twitter handles from our tweets

def remove_pattern(input_txt, pattern):
  r = re.findall(pattern, input_txt)
  for i in r:
    input_txt = re.sub(i, '', input_txt)

  return input_txt

data['tidy_tweet'] = np.vectorize(remove_pattern)(data['tweet'], "@[\w]*")

In [18]:
# Remove special characters, numbers, punctuation
data['tidy_tweet'] = data['tidy_tweet'].str.replace("[^a-zA-Z]", " " )

# Removing short words
data['tidy_tweet'] = data['tidy_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))


In [19]:
data.head()

Unnamed: 0,tweet,tidy_tweet
0,@user when a father is dysfunctional and is s...,when father dysfunctional and selfish drags hi...
1,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit can use cause they don ...
2,bihday your majesty,bihday your majesty
3,#model i love u take with u all the time in ...,model love take with all the time
4,factsguide: society now #motivation,factsguide society now motivation


In [21]:
#Tokenize the input
tokenized_tweet = data['tidy_tweet'].apply(lambda x : x.split())
tokenized_tweet.head()

0    [when, father, dysfunctional, and, selfish, dr...
1    [thanks, for, lyft, credit, can, use, cause, t...
2                              [bihday, your, majesty]
3            [model, love, take, with, all, the, time]
4               [factsguide, society, now, motivation]
Name: tidy_tweet, dtype: object

In [None]:
tokenized_tweet = tokenized_tweet.apply(lambda x  : [stemmer.stem(i) for i in x])

In [None]:
tokenized_tweet.head()

0    [when, father, dysfunct, and, selfish, drag, h...
1    [thank, for, lyft, credit, can, use, caus, the...
2                              [bihday, your, majesti]
3            [model, love, take, with, all, the, time]
4                     [factsguid, societi, now, motiv]
Name: tidy_tweet, dtype: object

In [None]:
for i in range(len(tokenized_tweet)):
  tokenized_tweet[i] = ' '.join(tokenized_tweet[i])

data['tidy_tweet'] = tokenized_tweet

In [23]:
#Adding a column to count no. of toxic words in it
def f(t):
  count_slur = 0
  for slur in toxic_words_stemmed:
    if slur in t:
      count_slur +=1
  return count_slur

data['toxic_count'] = data['tidy_tweet'].apply(f)

In [24]:
data.head()

Unnamed: 0,tweet,tidy_tweet,toxic_count
0,@user when a father is dysfunctional and is s...,when father dysfunctional and selfish drags hi...,0
1,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit can use cause they don ...,0
2,bihday your majesty,bihday your majesty,0
3,#model i love u take with u all the time in ...,model love take with all the time,0
4,factsguide: society now #motivation,factsguide society now motivation,0


In [25]:
data[data['toxic_count']>=1]

Unnamed: 0,tweet,tidy_tweet,toxic_count
21,sad little dude.. #badday #coneofshame #cats...,sad little dude badday coneofshame cats pissed...,2
43,my mom shares the same bihday as @user bihda...,mom shares the same bihday bihday snake see yo...,1
70,@user # if you #luv #hottweets like this from...,you luv hottweets like this from venusexchange,1
72,so much stuff happening in florida! first #orl...,much stuff happening florida first orlando sho...,1
97,couple having sex fat naked japanese girls,couple having sex fat naked japanese girls,2
...,...,...,...
49123,@user fuck yes!! @user mr money in the bank ð...,fuck yes money the bank dam proud mitb ambrose...,1
49129,people do anything for fucking attention nowad...,people anything for fucking attention nowadays,1
49134,in life u will grow to learn some pple will wo...,life will grow learn some pple will work fuck ...,1
49147,today is a good day for excercise #imready #so...,today good day for excercise imready sofuckenr...,1


Turns out we have 2102 such inputs which have toxic words

**Strategy-** Instead of total toxic word count we will convert them into 0 and 1. O means no toxic word, 1 it has toxic words. Reason being less/not much amount of  "tweet" for varied amount of toxic words.

In [27]:
data['toxicity'] = data['toxic_count'] > 0

In [28]:
data.head()

Unnamed: 0,tweet,tidy_tweet,toxic_count,toxicity
0,@user when a father is dysfunctional and is s...,when father dysfunctional and selfish drags hi...,0,False
1,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit can use cause they don ...,0,False
2,bihday your majesty,bihday your majesty,0,False
3,#model i love u take with u all the time in ...,model love take with all the time,0,False
4,factsguide: society now #motivation,factsguide society now motivation,0,False


In [29]:
data  = data[['tidy_tweet' , 'toxicity']]

In [30]:
data.head()

Unnamed: 0,tidy_tweet,toxicity
0,when father dysfunctional and selfish drags hi...,False
1,thanks for lyft credit can use cause they don ...,False
2,bihday your majesty,False
3,model love take with all the time,False
4,factsguide society now motivation,False


In [31]:
data['toxicity'] = data['toxicity'].apply(lambda  x : int(x))
data.head()

Unnamed: 0,tidy_tweet,toxicity
0,when father dysfunctional and selfish drags hi...,0
1,thanks for lyft credit can use cause they don ...,0
2,bihday your majesty,0
3,model love take with all the time,0
4,factsguide society now motivation,0


In [33]:
data.toxicity.unique()

array([0, 1])

In [34]:
#Train-Test-Split
from sklearn.model_selection import train_test_split
train , test = train_test_split(data , test_size = 0.1 , random_state = 1)

In [35]:
#Tokenize
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(train['tidy_tweet'])

word_index = tokenizer.word_index
vocab_size = len(tokenizer.word_index) + 1
print("Vocabulary Size :", vocab_size)

Vocabulary Size : 45423


In [36]:
 tokenizer.texts_to_sequences(['what the fuck are you doing'])

[[22, 1, 387, 11, 2, 309]]

In [37]:
train_Seq = tokenizer.texts_to_sequences(train['tidy_tweet'])

In [38]:
#Finding maxlen for padding
train_lengths = [len(seq) for seq in train_Seq]
np.percentile(train_lengths , 99)

20.0

In [39]:
#Padding
from keras.preprocessing.sequence import pad_sequences

maxlen = 20
x_train = pad_sequences(tokenizer.texts_to_sequences(train['tidy_tweet']),
                        maxlen = maxlen)
x_test = pad_sequences(tokenizer.texts_to_sequences(test['tidy_tweet']),
                       maxlen = maxlen)

print("Training X Shape:",x_train.shape)
print("Testing X Shape:",x_test.shape)

Training X Shape: (44243, 20)
Testing X Shape: (4916, 20)


In [40]:
#Using GloVe 6B 300d embedding
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

--2021-12-23 11:04:15--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2021-12-23 11:04:15--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2021-12-23 11:04:15--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2021-1

In [42]:
GLOVE_EMB = '/content/glove.6B.300d.txt'
EMBEDDING_DIM = 300
LR = 1e-3
BATCH_SIZE = 1024
EPOCHS = 10
MODEL_PATH = '/best_model.hdf5'

In [43]:
embeddings_index = {}

f = open(GLOVE_EMB)
for line in f:
  values = line.split()
  word = value = values[0]
  coefs = np.asarray(values[1:], dtype='float32')
  embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' %len(embeddings_index))

Found 400000 word vectors.


In [48]:
#creating embedding matrix
embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
for word, i in word_index.items():
  embedding_vector = embeddings_index.get(word)
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector

MODEL-

In [49]:
import tensorflow as tf
from tensorflow.keras.layers import Conv1D, Bidirectional, LSTM, Dense, Input, Dropout
from tensorflow.keras.layers import SpatialDropout1D
from tensorflow.keras.callbacks import ModelCheckpoint

In [50]:
sequence_input = Input(shape=(maxlen,), dtype='int32')
embedding_layer = tf.keras.layers.Embedding(vocab_size, EMBEDDING_DIM, weights=[embedding_matrix],input_length=maxlen,trainable=False)
embedding_sequences = embedding_layer(sequence_input)
x = SpatialDropout1D(0.2)(embedding_sequences)
x = Conv1D(64, 5, activation='relu')(x) #kernal slides along 1D
x = Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2))(x)
x = Dense(512, activation='relu')(x)
x = Dropout(0.5)(x)
x = Dense(512, activation='relu')(x)
outputs = Dense(1, activation='sigmoid')(x)
model = tf.keras.Model(sequence_input, outputs)



In [51]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 20)]              0         
                                                                 
 embedding_1 (Embedding)     (None, 20, 300)           13626900  
                                                                 
 spatial_dropout1d (SpatialD  (None, 20, 300)          0         
 ropout1D)                                                       
                                                                 
 conv1d (Conv1D)             (None, 16, 64)            96064     
                                                                 
 bidirectional (Bidirectiona  (None, 128)              66048     
 l)                                                              
                                                                 
 dense (Dense)               (None, 512)               66048 

In [52]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau

model.compile(optimizer=Adam(learning_rate=LR), loss='binary_crossentropy',
              metrics=['accuracy'])
ReduceLROnPlateau = ReduceLROnPlateau(factor=0.1,
                                     min_lr = 0.01,
                                     monitor = 'val_loss',
                                     verbose = 1)

In [53]:
y_train = train['toxicity']
y_test = test['toxicity']

In [54]:
history = model.fit(x_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS,
                    validation_data=(x_test, y_test), callbacks=[ReduceLROnPlateau])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [57]:
#Expanding dimension
x_test[0].reshape(1,-1).shape

(1, 20)

In [58]:
model.predict(x_test[0].reshape(1,-1))

array([[0.00568464]], dtype=float32)

**Inference-**

In [59]:
def process_tweet(tweet):
  tweet = re.sub('@[\w]*','',tweet)
  tweet = re.sub('[^a-zA-Z]',' ',tweet)
  tweet = " ".join([token for token in tweet.split() if len(token)>2])
  tweet = " ".join([stemmer.stem(token) for token in tweet.split() ])

  tweet =  pad_sequences(tokenizer.texts_to_sequences([tweet]),
                        maxlen = maxlen)

  return tweet

def inference(tweet):
  tweet_seq = process_tweet(tweet)
  tweet_seq = tweet_seq.reshape(1,-1)
  return model.predict(tweet_seq)[0][0]

In [62]:
tweets = ['jackblair - na: #horny #hot #naughty #nasty   #slut #young #shy #wet #nude #xxx #sexy #porn #kinky #snapshot ',
         'haileysporn - na: #sexy #shy #porn #horny #nasty #naughty #slut #hot #xxx #nude #wet #kinky #young  ',
         'sex videos sexy lady fucked hard pics',
         '@user   friskyfriday to all of you sexy people!!! ð¥ððð#hotwife #milf #sexylatina #bbg #rwsw ',
         'thank you all for the lovely followers.  glad i can share my fight with depression with you all. #keefighting  ']

for tweet in tweets:
  print(inference(tweet))

0.9999914
0.999998
1.0
0.011772362
0.00032603266
