In [1]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.7-py3-none-any.whl (1.5 MB)
     ---------------------------------------- 1.5/1.5 MB 1.7 MB/s eta 0:00:00
Collecting tqdm
  Using cached tqdm-4.64.0-py2.py3-none-any.whl (78 kB)
Collecting regex>=2021.8.3
  Downloading regex-2022.3.15-cp38-cp38-win_amd64.whl (274 kB)
     -------------------------------------- 274.4/274.4 KB 1.9 MB/s eta 0:00:00
Collecting click
  Using cached click-8.1.2-py3-none-any.whl (96 kB)
Installing collected packages: tqdm, regex, click, nltk
Successfully installed click-8.1.2 nltk-3.7 regex-2022.3.15 tqdm-4.64.0


In [2]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Dropout
from keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report

In [3]:
data = pd.read_csv('dataset/train_preprocessed.csv')
data.dropna(inplace = True)

In [4]:
MAX_NB_WORDS = 11000
MAX_SEQUENCE_LENGTH = 100
EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(data['text'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))



Found 6612 unique tokens.


In [5]:
word_index

{'information': 1,
 'use': 2,
 'service': 3,
 'site': 4,
 'third': 5,
 'personal': 6,
 'party': 7,
 'privacy': 8,
 'collect': 9,
 'u': 10,
 'provide': 11,
 'address': 12,
 'website': 13,
 'policy': 14,
 'user': 15,
 'web': 16,
 'email': 17,
 'personally': 18,
 'services': 19,
 'identifiable': 20,
 'cooky': 21,
 'share': 22,
 'online': 23,
 'com': 24,
 'data': 25,
 'account': 26,
 'access': 27,
 'time': 28,
 'also': 29,
 'advertising': 30,
 'company': 31,
 'e': 32,
 'contact': 33,
 'request': 34,
 'opt': 35,
 'please': 36,
 'collected': 37,
 'device': 38,
 'certain': 39,
 'browser': 40,
 'content': 41,
 'including': 42,
 'receive': 43,
 'product': 44,
 'mail': 45,
 'visit': 46,
 'us': 47,
 'name': 48,
 'page': 49,
 'used': 50,
 'number': 51,
 'provider': 52,
 'order': 53,
 'law': 54,
 'change': 55,
 'ad': 56,
 'marketing': 57,
 'and': 58,
 'purpose': 59,
 'you': 60,
 'or': 61,
 'automatically': 62,
 'cookies': 63,
 'business': 64,
 'mobile': 65,
 'using': 66,
 'disclose': 67,
 'help': 6

In [6]:
X = tokenizer.texts_to_sequences(data['text'].values)
print(len(X[20]))
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

35
Shape of data tensor: (19357, 100)


In [7]:
Y = pd.get_dummies(data['class']).values
print('Shape of label tensor:', Y.shape)

Shape of label tensor: (19357, 10)


In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42, stratify=Y)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(15485, 100) (15485, 10)
(3872, 100) (3872, 10)


In [9]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(LSTM(50, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(Y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 100)          1100000   
                                                                 
 lstm (LSTM)                 (None, 50)                30200     
                                                                 
 dense (Dense)               (None, 10)                510       
                                                                 
Total params: 1,130,710
Trainable params: 1,130,710
Non-trainable params: 0
_________________________________________________________________
None


In [10]:
epochs = 5
batch_size = 64

history = model.fit(
    X_train, Y_train, 
    epochs=epochs, batch_size=batch_size,
    validation_split=0.2,
    callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)]
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [11]:
accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 0.682
  Accuracy: 0.800
