In [1]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.7-py3-none-any.whl (1.5 MB)
     ---------------------------------------- 1.5/1.5 MB 1.7 MB/s eta 0:00:00
Collecting tqdm
  Using cached tqdm-4.64.0-py2.py3-none-any.whl (78 kB)
Collecting regex>=2021.8.3
  Downloading regex-2022.3.15-cp38-cp38-win_amd64.whl (274 kB)
     -------------------------------------- 274.4/274.4 KB 1.9 MB/s eta 0:00:00
Collecting click
  Using cached click-8.1.2-py3-none-any.whl (96 kB)
Installing collected packages: tqdm, regex, click, nltk
Successfully installed click-8.1.2 nltk-3.7 regex-2022.3.15 tqdm-4.64.0


In [1]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Dropout
from keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report

In [2]:
data = pd.read_csv('dataset/train_preprocessed.csv')
data.dropna(inplace = True)

In [5]:
X_train, X_test, Y_train, Y_test = train_test_split(data['text'], data['class'], test_size = 0.2, random_state = 42, stratify=data['class'])
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(15485,) (15485,)
(3872,) (3872,)


In [7]:
print(X_train[:10])
print('\n')
print(Y_train[:10])

11075    1. scope consent use online services, contract...
3460     browser scripting disabled, need opt out, onli...
11584    accessing record vision center, pharmacy, fina...
4068     another organization (e.g., course transaction...
11407    collect technical information visit website ty...
1385     possible served cooky u help u deliver adverti...
6271     medianews us pin ersonalize content advertisin...
18416    share medical data healthcare provider choose ...
6965     personally identifiable information collected ...
18447    emergency pose threat health and/or safety you...
Name: text, dtype: object


11075    0
3460     0
11584    7
4068     5
11407    2
1385     2
6271     2
18416    5
6965     2
18447    5
Name: class, dtype: int64


In [8]:
MAX_NB_WORDS = 11000
MAX_SEQUENCE_LENGTH = 100
EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(X_train.values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))



Found 6315 unique tokens.


In [9]:
word_index

{'information': 1,
 'use': 2,
 'service': 3,
 'site': 4,
 'third': 5,
 'personal': 6,
 'party': 7,
 'privacy': 8,
 'collect': 9,
 'u': 10,
 'provide': 11,
 'address': 12,
 'website': 13,
 'policy': 14,
 'user': 15,
 'email': 16,
 'web': 17,
 'personally': 18,
 'services': 19,
 'identifiable': 20,
 'cooky': 21,
 'share': 22,
 'online': 23,
 'time': 24,
 'data': 25,
 'access': 26,
 'account': 27,
 'com': 28,
 'advertising': 29,
 'also': 30,
 'e': 31,
 'company': 32,
 'contact': 33,
 'request': 34,
 'opt': 35,
 'certain': 36,
 'please': 37,
 'collected': 38,
 'including': 39,
 'content': 40,
 'browser': 41,
 'device': 42,
 'receive': 43,
 'product': 44,
 'mail': 45,
 'name': 46,
 'visit': 47,
 'us': 48,
 'page': 49,
 'used': 50,
 'number': 51,
 'provider': 52,
 'marketing': 53,
 'law': 54,
 'change': 55,
 'order': 56,
 'purpose': 57,
 'you': 58,
 'ad': 59,
 'and': 60,
 'automatically': 61,
 'or': 62,
 'business': 63,
 'disclose': 64,
 'using': 65,
 'mobile': 66,
 'help': 67,
 'cookies': 6

In [10]:
X = tokenizer.texts_to_sequences(X_train.values)
print(len(X[20]))
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

31
Shape of data tensor: (15485, 100)


In [11]:
test_input = tokenizer.texts_to_sequences(X_test.values)
print(len(test_input[20]))
test_input = pad_sequences(test_input, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', test_input.shape)

6
Shape of data tensor: (3872, 100)


In [12]:
Y = pd.get_dummies(Y_train).values
print('Shape of label tensor:', Y.shape)

Shape of label tensor: (15485, 10)


In [13]:
test_output = pd.get_dummies(Y_test).values
print('Shape of label tensor:', test_output.shape)

Shape of label tensor: (3872, 10)


In [14]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(LSTM(50, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(Y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 100)          1100000   
                                                                 
 lstm (LSTM)                 (None, 50)                30200     
                                                                 
 dense (Dense)               (None, 10)                510       
                                                                 
Total params: 1,130,710
Trainable params: 1,130,710
Non-trainable params: 0
_________________________________________________________________
None


In [15]:
epochs = 15
batch_size = 64

history = model.fit(
    X, Y, 
    epochs=epochs, batch_size=batch_size,
    validation_split=0.2,
    callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)]
)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15


In [16]:
accr = model.evaluate(test_input, test_output)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 0.710
  Accuracy: 0.802
