In [1]:
!pip install nltk



You should consider upgrading via the 'd:\projects\python projects\pythonproject-3.8\venv\scripts\python.exe -m pip install --upgrade pip' command.


In [2]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Dropout
from keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report

In [3]:
data = pd.read_csv('dataset/train_preprocessed.csv')
data.dropna(inplace = True)

In [8]:
test_data = pd.read_csv('dataset/test_preprocessed.csv')
test_data.dropna(inplace = True)

In [9]:
X_train = data['text']
Y_train = data['class']
print(X_train.shape,Y_train.shape)

(13549,) (13549,)


In [10]:
X_test = test_data['text']
Y_test = test_data['class']
print(X_test.shape,Y_test.shape)

(5808,) (5808,)


In [11]:
print(X_train[:10])
print('\n')
print(Y_train[:10])

0    unless request information withheld comply fer...
1    none collect personal information computer e m...
2    elect location based search saved history stor...
3    subsidiary corporate affiliate including enfor...
4    use service view content provided google autom...
5    share information reason described policy tell...
6    parent company affiliate us described personal...
7    receive information sources add account inform...
8    share entry information public connection admi...
9    use disclose iii perform certain data aggregat...
Name: text, dtype: object


0    5
1    5
2    6
3    5
4    2
5    5
6    5
7    2
8    5
9    5
Name: class, dtype: int64


In [12]:
MAX_NB_WORDS = 11000
MAX_SEQUENCE_LENGTH = 100
EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(X_train.values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))



Found 5935 unique tokens.


In [13]:
word_index

{'information': 1,
 'use': 2,
 'service': 3,
 'site': 4,
 'third': 5,
 'personal': 6,
 'party': 7,
 'privacy': 8,
 'collect': 9,
 'u': 10,
 'provide': 11,
 'address': 12,
 'website': 13,
 'policy': 14,
 'user': 15,
 'email': 16,
 'web': 17,
 's': 18,
 'personally': 19,
 'services': 20,
 'cooky': 21,
 'identifiable': 22,
 'share': 23,
 'online': 24,
 'com': 25,
 'account': 26,
 'time': 27,
 'access': 28,
 'data': 29,
 'also': 30,
 'contact': 31,
 'advertising': 32,
 'company': 33,
 'e': 34,
 'browser': 35,
 'opt': 36,
 'device': 37,
 'request': 38,
 'collected': 39,
 'certain': 40,
 'content': 41,
 'please': 42,
 'including': 43,
 'product': 44,
 'us': 45,
 'mail': 46,
 'name': 47,
 'receive': 48,
 'visit': 49,
 'page': 50,
 'number': 51,
 'used': 52,
 'provider': 53,
 'change': 54,
 'order': 55,
 'ad': 56,
 'purpose': 57,
 'law': 58,
 'and': 59,
 'marketing': 60,
 'business': 61,
 'you': 62,
 'or': 63,
 'automatically': 64,
 'cookies': 65,
 'disclose': 66,
 'mobile': 67,
 'child': 68,


In [14]:
X = tokenizer.texts_to_sequences(X_train.values)
print(len(X[20]))
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

7
Shape of data tensor: (13549, 100)


In [15]:
test_input = tokenizer.texts_to_sequences(X_test.values)
print(len(test_input[20]))
test_input = pad_sequences(test_input, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', test_input.shape)

8
Shape of data tensor: (5808, 100)


In [16]:
Y = pd.get_dummies(Y_train).values
print('Shape of label tensor:', Y.shape)

Shape of label tensor: (13549, 10)


In [17]:
test_output = pd.get_dummies(Y_test).values
print('Shape of label tensor:', test_output.shape)

Shape of label tensor: (5808, 10)


In [18]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(LSTM(50, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(Y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 100)          1100000   
                                                                 
 lstm (LSTM)                 (None, 50)                30200     
                                                                 
 dense (Dense)               (None, 10)                510       
                                                                 
Total params: 1,130,710
Trainable params: 1,130,710
Non-trainable params: 0
_________________________________________________________________
None


In [19]:
epochs = 15
batch_size = 64

history = model.fit(
    X, Y, 
    epochs=epochs, batch_size=batch_size,
    validation_split=0.2,
    callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)]
)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15


In [20]:
accr = model.evaluate(test_input, test_output)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 0.811
  Accuracy: 0.785
