<a href="https://colab.research.google.com/github/mralamdari/NLP-Text_Classification/blob/main/NLP_Text_Classification_DL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
!pip install wget
!pip install ktrain
!pip install pytorch_pretrained_bert pytorch-nlp
!pip install tqdm

In [32]:
import io
import os
import sys
import tqdm
import wget
import nltk
import torch
import ktrain
import tarfile
import warnings
import numpy as np
import pandas as pd
from ktrain import text
import tensorflow as tf
from zipfile import ZipFile
from sklearn import model_selection
from sklearn import preprocessing
warnings.filterwarnings('ignore')

# Data
###IMDB Dataset

In [None]:
!wget -P DATAPATH http://nlp.stanford.edu/data/glove.6B.zip
!unzip DATAPATH/glove.6B.zip -d DATAPATH/glove.6B

!wget -P DATAPATH http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xvf DATAPATH/aclImdb_v1.tar.gz -C DATAPATH
BASE_DIR = 'Data'

In [4]:
BASE_DIR = '/content/DATAPATH'
GLOVE_DIR = os.path.join(BASE_DIR, 'glove.6B')
TRAIN_DATA_DIR = os.path.join(BASE_DIR, 'aclImdb/train')
TEST_DATA_DIR = os.path.join(BASE_DIR, 'aclImdb/test')

In [5]:
MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

In [6]:
def get_data(data_dir):
  texts = []
  labels_index = {'pos': 1, 'neg': 0}
  labels = []
  for name in sorted(os.listdir(data_dir)):
    path = os.path.join(data_dir, name)
    if os.path.isdir(path):
      if name=='pos' or name=='neg':
        label_id = labels_index[name]
        for fname in sorted(os.listdir(path)):
          fpath = os.path.join(path, fname)
          text = open(fpath, encoding='utf8').read()
          texts.append(text)
          labels.append(label_id)
  return texts, labels

In [7]:
train_texts, train_labels = get_data(TRAIN_DATA_DIR)
test_texts, test_labels = get_data(TEST_DATA_DIR)

labels_index = {'pos': 1, 'neg': 0}

In [8]:
train_texts[20]

"The characters are unlikeable and the script is awful. It's a waste of the talents of Deneuve and Auteuil."

#Models

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(train_texts)
train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)
word_index = tokenizer.word_index

In [None]:
np.asarray(train_sequences).shape

(25000,)

In [None]:
print(len(train_sequences[0]))
print(train_sequences[0])

109
[62, 4, 3, 129, 34, 44, 7576, 1414, 15, 3, 4252, 514, 43, 16, 3, 633, 133, 12, 6, 3, 1301, 459, 4, 1751, 209, 3, 10785, 7693, 308, 6, 676, 80, 32, 2137, 1110, 3008, 31, 1, 929, 4, 42, 5120, 469, 9, 2665, 1751, 1, 223, 55, 16, 54, 828, 1318, 847, 228, 9, 40, 96, 122, 1484, 57, 145, 36, 1, 996, 141, 27, 676, 122, 1, 13886, 411, 59, 94, 2278, 303, 772, 5, 3, 837, 11037, 20, 3, 1755, 646, 42, 125, 71, 22, 235, 101, 16, 46, 49, 624, 31, 702, 84, 702, 378, 3493, 12997, 2, 16816, 8422, 67, 27, 107, 3348]


In [None]:
print(len(test_sequences[0]))
print(test_sequences[0])

164
[277, 171, 440, 11801, 44, 3318, 43, 3, 17, 15, 227, 1203, 71, 1668, 1209, 36, 1, 1301, 2016, 2225, 842, 4, 60, 47, 23, 52, 168, 10, 40, 119, 21, 456, 41, 98, 4, 1, 102, 88, 4, 175, 25, 2750, 8, 1, 4229, 2, 106, 23, 1704, 399, 20, 2, 92, 1547, 363, 73, 300, 31, 60, 55, 10, 119, 21, 456, 1, 106, 72, 141, 63, 456, 41, 6, 3, 52, 9290, 13323, 15894, 1, 436, 6, 26, 263, 122, 14, 550, 34, 1287, 237, 125, 71, 256, 331, 184, 87, 2, 284, 54, 4084, 4, 3, 4229, 24, 61, 12103, 735, 5, 27, 1573, 117, 11801, 414, 51, 72, 23, 70, 498, 1, 317, 93, 210, 4, 11, 4228, 11801, 713, 175, 29, 41, 2750, 72, 23, 576, 135, 15894, 6, 2163, 5, 27, 1, 115, 16, 54, 2593, 16291, 39, 12063, 54, 1233, 130, 9, 13, 29, 10, 97, 78, 5, 398, 36, 1583, 9, 122, 32, 531, 8]


In [None]:
len(word_index)

88582

In [None]:
for i, j in word_index.items():
  print(i, j)
  if j == 20:
    break

the 1
and 2
a 3
of 4
to 5
is 6
br 7
in 8
it 9
i 10
this 11
that 12
was 13
as 14
for 15
with 16
movie 17
but 18
film 19
on 20


In [None]:
train_valid_data = tf.keras.preprocessing.sequence.pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
test_data = tf.keras.preprocessing.sequence.pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
train_valid_labels = tf.keras.utils.to_categorical(np.asarray(train_labels))
test_labels = tf.keras.utils.to_categorical(np.asarray(test_labels))

In [None]:
print(train_valid_data.shape)

(25000, 1000)


In [None]:
train_valid_data[0]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,

In [None]:
print(test_data.shape)

(25000, 1000)


In [None]:
train_valid_labels.shape

(25000, 2)

In [None]:
indices =np.arange(train_valid_data.shape[0])
np.random.shuffle(indices)
train_valid_data = train_valid_data[indices]
train_valid_labels = train_valid_labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * train_valid_data.shape[0])
print(num_validation_samples)

5000


In [None]:
x_train = train_valid_data[:-num_validation_samples]
y_train = train_valid_labels[:-num_validation_samples]

x_test = train_valid_data[-num_validation_samples:]
y_test = train_valid_labels[-num_validation_samples:]

In [None]:
embeddings_index = {}
with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'), encoding='utf8') as f:
  for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs

In [None]:
embeddings_index.get('attention')

array([-3.3414e-01,  4.6667e-01,  5.3744e-01,  5.7743e-02,  2.9642e-01,
        2.5224e-01, -6.5586e-01, -4.1668e-01,  2.1959e-01, -4.9413e-01,
       -2.1816e-01, -9.0227e-02, -3.5179e-02, -2.7279e-01, -1.2343e-01,
        1.6808e-01, -5.0623e-01, -4.0497e-01, -1.6763e-01,  4.9066e-01,
       -8.8020e-02, -1.2339e-01, -3.8436e-01, -2.7766e-01, -1.3403e-01,
        1.4342e-01, -2.9177e-01, -2.1146e-02,  5.2180e-01, -2.1213e-01,
        3.0860e-02,  1.0402e-01, -1.6807e-01,  4.6170e-01, -5.4806e-01,
       -6.6849e-02, -3.3180e-01,  3.7257e-01, -7.4962e-01,  6.2741e-01,
       -4.9500e-01, -4.0996e-01, -1.4686e-01, -2.7166e-01, -7.7093e-02,
       -2.8342e-01,  6.3663e-02, -1.5734e-01,  6.9649e-01, -9.6694e-01,
        4.4510e-01, -2.4521e-01, -4.8447e-01,  1.1957e+00,  2.9929e-02,
       -2.0425e+00, -2.8603e-01, -3.9043e-01,  1.2197e+00, -4.7760e-01,
       -2.1191e-02,  9.3080e-01, -1.8173e-01, -7.5721e-02,  1.1242e+00,
       -8.2276e-02,  5.7149e-02, -2.3585e-01,  3.5901e-01,  6.92

In [None]:
num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
  if i > MAX_NUM_WORDS:
    continue
  
  embedding_vector = embeddings_index.get(word)
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector

In [None]:
embedding_layer = tf.keras.layers.Embedding(num_words, 
                                           EMBEDDING_DIM,
                                           embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
                                           input_length=MAX_SEQUENCE_LENGTH,
                                           trainable=False)

###1D CNN

In [None]:
cnn_model = tf.keras.models.Sequential()
cnn_model.add(embedding_layer)
cnn_model.add(tf.keras.layers.Conv1D(128, 5, activation='relu'))
cnn_model.add(tf.keras.layers.MaxPool1D(5))
cnn_model.add(tf.keras.layers.Conv1D(128, 5, activation='relu'))
cnn_model.add(tf.keras.layers.MaxPool1D(5))
cnn_model.add(tf.keras.layers.Conv1D(128, 5, activation='relu'))
cnn_model.add(tf.keras.layers.GlobalMaxPooling1D())
cnn_model.add(tf.keras.layers.Dense(128, activation='relu'))
cnn_model.add(tf.keras.layers.Dense(len(labels_index), activation='relu'))

In [None]:
cnn_model.compile(loss='categorical_crossentropy',
                  optimizer='rmsprop',
                  metrics=['acc'])

In [None]:
cnn_model.fit(x_train, y_train,
              batch_size=32,
              epochs=2,
              validation_data=(x_test, y_test))

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f817b0809d0>

In [None]:
score, acc = cnn_model.evaluate(test_data, test_labels)
print(acc)

0.5


###LSTM Model

####With Our Own Embedding Layer
#####it takes more than 40 minutes to complete

In [None]:
rnn_model = tf.keras.models.Sequential()
rnn_model.add(tf.keras.layers.Embedding(MAX_NUM_WORDS, 128))
rnn_model.add(tf.keras.layers.LSTM(128, dropout=0.2, recurrent_dropout=0.2))
rnn_model.add(tf.keras.layers.Dense(2, activation='sigmoid'))
rnn_model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

In [None]:
rnn_model.fit(x_train, y_train,
              batch_size=32, epochs=1,
              validation_data=(x_test, y_test))

In [None]:
score, acc = cnn_model.evaluate(test_data, test_labels)
print(acc)

####With Pre-trained Embedding Layer
#####it takes more than 20 minutes to complete

In [None]:
rnn_model = tf.keras.models.Sequential()
rnn_model.add(embedding_layer)
rnn_model.add(tf.keras.layers.LSTM(128, dropout=0.2, recurrent_dropout=0.2))
rnn_model.add(tf.keras.layers.Dense(2, activation='sigmoid'))
rnn_model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

In [None]:
rnn_model.fit(x_train, y_train,
              batch_size=32, epochs=1,
              validation_data=(x_test, y_test))



<keras.callbacks.History at 0x7f817d2a6110>

In [None]:
score, acc = cnn_model.evaluate(test_data, test_labels)
print(acc)

0.5


#BERT Sentiment Classification
## IMDB ktrain

In [None]:
os.environ['KAGGLE_CONFIG_DIR'] = '/content/drive/MyDrive/'
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
!unzip \*.zip && rm *.zip

In [26]:
from tqdm import tqdm, trange
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
# from pytorch_pretrained_bert import BertTokenizer, BertConfig
# from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification

In [28]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [29]:
df = pd.read_csv("IMDB Dataset.csv",engine='python', error_bad_lines=False)

In [31]:
df.tail(10)

Unnamed: 0,review,sentiment
49990,"Lame, lame, lame!!! A 90-minute cringe-fest th...",negative
49991,"Les Visiteurs, the first movie about the medie...",negative
49992,John Garfield plays a Marine who is blinded by...,positive
49993,Robert Colomb has two full-time jobs. He's kno...,negative
49994,This is your typical junk comedy.<br /><br />T...,negative
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative
49999,No one expects the Star Trek movies to be high...,negative


In [None]:
df.