<a href="https://colab.research.google.com/github/omkarwazulkar/NaturalLanguageProcessing/blob/main/N_Gram_CNN_Model_Pro.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import string
import re
from os import listdir
from nltk.corpus import stopwords
from pickle import dump
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
# Loading Doc into Memory
def load_doc(filename):
  file = open(filename, 'r')
  text = file.read()
  file.close()
  return text

In [3]:
# Forming Clean Tokens
def clean_doc(doc):
  tokens = doc.split()
  re_punc = re.compile('[%s]' % re.escape(string.punctuation))
  tokens = [re_punc.sub('', w) for w in tokens]
  tokens = [word for word in tokens if word.isalpha()]
  stop_words = set(stopwords.words('english'))
  tokens = [w for w in tokens if not w in stop_words]
  tokens = [word for word in tokens if len(word) > 1]
  tokens = ' '.join(tokens)
  return tokens

In [4]:
# Loading all documents in directory
def process_docs(directory, is_train):
  documents = list()
  for filename in listdir(directory):
    if is_train and filename.startswith('cv9'):
      continue
    if not is_train and not filename.startswith('cv9'):
      continue
    path = directory + '/' + filename
    doc = load_doc(path)
    tokens = clean_doc(doc)
    documents.append(tokens)
  return documents

In [5]:
# Load and Clean
def load_clean_dataset(is_train):
  neg = process_docs('/content/drive/MyDrive/ReviewPopularity/txt_sentoken/neg', is_train)
  pos = process_docs('/content/drive/MyDrive/ReviewPopularity/txt_sentoken/pos', is_train)
  docs = neg + pos
  labels = [0 for _ in range(len(neg))] + [1 for _ in range(len(pos))]
  return docs, labels

In [6]:
# Saving to file
def save_dataset(dataset, filename):
  dump(dataset, open(filename, 'wb'))
  print('Saved: %s' % filename)
train_docs, ytrain = load_clean_dataset(True)
test_docs, ytest = load_clean_dataset(False)

In [7]:
save_dataset([train_docs, ytrain], 'train.pkl')
save_dataset([test_docs, ytest], 'test.pkl')

Saved: train.pkl
Saved: test.pkl


# **Developing Multi Channel Model**

1.   Encoding
2.   Defining Model



In [8]:
from pickle import load
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.merge import concatenate

In [9]:
# Load Clean Dataset
def load_dataset(filename):
  return load(open(filename, 'rb'))

In [10]:
# Fitting Tokenizer
def create_tokenizer(lines):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(lines)
  return tokenizer

In [11]:
# Maximum Length of Document
def max_length(lines):
  return max([len(s.split()) for s in lines])

1. Encoding

In [12]:
# Encoding Lines
def encode_text(tokenizer, lines, length):
  encoded = tokenizer.texts_to_sequences(lines)
  padded = pad_sequences(encoded, maxlen=length, padding='post')
  return padded

In [13]:
# Defining Model
def define_model(length, vocab_size):

  inputs1 = Input(shape=(length,))
  embedding1 = Embedding(vocab_size, 100)(inputs1)
  conv1 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding1)
  drop1 = Dropout(0.5)(conv1)
  pool1 = MaxPooling1D(pool_size=2)(drop1)
  flat1 = Flatten()(pool1)

  inputs2 = Input(shape=(length,))
  embedding2 = Embedding(vocab_size, 100)(inputs2)
  conv2 = Conv1D(filters=32, kernel_size=6, activation='relu')(embedding2)
  drop2 = Dropout(0.5)(conv2)
  pool2 = MaxPooling1D(pool_size=2)(drop2)
  flat2 = Flatten()(pool2)

  inputs3 = Input(shape=(length,))
  embedding3 = Embedding(vocab_size, 100)(inputs3)
  conv3 = Conv1D(filters=32, kernel_size=8, activation='relu')(embedding3)
  drop3 = Dropout(0.5)(conv3)
  pool3 = MaxPooling1D(pool_size=2)(drop3)
  flat3 = Flatten()(pool3)

  merged = concatenate([flat1, flat2, flat3])

  dense1 = Dense(10, activation='relu')(merged)
  outputs = Dense(1, activation='sigmoid')(dense1)
  model = Model(inputs=[inputs1, inputs2, inputs3], outputs=outputs)
  
  # Compling
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  model.summary()
  plot_model(model, show_shapes=True, to_file='model.png')
  return model

# **Training and Saving Model**

In [14]:
# Load Train Data
trainLines, trainLabels = load_dataset('train.pkl')

In [15]:
# Tokenizer
tokenizer = create_tokenizer(trainLines)

In [16]:
# Max Doc Lenght
length = max_length(trainLines)
print('Max document length: %d' % length)

Max document length: 1380


In [17]:
# Vocabulary Size
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary size: %d' % vocab_size)

Vocabulary size: 44277


In [18]:
# Encode Data
trainX = encode_text(tokenizer, trainLines, length)
train_Labels = np.array(trainLabels)

In [19]:
# Define Model
model = define_model(length, vocab_size)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 1380)]       0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 1380)]       0           []                               
                                                                                                  
 input_3 (InputLayer)           [(None, 1380)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, 1380, 100)    4427700     ['input_1[0][0]']                
                                                                                              

In [20]:
# Fit Model
model.fit((trainX,trainX,trainX), train_Labels, epochs=7, batch_size=16)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.callbacks.History at 0x7fec811fe1d0>

In [21]:
# Save Model
model.save('model.h5')

# **Evaluating Model**

In [22]:
from pickle import load
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model

In [23]:
# Load Clean Data
def load_dataset(filename):
  return load(open(filename, 'rb'))

In [24]:
# Fitting Tokenizer
def create_tokenizer(lines):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(lines)
  return tokenizer

In [25]:
# Maximum Length of Document
def max_length(lines):
  return max([len(s.split()) for s in lines])

In [26]:
# Encoding Lines
def encode_text(tokenizer, lines, length):
  encoded = tokenizer.texts_to_sequences(lines)
  padded = pad_sequences(encoded, maxlen=length, padding='post')
  return padded

In [27]:
# Load Train Data
trainLines, trainLabels = load_dataset('train.pkl')
testLines, testLabels = load_dataset('test.pkl')

In [28]:
# Tokenizer
tokenizer = create_tokenizer(trainLines)

In [29]:
# Max Doc Lenght
length = max_length(trainLines)
print('Max document length: %d' % length)

Max document length: 1380


In [30]:
# Vocabulary Size
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary size: %d' % vocab_size)

Vocabulary size: 44277


In [31]:
# Encode Data
trainX = encode_text(tokenizer, trainLines, length)
testX = encode_text(tokenizer, testLines, length)

In [32]:
# Loading Model
model = load_model('model.h5')

In [33]:
# Evaluate Model on Train Data
train_Labels = np.array(trainLabels)
_, acc = model.evaluate([trainX,trainX,trainX], train_Labels, verbose=0)
print('Train Accuracy: %.2f' % (acc*100))

Train Accuracy: 100.00


In [34]:
# Evaluate Model on Test Dataset 
test_Labels = np.array(testLabels)
_, acc = model.evaluate([testX,testX,testX], test_Labels, verbose=0)
print('Test Accuracy: %.2f' % (acc*100))

Test Accuracy: 90.50
