<a href="https://colab.research.google.com/github/nahbos/AUT-Pattern-Recognition/blob/main/FinalProj/CNN_Rand.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###**Copyright (C) 2022 Sobhan Moradian Daghigh**
######**Date: 2/10/2022**

###**Preparing the pre-trained GloVe model**

**Downloading pre-trained GloVe**

In [1]:
import requests, zipfile, io
zip_file_url = "http://nlp.stanford.edu/data/glove.840B.300d.zip"
r = requests.get(zip_file_url)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()

**Import libraries**

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

import keras
from keras import optimizers
from keras import backend as K
from keras import regularizers
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Flatten
from keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from keras.utils.vis_utils import plot_model
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.layers import BatchNormalization
import tensorflow as tf
from tensorflow.keras.optimizers import RMSprop
from keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import EarlyStopping

import string
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk import pos_tag
import nltk
nltk.download('stopwords')
nltk.download('punkt')

from tqdm import tqdm
import codecs

np.random.seed(0)

MAX_NB_WORDS = 100000

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


**Converting all the words to the embedding index in pre-trained model**

In [3]:
print('Loading word embeddings..')

embeddings_index = {}
f = codecs.open('./glove.840B.300d.txt', encoding='utf-8')

for line in tqdm(f):
    values = line.rstrip().rsplit(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

Loading word embeddings..


2196018it [04:23, 8342.06it/s]


**Make a preprocess on the comments to improve quality and remove unneccessary words**
1. convert all words to lowercase
2. remove punctuations
3. tonekize words
4. remove stopwords
5. use porterStemmer to have stemming words

In [4]:
def nlprocess(dataset):

  preprocessed_comments = []
  for _, text in dataset.iterrows():

      text = text[0]
      #Lowercase
      text = text.lower()

      # Removing Punctuation
      text = "".join([char for char in text if char not in string.punctuation])

      # Word Tokenization
      text = word_tokenize(text)

      # Stopword Filtering
      stop_words = stopwords.words('english')
      text = [word for word in text if word not in stop_words]

      # Stemming
      porter = PorterStemmer()
      text = [porter.stem(word) for word in text]

      text = ' '.join(text)

      preprocessed_comments.append(text)
      
  return preprocessed_comments

In [5]:
def spliter(datasets):
    x_train_datasets, x_test_datasets, y_train_datasets, y_test_datasets = [], [], [], []
    print('(train, test):')
    for i, dataset in enumerate(datasets):
        x_train, x_test = dataset[0], dataset[1]
        tr, ts = np.zeros(len(x_train)), np.zeros(len(x_test))
        tr[:len(x_train) // 2], ts[:len(x_test) // 2] = 1, 1
        y_train, y_test = tr, ts

        x_train_datasets.append(nlprocess(x_train))
        x_test_datasets.append(nlprocess(x_test))
        y_train_datasets.append(y_train)
        y_test_datasets.append(y_test)
        print(' |_ {}: ({}, {})'.format(datasets_name[i], len(x_train), len(x_test)))
    return x_train_datasets, x_test_datasets, y_train_datasets, y_test_datasets

In [7]:
datasets_name = ['Books      ', 'DVD        ', 'Electronics', 'Kitchen    ']

# Load data
datasets = []

books_train = pd.read_csv('./Bookstrain.txt', names=['text'], sep='\t')
books_test = pd.read_csv('./Bookstest.txt', names=['text'], sep='\t')
datasets.append([books_train, books_test])

dvd_train = pd.read_csv('./Dvdtrain.txt', names=['text'], sep='\t')
dvd_test = pd.read_csv('./Dvdtest.txt', names=['text'], sep='\t')
datasets.append([dvd_train, dvd_test])

electronics_train = pd.read_csv('./Electronicstrain.txt', names=['text'], sep='\t')
electronics_test = pd.read_csv('./Electronicstest.txt', names=['text'], sep='\t')
datasets.append([electronics_train, electronics_test])

kitchen_train = pd.read_csv('./Kitchentrain.txt', names=['text'], sep='\t')
kitchen_test = pd.read_csv('./Kitchentest.txt', names=['text'], sep='\t')
datasets.append([kitchen_train, kitchen_test])


x_train_datasets, x_test_datasets, y_train_datasets, y_test_datasets = spliter(datasets)

(train, test):
 |_ Books      : (1600, 400)
 |_ DVD        : (1600, 400)
 |_ Electronics: (1600, 400)
 |_ Kitchen    : (1600, 400)


**Tokenizing the data with tokenizer from tensorflow**

In [8]:
def tokenizer(x_train_datasets, x_test_datasets):
  word_indices = []
  word_seq_trains = []
  word_seq_tests = []
  max_seq_lens = []

  print("Dictionary size:")
  for i, (x_tr, x_ts) in enumerate(zip(x_train_datasets, x_test_datasets)):
    (x_tr, x_ts) = (pd.DataFrame(x_tr, columns=['text']), pd.DataFrame(x_ts, columns=['text']))
    raw_docs_train = x_tr.text.tolist()
    raw_docs_test  = x_ts.text.tolist()


    processed_docs_train = []
    for doc in tqdm(raw_docs_train):
        tokens = word_tokenize(doc)
        processed_docs_train.append(" ".join(tokens))

    processed_docs_test = []
    for doc in tqdm(raw_docs_test):
        tokens = word_tokenize(doc)
        processed_docs_test.append(" ".join(tokens))


    tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=True, char_level=False)
    tokenizer.fit_on_texts(processed_docs_train + processed_docs_test)
    word_seq_train = tokenizer.texts_to_sequences(processed_docs_train)
    word_seq_test  = tokenizer.texts_to_sequences(processed_docs_test)

    word_index = tokenizer.word_index
    word_indices.append(word_index)
    print(" |_ {}: {}".format(datasets_name[i], len(word_index)))

    x_tr['doc_len'] = x_tr.text.apply(lambda words: len(words.split(' ')))
    max_seq_len = np.round(x_tr.doc_len.mean() + x_tr.doc_len.std()).astype(int)
    max_seq_lens.append(max_seq_len)
    x_tr = x_tr.drop('doc_len', axis=1)
    word_seq_trains.append(sequence.pad_sequences(word_seq_train, maxlen=max_seq_len))
    word_seq_tests.append(sequence.pad_sequences(word_seq_test, maxlen=max_seq_len))

  return word_indices, word_seq_trains, word_seq_tests, max_seq_lens

In [9]:
word_indices, word_seq_trains, word_seq_tests, max_seq_lens = tokenizer(x_train_datasets, x_test_datasets)

Dictionary size:


100%|██████████| 1600/1600 [00:00<00:00, 1687.11it/s]
100%|██████████| 400/400 [00:00<00:00, 1768.94it/s]


 |_ Books      : 20293


100%|██████████| 1600/1600 [00:00<00:00, 1722.84it/s]
100%|██████████| 400/400 [00:00<00:00, 1831.83it/s]


 |_ DVD        : 21192


100%|██████████| 1600/1600 [00:00<00:00, 2832.74it/s]
100%|██████████| 400/400 [00:00<00:00, 2683.15it/s]


 |_ Electronics: 11679


100%|██████████| 1600/1600 [00:00<00:00, 3389.07it/s]
100%|██████████| 400/400 [00:00<00:00, 3397.30it/s]


 |_ Kitchen    : 9011


###**Shape and Train with CNN-Static**

**defining variables that used on training**

In [52]:
# Training params
batch_size = 256 
num_epochs = 20

# Model params
num_filters = 64 
embed_dim = 300 
weight_decay = 1e-4

**All words that arent in the pre-trained model from GloVe would be changed to 0. These words are basically the ones with names, and mostly dont matter so much to the pattern. so its nicer to just weights it 0.**

In [11]:
# Embedding matrix
def embd_matrix(word_indices):
  nb_words_list = []
  embedding_matrices = []
  print('Preparing embedding matrix..')
  print(' Number of null word embeddings:')
  for j, word_index in enumerate(word_indices):
    words_not_found = []
    nb_words = min(MAX_NB_WORDS, len(word_index) + 1)
    nb_words_list.append(nb_words)
    embedding_matrix = np.zeros((nb_words, embed_dim))

    for word, i in word_index.items():
        if i >= nb_words:
            continue
        embedding_vector = embeddings_index.get(word)
        if (embedding_vector is not None) and len(embedding_vector) > 0:
            embedding_matrix[i] = embedding_vector
        else:
            words_not_found.append(word)
    
    embedding_matrices.append(embedding_matrix)
    print('  |_ {}: {}'.format(datasets_name[j], np.sum(np.sum(embedding_matrix, axis=1) == 0)))
    print("     some of not-found words: {}".format(np.random.choice(words_not_found, 3)))
  
  return nb_words_list, embedding_matrices

In [12]:
nb_words_list, embedding_matrices = embd_matrix(word_indices)

Preparing embedding matrix..
 Number of null word embeddings:
  |_ Books      : 8904
     some of not-found words: ['ellroy' 'facad' 'lovemap']
  |_ DVD        : 9368
     some of not-found words: ['insteadaft' 'doofusesand' 'gravelli']
  |_ Electronics: 5029
     some of not-found words: ['nonintuit' 'soundgrant' 'lowerneed']
  |_ Kitchen    : 3420
     some of not-found words: ['quicklyy' 'mistur' 'misquot']


**Cool :)))**

**Now, lets start training!**

In [61]:
def glove_trainer(word_indices, nb_words_list, max_seq_lens, y_train_datasets, word_seq_trains):
  models = []
  for i, (word_index, nb_words, max_seq_len, y_train, word_seq_train) in enumerate(zip(word_indices, nb_words_list, max_seq_lens, y_train_datasets, word_seq_trains)):

    model = tf.keras.Sequential()
    embedding_matrix2 = np.random.uniform(-1, 1, (len(word_index)  + 1, embed_dim))
    
    model.add(Embedding(nb_words, embed_dim, input_length=max_seq_len, weights=[embedding_matrix2], trainable=False))
    model.add(Conv1D(128, 5, activation='relu'))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(10, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    print('\n\n****************************   {}   ************************'.format(datasets_name[i]))
    model.summary()

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    es_callback = EarlyStopping(monitor='val_loss', patience=3)
    model.fit(word_seq_train, y_train, batch_size=256, epochs=num_epochs, callbacks=[es_callback], shuffle=False)

    models.append(model)
    keras.backend.clear_session()
  return models

In [62]:
models = glove_trainer(word_indices, nb_words_list, max_seq_lens, y_train_datasets, word_seq_trains)



****************************   Books         ************************
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 323, 300)          6088200   
                                                                 
 conv1d_1 (Conv1D)           (None, 319, 128)          192128    
                                                                 
 global_max_pooling1d_1 (Glo  (None, 128)              0         
 balMaxPooling1D)                                                
                                                                 
 dense_2 (Dense)             (None, 10)                1290      
                                                                 
 dense_3 (Dense)             (None, 1)                 11        
                                                                 
Total params: 6,281,629
Trainable params: 193,42

In [64]:
print("Accuracy with GloVe and CNN-Rand:")
for i, (model, y_ts, word_seq_test) in enumerate(zip(models, y_test_datasets, word_seq_tests)):
    predictions = (model.predict(word_seq_test) > 0.5).astype(int)
    acc = accuracy_score(y_ts, predictions)
    print(' |_  {}: {:.2f}'.format(datasets_name[i], acc))

Accuracy with GloVe and CNN-Rand:
 |_  Books      : 0.65
 |_  DVD        : 0.56
 |_  Electronics: 0.78
 |_  Kitchen    : 0.71


**Finito :)**