In [1]:
from comet_ml import Experiment
import re
import numpy as np
import os
import time
import datetime
import data_helpers
import nltk
from collections import defaultdict

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dense, Input, GlobalMaxPooling1D, Conv1D, MaxPooling1D, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.initializers import Constant

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize

from sklearn.model_selection import train_test_split

In [2]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet');

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\matth\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\matth\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\matth\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
# use nltk to remove stopwords and lemmatize
# you might need to run: nltk.download() to fetch the stopword package in "all packages"
# you might also need to run ntlk.download("punkt")

english_stopwords = set(stopwords.words("english"))
wordnet_lemmatizer = WordNetLemmatizer()

In [4]:
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

In [5]:
def clean_input_text(text):
    clean_text = []
    for sent in text:
        clean_sent = ""
        sent_tokens = word_tokenize(sent)
        for token in sent_tokens:
            clean_sent += wordnet_lemmatizer.lemmatize(token) + " " if token not in english_stopwords else ""
        clean_text.append(clean_sent)
    return clean_text

In [6]:
def load_reviews_dataset():
    base_path = "./data/customer_reviews/"
    products = [base_path +"Apex AD2600 Progressive-scan DVD player.txt"
    , base_path +"Canon G3.txt"
    , base_path + "Creative Labs Nomad Jukebox Zen Xtra 40GB.txt"
    , base_path + "Nikon coolpix 4300.txt", base_path +"Nokia 6610.txt"]
    examples = []
    for product in products:
        examples += list(open(product, "r", encoding="utf-8").readlines())
    
    # for every examples, keep the one starting with a ranking
    x_text, y = [],[]
    for example in examples:
        final_label = 0
        temp_split = example.split("##")
        # don't consider unlabeled sentences
        if len(temp_split) <= 1:
            continue
        temp_label, temp_sentence = temp_split
        # parse the temp_label to find positive or negative
        positive_label = temp_label.split("+")
        #print("len positive label: {}".format(len(positive_label)))
        if len(positive_label) > 1:
            final_label = 1
        
        # so the final_label is either 0 or 1, 0 if negative, 1 if positive
        final_sentence = clean_str(temp_sentence.strip())
        x_text.append(final_sentence)
        y.append(final_label)
    return x_text, y
        
        
x_text, y = load_reviews_dataset()
x_text = clean_input_text(x_text)

In [7]:
# hyperparameters for the cnn dealing with the movie dataset
MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.2
TEST_SPLIT = 0.2
NUMBER_DIFFERENT_OUTPUTS =2

In [8]:
# load pretrain glove word2vec instance for preprocessing
filename = './data/glove.6B.300d.txt'

print('Indexing word vectors.')

embeddings_index = {}
with open(filename, encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

Indexing word vectors.
Found 400000 word vectors.


In [9]:
# vectorize the input text (both negative and positive )
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(x_text)
sequences = tokenizer.texts_to_sequences(x_text)
word_index = tokenizer.word_index
print(len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(y))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)
labels[0]

4572
Shape of data tensor: (3944, 1000)
Shape of label tensor: (3944, 2)


array([1., 0.], dtype=float32)

In [10]:
# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]
len(y_val)

788

In [11]:
# prepare embedding matrix
num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [12]:
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [13]:
experiment = Experiment(api_key="PqrK4iPuQntpHwzb6SvJuXbdh",
                        project_name="COMP 551", workspace="mattesko")

# train a 1D convnet with global maxpooling
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = GlobalMaxPooling1D()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(NUMBER_DIFFERENT_OUTPUTS, activation='softmax')(x)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

model.fit(x_train, y_train,
          batch_size=128,
          epochs=10,
          validation_data=(x_val, y_val))

COMET INFO: Experiment is live on comet.ml https://www.comet.ml/mattesko/comp-551/46b352ef78f346c580f0925113499588



Instructions for updating:
Colocations handled automatically by placer.
Train on 3156 samples, validate on 788 samples
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1bc2e2a8b70>

In [14]:
from tensorflow.keras.utils import plot_model
plot_model(model, to_file='model.png')

# NB

In [30]:
y_train_nb = y_train[:,0]

In [54]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline  

pipeline = Pipeline([('clf',       MultinomialNB())])
parameters = { 
    'clf__alpha': (1, 0.1, 0.01, 0.001, 0.0001, 0.00001) ,
    'clf__fit_prior': (True, False)
}
grid_search_nb = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=2)  
grid_search_nb.fit(x_train, y_train_nb)
best_parameters = grid_search_nb.best_estimator_.get_params()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:    0.9s finished


In [47]:
best_parameters

{'memory': None,
 'steps': [('clf', MultinomialNB(alpha=1, class_prior=None, fit_prior=True))],
 'clf': MultinomialNB(alpha=1, class_prior=None, fit_prior=True),
 'clf__alpha': 1,
 'clf__class_prior': None,
 'clf__fit_prior': True}

In [53]:
predicted = grid_search_nb.predict(x_val)
accuracy = np.mean(predicted == y_val[:,0])
f1 = f1_score(y_val[:,0], predicted)
print('Accuracy:', accuracy, '\tF1 Score:', f1)

Accuracy: 0.5685279187817259 	F1 Score: 0.6792452830188679


# SVM

In [64]:
from sklearn.svm import LinearSVC

penalty=['l2']
loss=['hinge', 'squared_hinge']
dual=[True]
class_weight=[None, 'balanced']
tol=np.array([1, 0.1, 0.01, 0.001, 0.0001, 0.00001, 0.0000001, 0.00000001])
max_iter =[1000]
C=[1000, 100, 10, 1, 0.1, 0.01, 0.001, 0.0001, 0.00001, 0.0000001, 0.00000001]
fit_intercept=[True,False]
parameters = {'C': C, 'penalty': penalty, 'loss':loss, 'dual':dual,
              'class_weight':class_weight, 'tol':tol, 'max_iter':max_iter,
              'fit_intercept':fit_intercept}
grid_search = GridSearchCV(LinearSVC(), parameters, n_jobs=-1, verbose=2)  
grid_search.fit(x_train, y_train_nb)
best_parameters = grid_search.best_estimator_.get_params()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 704 candidates, totalling 2112 fits


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:    7.1s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:   15.6s
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:   27.4s
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:   42.8s
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 1969 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 2112 out of 2112 | elapsed:  1.4min finished


In [65]:
predicted = grid_search.predict(x_val)
accuracy = np.mean(predicted == y_val[:,0])
f1 = f1_score(y_val[:,0], predicted)
print('Accuracy:', accuracy, '\tF1 Score:', f1)
print('Best Parameters:', best_parameters)

Accuracy: 0.7411167512690355 	F1 Score: 0.8513119533527697
Best Parameters: {'C': 1e-08, 'class_weight': None, 'dual': True, 'fit_intercept': True, 'intercept_scaling': 1, 'loss': 'hinge', 'max_iter': 1000, 'multi_class': 'ovr', 'penalty': 'l2', 'random_state': None, 'tol': 1.0, 'verbose': 0}


In [68]:
penalty=['l2']
loss=['hinge']
dual=[True]
class_weight=[None]
tol=np.array([10,9,8,7,6,5,4,3,2,1,0.9,0.8,0.7,0.6,0.5,0.4,0.3,0.2,0.1])
max_iter =[1000,10000]
C=[10e-8,9e-8,8e-8,7e-8,6e-8,5e-8,4e-8,3e-8,2e-8,1e-8,9e-9,8e-9,7e-9,6e-9,5e-9,4e-9,3e-9,2e-9,1e-9]
fit_intercept=[True]
parameters = {'C': C, 'penalty': penalty, 'loss':loss, 'dual':dual,
              'class_weight':class_weight, 'tol':tol, 'max_iter':max_iter,
              'fit_intercept':fit_intercept}
grid_search = GridSearchCV(LinearSVC(), parameters, n_jobs=-1, verbose=2)  
grid_search.fit(x_train, y_train_nb)
best_parameters = grid_search.best_estimator_.get_params()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 722 candidates, totalling 2166 fits


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:    8.8s
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:   15.2s
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:   23.5s
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed:   33.5s
[Parallel(n_jobs=-1)]: Done 1969 tasks      | elapsed:   45.6s
[Parallel(n_jobs=-1)]: Done 2166 out of 2166 | elapsed:   50.0s finished


In [69]:
predicted = grid_search.predict(x_val)
accuracy = np.mean(predicted == y_val[:,0])
f1 = f1_score(y_val[:,0], predicted)
print('Accuracy:', accuracy, '\tF1 Score:', f1)
print('Best Parameters:', best_parameters)

Accuracy: 0.7411167512690355 	F1 Score: 0.8513119533527697
Best Parameters: {'C': 5e-09, 'class_weight': None, 'dual': True, 'fit_intercept': True, 'intercept_scaling': 1, 'loss': 'hinge', 'max_iter': 1000, 'multi_class': 'ovr', 'penalty': 'l2', 'random_state': None, 'tol': 6.0, 'verbose': 0}


In [73]:
from sklearn.svm import SVC

gamma=['auto','scale']
kernel=['linear', 'poly', 'rbf', 'sigmoid']
class_weight=[None, 'balanced']
tol=np.array([1, 0.1, 0.01, 0.001, 0.0001, 0.00001, 0.0000001, 0.00000001])
max_iter =[1000]
C=[1000, 100, 10, 1, 0.1, 0.01, 0.001, 0.0001, 0.00001, 0.0000001, 0.00000001]
shrinking=[True]
probability=[False]
parameters = {'C': C, 'gamma': gamma, 'kernel':kernel, 'class_weight':class_weight, 'tol':tol, 'max_iter':max_iter,
              'shrinking':shrinking, 'probability':probability}
grid_search = GridSearchCV(SVC(), parameters, n_jobs=-1, verbose=2)  
grid_search.fit(x_train, y_train_nb)
best_parameters = grid_search.best_estimator_.get_params()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 1408 candidates, totalling 4224 fits


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   29.7s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  8.9min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed: 17.0min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed: 28.5min
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed: 43.8min
[Parallel(n_jobs=-1)]: Done 1969 tasks      | elapsed: 61.0min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed: 83.1min
[Parallel(n_jobs=-1)]: Done 3265 tasks      | elapsed: 109.5min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed: 143.2min
[Parallel(n_jobs=-1)]: Done 4224 out of 4224 | elapsed: 151.5min finished


In [74]:
predicted = grid_search.predict(x_val)
accuracy = np.mean(predicted == y_val[:,0])
f1 = f1_score(y_val[:,0], predicted)
print('Accuracy:', accuracy, '\tF1 Score:', f1)
print('Best Parameters:', best_parameters)

Accuracy: 0.7449238578680203 	F1 Score: 0.8525311812179016
Best Parameters: {'C': 1, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'auto', 'kernel': 'rbf', 'max_iter': 1000, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 1.0, 'verbose': False}
