# Sentiment Analysis with Scikit-Learn Naive Bayes

In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', 200)
DATA_DIR = '../data/'

IMDB_DATA_FILE = DATA_DIR + 'imdb_labelled.txt'
YELP_DATA_FILE = DATA_DIR + 'yelp_labelled.txt'
AMAZON_DATA_FILE = DATA_DIR + 'amazon_cells_labelled.txt'

COLUMN_NAMES = ['Review', 'Sentiment']
yelp_reviews = pd.read_table(YELP_DATA_FILE, names=COLUMN_NAMES)
amazon_reviews = pd.read_table(AMAZON_DATA_FILE, names=COLUMN_NAMES)
imdb_reviews = pd.read_table(IMDB_DATA_FILE, names=COLUMN_NAMES)
review_data = pd.concat([amazon_reviews, imdb_reviews, yelp_reviews], ignore_index=True)

In [None]:
review_data.sample(10)

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt_tab')

tokens = review_data.Review.apply(func=word_tokenize).explode().str.lower()
tokens = tokens[~tokens.isin(stopwords.words('english'))]
review_data.Review = tokens.groupby(level=0).agg(lambda x: ' '.join(x))
review_data.Review


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB 

X_train, X_test, y_train, y_test = train_test_split(review_data.Review, 
                                                    review_data.Sentiment, 
                                                    test_size=0.3, 
                                                    random_state=42)

In [None]:
X_train.head(5)

In [None]:
import re

def clean(text):
    text = re.sub(r'[\W]+', ' ', text.lower())
    text = text.replace('hadn t' , 'had not')\
               .replace('wasn t', 'was not')\
               .replace('didn t', 'did not')
    return text

review_model_data = review_data.copy()
review_model_data.Review = review_model_data.Review.apply(clean)

In [None]:
# Pipelining
tfidf = TfidfVectorizer()
clf = MultinomialNB()
nb_tfidf = Pipeline([('vect', tfidf), ('clf', clf)])

In [None]:
# NB applied
nb_tfidf.fit(X_train.values, y_train.values)
test_accuracy = nb_tfidf.score(X_test.values, y_test.values)
'The model has a test accuracy of {:.0%}'.format(test_accuracy)

# Keras Embeddings

In [None]:
from numpy import array
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten,Embedding,Dense
# Define 10 restaurant reviews
reviews =['Never coming back!', 'horrible service', 'rude waitress', 'cold food', 
          'horrible food!', 'awesome', 'awesome services!', 'rocks', 'poor work',
          'couldn\'t have done better' ]

#Define labels
labels = array([1,1,1,1,1,0,0,0,0,0])

Vocab_size = 50
encoded_reviews = [one_hot(d,Vocab_size) for d in reviews]
print(f'encoded reviews: {encoded_reviews}')

In [None]:
max_length = 4
padded_reviews = pad_sequences(encoded_reviews,maxlen=max_length,padding='post')
print(padded_reviews)

In [None]:
model = Sequential()
embedding_layer = Embedding(input_dim=Vocab_size,output_dim=8,input_length=max_length)
model.add(embedding_layer)
model.add(Flatten())
model.add(Dense(1,activation='sigmoid'))
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['acc'])
print(model.summary())

In [None]:
model.fit(padded_reviews,labels,epochs=100,verbose=0)
print(embedding_layer.get_weights()[0].shape) # (50, 8)
embedding_layer.get_weights()[0]

### Neural Network

In [None]:
import pandas as pd, numpy as np, tensorflow as tf
from sklearn.model_selection import train_test_split

# data
X = review_data["Review"].astype(str).values
y, classes = pd.factorize(review_data["Sentiment"])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# vectorizer
vec = tf.keras.layers.TextVectorization(max_tokens=10000, output_mode="int", output_sequence_length=60)
vec.adapt(X_train)
vocab_size = len(vec.get_vocabulary())
binary = (len(np.unique(y_train)) == 2)

# model
inp = tf.keras.Input(shape=(1,), dtype=tf.string)
x = vec(inp)
x = tf.keras.layers.Embedding(vocab_size, 32, mask_zero=True)(x)
x = tf.keras.layers.GlobalAveragePooling1D()(x)
x = tf.keras.layers.Dense(16, activation="relu",kernel_regularizer=tf.keras.regularizers.l2(1e-4))(x)
x = tf.keras.layers.Dropout(0.1)(x)
out = tf.keras.layers.Dense(1 if binary else len(classes), activation="sigmoid" if binary else "softmax")(x)
model = tf.keras.Model(inp, out)
model.compile(optimizer="adam",
              loss="binary_crossentropy" if binary else "sparse_categorical_crossentropy",
              metrics=["accuracy"])

# train & evaluate
model.fit(X_train, y_train, epochs=10, batch_size=128, validation_split=0.1, verbose=1)
acc = model.evaluate(X_test, y_test, verbose=0)[1]
print(f"Test accuracy: {acc:.2%}")