# 1. Import Libraries

In [None]:
import nltk
nltk.download('wordnet')
import numpy as np
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
from sklearn.linear_model import LogisticRegression

from __future__ import print_function, division
from future.utils import iteritems

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mrmhm\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# 2. Prepare Project

In [None]:
# Creating lemmatizer instance
wordnet_lemmatizer = WordNetLemmatizer()

# Define stopwords
stopwords = set(w.rstrip() for w in ('https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/refs/heads/master/nlp_class/stopwords.txt'))

# Define Positive Reviews
positive_reviews = BeautifulSoup(open(r'dataset/sorted_data_acl/electronics/positive.review').read())
positive_reviews = positive_reviews.find_all('review_text')

# Define negative review
negative_reviews = BeautifulSoup(open(r'dataset/sorted_data_acl/electronics/negative.review').read())
negative_reviews = negative_reviews.find_all('review_text')

In [None]:
# Shuffling Data
np.random.shuffle(positive_reviews)
positive_reviews = positive_reviews[:len(negative_reviews)]

In [None]:
def tokenizer(s):
    s = s.lower()
    tokens = nltk.tokenize.word_tokenize(s)
    tokens = [t for t in tokens if len(t) > 2]
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens]
    tokens = [t for t in tokens if t not in stopwords]
    return tokens

In [None]:
word_index_map = {}
current_index = 0
positive_tokenized = []
negative_tokenized = []
orig_reviews = []

# Tokenization for positive review
for review in positive_reviews:
    orig_reviews.append(review.text)
    tokens = tokenizer(review.text)
    # print(tokens)
    positive_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

# Tokenization for negative reviews
for review in negative_reviews:
    orig_reviews.append(review.text)
    tokens = tokenizer(review.text)
    negative_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

print("len(word_index_map):", len(word_index_map))

len(word_index_map): 11249


In [None]:
# Convert tokens to vectors
def tokens_to_vectors(tokens, label):
    x = np.zeros(len(word_index_map) + 1)
    for t in tokens:
        i = word_index_map[t] # Get index
        x[i] += 1
    x = x / x.sum()
    x[-1] = label
    return x

In [None]:
N = len(positive_tokenized) + len(negative_tokenized) # Total size
data = np.zeros((N, len(word_index_map) + 1))
i = 0 # Counter

for tokens in positive_tokenized:
    xy = tokens_to_vectors(tokens, 1)
    data[i,:] = xy
    i += 1

for tokens in negative_tokenized:
    xy = tokens_to_vectors(tokens, 0)
    data[i,:] = xy
    i += 1

# Modelling

In [None]:
# Shuffle data
np.random.shuffle(data)

In [None]:
# Split X, y

X = data[:, :-1]
y = data[:, -1]

In [None]:
# last 100 rows will be test
X_train = X[:-100,]
y_train = y[:-100,]
X_test = X[-100:,]
y_test = y[-100:,]

In [None]:
# Logistic Regression
model = LogisticRegression()
model.fit(X_train, y_train)

# Score
print("Classification RatE: ", model.score(X_test, y_test))

Classification RatE:  0.71


In [None]:
# Set threshold

# let's look at the weights for each word
# try it with different threshold values!
threshold = 0.5
for word, index in iteritems(word_index_map):
    weight = model.coef_[0][index]
    if weight > threshold or weight < -threshold:
        print(word, weight)

and 1.488517233432227
will -0.6662897273680768
cable 0.5711923763996952
for 1.9417257380884105
that -0.6421632129921748
are 1.0110388370109558
the -0.743824292784933
used 0.6663702371082781
month -0.5177913408473829
they -0.5397257762919564
good 1.4710519178850066
sound 0.7546356657879958
you 0.8593521783482226
n't -1.5187528386267717
easy 0.9401641317318584
get -0.817692708713708
use 1.138059331881418
quality 0.9579229588757723
but -0.644113150970597
best 0.6896325298616318
item -0.5983899209160026
very 1.0025820911343333
well 0.644135280463187
with 1.0656965531267084
out -0.7397697272660526
wa -0.9752808015840501
perfect 0.5914611716105562
fast 0.5372774836698725
have 0.5611432498519805
price 1.6712771540090017
great 2.7391949938314886
money -0.6055016664281294
memory 0.5409267841390364
buy -0.5712719454550148
after -1.173190416121651
not -3.1205878918624625
doe -0.6285856497408113
highly 0.6313710933233982
excellent 0.8605200838065036
love 0.610432716788226
thing -0.6275095449365159