## Stentiment analysis for Amazon product reviews.

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import string
import random
import nltk
from nltk.corpus import stopwords
import re
import pandas as pd
import numpy as np

Since we are working with textual input data, we need to convert the data over to a pandas dataframe from the current format. 

In [2]:
path = "./sentiment-labelled-sentences/sentiment labelled sentences/amazon_cells_labelled.txt"
f = open(path, "r")

data =[]
# Converting it to pandas dataframe
for line in f:
    review = line[:len(line) - 2]
    sentiment = "neg" if line[len(line)-2] == "0" else "pos"
    row = [review, sentiment]
    data.append(row)

df = pd.DataFrame(data, columns = ['reviews', 'sentiment'])
df.head()

Unnamed: 0,reviews,sentiment
0,So there is no way for me to plug it in here i...,neg
1,"Good case, Excellent value.\t",pos
2,Great for the jawbone.\t,pos
3,Tied to charger for conversations lasting more...,neg
4,The mic is great.\t,pos


In [3]:
df.shape

(1000, 2)

With our dataframe made, we now need to clean it before analyzing. Apply the `remove_punctuation()` and `remove_stopwords()` functions on our dataset to clean it. This reduces the size of the data and thus helps in faster operations.

In [4]:
#Uncomment the line below if you need to download the stopwords
#nltk.download('stopwords')
stop = stopwords.words('english')

def remove_punctuation(text):
    translator = str.maketrans('','', string.punctuation)
    return text.translate(translator)

def remove_stopwords(text):
    text = [word.lower() for word in text.split() if word.lower() not in stop]
    return " ".join(text)

df['reviews'] = df['reviews'].apply(remove_punctuation).apply(remove_stopwords)
df.head()

Unnamed: 0,reviews,sentiment
0,way plug us unless go converter,neg
1,good case excellent value,pos
2,great jawbone,pos
3,tied charger conversations lasting 45 minutesm...,neg
4,mic great,pos


We need to adjust our data slightly before using LDA. In the cell below, use the `CountVectorizer()` function. Then, use `fit_transform()` with `df['reviews']` as a parameter

In [5]:
vect = CountVectorizer(max_features = 5000, max_df=.15)
X = vect.fit_transform(df['reviews'])

Using the `LatenDirichletAllocation()` function below, we want to pass it 10 components. 

In [6]:
lda = LatentDirichletAllocation(n_components=10, learning_method="batch", max_iter=25, random_state=0) 
document_topics = lda.fit_transform(X)

In [7]:
print(lda.components_.shape)

(10, 1788)


In [8]:
document_topics

array([[0.87141694, 0.0142878 , 0.01428764, ..., 0.01428817, 0.01428655,
        0.01428601],
       [0.02000096, 0.0200025 , 0.02000158, ..., 0.02000055, 0.02000212,
        0.81998241],
       [0.03334133, 0.03333333, 0.03333919, ..., 0.03333333, 0.03333333,
        0.29768756],
       ...,
       [0.02500393, 0.02500102, 0.025     , ..., 0.02500125, 0.025001  ,
        0.025005  ],
       [0.01428746, 0.01428571, 0.01428599, ..., 0.01428603, 0.01428745,
        0.87142414],
       [0.21770012, 0.01666792, 0.01666897, ..., 0.01666814, 0.01667184,
        0.01666977]])

And finally' let's see the results! Call the `print_topics()` function below, passing in `feature_names` and `sorting`.

In [9]:
sorting = np.argsort(lda.components_, axis=1)[:, ::-1]
print(len(sorting))
print(sorting)

10
[[1767  697  717 ...  668 1592 1392]
 [1469 1261 1238 ... 1072 1592 1392]
 [ 697 1667 1274 ...  515  668 1392]
 ...
 [ 461  998 1709 ... 1592 1524 1392]
 [1764  893  439 ...  515 1072 1392]
 [ 697  547 1238 ... 1592  515 1392]]


In [10]:
feature_names = np.array(vect.get_feature_names_out())
print(len(feature_names))
print(feature_names)

1788
['10' '100' '11' ... 'youll' 'z500a' 'zero']


In [11]:
def print_topics(topics, feature_names, sorting, topics_per_chunk, n_words):
    for i in range(0, len(topics), topics_per_chunk):
        chunk_indices = range(i, min(i + topics_per_chunk, len(topics)))
        print(*chunk_indices)
        print(' '.join(map(str, chunk_indices)))
        print(' '.join('{:<14}'.format('topic ' + str(i)) for i in chunk_indices))
        print(' '.join('{:<14}'.format('--------') for _ in chunk_indices))

        for j in range(n_words):
            words = []
            for topic_idx in chunk_indices:
                words.append(feature_names[sorting[topic_idx, j]])
            print(' '.join('{:<14}'.format(word) for word in words))
        print('\n')
print_topics(topics=range(10), feature_names=feature_names, sorting=sorting, topics_per_chunk=5, n_words=10)

0 1 2 3 4
0 1 2 3 4
topic 0        topic 1        topic 2        topic 3        topic 4       
--------       --------       --------       --------       --------      
works          sound          great          recommend      battery       
great          really         use            would          good          
happy          quality        reception      service        horrible      
easy           good           make           customer       software      
battery        headset        car            one            life          
junk           product        like           highly         also          
use            bad            new            ear            product       
piece          well           working        right          cell          
cheap          bluetooth      product        stay           never         
item           service        light          terrible       nice          


5 6 7 8 9
5 6 7 8 9
topic 5        topic 6        topic 7        topic 8      

In [12]:
import string
import random
import re
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [13]:
# Load the datasets
path1 = "./sentiment-labelled-sentences/sentiment labelled sentences/amazon_cells_labelled.txt"
path2 = "./sentiment-labelled-sentences/sentiment labelled sentences/imdb_labelled.txt"
path3 = "./sentiment-labelled-sentences/sentiment labelled sentences/yelp_labelled.txt"

Lemmatization is the process of reducing a word to its base or root form, called a lemma, by removing suffixes and prefixes. Unlike stemming, lemmatization ensures that the resulting word is a valid word in the language, often using a vocabulary and morphological analysis.

In [14]:
# Function to clean text
lemmer = WordNetLemmatizer()

def LemTokens(tokens):
    return [lemmer.lemmatize(token) for token in tokens]

remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)

def LemNormalize(text):
    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))

In [15]:
# Load and clean the data
sent_tokens = []

# Read data from files and tokenize
for path in [path1, path2, path3]:
    with open(path, "r") as file:
        for line in file:
            review = line.strip()[:-2]  # Remove trailing newline and sentiment label
            sent_tokens.append(LemNormalize(review))

In [16]:
sent_tokens

[['so',
  'there',
  'is',
  'no',
  'way',
  'for',
  'me',
  'to',
  'plug',
  'it',
  'in',
  'here',
  'in',
  'the',
  'u',
  'unless',
  'i',
  'go',
  'by',
  'a',
  'converter'],
 ['good', 'case', 'excellent', 'value'],
 ['great', 'for', 'the', 'jawbone'],
 ['tied',
  'to',
  'charger',
  'for',
  'conversation',
  'lasting',
  'more',
  'than',
  '45',
  'minutesmajor',
  'problem'],
 ['the', 'mic', 'is', 'great'],
 ['i',
  'have',
  'to',
  'jiggle',
  'the',
  'plug',
  'to',
  'get',
  'it',
  'to',
  'line',
  'up',
  'right',
  'to',
  'get',
  'decent',
  'volume'],
 ['if',
  'you',
  'have',
  'several',
  'dozen',
  'or',
  'several',
  'hundred',
  'contact',
  'then',
  'imagine',
  'the',
  'fun',
  'of',
  'sending',
  'each',
  'of',
  'them',
  'one',
  'by',
  'one'],
 ['if', 'you', 'are', 'razr', 'owneryou', 'must', 'have', 'this'],
 ['needle', 'to', 'say', 'i', 'wasted', 'my', 'money'],
 ['what', 'a', 'waste', 'of', 'money', 'and', 'time'],
 ['and', 'the', 'so

In [17]:
# Default greeting messages
GREETING_INPUTS = ("hello", "hi", "greetings", "sup", "what's up", "hey")
GREETING_RESPONSES = ["hi", "hey", "*nods*", "hi there", "hello", "I am glad! You are talking to me"]

def greeting(sentence):
    # Check if any greeting word is present in the user's input
    for word in GREETING_INPUTS:
        if word.lower() in sentence.lower():
            return random.choice(GREETING_RESPONSES)
    return None  # No greeting word found

In [19]:
import warnings
warnings.filterwarnings('ignore')

# Implement the chatbot
def response(user_response):
    robo_response = ''
    sent_tokens.append(LemNormalize(user_response))
    tfidfvec = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False, stop_words='english')
    tfidf = tfidfvec.fit_transform(sent_tokens)
    vals = cosine_similarity(tfidf[-1], tfidf)
    idx = vals.argsort()[0][-2]
    flat = vals.flatten()
    flat.sort()
    similarity = flat[-2]
    if similarity == 0:
        robo_response = robo_response + "I am sorry! I don't understand you"
        return robo_response
    else:
        robo_response = robo_response + ' '.join(sent_tokens[idx])
        return robo_response

# Initialize the chatbot
flag = True
print("ROBO: My name is Robo. I will answer your queries about Chatbots. If you want to exit, type 'bye'!")

while flag:
    user_response = input()
    user_response = user_response.lower()
    if user_response != 'bye':
        if user_response == 'thanks' or user_response == 'thank you':
            flag = False
            print("ROBO: You are welcome..")
        else:
            print("ROBO: ", end="")
            print(response(user_response))
            sent_tokens.remove(LemNormalize(user_response))
    else:
        flag = False
        print("ROBO: Bye! take care..")

ROBO: My name is Robo. I will answer your queries about Chatbots. If you want to exit, type 'bye'!
ROBO: not even a hello we will be right with you
ROBO: Bye! take care..
