In [26]:
import spacy
import pandas as pd
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import pickle
import tensorflow_hub as hub
from gensim.models import KeyedVectors
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
from sentence_transformers import SentenceTransformer
import numpy as np
nlp = spacy.load("en_core_web_lg")
import json
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [27]:


# getAspectDescription(text: string) => [{aspect: string, description: string}]

def getAspects(text):
    aspects = []
    doc = nlp(text)
    for sent in doc.sents:
        target = []
        for token in sent:
            if (token.dep_ == 'nsubj' or token.dep_ == 'dobj') and token.pos_ == 'NOUN' and token.ent_type_ == '':
                target.append(token.text)
        if target:
            aspects.extend(target)
    return aspects



In [39]:
def getSentiment(texts):
    model = pickle.load(open('SentimentModel/bigdata2modelNB.pkl', 'rb'))

    aspect_sentiments = []
    for text in texts:
        """
        function getSentiment(raw_text: string) -> (output: string, prediction: (polarity, subjectivity))

        This function takes in a string of raw text and performs sentiment analysis to determine whether the text is positive or negative. It returns a tuple consisting of the sentiment label and the positive probability of the prediction.

        Args:
            raw_text (str): The raw text to analyze.

        Returns:
            tuple: A tuple consisting of the sentiment label and the positive probability of the prediction.

        Example:
            >>> raw_text = "This product is amazing! I love it so much."
            >>> getSentiment(raw_text)
            ('Positive', 0.00819811, 0.99180189))
        """

        # Instantiate PorterStemmer
        p_stemmer = PorterStemmer()

        # Remove HTML
        review_text = BeautifulSoup(text).get_text()

        # Remove non-letters
        letters_only = re.sub("[^a-zA-Z]", " ", review_text)

        # Convert words to lower case and split each word up
        words = letters_only.lower().split()

        # Convert stopwords to a set
        stops = set(stopwords.words('english'))

        # Adding on stopwords that were appearing frequently in both positive and negative reviews
        stops.update(['app','shopee','shoppee','item','items','seller','sellers','bad'])

        # Remove stopwords
        meaningful_words = [w for w in words if w not in stops]

        # Stem words
        meaningful_words = [p_stemmer.stem(w) for w in meaningful_words]

        # Join words back into one string, with a space in between each word
        final_text = pd.Series(" ".join(meaningful_words))

        # Generate predictions
        pred = model.predict(final_text)[0]
        positive_prob = model.predict_proba([pd.Series.to_string(final_text)])

        pd.options.display.max_colwidth = None
        pd.options.display.max_rows = None
        print("TEXT: ", pd.Series.to_string(final_text))

        if pred == 1:
            output = "Negative"
        else:
            output = "Postive"
    
        aspect_sentiments.append([text, output, positive_prob])

    return aspect_sentiments
    #return output, positive_prob

In [42]:
model = pickle.load(open('SentimentModel/bigdata2modelNB.pkl', 'rb'))
model.predict_proba(['batteri charg quickli support wireless charg ad conveni'])

array([[0.47806649, 0.52193351]])

In [41]:
getSentiment(['The screen has a high resolution for sharp and clear images.', 'The battery charges quickly and supports wireless charging for added convenience.'])

TEXT:  0    screen high resolut sharp clear imag
TEXT:  0    batteri charg quickli support wireless charg ad conveni


[['The screen has a high resolution for sharp and clear images.',
  'Postive',
  array([[0.50733507, 0.49266493]])],
 ['The battery charges quickly and supports wireless charging for added convenience.',
  'Negative',
  array([[0.47806649, 0.52193351]])]]

In [15]:
def groupAspects(aspect_list):
    # Load pre-trained Word2Vec model
    word_model = KeyedVectors.load_word2vec_format('Aspect-Extraction\GoogleNews-vectors-negative300.bin', 
                                                binary=True, limit=500000)

    # Convert aspects to word vectors
    aspect_vectors = [word_model[aspect] for aspect in aspect_list]

    # Cluster word vectors using k-means
    kmeans = KMeans(n_clusters=7)
    kmeans.fit(aspect_vectors)
    clusters = kmeans.predict(aspect_vectors)

    # Find representative label for each cluster
    labels = []
    grouped_aspects = {}
    for i in range(kmeans.n_clusters):
        cluster_vectors = [aspect_vectors[j] for j in range(len(aspect_vectors)) if clusters[j] == i]
        centroid = np.mean(cluster_vectors, axis=0)
        distances = cdist([centroid], cluster_vectors, metric='cosine')
        closest_index = np.argmin(distances)
        label = aspect_list[np.where(clusters == i)[0][closest_index]]
        labels.append(label)
        grouped_aspects[label] = [aspect_list[j] for j in range(len(aspect_list)) if clusters[j] == i]

    return grouped_aspects

from sklearn.metrics.pairwise import cosine_similarity

def group_sentiments(aspect_sentiments, grouped_aspects, embedder):
    result = {}
    for label, aspects in grouped_aspects.items():
        result[label] = []
        aspect_embeddings = embedder(aspects)
        for text, output, positive_prob in aspect_sentiments:
            text_embedding = embedder([text])[0]
            similarities = cosine_similarity(text_embedding.reshape(1,-1), aspect_embeddings)
            if max(similarities[0]) > 0.289:
                result[label].append((text, output, positive_prob))
    return result

def embedder(texts):
    return embed(texts).numpy()

In [16]:
sentences = [
    "The device has a large and vibrant display that makes everything look great.",
    "Its camera takes stunning photos with vivid colors and sharp details.",
    "The battery life is impressive and lasts all day with heavy use.",
    "The size is perfect for one-handed use and fits comfortably in a pocket.",
    "The screen is responsive and easy to navigate with intuitive gestures.",
    "The pictures captured by the camera are of professional quality.",
    "The life of the device is extended by its durable construction and regular software updates.",
    "The colors on the display are accurate and true to life.",
    "The performance is smooth and fast, even when running multiple apps at once.",
    "The design is sleek and modern, with a premium feel.",
    "The display is protected by scratch-resistant glass for added durability.",
    "The camera has advanced features such as portrait mode and night mode for stunning photos in any lighting condition.",
    "The battery charges quickly and supports wireless charging for added convenience.",
    "The size of the device is perfect for watching videos and playing games.",
    "The screen has a high resolution for sharp and clear images."
]

In [17]:
new_text = ' '.join(sentences)
aspect_list = getAspects(new_text)
group_aspects = groupAspects(aspect_list)
print(group_aspects)



{'camera': ['camera', 'camera'], 'display': ['display', 'life', 'colors', 'performance', 'design', 'features', 'charging'], 'apps': ['device', 'apps'], 'size': ['size', 'size'], 'screen': ['screen', 'screen'], 'photos': ['photos', 'pictures', 'videos'], 'resolution': ['resolution']}


In [30]:
model = pickle.load(open('SentimentModel/bigdata2modelNB.pkl', 'rb'))
model.predict_proba(['The battery charges quickly and supports wireless charging for added convenience.'])

array([[0.58097448, 0.41902552]])

In [18]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Instantiate the sentiment intensity analyzer
sia = SentimentIntensityAnalyzer()

# Print a negative review in the training set
print('The screen has a high resolution for sharp and clear images.')

# VADER's polarity scores for the negative review
sia.polarity_scores('The screen has a high resolution for sharp and clear images.')

The screen has a high resolution for sharp and clear images.


{'neg': 0.0, 'neu': 0.794, 'pos': 0.206, 'compound': 0.3818}

In [19]:
getSentiment(['The screen has a high resolution for sharp and clear images.', 'The battery charges quickly and supports wireless charging for added convenience.'])

[['The screen has a high resolution for sharp and clear images.',
  'Postive',
  array([[0.50733507, 0.49266493]])],
 ['The battery charges quickly and supports wireless charging for added convenience.',
  'Negative',
  array([[0.366848, 0.633152]])]]

In [20]:
sentiments = getSentiment(sentences)
print(embedder(sentences))
group_sentences = group_sentiments(sentiments, group_aspects, embedder)
for label, sentences in group_sentences.items():
    print(f"{label}:")
    for sentence in sentences:
        print(f"  {sentence}")

[[-0.04116534  0.03324282 -0.03086034 ...  0.03107716  0.01527658
   0.03016139]
 [-0.02420701  0.04582939  0.01411261 ...  0.01786607 -0.02259821
  -0.02726042]
 [-0.03270246 -0.01961841 -0.00080166 ...  0.02664647  0.00477472
   0.02659776]
 ...
 [ 0.02352348 -0.0133323  -0.00828289 ...  0.04613282 -0.07519539
   0.05771908]
 [-0.0283571  -0.03624243 -0.03549223 ... -0.00527885 -0.00512973
   0.01658252]
 [-0.01287641  0.02537684  0.00207782 ...  0.03250806  0.00827598
  -0.01529129]]
camera:
  ('Its camera takes stunning photos with vivid colors and sharp details.', 'Postive', array([[0.58097448, 0.41902552]]))
  ('The pictures captured by the camera are of professional quality.', 'Postive', array([[0.64088682, 0.35911318]]))
display:
  ('The device has a large and vibrant display that makes everything look great.', 'Postive', array([[0.68497571, 0.31502429]]))
  ('The performance is smooth and fast, even when running multiple apps at once.', 'Postive', array([[0.67877738, 0.3212226

In [21]:
print(group_aspects)
for item in group_sentences:
    print(item)
for sentiment in getSentiment(sentences):
    positive_prob = sentiment[2]
    print(type(positive_prob))

{'camera': ['camera', 'camera'], 'display': ['display', 'life', 'colors', 'performance', 'design', 'features', 'charging'], 'apps': ['device', 'apps'], 'size': ['size', 'size'], 'screen': ['screen', 'screen'], 'photos': ['photos', 'pictures', 'videos'], 'resolution': ['resolution']}
camera
display
apps
size
screen
photos
resolution


TypeError: expected string or bytes-like object, got 'tuple'