In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import seaborn as sns
%matplotlib widget

In [None]:
df = pd.read_csv("filtered_df.csv")
df

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch

def get_sentence_embedding(sentence, model, tokenizer):
    # Tokenize input sentence
    tokens = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=512)

    # Get the transformer model output
    with torch.no_grad():
        outputs = model(**tokens)

    # Extract the output embeddings (CLS token)
    embeddings = outputs.last_hidden_state[:, 0, :]

    return embeddings.numpy()

# Load pre-trained model and tokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Example sentence
example_sentence = "This mobile phone was very good"

# Get sentence embedding
sentence_embedding = get_sentence_embedding(example_sentence, model, tokenizer)

# Convert to numpy array for easier handling
sentence_embedding_np = sentence_embedding


In [None]:
counts = df["Product Name"].value_counts().reset_index()
subset = df[df["Product Name"] == counts.iloc[100]["Product Name"]]
subset["embedding"] = subset["Reviews"].apply(get_sentence_embedding, args = (model, tokenizer))
subset

In [None]:

pca = PCA(n_components=3)
embed = np.vstack(subset["embedding"].to_numpy())
project = pca.fit_transform(embed)
kmeans = KMeans(n_clusters=3)
clusters = kmeans.fit_predict(embed)
subset["cluster"] = clusters

In [None]:
fig = plt.figure(figsize = (10, 7))
ax = plt.axes(projection ="3d")
ax.set_xlabel('X Axis')
ax.set_ylabel('Y Axis')
ax.set_zlabel('Z Axis')
ax.scatter3D(project[:,0], project[:,1], project[:,2], c=clusters)


In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(data=subset, x='Rating', hue='cluster', multiple="stack", bins=5)
plt.show()

In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import Counter
import nltk
nltk.download('stopwords')
nltk.download('punkt')

def preprocess(doc):
    stop_words = set(stopwords.words('english'))
    ps = PorterStemmer()
    processed_docs = []
    words = [ps.stem(word.lower()) for word in word_tokenize(doc) if word.isalpha() and word.lower() not in stop_words]
    return words

def calculate_term_frequency(documents):
    term_frequency = Counter()
    for doc in documents:
        term_frequency.update(doc)
    return term_frequency

subset["preprocessed"] = subset["Reviews"].apply(preprocess)
tf_group = [calculate_term_frequency(subset["preprocessed"][subset["cluster"] == i]) for i in range(3)]
terms_group_more = [{term: tf_group[i][term] for term in tf_group[i] if tf_group[i][term] > tf_group[(i+1)%3][term] 
                        and tf_group[i][term] > tf_group[(i+2)%3][term]} for i in range(3)]
terms_group_more = [sorted(cnt.items(), key=lambda x: x[1], reverse=True) for cnt in terms_group_more]

In [None]:
terms_group_more[0]

In [None]:
import plotly.express as px

# Generate some example data
data = {
    'x': [1, 2, 3, 4, 5],
    'y': [2, 3, 5, 4, 1],
    'z': [3, 1, 2, 4, 5],
    'labels': terms_group_more[0][0:5]
}

# Create a 3D scatter plot with hover text
fig = px.scatter_3d(data, x='x', y='y', z='z', hover_name='labels')

# Set layout options for better readability
fig.update_layout(
    title="Interactive 3D Scatter Plot with Hover Text",
    scene=dict(
        xaxis_title="X-axis",
        yaxis_title="Y-axis",
        zaxis_title="Z-axis"
    )
)

# Show the plot
fig.show()