# Parameters

In [None]:
max_records_to_read = 1000000

# Reading Data

In [None]:
import os
from google.colab import drive

# Mounting drive
drive.mount('/content/drive')

# Changing directory
project_path = '/content/drive/Aspect Based Sentiment Analysis'
os.chdir(project_path)

print("Current working directory:", os.getcwd())

In [13]:
import json
from tqdm import tqdm

all_records = []

# Opening file
file_path = "Data/Cell_Phones_and_Accessories.jsonl"
with open(file_path, 'r') as fp:

    for line in tqdm(fp):

      # Checking if max records reached
      if len(all_records) >= max_records_to_read:
        break
      record = json.loads(line.strip())

      # Dropping unnecessary information
      record.pop("images")
      record.pop("user_id")
      record.pop("asin")
      record.pop("parent_asin")

      # Storing record
      all_records.append(record)

print(f"Total records: {len(all_records)}")

1000000it [00:17, 58303.10it/s]

Total records: 1000000





In [15]:
import pandas as pd

# Converting to a pandas DataFrame
df = pd.DataFrame(all_records)
print(df.head().to_markdown())

# Deleting records list for memory save
del(all_records)

|    |   rating | title                               | text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            

In [16]:
df.shape

(1000000, 6)

In [17]:
# Saving the DataFrame to a CSV
newfile_path = "Data/loaded_data.csv"
df.to_csv(newfile_path, index=False)
print(f"DataFrame saved to {newfile_path}")

DataFrame saved to Data/loaded_data.csv


# Data Preparation

In [None]:
# Checking data types and null values
print(df.info())
print("\n\n\n")

# Statistical summary
print(df.describe().to_markdown())

In [None]:
# Dropping rows with null values
df.dropna(subset=['text'], inplace=True)

# Converting from milliseconds to seconds
df['timestamp'] = df['timestamp'] // 1000

# Converting timestamp to datetime
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')

# Dropping duplicates
df.drop_duplicates(subset=['text'], inplace=True)

In [None]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Downloading NLTK resources
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Preprocessing Text
df['cleaned_title'] = df['title'].apply(preprocess_text)
df['cleaned_text'] = df['text'].apply(preprocess_text)

# Combining the cleaned title and text
df['combined_text'] = df['cleaned_title'] + ' ' + df['cleaned_text']

In [None]:
# Dropping NAN if any
df.dropna(inplace=True)

# Data Analysis

## a) Sentiment Analysis

In [None]:
from textblob import TextBlob

# Function to calculate sentiment
def get_sentiment(text):
    analysis = TextBlob(text)
    polarity = analysis.sentiment.polarity  # Measure the positivity/negativity
    if polarity > 0:
        return 'positive'
    elif polarity < 0:
        return 'negative'
    else:
        return 'neutral'

# Sentiment
df['sentiment'] = df['combined_text'].apply(get_sentiment)
print(df[['sentiment', 'combined_text']].head().to_markdown())

In [None]:
import matplotlib.pyplot as plt

sentiment_counts = df['sentiment'].value_counts()

# Create the pie chart
plt.figure(figsize=(5, 5))
plt.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', startangle=0, colors=['lightgreen', 'lightcoral', 'skyblue'])
plt.title('Sentiment Distribution', fontsize=16)
plt.axis('equal')
plt.show()

In [None]:
import seaborn as sns

# Grouping data by sentiment and calculating the mean rating for each sentiment category
sentiment_rating = df.groupby('sentiment')['rating'].mean().reset_index()

# Plotting average rating for each sentiment
plt.figure(figsize=(8, 6))
sns.barplot(data=sentiment_rating, x='sentiment', y='rating', palette='viridis')
plt.title('Average Rating by Sentiment', fontsize=16)
plt.xlabel('Sentiment', fontsize=12)
plt.ylabel('Average Rating', fontsize=12)
plt.tight_layout()
plt.show()

# Analyzing rating distribution across sentiments
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='sentiment', y='rating', palette='viridis')
plt.title('Rating Distribution Across Sentiments', fontsize=16)
plt.xlabel('Sentiment', fontsize=12)
plt.ylabel('Rating', fontsize=12)
plt.tight_layout()
plt.show()

In [None]:
# Analyze and plot helpful votes by sentiment
helpful_votes_sentiment = df.groupby('sentiment')['helpful_vote'].mean().reset_index()

plt.figure(figsize=(8, 6))
sns.barplot(data=helpful_votes_sentiment, x='sentiment', y='helpful_vote', palette='viridis')
plt.title('Average Helpful Votes by Sentiment', fontsize=16)
plt.xlabel('Sentiment', fontsize=12)
plt.ylabel('Average Helpful Votes', fontsize=12)
plt.tight_layout()
plt.show()

# Analyze sentiment distribution over time (using year)
df['year'] = df['timestamp'].dt.year
sentiment_year = df.groupby(['year', 'sentiment']).size().unstack(fill_value=0)

sentiment_year.plot(kind='line', figsize=(10, 5), colormap='viridis', marker='o')
plt.title('Sentiment Distribution Over Time (Years)', fontsize=16)
plt.xlabel('Year', fontsize=12)
plt.ylabel('Number of Reviews', fontsize=12)
plt.legend(title='Sentiment')
plt.tight_layout()
plt.show()


In [None]:
from nltk.corpus import wordnet, sentiwordnet as swn, stopwords
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('sentiwordnet')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger_eng')

# Function to get WordNet POS tags
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

# Filter words based on emotional content
def filter_emotional_words(texts):
    positive_words = []
    negative_words = []
    for text in texts:
        tokens = word_tokenize(text.lower())
        for word, pos_tag in nltk.pos_tag(tokens):
            pos = get_wordnet_pos(pos_tag)
            if pos:
                try:
                    synset = swn.senti_synset(f"{word}.{pos}.01")
                    if synset.obj_score() <= 0.49:  # Filter out neutral/objective words
                        if synset.pos_score() > synset.neg_score():
                            positive_words.append(word)
                        elif synset.neg_score() > synset.pos_score():
                            negative_words.append(word)
                except:
                    pass
    return positive_words, negative_words

# Separate texts by sentiment
sentiment_texts = {
    sentiment: df[df['sentiment'] == sentiment]['combined_text'].tolist()
    for sentiment in df['sentiment'].unique()
}

# Prepare word clouds
fig, axes = plt.subplots(1, len(sentiment_texts), figsize=(18, 6), sharex=True, sharey=True)

for ax, (sentiment, texts) in zip(axes, sentiment_texts.items()):
    positive_words, _ = filter_emotional_words(texts)  # Focus on positive words for now
    wordcloud = WordCloud(
        width=800,
        height=400,
        background_color='white',
        colormap='viridis',
        stopwords=set(stopwords.words('english'))
    ).generate(' '.join(positive_words))
    ax.imshow(wordcloud, interpolation='bilinear')
    ax.axis('off')
    ax.set_title(f"Sentiment: {sentiment.capitalize()}", fontsize=14)

plt.tight_layout()
plt.show()


# Topic Modeling Analysis

In [None]:
from bertopic import BERTopic

# BERTopic
topic_model = BERTopic(verbose=True, nr_topics=5)
topics, probs = topic_model.fit_transform(df['combined_text'])

topic_names = {}

# Name of the topics
for topic_id in range(-1, 4):  # Get 5 topics (0-4)
    topic_name = topic_model.get_topic(topic_id)

    name = ""
    for word, prob in topic_name[:3]:
      name += word + " "
    topic_name = name.strip()
    topic_names[topic_id] = topic_name

# Print the topic names
print(topic_names)

# Add the topics back to the DataFrame
df['Topic'] = topics
df['Dominant_Topic'] = df.Topic.map(topic_names)
df.drop(columns=['Topic'], inplace=True)

In [None]:
# Visualize top topics
print("\nVisualizing Top Topics...")
topic_model.visualize_barchart(top_n_topics=5).show()

# Visualize topic relationships
topic_model.visualize_topics().show()

In [None]:
# Sentiment Analysis Grouped by Topic
sentiment_summary = df.groupby(['Dominant_Topic', 'sentiment']).size().unstack(fill_value=0)

print("\nSentiment Summary by Topic:")
print(sentiment_summary.to_markdown())

In [None]:
fig = sentiment_summary.plot.bar(rot=90, figsize=(8, 6), stacked=True, colormap='viridis')
plt.title('Sentiment Distribution by Topics')
plt.ylabel('Number of Reviews')
plt.xlabel('Topics')
plt.tight_layout()
plt.show()

# Machine Learning Model

## a) Unbalanced Data Modelling

In [None]:
# Import required libraries
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    confusion_matrix,
)
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import ConfusionMatrixDisplay

In [None]:
# Converting text to numbers using TFIDF
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
X = tfidf.fit_transform(df['combined_text']).toarray()
y = df['sentiment']

# Spliting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Models to train
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000),
    "Naive Bayes": MultinomialNB(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
}

# Updated evaluation function with confusion matrix plotting
def evaluate_model(name, model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average="weighted")
    recall = recall_score(y_test, y_pred, average="weighted")
    f1 = f1_score(y_test, y_pred, average="weighted")

    # Print metrics
    print(f"Model: {name}")
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)
    print("\nClassification Report:\n", classification_report(y_test, y_pred))

    # Plot confusion matrix
    ConfusionMatrixDisplay.from_estimator(model, X_test, y_test, cmap='viridis')
    plt.title(f'Confusion Matrix for {name}')
    plt.show()

    return {"model": name, "accuracy": accuracy, "precision": precision, "recall": recall, "f1_score": f1}

# Train and evaluate all models
results = []
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    results.append(evaluate_model(name, model, X_test, y_test))