# Parameters

In [None]:
max_records_to_read = 1000000

# Reading Data

In [None]:
import os
from google.colab import drive

# Mounting drive
drive.mount('/content/drive')

# Changing directory
project_path = '/content/drive/Aspect Based Sentiment Analysis'
os.chdir(project_path)

print("Current working directory:", os.getcwd())

In [13]:
import json
from tqdm import tqdm

all_records = []

# Opening file
file_path = "Data/Cell_Phones_and_Accessories.jsonl"
with open(file_path, 'r') as fp:

    for line in tqdm(fp):

      # Checking if max records reached
      if len(all_records) >= max_records_to_read:
        break
      record = json.loads(line.strip())

      # Dropping unnecessary information
      record.pop("images")
      record.pop("user_id")
      record.pop("asin")
      record.pop("parent_asin")

      # Storing record
      all_records.append(record)

print(f"Total records: {len(all_records)}")

1000000it [00:17, 58303.10it/s]

Total records: 1000000





In [15]:
import pandas as pd

# Converting to a pandas DataFrame
df = pd.DataFrame(all_records)
print(df.head().to_markdown())

# Deleting records list for memory save
del(all_records)

|    |   rating | title                               | text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            

In [16]:
df.shape

(1000000, 6)

In [17]:
# Saving the DataFrame to a CSV
newfile_path = "Data/loaded_data.csv"
df.to_csv(newfile_path, index=False)
print(f"DataFrame saved to {newfile_path}")

DataFrame saved to Data/loaded_data.csv


# Data Preparation

In [None]:
# Checking data types and null values
print(df.info())
print("\n\n\n")

# Statistical summary
print(df.describe().to_markdown())

In [None]:
# Dropping rows with null values
df.dropna(subset=['text'], inplace=True)

# Converting from milliseconds to seconds
df['timestamp'] = df['timestamp'] // 1000

# Converting timestamp to datetime
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')

# Dropping duplicates
df.drop_duplicates(subset=['text'], inplace=True)

In [None]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Downloading NLTK resources
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Preprocessing Text
df['cleaned_title'] = df['title'].apply(preprocess_text)
df['cleaned_text'] = df['text'].apply(preprocess_text)

# Combining the cleaned title and text
df['combined_text'] = df['cleaned_title'] + ' ' + df['cleaned_text']

In [None]:
# Dropping NAN if any
df.dropna(inplace=True)

# Data Analysis

## a) Sentiment Analysis

In [None]:
from textblob import TextBlob

# Function to calculate sentiment
def get_sentiment(text):
    analysis = TextBlob(text)
    polarity = analysis.sentiment.polarity  # Measure the positivity/negativity
    if polarity > 0:
        return 'positive'
    elif polarity < 0:
        return 'negative'
    else:
        return 'neutral'

# Sentiment
df['sentiment'] = df['combined_text'].apply(get_sentiment)
print(df[['sentiment', 'combined_text']].head().to_markdown())

In [None]:
import matplotlib.pyplot as plt

sentiment_counts = df['sentiment'].value_counts()

# Create the pie chart
plt.figure(figsize=(5, 5))
plt.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', startangle=0, colors=['lightgreen', 'lightcoral', 'skyblue'])
plt.title('Sentiment Distribution', fontsize=16)
plt.axis('equal')
plt.show()

In [None]:
import seaborn as sns

# Grouping data by sentiment and calculating the mean rating for each sentiment category
sentiment_rating = df.groupby('sentiment')['rating'].mean().reset_index()

# Plotting average rating for each sentiment
plt.figure(figsize=(8, 6))
sns.barplot(data=sentiment_rating, x='sentiment', y='rating', palette='viridis')
plt.title('Average Rating by Sentiment', fontsize=16)
plt.xlabel('Sentiment', fontsize=12)
plt.ylabel('Average Rating', fontsize=12)
plt.tight_layout()
plt.show()

# Analyzing rating distribution across sentiments
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='sentiment', y='rating', palette='viridis')
plt.title('Rating Distribution Across Sentiments', fontsize=16)
plt.xlabel('Sentiment', fontsize=12)
plt.ylabel('Rating', fontsize=12)
plt.tight_layout()
plt.show()

In [None]:
# Analyze and plot helpful votes by sentiment
helpful_votes_sentiment = df.groupby('sentiment')['helpful_vote'].mean().reset_index()

plt.figure(figsize=(8, 6))
sns.barplot(data=helpful_votes_sentiment, x='sentiment', y='helpful_vote', palette='viridis')
plt.title('Average Helpful Votes by Sentiment', fontsize=16)
plt.xlabel('Sentiment', fontsize=12)
plt.ylabel('Average Helpful Votes', fontsize=12)
plt.tight_layout()
plt.show()

# Analyze sentiment distribution over time (using year)
df['year'] = df['timestamp'].dt.year
sentiment_year = df.groupby(['year', 'sentiment']).size().unstack(fill_value=0)

sentiment_year.plot(kind='line', figsize=(10, 5), colormap='viridis', marker='o')
plt.title('Sentiment Distribution Over Time (Years)', fontsize=16)
plt.xlabel('Year', fontsize=12)
plt.ylabel('Number of Reviews', fontsize=12)
plt.legend(title='Sentiment')
plt.tight_layout()
plt.show()
