In [None]:
# Project: Book Review Sentiment Analysis
# Phase 1: Data Acquisition and Exploration

# 1. Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# 2. Load the Dataset
df = pd.read_csv('data/raw/all_kindle_reviews.csv')

print("--- Initial Data Exploration ---")

# 3.1. Check the first few rows to see what the data looks like
print("\nFirst few rows of the dataset:")
print(df.head())

# 3.2. Check for missing values
print("\nGet a summary of the dataset:")
print(df.info())

# 3.3. Check for duplicates
print("\nNumber of duplicate reviews:")
print(df.duplicated().sum())

# 3.4. Check for missing review text
print("\nMissing values in 'reviewText' column:")
print(df['reviewText'].isnull().sum())

# 3.5. Check the rating distribution
print("\nDistribution of Ratings:")
print(df['rating'].value_counts())

In [None]:
print("--- Exploratory Data Analysis (EDA) ---")

# 4.1. Create the 'images' directory if it does not exist yet
if not os.path.exists('images'):
    os.makedirs('images')

# 4.2. Visualise the rating distribution
custom_palette = ['#e6194b', '#f58231', '#ffe119', '#3cb44b', '#4363d8']

plt.figure(figsize=(8, 5))
ax = sns.countplot(x='rating', data=df, hue='rating', palette=custom_palette)
ax.legend_.remove()
plt.title('Distribution of Book Ratings')
plt.xlabel('Rating (Stars)')
plt.ylabel('Number of Reviews')
plt.savefig('images/ph1_rating_distribution.png')
# plt.show() # uncomment this line to display this visual while running the notebook