In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Loading dataset obtained from kaggle
df = pd.read_csv("amazon.csv")
print(df)


In [None]:
# Clean data
df['discounted_price'] = df['discounted_price'].str.replace('₹','').str.replace(',','').astype(float)
df['actual_price'] = df['actual_price'].str.replace('₹','').str.replace(',','').astype(float)
df['discount_percentage'] = df['discount_percentage'].str.replace('%','').astype(float)
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')
df['rating_count'] = df['rating_count'].dropna().str.replace(',', '').astype(float)
df_cleaned = df.dropna()

In [None]:
#Data Overview
print("Dataset Shape:", df.shape)
print("\nColumn Names:\n", df.columns.tolist())
print("\nMissing Values:\n", df.isnull().sum())
print("\nData Types:\n", df.dtypes)


In [None]:
print("\nTop 5 Product Categories:")
print(df_cleaned['category'].value_counts().head(5))


In [None]:
#Price range
print("\nMost Expensive Product:")
print(df_cleaned[df_cleaned['discounted_price'] == df_cleaned['discounted_price'].max()][['product_name', 'discounted_price', 'actual_price']])

print("\nCheapest Product:")
print(df_cleaned[df_cleaned['discounted_price'] == df_cleaned['discounted_price'].min()][['product_name', 'discounted_price', 'actual_price']])


In [None]:
#discount analysis
average_discount = df_cleaned['discount_percentage'].mean()
print(f"\nAverage Discount Percentage: {average_discount:.2f}%")

print("\nTop 5 Most Discounted Products:")
print(df_cleaned.sort_values(by='discount_percentage', ascending=False)[['product_name', 'discount_percentage']].head())


In [None]:
#rating statistics
print("\nRating Summary:")
print(df_cleaned['rating'].describe())

print("\nProducts with Perfect 5-Star Rating:")
print(df_cleaned[df_cleaned['rating'] == 5][['product_name', 'rating', 'rating_count']].head())

print("\nProducts with Lowest Ratings:")
print(df_cleaned[df_cleaned['rating'] == df_cleaned['rating'].min()][['product_name', 'rating']].head())


In [None]:
# Plot 1: Top 10 categories
category_counts = df_cleaned['category'].value_counts().nlargest(10)
plt.figure(figsize=(12, 6))
sns.barplot(x=category_counts.values, y=category_counts.index, palette="viridis")
plt.title("Top 10 Product Categories by Frequency")
plt.xlabel("Number of Products")
plt.ylabel("Category")
plt.tight_layout()
plt.show()

In [None]:
# Plot 2: Price Distribution
plt.figure(figsize=(10, 6))
sns.histplot(df_cleaned['discounted_price'], bins=40, kde=True, color='blue')
plt.title("Distribution of Discounted Prices")
plt.xlabel("Discounted Price (INR)")
plt.ylabel("Number of Products")
plt.tight_layout()
plt.show()

In [None]:
# Plot 3: Rating Distribution
plt.figure(figsize=(8, 5))
sns.countplot(x='rating', data=df_cleaned, palette='coolwarm')
plt.title("Product Rating Distribution")
plt.xlabel("Rating")
plt.ylabel("Number of Products")
plt.tight_layout()
plt.show()

In [None]:
# Plot 4: Discount vs Rating
plt.figure(figsize=(10, 6))
sns.scatterplot(x='discount_percentage', y='rating', data=df_cleaned, hue='rating', palette='viridis', alpha=0.6)
plt.title("Discount Percentage vs. Rating")
plt.xlabel("Discount Percentage")
plt.ylabel("Rating")
plt.tight_layout()
plt.show()

In [None]:
# Plot 5: Correlation Heatmap
plt.figure(figsize=(8, 6))
numerical_data = df_cleaned[['discounted_price', 'actual_price', 'discount_percentage', 'rating', 'rating_count']]
sns.heatmap(numerical_data.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix")
plt.tight_layout()
plt.show()


