# Data exploration and Visualization

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set style for visualizations
plt.style.use('ggplot')
sns.set_palette("Set2")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

# Load the datasets
retail_df = pd.read_csv('retail_sales_data.csv')
reviews_df = pd.read_csv('product_reviews.csv')

# Data overview
print("=== Retail Sales Data Overview ===")
print(f"Shape: {retail_df.shape}")
print("\nFirst 5 rows:")
print(retail_df.head())
print("\nData types:")
print(retail_df.dtypes)
print("\nMissing values:")
print(retail_df.isnull().sum())
print("\nSummary statistics:")
print(retail_df.describe())

print("\n\n=== Product Reviews Data Overview ===")
print(f"Shape: {reviews_df.shape}")
print("\nFirst 5 rows:")
print(reviews_df.head())
print("\nData types:")
print(reviews_df.dtypes)
print("\nMissing values:")
print(reviews_df.isnull().sum())
print("\nSummary statistics:")
print(reviews_df.describe(include='all'))

# Convert date column to datetime
retail_df['date'] = pd.to_datetime(retail_df['date'])
reviews_df['date'] = pd.to_datetime(reviews_df['date'])

# Add some temporal features to retail data
retail_df['year'] = retail_df['date'].dt.year
retail_df['month'] = retail_df['date'].dt.month
retail_df['day_of_week'] = retail_df['date'].dt.dayofweek
retail_df['is_weekend'] = retail_df['day_of_week'].isin([5, 6]).astype(int)

# Basic visualizations for retail data
print("\n\n=== Retail Sales Analysis ===")

# 1. Total sales by category
plt.figure(figsize=(10, 6))
category_sales = retail_df.groupby('category')['total_sales'].sum().sort_values(ascending=False)
sns.barplot(x=category_sales.index, y=category_sales.values)
plt.title('Total Sales by Category')
plt.xlabel('Category')
plt.ylabel('Total Sales')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('total_sales_by_category.png')
plt.close()

# 2. Online vs In-store sales by category
category_online = retail_df.groupby('category')['online_sales'].sum()
category_instore = retail_df.groupby('category')['in_store_sales'].sum()
category_data = pd.DataFrame({
    'Online Sales': category_online,
    'In-Store Sales': category_instore
})

plt.figure(figsize=(10, 6))
category_data.plot(kind='bar', stacked=True)
plt.title('Online vs In-Store Sales by Category')
plt.xlabel('Category')
plt.ylabel('Sales')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('online_vs_instore_sales.png')
plt.close()

# 3. Sales by weather condition
plt.figure(figsize=(10, 6))
weather_sales = retail_df.groupby('weather')['total_sales'].mean().sort_values(ascending=False)
sns.barplot(x=weather_sales.index, y=weather_sales.values)
plt.title('Average Sales by Weather Condition')
plt.xlabel('Weather')
plt.ylabel('Average Sales')
plt.tight_layout()
plt.savefig('sales_by_weather.png')
plt.close()

# 4. Sales by promotion type
plt.figure(figsize=(10, 6))
promo_sales = retail_df.groupby('promotion')['total_sales'].mean().sort_values(ascending=False)
sns.barplot(x=promo_sales.index, y=promo_sales.values)
plt.title('Average Sales by Promotion Type')
plt.xlabel('Promotion')
plt.ylabel('Average Sales')
plt.tight_layout()
plt.savefig('sales_by_promotion.png')
plt.close()

# 5. Sales by age group
plt.figure(figsize=(10, 6))
age_sales = retail_df.groupby('dominant_age_group')['total_sales'].mean().sort_values(ascending=False)
sns.barplot(x=age_sales.index, y=age_sales.values)
plt.title('Average Sales by Dominant Age Group')
plt.xlabel('Age Group')
plt.ylabel('Average Sales')
plt.tight_layout()
plt.savefig('sales_by_age_group.png')
plt.close()

# 6. Monthly sales trend
monthly_sales = retail_df.groupby(['year', 'month'])['total_sales'].sum().reset_index()
plt.figure(figsize=(12, 6))
sns.lineplot(x=monthly_sales.index, y='total_sales', data=monthly_sales, marker='o')
plt.title('Monthly Sales Trend')
plt.xlabel('Month Index (2023)')
plt.ylabel('Total Sales')
plt.tight_layout()
plt.savefig('monthly_sales_trend.png')
plt.close()

# 7. Return rate by category
plt.figure(figsize=(10, 6))
return_rate = retail_df.groupby('category')['return_rate'].mean().sort_values(ascending=False)
sns.barplot(x=return_rate.index, y=return_rate.values)
plt.title('Average Return Rate by Category')
plt.xlabel('Category')
plt.ylabel('Return Rate')
plt.tight_layout()
plt.savefig('return_rate_by_category.png')
plt.close()

# 8. Store performance comparison
store_sales = retail_df.groupby('store_id')['total_sales'].sum().sort_values(ascending=False)
plt.figure(figsize=(12, 6))
sns.barplot(x=store_sales.index, y=store_sales.values)
plt.title('Total Sales by Store')
plt.xlabel('Store')
plt.ylabel('Total Sales')
plt.tight_layout()
plt.savefig('sales_by_store.png')
plt.close()

# Basic visualizations for product reviews
print("\n\n=== Product Reviews Analysis ===")

# 1. Rating distribution
plt.figure(figsize=(8, 6))
sns.countplot(x='rating', data=reviews_df)
plt.title('Rating Distribution')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.tight_layout()
plt.savefig('rating_distribution.png')
plt.close()

# 2. Sentiment distribution
plt.figure(figsize=(8, 6))
sentiment_counts = reviews_df['sentiment'].value_counts()
plt.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', startangle=90)
plt.title('Sentiment Distribution')
plt.axis('equal')
plt.tight_layout()
plt.savefig('sentiment_distribution.png')
plt.close()

# 3. Average rating by product category
plt.figure(figsize=(10, 6))
category_rating = reviews_df.groupby('category')['rating'].mean().sort_values(ascending=False)
sns.barplot(x=category_rating.index, y=category_rating.values)
plt.title('Average Rating by Product Category')
plt.xlabel('Category')
plt.ylabel('Average Rating')
plt.tight_layout()
plt.savefig('rating_by_category.png')
plt.close()

# 4. Top mentioned features
plt.figure(figsize=(12, 6))
feature_counts = reviews_df['feature_mentioned'].value_counts().head(10)
sns.barplot(x=feature_counts.values, y=feature_counts.index)
plt.title('Top 10 Mentioned Features')
plt.xlabel('Count')
plt.ylabel('Feature')
plt.tight_layout()
plt.savefig('top_features.png')
plt.close()

# 5. Top mentioned attributes
plt.figure(figsize=(12, 6))
attribute_counts = reviews_df['attribute_mentioned'].value_counts().head(10)
sns.barplot(x=attribute_counts.values, y=attribute_counts.index)
plt.title('Top 10 Mentioned Attributes')
plt.xlabel('Count')
plt.ylabel('Attribute')
plt.tight_layout()
plt.savefig('top_attributes.png')
plt.close()

# 6. Sentiment by product category
plt.figure(figsize=(10, 6))
sentiment_by_category = pd.crosstab(reviews_df['category'], reviews_df['sentiment'], normalize='index')
sentiment_by_category.plot(kind='bar', stacked=True)
plt.title('Sentiment Distribution by Product Category')
plt.xlabel('Category')
plt.ylabel('Proportion')
plt.legend(title='Sentiment')
plt.tight_layout()
plt.savefig('sentiment_by_category.png')
plt.close()

# 7. Rating trends over time
reviews_df['month_year'] = reviews_df['date'].dt.to_period('M')
rating_trend = reviews_df.groupby('month_year')['rating'].mean()
plt.figure(figsize=(14, 6))
rating_trend.plot(kind='line', marker='o')
plt.title('Average Rating Trend Over Time')
plt.xlabel('Month')
plt.ylabel('Average Rating')
plt.tight_layout()
plt.savefig('rating_trend.png')
plt.close()

# Correlation analysis for retail data
print("\n\n=== Correlation Analysis for Retail Data ===")
numeric_cols = ['num_customers', 'total_sales', 'online_sales', 'in_store_sales', 
                'avg_transaction', 'return_rate', 'is_weekend']
corr = retail_df[numeric_cols].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix for Retail Data')
plt.tight_layout()
plt.savefig('retail_correlation.png')
plt.close()

# Key insights summary
print("\n\n=== Key Insights ===")
print("Retail Sales Insights:")
print(f"1. Total number of records: {retail_df.shape[0]}")
print(f"2. Date range: {retail_df['date'].min()} to {retail_df['date'].max()}")
print(f"3. Number of stores: {retail_df['store_id'].nunique()}")
print(f"4. Number of categories: {retail_df['category'].nunique()}")
print(f"5. Average sales: ${retail_df['total_sales'].mean():.2f}")
print(f"6. Average online sales proportion: {(retail_df['online_sales'].sum() / retail_df['total_sales'].sum() * 100):.2f}%")
print(f"7. Most common weather condition: {retail_df['weather'].mode()[0]}")
print(f"8. Most common promotion type: {retail_df['promotion'].mode()[0]}")
print(f"9. Most common dominant age group: {retail_df['dominant_age_group'].mode()[0]}")

print("\nProduct Reviews Insights:")
print(f"1. Total number of reviews: {reviews_df.shape[0]}")
print(f"2. Number of unique products: {reviews_df['product'].nunique()}")
print(f"3. Number of categories: {reviews_df['category'].nunique()}")
print(f"4. Average rating: {reviews_df['rating'].mean():.2f}")
print(f"5. Most common sentiment: {reviews_df['sentiment'].mode()[0]}")
print(f"6. Most mentioned feature: {reviews_df['feature_mentioned'].mode()[0]}")
print(f"7. Most mentioned attribute: {reviews_df['attribute_mentioned'].mode()[0]}")
print(f"8. Date range: {reviews_df['date'].min()} to {reviews_df['date'].max()}")

# Return the dataframes for further analysis
retail_df.head(), reviews_df.head()