# In-Depth Clothing Brand Sentiment Analysis
## Comprehensive Exploratory Data Analysis

**Objective:** Deep dive into clothing brand customer reviews to understand:
- Product category performance
- Sentiment trends over time
- Key themes in positive vs negative reviews
- Customer satisfaction patterns
- Detailed text analysis and insights

---

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Set style for professional visualizations
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 7)
plt.rcParams['font.size'] = 11
plt.rcParams['axes.titlesize'] = 14
plt.rcParams['axes.labelsize'] = 12

print("‚úì Libraries imported successfully")

## 1. Load Clothing Reviews Dataset

In [None]:
# Load train and test datasets
train_df = pd.read_csv('../data/splits/clothing_reviews_train.csv')
test_df = pd.read_csv('../data/splits/clothing_reviews_test.csv')

# Combine for full analysis
df = pd.concat([train_df, test_df], ignore_index=True)

print(f"üìä Dataset Overview:")
print(f"   Total Reviews: {len(df):,}")
print(f"   Training Set: {len(train_df):,} ({len(train_df)/len(df)*100:.1f}%)")
print(f"   Test Set: {len(test_df):,} ({len(test_df)/len(df)*100:.1f}%)")
print(f"\n   Columns: {df.columns.tolist()}")
print(f"\n   Date Range: {df['created_at'].min()} to {df['created_at'].max()}")

In [None]:
# Display sample reviews
print("\nüìù Sample Customer Reviews:\n")
df.head(5)

In [None]:
# Data quality check
print("üîç Data Quality Check:\n")
print(df.info())
print("\nüìä Missing Values:")
print(df.isnull().sum())
print("\nüìä Summary Statistics:")
print(df.describe())

## 2. Product Category Analysis

In [None]:
# Analyze product categories
category_counts = df['topic'].value_counts()

print("üè∑Ô∏è  Product Categories Distribution:\n")
for cat, count in category_counts.items():
    pct = (count / len(df)) * 100
    print(f"   {cat:20s}: {count:5,} reviews ({pct:5.2f}%)")

print(f"\n   Total Categories: {len(category_counts)}")

In [None]:
# Visualize top product categories
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Bar chart
top_15 = category_counts.head(15)
colors = plt.cm.Set3(np.linspace(0, 1, len(top_15)))

axes[0].barh(range(len(top_15)), top_15.values, color=colors, edgecolor='black', linewidth=1.2)
axes[0].set_yticks(range(len(top_15)))
axes[0].set_yticklabels(top_15.index)
axes[0].set_xlabel('Number of Reviews', fontweight='bold')
axes[0].set_title('Top 15 Product Categories by Review Volume', fontweight='bold', fontsize=14)
axes[0].invert_yaxis()
axes[0].grid(axis='x', alpha=0.3)

# Add value labels
for i, v in enumerate(top_15.values):
    axes[0].text(v + 50, i, f'{v:,}', va='center', fontweight='bold')

# Pie chart for top categories
top_10 = category_counts.head(10)
other_count = category_counts[10:].sum()

pie_data = list(top_10.values) + [other_count]
pie_labels = list(top_10.index) + ['Others']

wedges, texts, autotexts = axes[1].pie(pie_data, labels=pie_labels, autopct='%1.1f%%',
                                        startangle=90, colors=plt.cm.Set3(np.linspace(0, 1, 11)))
axes[1].set_title('Product Category Distribution (Top 10 + Others)', fontweight='bold', fontsize=14)

# Make percentage text bold
for autotext in autotexts:
    autotext.set_color('white')
    autotext.set_fontweight('bold')
    autotext.set_fontsize(9)

plt.tight_layout()
plt.savefig('../reports/clothing_category_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

## 3. Sentiment Distribution Analysis

In [None]:
# Overall sentiment distribution
sentiment_counts = df['label'].value_counts().reindex(['negative', 'neutral', 'positive'])

print("üí≠ Overall Sentiment Distribution:\n")
for sent, count in sentiment_counts.items():
    pct = (count / len(df)) * 100
    print(f"   {sent.capitalize():12s}: {count:5,} reviews ({pct:5.2f}%)")

print(f"\n   ‚ö†Ô∏è  Class Imbalance Ratio: {sentiment_counts['positive'] / sentiment_counts['negative']:.2f}:1 (Positive:Negative)")

In [None]:
# Visualize overall sentiment
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

colors = ['#ef5350', '#ffa726', '#66bb6a']

# Bar chart
axes[0].bar(sentiment_counts.index, sentiment_counts.values, color=colors, 
           edgecolor='black', linewidth=1.5, width=0.6)
axes[0].set_xlabel('Sentiment', fontweight='bold')
axes[0].set_ylabel('Number of Reviews', fontweight='bold')
axes[0].set_title('Clothing Reviews Sentiment Distribution', fontweight='bold', fontsize=14)
axes[0].grid(axis='y', alpha=0.3)

# Add count and percentage labels
for i, (sent, count) in enumerate(sentiment_counts.items()):
    pct = (count / len(df)) * 100
    axes[0].text(i, count + 200, f'{count:,}\n({pct:.1f}%)', 
                ha='center', va='bottom', fontweight='bold', fontsize=11)

# Donut chart
wedges, texts, autotexts = axes[1].pie(sentiment_counts.values, labels=sentiment_counts.index,
                                        autopct='%1.1f%%', startangle=90, colors=colors,
                                        wedgeprops=dict(width=0.5, edgecolor='black', linewidth=1.5))

axes[1].set_title('Sentiment Proportion', fontweight='bold', fontsize=14)

for autotext in autotexts:
    autotext.set_color('white')
    autotext.set_fontweight('bold')
    autotext.set_fontsize(12)

plt.tight_layout()
plt.savefig('../reports/clothing_sentiment_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

## 4. Sentiment by Product Category

In [None]:
# Analyze sentiment per category
category_sentiment = pd.crosstab(df['topic'], df['label'], normalize='index') * 100
category_sentiment = category_sentiment.reindex(columns=['negative', 'neutral', 'positive'])

# Sort by positive sentiment percentage
category_sentiment_sorted = category_sentiment.sort_values('positive', ascending=False)

print("üèÜ Product Categories by Positive Sentiment (Top 15):\n")
print(category_sentiment_sorted.head(15).round(2))

In [None]:
# Visualize sentiment by top categories
top_categories = category_counts.head(12).index
category_sent_top = category_sentiment.loc[top_categories]

fig, ax = plt.subplots(figsize=(14, 8))

x = np.arange(len(category_sent_top))
width = 0.25

bars1 = ax.bar(x - width, category_sent_top['negative'], width, 
              label='Negative', color='#ef5350', edgecolor='black', linewidth=1)
bars2 = ax.bar(x, category_sent_top['neutral'], width, 
              label='Neutral', color='#ffa726', edgecolor='black', linewidth=1)
bars3 = ax.bar(x + width, category_sent_top['positive'], width, 
              label='Positive', color='#66bb6a', edgecolor='black', linewidth=1)

ax.set_xlabel('Product Category', fontweight='bold')
ax.set_ylabel('Percentage of Reviews (%)', fontweight='bold')
ax.set_title('Sentiment Distribution Across Top 12 Product Categories', fontweight='bold', fontsize=14)
ax.set_xticks(x)
ax.set_xticklabels(category_sent_top.index, rotation=45, ha='right')
ax.legend(loc='upper right')
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('../reports/clothing_sentiment_by_category.png', dpi=300, bbox_inches='tight')
plt.show()

## 5. Temporal Analysis - Sentiment Trends Over Time

In [None]:
# Convert date column
df['date'] = pd.to_datetime(df['created_at'])
df['year_month'] = df['date'].dt.to_period('M')
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month

print("üìÖ Temporal Coverage:")
print(f"   Date Range: {df['date'].min().date()} to {df['date'].max().date()}")
print(f"   Duration: {(df['date'].max() - df['date'].min()).days} days")
print(f"   Average Reviews per Day: {len(df) / (df['date'].max() - df['date'].min()).days:.1f}")

In [None]:
# Monthly sentiment trends
monthly_sentiment = df.groupby(['year_month', 'label']).size().unstack(fill_value=0)
monthly_sentiment_pct = monthly_sentiment.div(monthly_sentiment.sum(axis=1), axis=0) * 100

fig, axes = plt.subplots(2, 1, figsize=(16, 10))

# Review volume over time
monthly_total = monthly_sentiment.sum(axis=1)
axes[0].plot(range(len(monthly_total)), monthly_total.values, marker='o', linewidth=2, 
            color='#2196f3', markersize=6)
axes[0].fill_between(range(len(monthly_total)), monthly_total.values, alpha=0.3, color='#2196f3')
axes[0].set_ylabel('Number of Reviews', fontweight='bold')
axes[0].set_title('Monthly Review Volume Over Time', fontweight='bold', fontsize=14)
axes[0].grid(alpha=0.3)
axes[0].set_xticks(range(len(monthly_total)))
axes[0].set_xticklabels([str(m) for m in monthly_total.index], rotation=45, ha='right')

# Sentiment percentage over time
axes[1].plot(range(len(monthly_sentiment_pct)), monthly_sentiment_pct['positive'].values, 
            marker='o', linewidth=2.5, label='Positive', color='#66bb6a', markersize=7)
axes[1].plot(range(len(monthly_sentiment_pct)), monthly_sentiment_pct['neutral'].values, 
            marker='s', linewidth=2.5, label='Neutral', color='#ffa726', markersize=7)
axes[1].plot(range(len(monthly_sentiment_pct)), monthly_sentiment_pct['negative'].values, 
            marker='^', linewidth=2.5, label='Negative', color='#ef5350', markersize=7)

axes[1].set_xlabel('Month', fontweight='bold')
axes[1].set_ylabel('Percentage (%)', fontweight='bold')
axes[1].set_title('Sentiment Trends Over Time (Monthly)', fontweight='bold', fontsize=14)
axes[1].legend(loc='best', fontsize=11)
axes[1].grid(alpha=0.3)
axes[1].set_xticks(range(len(monthly_sentiment_pct)))
axes[1].set_xticklabels([str(m) for m in monthly_sentiment_pct.index], rotation=45, ha='right')

plt.tight_layout()
plt.savefig('../reports/clothing_temporal_trends.png', dpi=300, bbox_inches='tight')
plt.show()

## 6. Review Text Analysis

In [None]:
# Text length analysis
df['text_length'] = df['text'].str.len()
df['word_count'] = df['text'].str.split().str.len()

print("üìä Text Characteristics by Sentiment:\n")
text_stats = df.groupby('label')[['text_length', 'word_count']].describe()
print(text_stats)

In [None]:
# Visualize text length distribution
fig, axes = plt.subplots(2, 2, figsize=(16, 10))

# Character length distribution
for label, color in zip(['negative', 'neutral', 'positive'], ['#ef5350', '#ffa726', '#66bb6a']):
    data = df[df['label'] == label]['text_length']
    axes[0, 0].hist(data, bins=50, alpha=0.6, label=label.capitalize(), 
                   color=color, edgecolor='black', linewidth=0.5)

axes[0, 0].set_xlabel('Text Length (characters)', fontweight='bold')
axes[0, 0].set_ylabel('Frequency', fontweight='bold')
axes[0, 0].set_title('Review Length Distribution by Sentiment', fontweight='bold', fontsize=13)
axes[0, 0].legend()
axes[0, 0].grid(alpha=0.3)

# Word count distribution
for label, color in zip(['negative', 'neutral', 'positive'], ['#ef5350', '#ffa726', '#66bb6a']):
    data = df[df['label'] == label]['word_count']
    axes[0, 1].hist(data, bins=50, alpha=0.6, label=label.capitalize(), 
                   color=color, edgecolor='black', linewidth=0.5)

axes[0, 1].set_xlabel('Word Count', fontweight='bold')
axes[0, 1].set_ylabel('Frequency', fontweight='bold')
axes[0, 1].set_title('Word Count Distribution by Sentiment', fontweight='bold', fontsize=13)
axes[0, 1].legend()
axes[0, 1].grid(alpha=0.3)

# Box plot for text length
sentiment_order = ['negative', 'neutral', 'positive']
bp1 = axes[1, 0].boxplot([df[df['label'] == s]['text_length'] for s in sentiment_order],
                         labels=[s.capitalize() for s in sentiment_order],
                         patch_artist=True, showfliers=False)

for patch, color in zip(bp1['boxes'], ['#ef5350', '#ffa726', '#66bb6a']):
    patch.set_facecolor(color)
    patch.set_alpha(0.7)

axes[1, 0].set_ylabel('Text Length (characters)', fontweight='bold')
axes[1, 0].set_title('Text Length Distribution (Box Plot)', fontweight='bold', fontsize=13)
axes[1, 0].grid(axis='y', alpha=0.3)

# Box plot for word count
bp2 = axes[1, 1].boxplot([df[df['label'] == s]['word_count'] for s in sentiment_order],
                         labels=[s.capitalize() for s in sentiment_order],
                         patch_artist=True, showfliers=False)

for patch, color in zip(bp2['boxes'], ['#ef5350', '#ffa726', '#66bb6a']):
    patch.set_facecolor(color)
    patch.set_alpha(0.7)

axes[1, 1].set_ylabel('Word Count', fontweight='bold')
axes[1, 1].set_title('Word Count Distribution (Box Plot)', fontweight='bold', fontsize=13)
axes[1, 1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('../reports/clothing_text_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

## 7. Word Clouds - Key Themes by Sentiment

In [None]:
# Generate word clouds for each sentiment
fig, axes = plt.subplots(3, 1, figsize=(16, 18))

sentiments = [
    ('positive', 'Greens', 'üéâ Positive Reviews'),
    ('neutral', 'Oranges', 'üòê Neutral Reviews'),
    ('negative', 'Reds', 'üòû Negative Reviews')
]

for idx, (label, colormap, title) in enumerate(sentiments):
    # Get text for this sentiment
    texts = df[df['label'] == label]['text']
    text = ' '.join(texts.fillna(''))
    
    # Generate word cloud
    wordcloud = WordCloud(width=1400, height=400,
                         background_color='white',
                         colormap=colormap,
                         max_words=100,
                         relative_scaling=0.5,
                         min_font_size=10).generate(text)
    
    axes[idx].imshow(wordcloud, interpolation='bilinear')
    axes[idx].set_title(f'{title} - Most Common Words', fontsize=16, fontweight='bold', pad=20)
    axes[idx].axis('off')

plt.tight_layout()
plt.savefig('../reports/clothing_wordclouds_by_sentiment.png', dpi=300, bbox_inches='tight')
plt.show()

## 8. Top Categories - Detailed Breakdown

In [None]:
# Analyze top 5 categories in detail
top_5_cats = category_counts.head(5).index

print("üîç Deep Dive: Top 5 Product Categories\n")
print("="*80)

for cat in top_5_cats:
    cat_df = df[df['topic'] == cat]
    sent_dist = cat_df['label'].value_counts(normalize=True) * 100
    
    print(f"\nüì¶ {cat.upper()}")
    print(f"   Total Reviews: {len(cat_df):,}")
    print(f"   Sentiment Breakdown:")
    for sent in ['positive', 'neutral', 'negative']:
        if sent in sent_dist.index:
            print(f"      {sent.capitalize():10s}: {sent_dist[sent]:5.1f}%")
    print(f"   Avg Review Length: {cat_df['text_length'].mean():.1f} characters")
    print(f"   Avg Word Count: {cat_df['word_count'].mean():.1f} words")
    print("-" * 80)

## 9. Heatmap - Category vs Sentiment

In [None]:
# Create heatmap of top categories vs sentiment
top_15_cats = category_counts.head(15).index
heatmap_data = category_sentiment.loc[top_15_cats]

fig, ax = plt.subplots(figsize=(10, 10))

sns.heatmap(heatmap_data, annot=True, fmt='.1f', cmap='RdYlGn', 
           linewidths=1, linecolor='black', cbar_kws={'label': 'Percentage (%)'},
           vmin=0, vmax=100, ax=ax)

ax.set_xlabel('Sentiment', fontweight='bold', fontsize=12)
ax.set_ylabel('Product Category', fontweight='bold', fontsize=12)
ax.set_title('Sentiment Distribution Heatmap - Top 15 Categories', 
            fontweight='bold', fontsize=14, pad=20)
ax.set_yticklabels(ax.get_yticklabels(), rotation=0)

plt.tight_layout()
plt.savefig('../reports/clothing_sentiment_heatmap.png', dpi=300, bbox_inches='tight')
plt.show()

## 10. Summary Insights

In [None]:
print("\n" + "="*80)
print("üìä KEY INSIGHTS - CLOTHING BRAND SENTIMENT ANALYSIS")
print("="*80)

# 1. Overall sentiment
pos_pct = (sentiment_counts['positive'] / len(df)) * 100
print(f"\n1Ô∏è‚É£  OVERALL SENTIMENT:")
print(f"   {pos_pct:.1f}% of reviews are POSITIVE - High customer satisfaction")
print(f"   Imbalance ratio: {sentiment_counts['positive'] / sentiment_counts['negative']:.1f}:1")

# 2. Top performing categories
best_cat = category_sentiment.sort_values('positive', ascending=False).head(1)
print(f"\n2Ô∏è‚É£  BEST PERFORMING CATEGORY:")
print(f"   '{best_cat.index[0]}' - {best_cat['positive'].values[0]:.1f}% positive")

# 3. Categories needing attention
worst_cat = category_sentiment.sort_values('negative', ascending=False).head(1)
print(f"\n3Ô∏è‚É£  NEEDS ATTENTION:")
print(f"   '{worst_cat.index[0]}' - {worst_cat['negative'].values[0]:.1f}% negative")

# 4. Review characteristics
print(f"\n4Ô∏è‚É£  REVIEW CHARACTERISTICS:")
print(f"   Average review length: {df['text_length'].mean():.0f} characters")
print(f"   Average word count: {df['word_count'].mean():.0f} words")
print(f"   Negative reviews are {df[df['label']=='negative']['word_count'].mean() / df[df['label']=='positive']['word_count'].mean():.2f}x longer")

# 5. Temporal insights
recent_month = df[df['year_month'] == df['year_month'].max()]
recent_pos_pct = (recent_month['label'] == 'positive').mean() * 100
print(f"\n5Ô∏è‚É£  RECENT TRENDS:")
print(f"   Latest month ({df['year_month'].max()}): {recent_pos_pct:.1f}% positive")
print(f"   Total reviews in latest month: {len(recent_month):,}")

print("\n" + "="*80)