In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("‚úÖ Libraries imported successfully!")

## 1. Load Dataset

In [None]:
# Load data
df = pd.read_csv('../data/raw/coursera_reviews.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
df.head()

## 2. Dataset Overview

In [None]:
# Basic info
print("\nüìã Dataset Info:")
df.info()

In [None]:
# Statistical summary
print("\nüìä Statistical Summary:")
df.describe()

In [None]:
# Check for missing values
print("\n‚ùì Missing Values:")
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Percentage': missing_pct
})
print(missing_df[missing_df['Missing Count'] > 0])

## 3. Text Analysis

In [None]:
# Assuming review column is named 'review' or 'text'
# Adjust column name as needed
text_column = 'review'  # Change this to your actual text column name

# Calculate review lengths
df['review_length'] = df[text_column].astype(str).apply(lambda x: len(x.split()))

print("\nüìù Review Length Statistics:")
print(f"Mean: {df['review_length'].mean():.2f} words")
print(f"Median: {df['review_length'].median():.2f} words")
print(f"Min: {df['review_length'].min()} words")
print(f"Max: {df['review_length'].max()} words")

In [None]:
# Plot review length distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
ax1.hist(df['review_length'], bins=50, color='skyblue', edgecolor='black')
ax1.set_title('Review Length Distribution', fontsize=14, fontweight='bold')
ax1.set_xlabel('Number of Words')
ax1.set_ylabel('Frequency')
ax1.axvline(df['review_length'].mean(), color='red', linestyle='--', 
            label=f'Mean: {df["review_length"].mean():.1f}')
ax1.legend()

# Box plot
ax2.boxplot(df['review_length'])
ax2.set_title('Review Length Box Plot', fontsize=14, fontweight='bold')
ax2.set_ylabel('Number of Words')

plt.tight_layout()
plt.savefig('../reports/figures/review_length_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

## 4. Sentiment Distribution (if available)

In [None]:
# Check if sentiment/label column exists
# Adjust column name as needed
if 'sentiment' in df.columns or 'label' in df.columns:
    sentiment_col = 'sentiment' if 'sentiment' in df.columns else 'label'
    
    print(f"\nüòä Sentiment Distribution:")
    print(df[sentiment_col].value_counts())
    print(f"\nPercentage:")
    print(df[sentiment_col].value_counts(normalize=True) * 100)
    
    # Plot distribution
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
    
    # Bar chart
    sentiment_counts = df[sentiment_col].value_counts()
    sentiment_counts.plot(kind='bar', ax=ax1, color=['#e74c3c', '#95a5a6', '#2ecc71'])
    ax1.set_title('Sentiment Distribution', fontsize=14, fontweight='bold')
    ax1.set_xlabel('Sentiment')
    ax1.set_ylabel('Count')
    ax1.tick_params(axis='x', rotation=0)
    
    # Pie chart
    colors = ['#e74c3c', '#95a5a6', '#2ecc71']
    ax2.pie(sentiment_counts.values, labels=sentiment_counts.index, 
            autopct='%1.1f%%', startangle=90, colors=colors)
    ax2.set_title('Sentiment Percentage', fontsize=14, fontweight='bold')
    
    plt.tight_layout()
    plt.savefig('../reports/figures/sentiment_distribution.png', dpi=300, bbox_inches='tight')
    plt.show()
else:
    print("\n‚ö†Ô∏è No sentiment column found. You may need to create labels.")

## 5. Word Cloud

In [None]:
# Generate word cloud
text = ' '.join(df[text_column].astype(str))

wordcloud = WordCloud(
    width=1200,
    height=600,
    background_color='white',
    colormap='viridis',
    max_words=100
).generate(text)

plt.figure(figsize=(14, 7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud - All Reviews', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig('../reports/figures/wordcloud_all.png', dpi=300, bbox_inches='tight')
plt.show()

## 6. Most Common Words

In [None]:
from collections import Counter

# Get all words
all_words = ' '.join(df[text_column].astype(str)).lower().split()

# Count words
word_freq = Counter(all_words)
top_20 = word_freq.most_common(20)

# Create dataframe
top_words_df = pd.DataFrame(top_20, columns=['Word', 'Frequency'])

# Plot
plt.figure(figsize=(12, 6))
sns.barplot(data=top_words_df, x='Frequency', y='Word', palette='viridis')
plt.title('Top 20 Most Common Words', fontsize=14, fontweight='bold')
plt.xlabel('Frequency')
plt.ylabel('Word')
plt.tight_layout()
plt.savefig('../reports/figures/top_words.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nüìù Top 20 Words:")
print(top_words_df.to_string(index=False))

## 7. Key Insights

**Summary:**
- Total reviews: [Number]
- Average review length: [X] words
- Sentiment distribution: [Breakdown]
- Most common topics: [List]

**Next Steps:**
1. Data preprocessing (cleaning, tokenization)
2. Feature extraction (TF-IDF)
3. Model training (Logistic Regression, Naive Bayes)
4. Model evaluation and optimization