# IMDB電影評論數據探索

本notebook用於探索IMDB電影評論數據集，了解數據的基本特徵和分布。

In [None]:
import sys
import os
sys.path.append('..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import nltk
from collections import Counter

from src.data_preprocessing import IMDBDataPreprocessor
from tensorflow.keras.datasets import imdb

plt.style.use('seaborn-v0_8')
plt.rcParams['font.size'] = 12
plt.rcParams['figure.figsize'] = (10, 6)

## 1. 載入數據

In [None]:
# 載入IMDB數據集
print("載入IMDB數據集...")
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=10000)

print(f"訓練集大小: {len(X_train)}")
print(f"測試集大小: {len(X_test)}")
print(f"總樣本數: {len(X_train) + len(X_test)}")

## 2. 標籤分布分析

In [None]:
# 分析標籤分布
train_labels = pd.Series(y_train)
test_labels = pd.Series(y_test)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

train_labels.value_counts().plot(kind='bar', ax=ax1, color=['red', 'green'])
ax1.set_title('訓練集標籤分布')
ax1.set_xlabel('標籤 (0=負面, 1=正面)')
ax1.set_ylabel('數量')
ax1.tick_params(axis='x', rotation=0)

test_labels.value_counts().plot(kind='bar', ax=ax2, color=['red', 'green'])
ax2.set_title('測試集標籤分布')
ax2.set_xlabel('標籤 (0=負面, 1=正面)')
ax2.set_ylabel('數量')
ax2.tick_params(axis='x', rotation=0)

plt.tight_layout()
plt.show()

print(f"訓練集正面評論比例: {np.mean(y_train):.2%}")
print(f"測試集正面評論比例: {np.mean(y_test):.2%}")

## 3. 評論長度分析

In [None]:
# 分析評論長度
train_lengths = [len(review) for review in X_train]
test_lengths = [len(review) for review in X_test]

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

ax1.hist(train_lengths, bins=50, alpha=0.7, color='blue', label='訓練集')
ax1.hist(test_lengths, bins=50, alpha=0.7, color='red', label='測試集')
ax1.set_title('評論長度分布')
ax1.set_xlabel('評論長度 (詞數)')
ax1.set_ylabel('頻率')
ax1.legend()

ax2.boxplot([train_lengths, test_lengths], labels=['訓練集', '測試集'])
ax2.set_title('評論長度箱型圖')
ax2.set_ylabel('評論長度 (詞數)')

plt.tight_layout()
plt.show()

print(f"訓練集評論長度統計:")
print(f"  平均長度: {np.mean(train_lengths):.2f}")
print(f"  中位數長度: {np.median(train_lengths):.2f}")
print(f"  最短長度: {np.min(train_lengths)}")
print(f"  最長長度: {np.max(train_lengths)}")
print(f"  標準差: {np.std(train_lengths):.2f}")

## 4. 詞彙分析

In [None]:
# 獲取詞彙索引
word_index = imdb.get_word_index()
reverse_word_index = {value: key for key, value in word_index.items()}

print(f"詞彙表大小: {len(word_index)}")
print(f"前10個最常見詞彙:")
for i in range(1, 11):
    print(f"  {i}: {reverse_word_index.get(i, '?')}")

In [None]:
# 分析詞頻分布
all_words = []
for review in X_train[:1000]:  # 只分析前1000個評論以節省時間
    all_words.extend(review)

word_freq = Counter(all_words)
most_common = word_freq.most_common(20)

words, frequencies = zip(*most_common)
word_names = [reverse_word_index.get(word, f'UNK_{word}') for word in words]

plt.figure(figsize=(12, 6))
plt.bar(range(len(word_names)), frequencies)
plt.xticks(range(len(word_names)), word_names, rotation=45, ha='right')
plt.title('最常見的20個詞彙')
plt.xlabel('詞彙')
plt.ylabel('頻率')
plt.tight_layout()
plt.show()

## 5. 樣本展示

In [None]:
# 將數字序列轉換回文本
def decode_review(encoded_review):
    return ' '.join([reverse_word_index.get(i - 3, '?') for i in encoded_review])

# 展示幾個樣本
print("正面評論樣本:")
positive_indices = np.where(y_train == 1)[0][:3]
for i, idx in enumerate(positive_indices):
    print(f"\n樣本 {i+1}:")
    print(decode_review(X_train[idx])[:500] + "...")

print("\n" + "="*50)
print("負面評論樣本:")
negative_indices = np.where(y_train == 0)[0][:3]
for i, idx in enumerate(negative_indices):
    print(f"\n樣本 {i+1}:")
    print(decode_review(X_train[idx])[:500] + "...")

## 6. 數據預處理建議

基於以上分析，我們可以得出以下結論和建議：

1. **數據平衡性**: 數據集在正面和負面評論之間保持良好的平衡
2. **序列長度**: 評論長度變化很大，建議設置合適的最大長度進行截斷或填充
3. **詞彙處理**: 需要處理低頻詞和高頻停用詞
4. **文本清理**: 原始數據包含HTML標籤等需要清理的內容

In [None]:
# 分析不同長度閾值的覆蓋率
length_thresholds = [100, 200, 300, 400, 500, 600, 800, 1000]
coverage_rates = []

for threshold in length_thresholds:
    coverage = np.mean([length <= threshold for length in train_lengths])
    coverage_rates.append(coverage)
    print(f"長度閾值 {threshold}: 覆蓋率 {coverage:.2%}")

plt.figure(figsize=(10, 6))
plt.plot(length_thresholds, coverage_rates, marker='o')
plt.title('不同長度閾值的數據覆蓋率')
plt.xlabel('長度閾值')
plt.ylabel('覆蓋率')
plt.grid(True)
plt.show()