1. Instalasi Packages

In [None]:
# ===============================================
# 1. INSTALASI PACKAGES
# ===============================================

# Install required packages
%pip install nltk PySastrawi requests beautifulsoup4 textblob wordcloud matplotlib seaborn scipy pandas numpy

print("✅ Packages berhasil diinstall!")

2. Import Libraries dan Konfigurasi Awal

In [None]:
# ===============================================
# 2. IMPORT LIBRARIES DAN KONFIGURASI AWAL
# ===============================================

# Manipulasi & Analisis Data
import pandas as pd
import numpy as np
import re
from collections import Counter
import warnings

# Visualisasi
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# Pemrosesan Teks (NLP)
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Mengunduh data NLTK yang diperlukan
nltk.download('punkt')
nltk.download('stopwords')

# Pengaturan Peringatan dan Tampilan Plot
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

print("✅ Semua library berhasil diimport dan dikonfigurasi!")

3. Pemuatan Data

In [None]:
# ===============================================
# 3. PEMUATAN DATA
# ===============================================

# Data review redBus
redbus_reviews = [
    {"review": "Our first trip was very good...", "rating": 3, "sentiment": "mixed"},
    {"review": "I had a medical emergency so not able to travel...", "rating": 1, "sentiment": "negative"},
    {"review": "very bad algorithm, I went upto the payment...", "rating": 1, "sentiment": "negative"},
    {"review": "Excellent app! Easy to use, great interface...", "rating": 5, "sentiment": "positive"},
    {"review": "Good app overall but sometimes the bus tracking...", "rating": 4, "sentiment": "positive"}
]

# Konversi ke DataFrame
df = pd.DataFrame(redbus_reviews)

print("✅ Data berhasil dimuat!")
print(df.head())

4. EDA pada Data Mentah

In [None]:
# ===============================================
# 4. EDA PADA DATA MENTAH
# ===============================================

print("=== Melakukan EDA pada Raw Dataset ===\n")

# Menambahkan fitur dasar dari data mentah
df['review_length'] = df['review'].str.len()
df['word_count'] = df['review'].apply(lambda x: len(x.split()))

# 1. Informasi dan Statistik Deskriptif
print("1. INFORMASI DASAR DATASET")
print("=" * 50)
df.info()
print("\nSTATISTIK DESKRIPTIF:")
print(df.describe())

# 2. Analisis Kualitas Data
print("\n2. ANALISIS KUALITAS DATA")
print("=" * 50)
print(f"Jumlah Missing Values: {df.isnull().sum().sum()}")
print(f"Jumlah Baris Duplikat: {df.duplicated().sum()}")

# 3. Visualisasi Distribusi
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
fig.suptitle('Distribusi pada Data Mentah', fontsize=16, fontweight='bold')

# Distribusi Rating
sns.countplot(ax=axes[0], data=df, x='rating', palette='viridis')
axes[0].set_title('Distribusi Rating', fontweight='bold')

# Distribusi Panjang Review
sns.histplot(ax=axes[1], data=df, x='review_length', bins=15, kde=True)
axes[1].set_title('Distribusi Panjang Review (Karakter)', fontweight='bold')

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

5. Data Preprocessing

In [None]:
# ===============================================
# 5. DATA PREPROCESSING
# ===============================================
print("=== Memulai Proses Pembersihan Teks ===\n")

# 1. Case Folding (Mengubah ke Lowercase)
df['clean_review'] = df['review'].str.lower()
print("1. Teks setelah diubah ke lowercase:")
print(df[['review', 'clean_review']].head())

# 2. Menghilangkan Tanda Baca
df['clean_review'] = df['clean_review'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x))
print("\n2. Teks setelah dihilangkan tanda baca:")
print(df['clean_review'].head())

# 3. Tokenisasi
df['tokens'] = df['clean_review'].apply(lambda x: word_tokenize(x))
print("\n3. Teks setelah tokenisasi:")
print(df['tokens'].head())

# 4. Menghilangkan Stopwords (dengan kolom perbandingan)
stop_words = set(stopwords.words('english'))
df['tokens_before_stopwords'] = df['tokens']
df['tokens_after_stopwords'] = df['tokens'].apply(lambda tokens: [word for word in tokens if word not in stop_words])
print("\n4. Perbandingan Sebelum dan Sesudah Menghilangkan Stopwords:")
print(df[['tokens_before_stopwords', 'tokens_after_stopwords']].head())

# 5. Stemming (Mencari Kata Dasar)
stemmer = PorterStemmer()
df['tokens_stemmed'] = df['tokens_after_stopwords'].apply(lambda tokens: [stemmer.stem(word) for word in tokens])
print("\n5. Teks setelah proses stemming (hasil akhir):")
print(df['tokens_stemmed'].head())

print("\n✅ Proses preprocessing selesai!")

6. EDA pada Data Bersih

In [None]:
# ===============================================
# 6. EDA PADA DATA BERSIH
# ===============================================
print("=== Melakukan EDA pada Data yang Sudah Dibersihkan ===\n")

# 1. Analisis Frekuensi 100 Kata Teratas
all_tokens = [token for sublist in df['tokens_stemmed'] for token in sublist]
word_freq = Counter(all_tokens)
top_100_words = word_freq.most_common(100)
df_top_words = pd.DataFrame(top_100_words, columns=['Kata', 'Frekuensi'])

print("Top 100 Kata Paling Sering Muncul:")
print(df_top_words)

# 2. Visualisasi Kata Paling Umum
plt.figure(figsize=(12, 8))
sns.barplot(data=df_top_words.head(20), x='Frekuensi', y='Kata', palette='mako')
plt.title('20 Kata Paling Sering Muncul Setelah Preprocessing', fontweight='bold')
plt.show()

# 3. Word Cloud dari Data Bersih
cleaned_text = " ".join(all_tokens)
wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='viridis').generate(cleaned_text)

plt.figure(figsize=(10, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud dari Data Bersih', fontweight='bold')
plt.show()

7. Analisis Statistik Deskriptif

In [None]:
# ===============================================
# 7. ANALISIS STATISTIK DESKRIPTIF
# ===============================================

print("=== ANALISIS STATISTIK DESKRIPTIF ===\n")

# Informasi dasar dataset
print("1. INFORMASI DASAR DATASET")
print("=" * 50)
df.info()

# Statistik deskriptif untuk kolom numerik
print("\n2. STATISTIK DESKRIPTIF NUMERIK")
print("=" * 50)
print(df.describe())

# Distribusi rating
print("\n3. ANALISIS DISTRIBUSI RATING")
print("=" * 50)
print(df['rating'].value_counts().sort_index())

# Distribusi sentimen
print("\n4. ANALISIS DISTRIBUSI SENTIMEN")
print("=" * 50)
print(df['sentiment'].value_counts())

8. Analisis Kualitas Data

In [None]:
# ===============================================
# 8. ANALISIS KUALITAS DATA
# ===============================================

print("=== ANALISIS KUALITAS DATA ===\n")

# 1. Analisis Missing Values
print("1. ANALISIS MISSING VALUES")
print("=" * 50)
print(df.isnull().sum())

# 2. Analisis Data Duplikat
print("\n2. ANALISIS DATA DUPLIKAT")
print("=" * 50)
print(f"Jumlah baris duplikat: {df.duplicated().sum()}")

# 3. Analisis Konsistensi Rating vs Sentimen
print("\n3. ANALISIS KONSISTENSI DATA")
print("=" * 50)
inconsistencies = df[((df['rating'] >= 4) & (df['sentiment'] == 'negative')) | ((df['rating'] <= 2) & (df['sentiment'] == 'positive'))]
print(f"Ditemukan {len(inconsistencies)} inkonsistensi antara rating dan sentimen.")

9. Visualisasi Data Eksploratif (EDA)

In [None]:
# ===============================================
# 9. VISUALISASI DATA EKSPLORATIF (EDA)
# ===============================================

print("=== VISUALISASI DATA ===\n")

fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Analisis Eksploratif Review redBus', fontsize=16, fontweight='bold')

# 1. Distribusi Rating (Bar Chart)
sns.countplot(ax=axes[0, 0], data=df, x='rating', palette='husl')
axes[0, 0].set_title('Distribusi Rating', fontweight='bold')

# 2. Distribusi Sentimen (Pie Chart)
sentiment_counts = df['sentiment'].value_counts()
axes[0, 1].pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', startangle=90, colors=['#ff6b6b', '#4ecdc4', '#45b7d1'])
axes[0, 1].set_title('Distribusi Sentimen', fontweight='bold')

# 3. Distribusi Panjang Review (Histogram)
sns.histplot(ax=axes[1, 0], data=df, x='review_length', bins=15, kde=True)
axes[1, 0].set_title('Distribusi Panjang Review (Karakter)', fontweight='bold')

# 4. Box Plot Rating vs Panjang Review
sns.boxplot(ax=axes[1, 1], data=df, x='rating', y='review_length')
axes[1, 1].set_title('Rating vs Panjang Review', fontweight='bold')

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

10. Analisis Korelasi

In [None]:
# ===============================================
# 10. ANALISIS KORELASI
# ===============================================

plt.figure(figsize=(10, 7))
correlation_matrix = df[['rating', 'review_length', 'word_count', 'sentence_count', 'sentiment_numeric']].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, square=True)
plt.title('Heatmap Korelasi', fontweight='bold')
plt.show()

11. Analisis Teks dengan Word Count

In [None]:
# ===============================================
# 11. ANALISIS TEKS DENGAN WORD CLOUD
# ===============================================

print("=== WORD CLOUD ANALYSIS ===\n")

# Menggabungkan semua teks review
all_text = ' '.join(df['review'])
positive_text = ' '.join(df[df['sentiment'] == 'positive']['review'])
negative_text = ' '.join(df[df['sentiment'] == 'negative']['review'])

# Membuat Word Cloud
wordcloud_all = WordCloud(width=800, height=400, background_color='white').generate(all_text)
wordcloud_positive = WordCloud(width=800, height=400, background_color='white').generate(positive_text)
wordcloud_negative = WordCloud(width=800, height=400, background_color='white').generate(negative_text)

# Menampilkan Word Cloud
fig, axes = plt.subplots(1, 3, figsize=(20, 6))
axes[0].imshow(wordcloud_all, interpolation='bilinear')
axes[0].set_title('Semua Review', fontweight='bold')
axes[0].axis('off')

axes[1].imshow(wordcloud_positive, interpolation='bilinear')
axes[1].set_title('Review Positif', fontweight='bold')
axes[1].axis('off')

axes[2].imshow(wordcloud_negative, interpolation='bilinear')
axes[2].set_title('Review Negatif', fontweight='bold')
axes[2].axis('off')

plt.show()

12. Analisis Frekuensi Kata Kunci

In [None]:
# ===============================================
# 12. ANALISIS FREKUENSI KATA KUNCI
# ===============================================

print("=== ANALISIS KATA KUNCI ===\n")

def get_top_n_words(corpus, n=None):
    """Fungsi untuk mendapatkan n kata teratas dari sebuah corpus"""
    vec = nltk.word_tokenize(corpus.lower())
    # Filter token yang hanya berisi alfabet
    words = [word for word in vec if word.isalpha()]
    bag_of_words = Counter(words)
    common_words = bag_of_words.most_common(n)
    return common_words

# 10 kata kunci positif teratas
positive_keywords = get_top_n_words(positive_text, 10)
print("Top 10 Kata Kunci Positif:")
print(pd.DataFrame(positive_keywords, columns=['Kata', 'Frekuensi']))

# 10 kata kunci negatif teratas
negative_keywords = get_top_n_words(negative_text, 10)
print("\nTop 10 Kata Kunci Negatif:")
print(pd.DataFrame(negative_keywords, columns=['Kata', 'Frekuensi']))

13. Pembuatan Laporan Ringkasan EDA

In [None]:
# ===============================================
# 13. PEMBUATAN LAPORAN RINGKASAN EDA
# ===============================================

def generate_eda_summary(df_summary):
    """Menghasilkan laporan ringkasan EDA dan menyimpannya ke file."""
    report = f"""
# Laporan Ringkasan EDA - Review redBus

**Tanggal Analisis:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

## 1. Ikhtisar Dataset
- **Total Review:** {len(df_summary)}
- **Rata-rata Rating:** {df_summary['rating'].mean():.2f}
- **Rata-rata Panjang Review:** {df_summary['review_length'].mean():.1f} karakter

## 2. Kualitas Data
- **Missing Values:** {df_summary.isnull().sum().sum()}
- **Data Duplikat:** {df_summary.duplicated().sum()}
- **Inkonsistensi Rating-Sentimen:** {len(inconsistencies)}

## 3. Temuan Utama
- Mayoritas review memberikan rating **positif** (4-5 bintang) atau **negatif** (1-2 bintang), dengan sedikit review campuran.
- Terdapat korelasi negatif antara **rating** dengan **panjang review**, menunjukkan review negatif cenderung lebih panjang.
- Kata kunci negatif yang sering muncul adalah terkait **masalah teknis** ('payment', 'app', 'ticket') dan **layanan pelanggan** ('cancel', 'reschedule').
- Kata kunci positif berfokus pada **kemudahan penggunaan** ('easy', 'smooth', 'good') dan **fitur** ('booking', 'interface').

## 4. Rekomendasi
- **Untuk Tim Produk:** Fokus pada perbaikan bug terkait proses pembayaran dan fitur referral.
- **Untuk Tim Layanan Pelanggan:** Tinjau kembali kebijakan pembatalan dan penjadwalan ulang, terutama untuk kasus darurat.
- **Untuk Analisis Lanjutan:** Lakukan analisis sentimen yang lebih mendalam menggunakan model machine learning dan analisis topik untuk mengidentifikasi masalah spesifik.
"""
    
    with open('eda_summary_report.md', 'w', encoding='utf-8') as f:
        f.write(report)
    
    return report

# Menghasilkan dan menampilkan laporan
summary_report = generate_eda_summary(df)
print(summary_report)
print("\n✅ Laporan ringkasan berhasil disimpan ke: eda_summary_report.md")