1. Instalasi Packages

In [None]:
# ===============================================
# 1. INSTALASI PACKAGES
# ===============================================

# Install required packages
%pip install nltk PySastrawi requests beautifulsoup4 textblob wordcloud matplotlib seaborn scipy pandas numpy

print("✅ Packages berhasil diinstall!")

2. Import Libraries dan Konfigurasi Awal

In [None]:
# ===============================================
# 2. IMPORT LIBRARIES DAN KONFIGURASI AWAL
# ===============================================

# Manipulasi Data
import pandas as pd
import numpy as np

# Visualisasi
import matplotlib.pyplot as plt
import seaborn as sns

# Pemrosesan Teks
import nltk
from nltk.tokenize import word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from textblob import TextBlob
from wordcloud import WordCloud

# Web Scraping
import requests
from bs4 import BeautifulSoup

# Utilitas
import re
from collections import Counter
import time
from datetime import datetime
import warnings

# Statistik
from scipy import stats
from scipy.stats import skew, kurtosis, shapiro

# Mengunduh data NLTK
nltk.download('punkt')
nltk.download('punkt_tab')

# Pengaturan Peringatan dan Plot
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

print("✅ Semua library berhasil diimport dan dikonfigurasi!")

3. Pemuatan dan Preprocessing Dasar Data

In [None]:
# ===============================================
# 3. PEMUATAN DAN PREPROCESSING DASAR DATA
# ===============================================

# Data review redBus
redbus_reviews = [
    {
        "review": "Our first trip was very good, and the booking went smooth. However, when I'm trying to use my 'referral code' in my friends' devices, it's just not working, showing 'something seems to be wrong'. I tried to use the 'chat with us' in the app, but the 'Help' section is not even opening, showing some webpage error. Please fix it soon.",
        "rating": 3,
        "sentiment": "mixed"
    },
    {
        "review": "I had a medical emergency so not able to travel and tried for cancellation but directly redbus informed that I can't cancel my ticket and even I tried to reschedule as I have to travel in coming days but then also it's not possible. I don't understand if you can't provide any of this options then what services you provide and it's not at all expected. My experience is very worst and has occurred loss of 2500.",
        "rating": 1,
        "sentiment": "negative"
    },
    {
        "review": "very bad algorithm, I went upto the payment procedure but due to technical issues the payment failed. next time the payment increase, I totally shocked. 2nd time also I checked the price after closing the app, again the price increase. very bad experience so I did not book the ticket. now I am going to uninstall the app. i know this will not impact you. but I don't want your app on my phone. so ta ta bye bye",
        "rating": 1,
        "sentiment": "negative"
    },
    {
        "review": "Excellent app! Easy to use, great interface, and booking is very smooth. Love the new features like train booking and metro tickets. Highly recommended!",
        "rating": 5,
        "sentiment": "positive"
    },
    {
        "review": "Good app overall but sometimes the bus tracking is not accurate. Customer service is helpful though. Would give 4 stars if tracking was better.",
        "rating": 4,
        "sentiment": "positive"
    }
]

# Konversi ke DataFrame
df = pd.DataFrame(redbus_reviews)

# Inisialisasi stemmer Sastrawi
stemmer = StemmerFactory().create_stemmer()

# Menambahkan kolom untuk analisis
df['review_length'] = df['review'].str.len()
df['word_count'] = df['review'].apply(lambda x: len(x.split()))
df['sentence_count'] = df['review'].apply(lambda x: len(x.split('.')))

# Membuat kolom numerik untuk sentimen
sentiment_map = {'positive': 1, 'negative': -1, 'mixed': 0}
df['sentiment_numeric'] = df['sentiment'].map(sentiment_map)

print("✅ Data berhasil dimuat!")
print(f"📊 Dataset berisi {len(df)} review")
print("\n📋 Preview Data:")
print(df.head())

4. Analisis Statistik Deskriptif

In [None]:
# ===============================================
# 4. ANALISIS STATISTIK DESKRIPTIF
# ===============================================

print("=== ANALISIS STATISTIK DESKRIPTIF ===\n")

# Informasi dasar dataset
print("1. INFORMASI DASAR DATASET")
print("=" * 50)
df.info()

# Statistik deskriptif untuk kolom numerik
print("\n2. STATISTIK DESKRIPTIF NUMERIK")
print("=" * 50)
print(df.describe())

# Distribusi rating
print("\n3. ANALISIS DISTRIBUSI RATING")
print("=" * 50)
print(df['rating'].value_counts().sort_index())

# Distribusi sentimen
print("\n4. ANALISIS DISTRIBUSI SENTIMEN")
print("=" * 50)
print(df['sentiment'].value_counts())

5. Analisis Kualitas Data

In [None]:
# ===============================================
# 5. ANALISIS KUALITAS DATA
# ===============================================

print("=== ANALISIS KUALITAS DATA ===\n")

# 1. Analisis Missing Values
print("1. ANALISIS MISSING VALUES")
print("=" * 50)
print(df.isnull().sum())

# 2. Analisis Data Duplikat
print("\n2. ANALISIS DATA DUPLIKAT")
print("=" * 50)
print(f"Jumlah baris duplikat: {df.duplicated().sum()}")

# 3. Analisis Konsistensi Rating vs Sentimen
print("\n3. ANALISIS KONSISTENSI DATA")
print("=" * 50)
inconsistencies = df[((df['rating'] >= 4) & (df['sentiment'] == 'negative')) | ((df['rating'] <= 2) & (df['sentiment'] == 'positive'))]
print(f"Ditemukan {len(inconsistencies)} inkonsistensi antara rating dan sentimen.")

6. Visualisasi Data Eksploratif (EDA)

In [None]:
# ===============================================
# 6. VISUALISASI DATA EKSPLORATIF (EDA)
# ===============================================

print("=== VISUALISASI DATA ===\n")

fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Analisis Eksploratif Review redBus', fontsize=16, fontweight='bold')

# 1. Distribusi Rating (Bar Chart)
sns.countplot(ax=axes[0, 0], data=df, x='rating', palette='husl')
axes[0, 0].set_title('Distribusi Rating', fontweight='bold')

# 2. Distribusi Sentimen (Pie Chart)
sentiment_counts = df['sentiment'].value_counts()
axes[0, 1].pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', startangle=90, colors=['#ff6b6b', '#4ecdc4', '#45b7d1'])
axes[0, 1].set_title('Distribusi Sentimen', fontweight='bold')

# 3. Distribusi Panjang Review (Histogram)
sns.histplot(ax=axes[1, 0], data=df, x='review_length', bins=15, kde=True)
axes[1, 0].set_title('Distribusi Panjang Review (Karakter)', fontweight='bold')

# 4. Box Plot Rating vs Panjang Review
sns.boxplot(ax=axes[1, 1], data=df, x='rating', y='review_length')
axes[1, 1].set_title('Rating vs Panjang Review', fontweight='bold')

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

7. Analisis Korelasi

In [None]:
# ===============================================
# 7. ANALISIS KORELASI
# ===============================================

plt.figure(figsize=(10, 7))
correlation_matrix = df[['rating', 'review_length', 'word_count', 'sentence_count', 'sentiment_numeric']].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, square=True)
plt.title('Heatmap Korelasi', fontweight='bold')
plt.show()

8. Analisis Teks dengan Word Count

In [None]:
# ===============================================
# 8. ANALISIS TEKS DENGAN WORD CLOUD
# ===============================================

print("=== WORD CLOUD ANALYSIS ===\n")

# Menggabungkan semua teks review
all_text = ' '.join(df['review'])
positive_text = ' '.join(df[df['sentiment'] == 'positive']['review'])
negative_text = ' '.join(df[df['sentiment'] == 'negative']['review'])

# Membuat Word Cloud
wordcloud_all = WordCloud(width=800, height=400, background_color='white').generate(all_text)
wordcloud_positive = WordCloud(width=800, height=400, background_color='white').generate(positive_text)
wordcloud_negative = WordCloud(width=800, height=400, background_color='white').generate(negative_text)

# Menampilkan Word Cloud
fig, axes = plt.subplots(1, 3, figsize=(20, 6))
axes[0].imshow(wordcloud_all, interpolation='bilinear')
axes[0].set_title('Semua Review', fontweight='bold')
axes[0].axis('off')

axes[1].imshow(wordcloud_positive, interpolation='bilinear')
axes[1].set_title('Review Positif', fontweight='bold')
axes[1].axis('off')

axes[2].imshow(wordcloud_negative, interpolation='bilinear')
axes[2].set_title('Review Negatif', fontweight='bold')
axes[2].axis('off')

plt.show()

9. Analisis Frekuensi Kata Kunci

In [None]:
# ===============================================
# 9. ANALISIS FREKUENSI KATA KUNCI
# ===============================================

print("=== ANALISIS KATA KUNCI ===\n")

def get_top_n_words(corpus, n=None):
    """Fungsi untuk mendapatkan n kata teratas dari sebuah corpus"""
    vec = nltk.word_tokenize(corpus.lower())
    # Filter token yang hanya berisi alfabet
    words = [word for word in vec if word.isalpha()]
    bag_of_words = Counter(words)
    common_words = bag_of_words.most_common(n)
    return common_words

# 10 kata kunci positif teratas
positive_keywords = get_top_n_words(positive_text, 10)
print("Top 10 Kata Kunci Positif:")
print(pd.DataFrame(positive_keywords, columns=['Kata', 'Frekuensi']))

# 10 kata kunci negatif teratas
negative_keywords = get_top_n_words(negative_text, 10)
print("\nTop 10 Kata Kunci Negatif:")
print(pd.DataFrame(negative_keywords, columns=['Kata', 'Frekuensi']))

10. Pembuatan Laporan Ringkasan EDA

In [None]:
# ===============================================
# 10. PEMBUATAN LAPORAN RINGKASAN EDA
# ===============================================

def generate_eda_summary(df_summary):
    """Menghasilkan laporan ringkasan EDA dan menyimpannya ke file."""
    report = f"""
# Laporan Ringkasan EDA - Review redBus

**Tanggal Analisis:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

## 1. Ikhtisar Dataset
- **Total Review:** {len(df_summary)}
- **Rata-rata Rating:** {df_summary['rating'].mean():.2f}
- **Rata-rata Panjang Review:** {df_summary['review_length'].mean():.1f} karakter

## 2. Kualitas Data
- **Missing Values:** {df_summary.isnull().sum().sum()}
- **Data Duplikat:** {df_summary.duplicated().sum()}
- **Inkonsistensi Rating-Sentimen:** {len(inconsistencies)}

## 3. Temuan Utama
- Mayoritas review memberikan rating **positif** (4-5 bintang) atau **negatif** (1-2 bintang), dengan sedikit review campuran.
- Terdapat korelasi negatif antara **rating** dengan **panjang review**, menunjukkan review negatif cenderung lebih panjang.
- Kata kunci negatif yang sering muncul adalah terkait **masalah teknis** ('payment', 'app', 'ticket') dan **layanan pelanggan** ('cancel', 'reschedule').
- Kata kunci positif berfokus pada **kemudahan penggunaan** ('easy', 'smooth', 'good') dan **fitur** ('booking', 'interface').

## 4. Rekomendasi
- **Untuk Tim Produk:** Fokus pada perbaikan bug terkait proses pembayaran dan fitur referral.
- **Untuk Tim Layanan Pelanggan:** Tinjau kembali kebijakan pembatalan dan penjadwalan ulang, terutama untuk kasus darurat.
- **Untuk Analisis Lanjutan:** Lakukan analisis sentimen yang lebih mendalam menggunakan model machine learning dan analisis topik untuk mengidentifikasi masalah spesifik.
"""
    
    with open('eda_summary_report.md', 'w', encoding='utf-8') as f:
        f.write(report)
    
    return report

# Menghasilkan dan menampilkan laporan
summary_report = generate_eda_summary(df)
print(summary_report)
print("\n✅ Laporan ringkasan berhasil disimpan ke: eda_summary_report.md")