In [None]:
# ===============================================
# 1. INSTALASI PACKAGES
# ===============================================

# Install required packages
%pip install nltk PySastrawi requests beautifulsoup4 textblob wordcloud matplotlib seaborn scipy

print("✅ Packages berhasil diinstall!")

In [None]:
# ===============================================
# 2. IMPORT LIBRARIES
# ===============================================

# Data manipulation
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Text processing
import nltk
from nltk.tokenize import word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from textblob import TextBlob
from wordcloud import WordCloud

# Web scraping
import requests
from bs4 import BeautifulSoup

# Utilities
import re
from collections import Counter
import time
from datetime import datetime
import warnings

# Statistics
from scipy import stats
from scipy.stats import skew, kurtosis, shapiro

# Download NLTK data
nltk.download('punkt')
nltk.download('punkt_tab')

# Set warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

print("✅ Semua library berhasil diimport!")


In [None]:
# ===============================================
# 2. DATA LOADING DAN PREPROCESSING DASAR
# ===============================================

# Data review redBus yang telah dianalisis dari Google Play Store
redbus_reviews = [
    {
        "review": "Our first trip was very good, and the booking went smooth. However, when I'm trying to use my 'referral code' in my friends' devices, it's just not working, showing 'something seems to be wrong'. I tried to use the 'chat with us' in the app, but the 'Help' section is not even opening, showing some webpage error. Please fix it soon.",
        "rating": 3,
        "sentiment": "mixed"
    },
    {
        "review": "I had a medical emergency so not able to travel and tried for cancellation but directly redbus informed that I can't cancel my ticket and even I tried to reschedule as I have to travel in coming days but then also it's not possible. I don't understand if you can't provide any of this options then what services you provide and it's not at all expected. My experience is very worst and has occurred loss of 2500.",
        "rating": 1,
        "sentiment": "negative"
    },
    {
        "review": "very bad algorithm, I went upto the payment procedure but due to technical issues the payment failed. next time the payment increase, I totally shocked. 2nd time also I checked the price after closing the app, again the price increase. very bad experience so I did not book the ticket. now I am going to uninstall the app. i know this will not impact you. but I don't want your app on my phone. so ta ta bye bye",
        "rating": 1,
        "sentiment": "negative"
    },
    {
        "review": "Excellent app! Easy to use, great interface, and booking is very smooth. Love the new features like train booking and metro tickets. Highly recommended!",
        "rating": 5,
        "sentiment": "positive"
    },
    {
        "review": "Good app overall but sometimes the bus tracking is not accurate. Customer service is helpful though. Would give 4 stars if tracking was better.",
        "rating": 4,
        "sentiment": "positive"
    }
]

# Konversi ke DataFrame untuk analisis
df = pd.DataFrame(redbus_reviews)

# Inisialisasi stemmer Sastrawi
stemmer = StemmerFactory().create_stemmer()

# Tambahkan kolom untuk analisis
df['review_length'] = df['review'].str.len()
df['word_count'] = df['review'].apply(lambda x: len(x.split()))
df['sentence_count'] = df['review'].apply(lambda x: len(x.split('.')))

# Buat kolom numerik untuk sentimen
sentiment_map = {'positive': 1, 'negative': -1, 'mixed': 0}
df['sentiment_numeric'] = df['sentiment'].map(sentiment_map)

print("✅ Data berhasil dimuat!")
print(f"📊 Dataset berisi {len(df)} review")
print(f"📋 Kolom yang tersedia: {list(df.columns)}")
print(f"📈 Shape dataset: {df.shape}")

# Tampilkan preview data
print("\n📋 Preview Data:")
print(df.head())

In [None]:
# ===============================================
# 3. ANALISIS STATISTIK DASAR
# ===============================================

print("=== ANALISIS STATISTIK DASAR ===\n")

# 1. INFORMASI DASAR DATASET
print("1. INFORMASI DASAR DATASET")
print("=" * 50)
print(f"Shape dataset: {df.shape}")
print(f"Jumlah baris: {df.shape[0]}")
print(f"Jumlah kolom: {df.shape[1]}")
print(f"Kolom yang tersedia: {list(df.columns)}")
print(f"Tipe data:\n{df.dtypes}")
print(f"Memori yang digunakan: {df.memory_usage(deep=True).sum() / 1024:.2f} KB")

# 2. STATISTIK DESKRIPTIF
print("\n2. STATISTIK DESKRIPTIF")
print("=" * 50)
print("Statistik untuk kolom numerik:")
print(df.describe())

# 3. ANALISIS MISSING VALUES
print("\n3. ANALISIS MISSING VALUES")
print("=" * 50)
missing_data = df.isnull().sum()
missing_percent = (missing_data / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing_data,
    'Missing Percentage': missing_percent
})
print(missing_df)

# 4. ANALISIS DISTRIBUSI RATING
print("\n4. ANALISIS DISTRIBUSI RATING")
print("=" * 50)
rating_stats = df['rating'].describe()
print("Statistik Rating:")
print(f"Mean: {rating_stats['mean']:.2f}")
print(f"Median: {rating_stats['50%']:.2f}")
print(f"Mode: {df['rating'].mode().iloc[0]}")
print(f"Std Dev: {rating_stats['std']:.2f}")
print(f"Min: {rating_stats['min']:.2f}")
print(f"Max: {rating_stats['max']:.2f}")
print(f"Range: {rating_stats['max'] - rating_stats['min']:.2f}")

# Distribusi rating
rating_dist = df['rating'].value_counts().sort_index()
print(f"\nDistribusi Rating:")
for rating, count in rating_dist.items():
    percentage = (count / len(df)) * 100
    print(f"Rating {rating}: {count} review ({percentage:.1f}%)")

# 5. ANALISIS SENTIMEN
print("\n5. ANALISIS SENTIMEN")
print("=" * 50)
sentiment_dist = df['sentiment'].value_counts()
print("Distribusi Sentimen:")
for sentiment, count in sentiment_dist.items():
    percentage = (count / len(df)) * 100
    print(f"{sentiment.capitalize()}: {count} review ({percentage:.1f}%)")

# 6. ANALISIS PANJANG REVIEW
print("\n6. ANALISIS PANJANG REVIEW")
print("=" * 50)
print("Statistik Panjang Review:")
print(f"Rata-rata karakter: {df['review_length'].mean():.1f}")
print(f"Rata-rata kata: {df['word_count'].mean():.1f}")
print(f"Rata-rata kalimat: {df['sentence_count'].mean():.1f}")
print(f"Review terpanjang: {df['review_length'].max()} karakter")
print(f"Review terpendek: {df['review_length'].min()} karakter")

# 7. KORELASI ANTAR VARIABEL
print("\n7. KORELASI ANTAR VARIABEL")
print("=" * 50)
correlation_matrix = df[['rating', 'review_length', 'word_count', 'sentence_count', 'sentiment_numeric']].corr()
print("Matriks Korelasi:")
print(correlation_matrix.round(3))

print("\n✅ Analisis statistik dasar selesai!")


In [None]:
# ===============================================
# 4. VISUALISASI DATA
# ===============================================

print("=== VISUALISASI DATA ===\n")

# 1. VISUALISASI DISTRIBUSI RATING
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Exploratory Data Analysis - redBus Reviews', fontsize=16, fontweight='bold')

# 1.1 Distribusi Rating (Bar Chart)
axes[0, 0].bar(df['rating'].value_counts().sort_index().index, 
               df['rating'].value_counts().sort_index().values,
               color=['#ff4444', '#ff8800', '#ffaa00', '#88cc00', '#00cc44'])
axes[0, 0].set_title('Distribusi Rating', fontweight='bold')
axes[0, 0].set_xlabel('Rating')
axes[0, 0].set_ylabel('Jumlah Review')
axes[0, 0].grid(True, alpha=0.3)

# 1.2 Distribusi Sentimen (Pie Chart)
sentiment_counts = df['sentiment'].value_counts()
colors = ['#ff6b6b', '#4ecdc4', '#45b7d1']
axes[0, 1].pie(sentiment_counts.values, labels=sentiment_counts.index, 
               autopct='%1.1f%%', startangle=90, colors=colors)
axes[0, 1].set_title('Distribusi Sentimen', fontweight='bold')

# 1.3 Histogram Panjang Review
axes[0, 2].hist(df['review_length'], bins=15, alpha=0.7, color='skyblue', edgecolor='black')
axes[0, 2].set_title('Distribusi Panjang Review (Karakter)', fontweight='bold')
axes[0, 2].set_xlabel('Jumlah Karakter')
axes[0, 2].set_ylabel('Frekuensi')
axes[0, 2].grid(True, alpha=0.3)

# 1.4 Box Plot Rating vs Panjang Review
sns.boxplot(data=df, x='rating', y='review_length', ax=axes[1, 0])
axes[1, 0].set_title('Rating vs Panjang Review', fontweight='bold')
axes[1, 0].set_xlabel('Rating')
axes[1, 0].set_ylabel('Panjang Review (Karakter)')

# 1.5 Scatter Plot: Rating vs Word Count
axes[1, 1].scatter(df['rating'], df['word_count'], alpha=0.7, color='purple')
axes[1, 1].set_title('Rating vs Jumlah Kata', fontweight='bold')
axes[1, 1].set_xlabel('Rating')
axes[1, 1].set_ylabel('Jumlah Kata')
axes[1, 1].grid(True, alpha=0.3)

# 1.6 Heatmap Korelasi
correlation_data = df[['rating', 'review_length', 'word_count', 'sentence_count', 'sentiment_numeric']]
correlation_matrix = correlation_data.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, ax=axes[1, 2])
axes[1, 2].set_title('Heatmap Korelasi', fontweight='bold')

plt.tight_layout()
plt.show()

# 2. VISUALISASI LANJUTAN
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Analisis Lanjutan - redBus Reviews', fontsize=16, fontweight='bold')

# 2.1 Violin Plot: Sentimen vs Panjang Review
sns.violinplot(data=df, x='sentiment', y='review_length', ax=axes[0, 0])
axes[0, 0].set_title('Distribusi Panjang Review per Sentimen', fontweight='bold')
axes[0, 0].set_xlabel('Sentimen')
axes[0, 0].set_ylabel('Panjang Review (Karakter)')

# 2.2 Bar Plot: Rata-rata Rating per Sentimen
sentiment_rating = df.groupby('sentiment')['rating'].mean().sort_values(ascending=False)
axes[0, 1].bar(sentiment_rating.index, sentiment_rating.values, 
               color=['#ff6b6b', '#4ecdc4', '#45b7d1'])
axes[0, 1].set_title('Rata-rata Rating per Sentimen', fontweight='bold')
axes[0, 1].set_xlabel('Sentimen')
axes[0, 1].set_ylabel('Rata-rata Rating')
axes[0, 1].set_ylim(0, 5)

# 2.3 Histogram Word Count
axes[1, 0].hist(df['word_count'], bins=12, alpha=0.7, color='lightgreen', edgecolor='black')
axes[1, 0].set_title('Distribusi Jumlah Kata', fontweight='bold')
axes[1, 0].set_xlabel('Jumlah Kata')
axes[1, 0].set_ylabel('Frekuensi')
axes[1, 0].grid(True, alpha=0.3)

# 2.4 Scatter Plot: Review Length vs Word Count dengan Color Coding Rating
scatter = axes[1, 1].scatter(df['review_length'], df['word_count'], 
                            c=df['rating'], cmap='viridis', alpha=0.7)
axes[1, 1].set_title('Panjang Review vs Jumlah Kata (Color: Rating)', fontweight='bold')
axes[1, 1].set_xlabel('Panjang Review (Karakter)')
axes[1, 1].set_ylabel('Jumlah Kata')
plt.colorbar(scatter, ax=axes[1, 1], label='Rating')

plt.tight_layout()
plt.show()

print("✅ Visualisasi data selesai!")


In [None]:
# ===============================================
# 5. WORD CLOUD ANALYSIS
# ===============================================

print("=== WORD CLOUD ANALYSIS ===\n")

# 1. WORD CLOUD untuk semua review
plt.figure(figsize=(15, 5))

# Word Cloud untuk semua review
plt.subplot(1, 3, 1)
all_text = ' '.join(df['review'])
wordcloud_all = WordCloud(width=400, height=300, background_color='white').generate(all_text)
plt.imshow(wordcloud_all, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud - Semua Review', fontweight='bold')

# Word Cloud untuk review positif
plt.subplot(1, 3, 2)
positive_text = ' '.join(df[df['sentiment'] == 'positive']['review'])
if positive_text:
    wordcloud_positive = WordCloud(width=400, height=300, background_color='white').generate(positive_text)
    plt.imshow(wordcloud_positive, interpolation='bilinear')
    plt.axis('off')
    plt.title('Word Cloud - Review Positif', fontweight='bold')
else:
    plt.text(0.5, 0.5, 'Tidak ada data', ha='center', va='center', transform=plt.gca().transAxes)
    plt.axis('off')
    plt.title('Word Cloud - Review Positif', fontweight='bold')

# Word Cloud untuk review negatif
plt.subplot(1, 3, 3)
negative_text = ' '.join(df[df['sentiment'] == 'negative']['review'])
if negative_text:
    wordcloud_negative = WordCloud(width=400, height=300, background_color='white').generate(negative_text)
    plt.imshow(wordcloud_negative, interpolation='bilinear')
    plt.axis('off')
    plt.title('Word Cloud - Review Negatif', fontweight='bold')
else:
    plt.text(0.5, 0.5, 'Tidak ada data', ha='center', va='center', transform=plt.gca().transAxes)
    plt.axis('off')
    plt.title('Word Cloud - Review Negatif', fontweight='bold')

plt.tight_layout()
plt.show()

# 2. ANALISIS KATA KUNCI
print("2. ANALISIS KATA KUNCI")
print("=" * 50)

# Gabungkan semua review
all_text = ' '.join(df['review'].str.lower())
# Tokenisasi sederhana
words = all_text.split()
# Filter kata yang panjangnya > 3 karakter
meaningful_words = [word for word in words if len(word) > 3 and word.isalpha()]
word_freq = Counter(meaningful_words)

print("10 Kata Paling Sering Muncul:")
for word, count in word_freq.most_common(10):
    print(f"{word}: {count} kali")

# 3. ANALISIS KATA KUNCI POSITIF DAN NEGATIF
print("\n3. ANALISIS KATA KUNCI POSITIF DAN NEGATIF")
print("=" * 50)

# Analisis kata kunci positif dan negatif
positive_reviews = df[df['rating'] >= 4]
negative_reviews = df[df['rating'] <= 2]

positive_tokens = []
negative_tokens = []

for review in positive_reviews['review']:
    positive_tokens.extend(word_tokenize(review.lower()))

for review in negative_reviews['review']:
    negative_tokens.extend(word_tokenize(review.lower()))

print("Kata Kunci Positif:")
for word, count in Counter(positive_tokens).most_common(10):
    print(f"  {word}: {count} kali")

print("\nKata Kunci Negatif:")
for word, count in Counter(negative_tokens).most_common(10):
    print(f"  {word}: {count} kali")

print("\n✅ Word cloud analysis selesai!")


In [None]:
# ===============================================
# 6. ANALISIS KUALITAS DATA
# ===============================================

print("=== ANALISIS KUALITAS DATA ===\n")

# 1. ANALISIS MISSING VALUES DETAIL
print("1. ANALISIS MISSING VALUES DETAIL")
print("=" * 50)

# Cek missing values per kolom
missing_analysis = pd.DataFrame({
    'Kolom': df.columns,
    'Missing_Count': df.isnull().sum(),
    'Missing_Percentage': (df.isnull().sum() / len(df)) * 100,
    'Data_Type': df.dtypes,
    'Unique_Values': df.nunique()
})

print("Ringkasan Missing Values:")
print(missing_analysis)

# Cek missing values per baris
df['missing_per_row'] = df.isnull().sum(axis=1)
print(f"\nBaris dengan missing values: {df['missing_per_row'].sum()}")
print(f"Persentase baris dengan missing values: {(df['missing_per_row'] > 0).sum() / len(df) * 100:.2f}%")

# 2. ANALISIS KUALITAS DATA TEXT
print("\n2. ANALISIS KUALITAS DATA TEXT")
print("=" * 50)

# 2.1 Analisis karakter khusus dan simbol
def analyze_text_quality(text):
    """Analisis kualitas teks review"""
    if pd.isna(text):
        return {
            'length': 0,
            'word_count': 0,
            'has_special_chars': False,
            'has_numbers': False,
            'has_uppercase': False,
            'has_punctuation': False,
            'is_empty': True
        }
    
    return {
        'length': len(text),
        'word_count': len(text.split()),
        'has_special_chars': bool(re.search(r'[^a-zA-Z0-9\s.,!?]', text)),
        'has_numbers': bool(re.search(r'\d', text)),
        'has_uppercase': bool(re.search(r'[A-Z]', text)),
        'has_punctuation': bool(re.search(r'[.,!?]', text)),
        'is_empty': len(text.strip()) == 0
    }

# Analisis kualitas teks untuk setiap review
text_quality = df['review'].apply(analyze_text_quality)
text_quality_df = pd.DataFrame(list(text_quality))

print("Statistik Kualitas Teks:")
print(f"Review kosong: {text_quality_df['is_empty'].sum()}")
print(f"Review dengan karakter khusus: {text_quality_df['has_special_chars'].sum()}")
print(f"Review dengan angka: {text_quality_df['has_numbers'].sum()}")
print(f"Review dengan huruf kapital: {text_quality_df['has_uppercase'].sum()}")
print(f"Review dengan tanda baca: {text_quality_df['has_punctuation'].sum()}")

# 2.2 Analisis panjang review yang ekstrem
print(f"\nReview terpendek: {text_quality_df['length'].min()} karakter")
print(f"Review terpanjang: {text_quality_df['length'].max()} karakter")
print(f"Rata-rata panjang: {text_quality_df['length'].mean():.1f} karakter")

# Identifikasi review yang sangat pendek atau sangat panjang
very_short = text_quality_df['length'] < 10
very_long = text_quality_df['length'] > 500
print(f"Review sangat pendek (<10 karakter): {very_short.sum()}")
print(f"Review sangat panjang (>500 karakter): {very_long.sum()}")

# 3. ANALISIS KONSISTENSI DATA
print("\n3. ANALISIS KONSISTENSI DATA")
print("=" * 50)

# 3.1 Konsistensi rating dan sentimen
print("Analisis Konsistensi Rating vs Sentimen:")
for rating in sorted(df['rating'].unique()):
    rating_data = df[df['rating'] == rating]
    sentiment_dist = rating_data['sentiment'].value_counts()
    print(f"\nRating {rating}:")
    for sentiment, count in sentiment_dist.items():
        percentage = (count / len(rating_data)) * 100
        print(f"  {sentiment}: {count} ({percentage:.1f}%)")

# 3.2 Identifikasi inkonsistensi
inconsistencies = []
for idx, row in df.iterrows():
    rating = row['rating']
    sentiment = row['sentiment']
    
    # Logika konsistensi: rating tinggi = positive, rating rendah = negative
    if rating >= 4 and sentiment == 'negative':
        inconsistencies.append(f"Row {idx}: Rating {rating} tapi sentimen {sentiment}")
    elif rating <= 2 and sentiment == 'positive':
        inconsistencies.append(f"Row {idx}: Rating {rating} tapi sentimen {sentiment}")

print(f"\nInkonsistensi yang ditemukan: {len(inconsistencies)}")
for inconsistency in inconsistencies[:5]:  # Tampilkan 5 pertama
    print(f"  - {inconsistency}")

# 4. ANALISIS OUTLIERS DETAIL
print("\n4. ANALISIS OUTLIERS DETAIL")
print("=" * 50)

# 4.1 Outliers menggunakan IQR method
def detect_outliers_iqr(data, column_name):
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data < lower_bound) | (data > upper_bound)]
    return outliers, lower_bound, upper_bound

# Analisis outliers untuk berbagai metrik
outlier_analysis = {}
metrics = ['review_length', 'word_count', 'sentence_count']

for metric in metrics:
    outliers, lower, upper = detect_outliers_iqr(df[metric], metric)
    outlier_analysis[metric] = {
        'outliers': outliers,
        'lower_bound': lower,
        'upper_bound': upper,
        'count': len(outliers)
    }
    
    print(f"\n{metric.upper()}:")
    print(f"  Outliers: {len(outliers)}")
    print(f"  Lower bound: {lower:.1f}")
    print(f"  Upper bound: {upper:.1f}")
    if len(outliers) > 0:
        print(f"  Outlier values: {outliers.tolist()}")

# 4.2 Outliers menggunakan Z-score method
print(f"\nOutliers menggunakan Z-score (|z| > 2):")
for metric in metrics:
    z_scores = np.abs(stats.zscore(df[metric]))
    outliers_z = df[z_scores > 2]
    print(f"  {metric}: {len(outliers_z)} outliers")

# 5. ANALISIS DUPLICATE DATA
print("\n5. ANALISIS DUPLICATE DATA")
print("=" * 50)

# Cek duplicate berdasarkan review text
duplicate_reviews = df.duplicated(subset=['review'], keep=False)
print(f"Review duplikat: {duplicate_reviews.sum()}")

if duplicate_reviews.sum() > 0:
    print("Review yang duplikat:")
    duplicate_data = df[duplicate_reviews][['review', 'rating', 'sentiment']]
    print(duplicate_data)

# Cek duplicate berdasarkan semua kolom
duplicate_all = df.duplicated(keep=False)
print(f"Baris duplikat (semua kolom): {duplicate_all.sum()}")

print("\n✅ Analisis kualitas data selesai!")


In [None]:
# ===============================================
# ANALISIS KUALITAS DATA DAN MISSING VALUES
# ===============================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

print("=== ANALISIS KUALITAS DATA ===\n")

# 1. ANALISIS MISSING VALUES DETAIL
print("1. ANALISIS MISSING VALUES DETAIL")
print("=" * 50)

# Cek missing values per kolom
missing_analysis = pd.DataFrame({
    'Kolom': df.columns,
    'Missing_Count': df.isnull().sum(),
    'Missing_Percentage': (df.isnull().sum() / len(df)) * 100,
    'Data_Type': df.dtypes,
    'Unique_Values': df.nunique()
})

print("Ringkasan Missing Values:")
print(missing_analysis)

# Cek missing values per baris
df['missing_per_row'] = df.isnull().sum(axis=1)
print(f"\nBaris dengan missing values: {df['missing_per_row'].sum()}")
print(f"Persentase baris dengan missing values: {(df['missing_per_row'] > 0).sum() / len(df) * 100:.2f}%")

# 2. ANALISIS KUALITAS DATA TEXT
print("\n2. ANALISIS KUALITAS DATA TEXT")
print("=" * 50)

# 2.1 Analisis karakter khusus dan simbol
def analyze_text_quality(text):
    """Analisis kualitas teks review"""
    if pd.isna(text):
        return {
            'length': 0,
            'word_count': 0,
            'has_special_chars': False,
            'has_numbers': False,
            'has_uppercase': False,
            'has_punctuation': False,
            'is_empty': True
        }
    
    return {
        'length': len(text),
        'word_count': len(text.split()),
        'has_special_chars': bool(re.search(r'[^a-zA-Z0-9\s.,!?]', text)),
        'has_numbers': bool(re.search(r'\d', text)),
        'has_uppercase': bool(re.search(r'[A-Z]', text)),
        'has_punctuation': bool(re.search(r'[.,!?]', text)),
        'is_empty': len(text.strip()) == 0
    }

# Analisis kualitas teks untuk setiap review
text_quality = df['review'].apply(analyze_text_quality)
text_quality_df = pd.DataFrame(list(text_quality))

print("Statistik Kualitas Teks:")
print(f"Review kosong: {text_quality_df['is_empty'].sum()}")
print(f"Review dengan karakter khusus: {text_quality_df['has_special_chars'].sum()}")
print(f"Review dengan angka: {text_quality_df['has_numbers'].sum()}")
print(f"Review dengan huruf kapital: {text_quality_df['has_uppercase'].sum()}")
print(f"Review dengan tanda baca: {text_quality_df['has_punctuation'].sum()}")

# 2.2 Analisis panjang review yang ekstrem
print(f"\nReview terpendek: {text_quality_df['length'].min()} karakter")
print(f"Review terpanjang: {text_quality_df['length'].max()} karakter")
print(f"Rata-rata panjang: {text_quality_df['length'].mean():.1f} karakter")

# Identifikasi review yang sangat pendek atau sangat panjang
very_short = text_quality_df['length'] < 10
very_long = text_quality_df['length'] > 500
print(f"Review sangat pendek (<10 karakter): {very_short.sum()}")
print(f"Review sangat panjang (>500 karakter): {very_long.sum()}")

# 3. ANALISIS KONSISTENSI DATA
print("\n3. ANALISIS KONSISTENSI DATA")
print("=" * 50)

# 3.1 Konsistensi rating dan sentimen
print("Analisis Konsistensi Rating vs Sentimen:")
for rating in sorted(df['rating'].unique()):
    rating_data = df[df['rating'] == rating]
    sentiment_dist = rating_data['sentiment'].value_counts()
    print(f"\nRating {rating}:")
    for sentiment, count in sentiment_dist.items():
        percentage = (count / len(rating_data)) * 100
        print(f"  {sentiment}: {count} ({percentage:.1f}%)")

# 3.2 Identifikasi inkonsistensi
inconsistencies = []
for idx, row in df.iterrows():
    rating = row['rating']
    sentiment = row['sentiment']
    
    # Logika konsistensi: rating tinggi = positive, rating rendah = negative
    if rating >= 4 and sentiment == 'negative':
        inconsistencies.append(f"Row {idx}: Rating {rating} tapi sentimen {sentiment}")
    elif rating <= 2 and sentiment == 'positive':
        inconsistencies.append(f"Row {idx}: Rating {rating} tapi sentimen {sentiment}")

print(f"\nInkonsistensi yang ditemukan: {len(inconsistencies)}")
for inconsistency in inconsistencies[:5]:  # Tampilkan 5 pertama
    print(f"  - {inconsistency}")

# 4. ANALISIS OUTLIERS DETAIL
print("\n4. ANALISIS OUTLIERS DETAIL")
print("=" * 50)

# 4.1 Outliers menggunakan IQR method
def detect_outliers_iqr(data, column_name):
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data < lower_bound) | (data > upper_bound)]
    return outliers, lower_bound, upper_bound

# Analisis outliers untuk berbagai metrik
outlier_analysis = {}
metrics = ['review_length', 'word_count', 'sentence_count']

for metric in metrics:
    outliers, lower, upper = detect_outliers_iqr(df[metric], metric)
    outlier_analysis[metric] = {
        'outliers': outliers,
        'lower_bound': lower,
        'upper_bound': upper,
        'count': len(outliers)
    }
    
    print(f"\n{metric.upper()}:")
    print(f"  Outliers: {len(outliers)}")
    print(f"  Lower bound: {lower:.1f}")
    print(f"  Upper bound: {upper:.1f}")
    if len(outliers) > 0:
        print(f"  Outlier values: {outliers.tolist()}")

# 4.2 Outliers menggunakan Z-score method
from scipy import stats

print(f"\nOutliers menggunakan Z-score (|z| > 2):")
for metric in metrics:
    z_scores = np.abs(stats.zscore(df[metric]))
    outliers_z = df[z_scores > 2]
    print(f"  {metric}: {len(outliers_z)} outliers")

# 5. ANALISIS DUPLICATE DATA
print("\n5. ANALISIS DUPLICATE DATA")
print("=" * 50)

# Cek duplicate berdasarkan review text
duplicate_reviews = df.duplicated(subset=['review'], keep=False)
print(f"Review duplikat: {duplicate_reviews.sum()}")

if duplicate_reviews.sum() > 0:
    print("Review yang duplikat:")
    duplicate_data = df[duplicate_reviews][['review', 'rating', 'sentiment']]
    print(duplicate_data)

# Cek duplicate berdasarkan semua kolom
duplicate_all = df.duplicated(keep=False)
print(f"Baris duplikat (semua kolom): {duplicate_all.sum()}")

# 6. ANALISIS DATA DISTRIBUSI
print("\n6. ANALISIS DATA DISTRIBUSI")
print("=" * 50)

# 6.1 Skewness dan Kurtosis
from scipy.stats import skew, kurtosis

print("Skewness dan Kurtosis:")
for metric in metrics:
    skewness = skew(df[metric])
    kurt = kurtosis(df[metric])
    print(f"{metric}:")
    print(f"  Skewness: {skewness:.3f} ({'Right skewed' if skewness > 0 else 'Left skewed' if skewness < 0 else 'Symmetric'})")
    print(f"  Kurtosis: {kurt:.3f} ({'Heavy tailed' if kurt > 0 else 'Light tailed' if kurt < 0 else 'Normal'})")

# 6.2 Normalitas data
from scipy.stats import shapiro

print(f"\nUji Normalitas (Shapiro-Wilk):")
for metric in metrics:
    if len(df[metric]) <= 5000:  # Shapiro-Wilk works best with small samples
        stat, p_value = shapiro(df[metric])
        print(f"{metric}: p-value = {p_value:.6f} ({'Normal' if p_value > 0.05 else 'Not normal'})")
    else:
        print(f"{metric}: Sample too large for Shapiro-Wilk test")

# 7. REKOMENDASI PERBAIKAN DATA
print("\n7. REKOMENDASI PERBAIKAN DATA")
print("=" * 50)

recommendations = []

# Rekomendasi berdasarkan missing values
if missing_analysis['Missing_Count'].sum() > 0:
    recommendations.append("Terdapat missing values yang perlu ditangani")

# Rekomendasi berdasarkan outliers
total_outliers = sum([analysis['count'] for analysis in outlier_analysis.values()])
if total_outliers > 0:
    recommendations.append(f"Terdapat {total_outliers} outliers yang perlu dievaluasi")

# Rekomendasi berdasarkan inkonsistensi
if len(inconsistencies) > 0:
    recommendations.append(f"Terdapat {len(inconsistencies)} inkonsistensi rating-sentimen")

# Rekomendasi berdasarkan duplicate
if duplicate_reviews.sum() > 0:
    recommendations.append(f"Terdapat {duplicate_reviews.sum()} review duplikat")

# Rekomendasi berdasarkan kualitas teks
if text_quality_df['is_empty'].sum() > 0:
    recommendations.append("Terdapat review kosong yang perlu dihapus")

if very_short.sum() > 0:
    recommendations.append(f"Terdapat {very_short.sum()} review sangat pendek yang perlu dievaluasi")

print("Rekomendasi Perbaikan Data:")
for i, rec in enumerate(recommendations, 1):
    print(f"{i}. {rec}")

if not recommendations:
    print("Data dalam kondisi baik, siap untuk preprocessing!")

print("\n=== ANALISIS KUALITAS DATA SELESAI ===")
print("Data telah dianalisis secara menyeluruh untuk kualitas dan konsistensi!")


In [None]:
# ===============================================
# SUMMARY REPORT EXPLORATORY DATA ANALYSIS
# ===============================================

import pandas as pd
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

print("=== SUMMARY REPORT EDA - REDBUS REVIEWS ===\n")

# Generate comprehensive EDA report
def generate_eda_summary():
    """Generate comprehensive EDA summary report"""
    
    # 1. EXECUTIVE SUMMARY
    print("1. EXECUTIVE SUMMARY")
    print("=" * 60)
    print(f"Tanggal Analisis: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"Dataset: redBus Reviews")
    print(f"Total Records: {len(df)}")
    print(f"Total Features: {len(df.columns)}")
    print(f"Memory Usage: {df.memory_usage(deep=True).sum() / 1024:.2f} KB")
    
    # 2. DATA OVERVIEW
    print(f"\n2. DATA OVERVIEW")
    print("=" * 60)
    print("Dataset Structure:")
    print(f"- Shape: {df.shape}")
    print(f"- Columns: {list(df.columns)}")
    print(f"- Data Types:")
    for col, dtype in df.dtypes.items():
        print(f"  * {col}: {dtype}")
    
    # 3. STATISTICAL SUMMARY
    print(f"\n3. STATISTICAL SUMMARY")
    print("=" * 60)
    
    # Rating statistics
    rating_stats = df['rating'].describe()
    print("Rating Statistics:")
    print(f"- Mean: {rating_stats['mean']:.2f}")
    print(f"- Median: {rating_stats['50%']:.2f}")
    print(f"- Std Dev: {rating_stats['std']:.2f}")
    print(f"- Min: {rating_stats['min']:.2f}")
    print(f"- Max: {rating_stats['max']:.2f}")
    
    # Review length statistics
    print(f"\nReview Length Statistics:")
    print(f"- Mean characters: {df['review_length'].mean():.1f}")
    print(f"- Mean words: {df['word_count'].mean():.1f}")
    print(f"- Mean sentences: {df['sentence_count'].mean():.1f}")
    
    # 4. DATA DISTRIBUTION
    print(f"\n4. DATA DISTRIBUTION")
    print("=" * 60)
    
    # Rating distribution
    print("Rating Distribution:")
    rating_dist = df['rating'].value_counts().sort_index()
    for rating, count in rating_dist.items():
        percentage = (count / len(df)) * 100
        print(f"- Rating {rating}: {count} reviews ({percentage:.1f}%)")
    
    # Sentiment distribution
    print(f"\nSentiment Distribution:")
    sentiment_dist = df['sentiment'].value_counts()
    for sentiment, count in sentiment_dist.items():
        percentage = (count / len(df)) * 100
        print(f"- {sentiment.capitalize()}: {count} reviews ({percentage:.1f}%)")
    
    # 5. DATA QUALITY ASSESSMENT
    print(f"\n5. DATA QUALITY ASSESSMENT")
    print("=" * 60)
    
    # Missing values
    missing_count = df.isnull().sum().sum()
    missing_percentage = (missing_count / (len(df) * len(df.columns))) * 100
    print(f"Missing Values:")
    print(f"- Total missing: {missing_count}")
    print(f"- Percentage: {missing_percentage:.2f}%")
    
    # Duplicates
    duplicate_count = df.duplicated().sum()
    duplicate_percentage = (duplicate_count / len(df)) * 100
    print(f"\nDuplicate Records:")
    print(f"- Total duplicates: {duplicate_count}")
    print(f"- Percentage: {duplicate_percentage:.2f}%")
    
    # Data consistency
    inconsistencies = 0
    for idx, row in df.iterrows():
        rating = row['rating']
        sentiment = row['sentiment']
        if (rating >= 4 and sentiment == 'negative') or (rating <= 2 and sentiment == 'positive'):
            inconsistencies += 1
    
    print(f"\nData Consistency:")
    print(f"- Inconsistencies: {inconsistencies}")
    print(f"- Consistency rate: {((len(df) - inconsistencies) / len(df)) * 100:.1f}%")
    
    # 6. KEY INSIGHTS
    print(f"\n6. KEY INSIGHTS")
    print("=" * 60)
    
    # Rating insights
    high_rating = df[df['rating'] >= 4]
    low_rating = df[df['rating'] <= 2]
    
    print("Rating Insights:")
    print(f"- High ratings (4-5): {len(high_rating)} reviews ({len(high_rating)/len(df)*100:.1f}%)")
    print(f"- Low ratings (1-2): {len(low_rating)} reviews ({len(low_rating)/len(df)*100:.1f}%)")
    print(f"- Average rating: {df['rating'].mean():.2f}")
    
    # Text insights
    print(f"\nText Insights:")
    print(f"- Average review length: {df['review_length'].mean():.1f} characters")
    print(f"- Average word count: {df['word_count'].mean():.1f} words")
    print(f"- Longest review: {df['review_length'].max()} characters")
    print(f"- Shortest review: {df['review_length'].min()} characters")
    
    # Correlation insights
    rating_length_corr = df['rating'].corr(df['review_length'])
    rating_words_corr = df['rating'].corr(df['word_count'])
    
    print(f"\nCorrelation Insights:")
    print(f"- Rating vs Review Length: {rating_length_corr:.3f}")
    print(f"- Rating vs Word Count: {rating_words_corr:.3f}")
    
    # 7. OUTLIERS ANALYSIS
    print(f"\n7. OUTLIERS ANALYSIS")
    print("=" * 60)
    
    # IQR method for outliers
    def count_outliers_iqr(data):
        Q1 = data.quantile(0.25)
        Q3 = data.quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers = data[(data < lower_bound) | (data > upper_bound)]
        return len(outliers)
    
    review_length_outliers = count_outliers_iqr(df['review_length'])
    word_count_outliers = count_outliers_iqr(df['word_count'])
    
    print(f"Outliers (IQR method):")
    print(f"- Review length outliers: {review_length_outliers}")
    print(f"- Word count outliers: {word_count_outliers}")
    
    # 8. RECOMMENDATIONS
    print(f"\n8. RECOMMENDATIONS")
    print("=" * 60)
    
    recommendations = []
    
    # Data quality recommendations
    if missing_count > 0:
        recommendations.append("Handle missing values before analysis")
    
    if duplicate_count > 0:
        recommendations.append("Remove or handle duplicate records")
    
    if inconsistencies > 0:
        recommendations.append("Review and correct rating-sentiment inconsistencies")
    
    if review_length_outliers > 0 or word_count_outliers > 0:
        recommendations.append("Evaluate outliers for data quality issues")
    
    # Analysis recommendations
    if len(df) < 100:
        recommendations.append("Consider collecting more data for robust analysis")
    
    if abs(rating_length_corr) < 0.3:
        recommendations.append("Weak correlation between rating and text length - investigate further")
    
    # Preprocessing recommendations
    recommendations.append("Apply text preprocessing (tokenization, stemming, stop words removal)")
    recommendations.append("Consider feature engineering for text analysis")
    recommendations.append("Implement proper train-test split for modeling")
    
    print("Data Preprocessing Recommendations:")
    for i, rec in enumerate(recommendations, 1):
        print(f"{i}. {rec}")
    
    # 9. NEXT STEPS
    print(f"\n9. NEXT STEPS")
    print("=" * 60)
    print("Recommended next steps for data processing:")
    print("1. Text preprocessing and cleaning")
    print("2. Feature engineering (TF-IDF, word embeddings)")
    print("3. Sentiment analysis model training")
    print("4. Model evaluation and validation")
    print("5. Results interpretation and business insights")
    
    return {
        'total_records': len(df),
        'missing_values': missing_count,
        'duplicates': duplicate_count,
        'inconsistencies': inconsistencies,
        'avg_rating': df['rating'].mean(),
        'avg_length': df['review_length'].mean(),
        'outliers': review_length_outliers + word_count_outliers
    }

# Generate the summary
summary_stats = generate_eda_summary()

# 10. FINAL SUMMARY
print(f"\n10. FINAL SUMMARY")
print("=" * 60)
print("EDA telah selesai dengan hasil sebagai berikut:")
print(f"✓ Dataset berisi {summary_stats['total_records']} review")
print(f"✓ Rata-rata rating: {summary_stats['avg_rating']:.2f}")
print(f"✓ Rata-rata panjang review: {summary_stats['avg_length']:.1f} karakter")
print(f"✓ Missing values: {summary_stats['missing_values']}")
print(f"✓ Duplicates: {summary_stats['duplicates']}")
print(f"✓ Inconsistencies: {summary_stats['inconsistencies']}")
print(f"✓ Outliers: {summary_stats['outliers']}")

print(f"\n🎯 Data siap untuk tahap preprocessing dan analisis lebih lanjut!")
print(f"📊 EDA Report generated on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

# Save summary to file
with open('eda_summary_report.md', 'w', encoding='utf-8') as f:
    f.write("# EDA Summary Report - redBus Reviews\n\n")
    f.write(f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
    f.write("## Dataset Overview\n")
    f.write(f"- Total Records: {summary_stats['total_records']}\n")
    f.write(f"- Average Rating: {summary_stats['avg_rating']:.2f}\n")
    f.write(f"- Average Review Length: {summary_stats['avg_length']:.1f} characters\n\n")
    f.write("## Data Quality\n")
    f.write(f"- Missing Values: {summary_stats['missing_values']}\n")
    f.write(f"- Duplicates: {summary_stats['duplicates']}\n")
    f.write(f"- Inconsistencies: {summary_stats['inconsistencies']}\n")
    f.write(f"- Outliers: {summary_stats['outliers']}\n\n")
    f.write("## Status\n")
    f.write("✅ EDA Complete - Ready for Data Preprocessing\n")

print(f"\n📄 Summary report saved to: eda_summary_report.md")
