# Google Play Store Review Scraper - Kaggle Version

## Overview
Scraping 10,000+ Indonesian app reviews from Google Play Store using parallel processing.

**Features:**
- ‚ö° Parallel scraping (optimized for Kaggle)
- üìä Real-time progress tracking
- üîÑ Auto-retry on failures
- üíæ Save to CSV & JSON
- üìà Detailed statistics

**Target:** 10,000+ reviews in ~5-10 minutes

## 1. Install Dependencies

In [None]:
%%time
# Install required packages
!pip install -q google-play-scraper

print("‚úì Dependencies installed!")

## 2. Import Libraries

In [None]:
import pandas as pd
import numpy as np
from google_play_scraper import Sort, reviews_all
import time
from datetime import datetime
import json
from tqdm.notebook import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import warnings
warnings.filterwarnings('ignore')

print("‚úì Libraries imported successfully!")

## 3. Scraper Configuration

In [None]:
# Configuration
TARGET_REVIEWS = 12000  # Target with buffer
MAX_WORKERS = 6  # Parallel workers (Kaggle optimized)
MAX_RETRIES = 3  # Retry attempts per app

# App list - popular Indonesian apps
APP_LIST = [
    ('com.gojek.app', 'Gojek'),
    ('com.tokopedia.tkpd', 'Tokopedia'),
    ('com.shopee.id', 'Shopee'),
    ('com.instagram.android', 'Instagram'),
    ('com.whatsapp', 'WhatsApp'),
    ('com.spotify.music', 'Spotify'),
    ('com.netflix.mediaclient', 'Netflix'),
    ('id.dana', 'Dana'),
    ('com.traveloka.android', 'Traveloka'),
    ('com.bukalapak.android', 'Bukalapak'),
    ('com.lazada.android', 'Lazada'),
    ('id.co.bri.brimo', 'BRI Mobile'),
    ('com.dbs.id.digibank', 'digibank'),
    ('com.LinkAja', 'LinkAja'),
    ('com.ovo.id', 'OVO'),
]

REVIEWS_PER_APP = TARGET_REVIEWS // len(APP_LIST) + 200

print(f"Configuration:")
print(f"  Target reviews: {TARGET_REVIEWS:,}")
print(f"  Apps to scrape: {len(APP_LIST)}")
print(f"  Reviews per app: ~{REVIEWS_PER_APP:,}")
print(f"  Max workers: {MAX_WORKERS}")
print(f"  Estimated time: ~5-10 minutes")

## 4. Scraping Functions

In [None]:
def scrape_single_app(app_info, count=1000, max_retries=3):
    """
    Scrape reviews from a single app with retry mechanism
    
    Args:
        app_info: Tuple of (app_id, app_name)
        count: Number of reviews to scrape
        max_retries: Maximum retry attempts
    
    Returns:
        Dict with scraping results
    """
    app_id, app_name = app_info
    
    for attempt in range(1, max_retries + 1):
        try:
            start_time = time.time()
            
            # Scrape reviews
            result = reviews_all(
                app_id,
                sleep_milliseconds=0,
                lang='id',
                country='id',
                sort=Sort.NEWEST
            )
            
            # Limit to requested count
            result = result[:count]
            
            elapsed = time.time() - start_time
            
            return {
                'app_id': app_id,
                'app_name': app_name,
                'reviews': result,
                'count': len(result),
                'success': True,
                'elapsed_time': elapsed,
                'speed': len(result) / elapsed if elapsed > 0 else 0
            }
            
        except Exception as e:
            if attempt < max_retries:
                time.sleep(attempt * 2)  # Exponential backoff
            else:
                return {
                    'app_id': app_id,
                    'app_name': app_name,
                    'reviews': [],
                    'count': 0,
                    'success': False,
                    'error': str(e)
                }
    
    return None


def scrape_parallel(app_list, reviews_per_app, max_workers=6):
    """
    Scrape multiple apps in parallel
    
    Args:
        app_list: List of (app_id, app_name) tuples
        reviews_per_app: Number of reviews per app
        max_workers: Number of parallel workers
    
    Returns:
        Tuple of (all_reviews, results_summary)
    """
    all_reviews = []
    results = []
    
    print("\n" + "="*80)
    print("STARTING PARALLEL SCRAPING")
    print("="*80)
    
    start_time = time.time()
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit all tasks
        future_to_app = {
            executor.submit(scrape_single_app, app_info, reviews_per_app): app_info 
            for app_info in app_list
        }
        
        # Progress bar
        with tqdm(total=len(app_list), desc="üì± Scraping apps", unit="app") as pbar:
            for future in as_completed(future_to_app):
                result = future.result()
                results.append(result)
                
                if result['success']:
                    all_reviews.extend(result['reviews'])
                    pbar.set_postfix({
                        'collected': f"{len(all_reviews):,}",
                        'success': sum(1 for r in results if r['success']),
                        'failed': sum(1 for r in results if not r['success'])
                    })
                
                pbar.update(1)
    
    total_time = time.time() - start_time
    
    print(f"\n‚úì Scraping completed in {total_time:.2f}s ({total_time/60:.2f} minutes)")
    print(f"‚úì Total reviews collected: {len(all_reviews):,}")
    print(f"‚úì Average speed: {len(all_reviews)/total_time:.1f} reviews/second")
    
    return all_reviews, results

print("‚úì Scraping functions defined!")

## 5. Start Scraping üöÄ

In [None]:
%%time

# Run parallel scraping
all_reviews, results = scrape_parallel(
    app_list=APP_LIST,
    reviews_per_app=REVIEWS_PER_APP,
    max_workers=MAX_WORKERS
)

## 6. Scraping Results Summary

In [None]:
# Summary statistics
print("\n" + "="*80)
print("SCRAPING SUMMARY")
print("="*80)

successful_apps = [r for r in results if r['success']]
failed_apps = [r for r in results if not r['success']]

print(f"Total apps scraped: {len(results)}")
print(f"Successful: {len(successful_apps)}")
print(f"Failed: {len(failed_apps)}")
print(f"Total reviews: {len(all_reviews):,}")
print(f"Target achieved: {'‚úì YES' if len(all_reviews) >= 10000 else '‚úó NO'}")

if failed_apps:
    print(f"\n‚ö†Ô∏è  Failed apps:")
    for app in failed_apps:
        print(f"  - {app['app_name']}: {app.get('error', 'Unknown error')}")

In [None]:
# Per-app breakdown
print("\n" + "="*80)
print("PER-APP BREAKDOWN")
print("="*80)

# Sort by review count
sorted_results = sorted(results, key=lambda x: x['count'], reverse=True)

print(f"{'App Name':<25} {'Reviews':<10} {'Time (s)':<10} {'Speed (rev/s)':<15} {'Status'}")
print("-"*80)

for result in sorted_results:
    status = "‚úì" if result['success'] else "‚úó"
    time_str = f"{result.get('elapsed_time', 0):.2f}" if result['success'] else "N/A"
    speed_str = f"{result.get('speed', 0):.1f}" if result['success'] else "N/A"
    
    print(f"{result['app_name']:<25} {result['count']:<10,} {time_str:<10} {speed_str:<15} {status}")

print("="*80)

## 7. Convert to DataFrame

In [None]:
# Convert to DataFrame
print("Converting to DataFrame...")

df = pd.DataFrame(all_reviews)

# Select and rename relevant columns
if len(df) > 0:
    columns_to_keep = [
        'reviewId',
        'userName', 
        'content',
        'score',
        'at',
        'replyContent',
        'appVersion',
        'thumbsUpCount'
    ]
    
    df = df[columns_to_keep]
    df.columns = [
        'review_id',
        'username',
        'review_text',
        'rating',
        'date',
        'reply',
        'app_version',
        'helpful_count'
    ]
    
    print(f"‚úì DataFrame created with {len(df):,} rows")
    print(f"\nDataFrame shape: {df.shape}")
    print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB")
else:
    print("‚ö†Ô∏è  No reviews collected!")

## 8. Dataset Overview

In [None]:
# Basic info
print("Dataset Info:")
print(df.info())

In [None]:
# First few rows
print("\nFirst 5 rows:")
df.head()

In [None]:
# Statistics
print("\nDataset Statistics:")
print(f"Total reviews: {len(df):,}")
print(f"Date range: {df['date'].min()} to {df['date'].max()}")
print(f"Unique users: {df['username'].nunique():,}")
print(f"Average review length: {df['review_text'].str.len().mean():.1f} characters")
print(f"Missing values:")
print(df.isnull().sum())

In [None]:
# Rating distribution
print("\nRating Distribution:")
print(df['rating'].value_counts().sort_index())

# Visualize
import matplotlib.pyplot as plt
import seaborn as sns

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Rating distribution
df['rating'].value_counts().sort_index().plot(kind='bar', ax=axes[0], color='skyblue')
axes[0].set_title('Rating Distribution', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Rating')
axes[0].set_ylabel('Count')
axes[0].grid(axis='y', alpha=0.3)

# Review length distribution
review_lengths = df['review_text'].str.len()
axes[1].hist(review_lengths, bins=50, color='lightcoral', edgecolor='black')
axes[1].set_title('Review Length Distribution', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Characters')
axes[1].set_ylabel('Frequency')
axes[1].axvline(review_lengths.mean(), color='red', linestyle='--', label=f'Mean: {review_lengths.mean():.1f}')
axes[1].legend()
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

## 9. Save to Files

In [None]:
# Save to CSV
csv_filename = 'playstore_reviews.csv'
df.to_csv(csv_filename, index=False, encoding='utf-8')
print(f"‚úì Saved to {csv_filename}")

# Save to JSON
json_filename = 'playstore_reviews.json'
with open(json_filename, 'w', encoding='utf-8') as f:
    json.dump(all_reviews, f, ensure_ascii=False, indent=2, default=str)
print(f"‚úì Saved to {json_filename}")

# File sizes
import os
csv_size = os.path.getsize(csv_filename) / 1024 / 1024
json_size = os.path.getsize(json_filename) / 1024 / 1024

print(f"\nFile sizes:")
print(f"  CSV:  {csv_size:.2f} MB")
print(f"  JSON: {json_size:.2f} MB")

## 10. Download Files (For Kaggle)

Click on the **Output** tab on the right sidebar, then click the download button for each file.

In [None]:
# Verify files exist
import os

files_to_check = ['playstore_reviews.csv', 'playstore_reviews.json']

print("Files ready for download:")
for filename in files_to_check:
    if os.path.exists(filename):
        size = os.path.getsize(filename) / 1024 / 1024
        print(f"  ‚úì {filename} ({size:.2f} MB)")
    else:
        print(f"  ‚úó {filename} (not found)")

print("\nüì• Go to Output tab ‚Üí Click download button")

## 11. Quick Data Sample

In [None]:
# Sample reviews
print("Sample reviews:")
print("\n" + "="*80)

for idx in df.sample(5).index:
    row = df.loc[idx]
    print(f"Rating: {row['rating']} ‚≠ê")
    print(f"Review: {row['review_text'][:150]}...")
    print(f"Date: {row['date']}")
    print("-"*80)

## 12. Final Summary

In [None]:
print("\n" + "="*80)
print("SCRAPING COMPLETE! üéâ")
print("="*80)
print(f"Total reviews collected: {len(df):,}")
print(f"Target (10,000): {'‚úì ACHIEVED' if len(df) >= 10000 else '‚úó NOT ACHIEVED'}")
print(f"")
print(f"Files saved:")
print(f"  1. playstore_reviews.csv")
print(f"  2. playstore_reviews.json")
print(f"")
print(f"Next steps:")
print(f"  1. Download files from Output tab")
print(f"  2. Upload to your project")
print(f"  3. Run training notebook")
print(f"")
print(f"Good luck with your submission! üöÄ")
print("="*80)