In [8]:
# YouTube Data Collection for Southeast Asian Food Travel Content
# This notebook collects video data using YouTube Data API v3 and filters for Southeast Asian food travel content

import pandas as pd
import numpy as np
import requests
import json
import time
from datetime import datetime, timedelta
import re
import os
from typing import List, Dict, Any

# Install required packages (run once)
# !pip install google-api-python-client pandas numpy requests

from googleapiclient.discovery import build
from googleapiclient.errors import HttpError

class YouTubeDataCollector:
    def __init__(self, api_key: str):
        """
        Initialize YouTube Data Collector
        
        Args:
            api_key (str): YouTube Data API v3 key
        """
        self.api_key = api_key
        self.youtube = build('youtube', 'v3', developerKey=api_key)
        self.collected_videos = []
        self.search_terms = [
            # Thai cuisine
            "Thai street food", "Thai cooking", "Bangkok food", "Thai restaurant",
            "Pad Thai", "Tom Yum", "Green curry Thai", "Thai market food",
            
            # Vietnamese cuisine
            "Vietnamese pho", "Vietnamese street food", "Saigon food", "Hanoi food",
            "Vietnamese cooking", "Banh mi", "Vietnamese restaurant",
            
            # Malaysian cuisine
            "Malaysian food", "Kuala Lumpur food", "Malaysian hawker", "Nasi lemak",
            "Malaysian street food", "Penang food", "Malaysian cooking",
            
            # Singaporean cuisine
            "Singapore hawker", "Singapore food", "Singapore street food",
            "Singapore cooking", "Hainanese chicken rice",
            
            # Indonesian cuisine
            "Indonesian food", "Jakarta food", "Indonesian street food",
            "Nasi goreng", "Indonesian cooking", "Bali food",
            
            # Filipino cuisine
            "Filipino food", "Manila food", "Filipino cooking", "Adobo",
            "Filipino street food", "Philippines food",
            
            # General Southeast Asian terms
            "Southeast Asian cuisine", "ASEAN food", "Asian street food travel",
            "Southeast Asian cooking", "Asian food travel"
        ]
    
    def search_videos(self, query: str, max_results: int = 50, order: str = 'relevance') -> List[Dict]:
        """
        Search for videos using YouTube Data API
        
        Args:
            query (str): Search query
            max_results (int): Maximum number of results per query
            order (str): Order of results ('relevance', 'date', 'viewCount', 'rating')
        
        Returns:
            List[Dict]: List of video data dictionaries
        """
        try:
            # Calculate date range for recent content (last 2 years)
            published_after = (datetime.now() - timedelta(days=730)).isoformat() + 'Z'
            
            search_response = self.youtube.search().list(
                q=query,
                part='id,snippet',
                maxResults=max_results,
                type='video',
                order=order,
                publishedAfter=published_after,
                videoDuration='medium',  # 4-20 minutes
                regionCode='SG'  # Singapore region for Southeast Asian content
            ).execute()
            
            video_ids = [item['id']['videoId'] for item in search_response['items']]
            
            # Get detailed video statistics
            videos_response = self.youtube.videos().list(
                part='statistics,snippet,contentDetails',
                id=','.join(video_ids)
            ).execute()
            
            videos_data = []
            for video in videos_response['items']:
                video_data = self.extract_video_data(video)
                if self.is_relevant_content(video_data):
                    videos_data.append(video_data)
            
            print(f"Found {len(videos_data)} relevant videos for query: '{query}'")
            return videos_data
            
        except HttpError as e:
            print(f"An HTTP error occurred: {e}")
            return []
        except Exception as e:
            print(f"An error occurred: {e}")
            return []
    
    def extract_video_data(self, video: Dict) -> Dict:
        """
        Extract relevant data from YouTube video object
        
        Args:
            video (Dict): YouTube video object from API
        
        Returns:
            Dict: Processed video data
        """
        snippet = video['snippet']
        statistics = video['statistics']
        content_details = video['contentDetails']
        
        # Parse duration from ISO 8601 format (PT4M13S -> 253 seconds)
        duration_str = content_details['duration']
        duration_seconds = self.parse_duration(duration_str)
        
        return {
            'video_id': video['id'],
            'title': snippet['title'],
            'description': snippet['description'],
            'channel_title': snippet['channelTitle'],
            'channel_id': snippet['channelId'],
            'published_at': snippet['publishedAt'],
            'tags': snippet.get('tags', []),
            'category_id': snippet['categoryId'],
            'view_count': int(statistics.get('viewCount', 0)),
            'like_count': int(statistics.get('likeCount', 0)),
            'comment_count': int(statistics.get('commentCount', 0)),
            'duration_seconds': duration_seconds,
            'thumbnail_url': snippet['thumbnails']['high']['url'] if 'high' in snippet['thumbnails'] else '',
            'language': snippet.get('defaultLanguage', 'unknown')
        }
    
    def parse_duration(self, duration_str: str) -> int:
        """
        Parse YouTube duration format (PT4M13S) to seconds
        
        Args:
            duration_str (str): Duration in ISO 8601 format
        
        Returns:
            int: Duration in seconds
        """
        import re
        pattern = r'PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?'
        match = re.match(pattern, duration_str)
        
        if not match:
            return 0
        
        hours = int(match.group(1)) if match.group(1) else 0
        minutes = int(match.group(2)) if match.group(2) else 0
        seconds = int(match.group(3)) if match.group(3) else 0
        
        return hours * 3600 + minutes * 60 + seconds
    
    def is_relevant_content(self, video_data: Dict) -> bool:
        """
        Filter videos to ensure they're relevant to Southeast Asian food travel
        
        Args:
            video_data (Dict): Video data dictionary
        
        Returns:
            bool: True if content is relevant
        """
        # Combine title, description, and tags for content analysis
        content_text = (
            video_data['title'].lower() + ' ' + 
            video_data['description'].lower() + ' ' + 
            ' '.join(video_data['tags']).lower()
        )
        
        # Southeast Asian countries and food-related keywords
        sea_countries = [
            'thailand', 'thai', 'vietnam', 'vietnamese', 'malaysia', 'malaysian',
            'singapore', 'singaporean', 'indonesia', 'indonesian', 'philippines',
            'filipino', 'myanmar', 'burma', 'cambodia', 'cambodian', 'laos',
            'lao', 'brunei', 'bangkok', 'saigon', 'hanoi', 'kuala lumpur',
            'penang', 'jakarta', 'bali', 'manila', 'phuket', 'ho chi minh'
        ]
        
        food_keywords = [
            'food', 'cuisine', 'cooking', 'recipe', 'restaurant', 'street food',
            'hawker', 'market', 'eating', 'taste', 'flavor', 'dish', 'meal',
            'cooking', 'chef', 'kitchen', 'spicy', 'noodles', 'rice', 'curry'
        ]
        
        travel_keywords = [
            'travel', 'trip', 'visit', 'tour', 'explore', 'journey', 'vacation',
            'adventure', 'guide', 'vlog', 'experience'
        ]
        
        # Check for Southeast Asian country mention
        has_sea_country = any(country in content_text for country in sea_countries)
        
        # Check for food-related content
        has_food_content = any(keyword in content_text for keyword in food_keywords)
        
        # Check for travel element (optional but preferred)
        has_travel_element = any(keyword in content_text for keyword in travel_keywords)
        
        # Must have SEA country and food content
        # Travel element is a bonus but not required
        return has_sea_country and has_food_content
    
    def collect_comments(self, video_id: str, max_comments: int = 20) -> List[Dict]:
        """
        Collect comments for a specific video
        
        Args:
            video_id (str): YouTube video ID
            max_comments (int): Maximum number of comments to collect
        
        Returns:
            List[Dict]: List of comment data
        """
        try:
            comments_response = self.youtube.commentThreads().list(
                part='snippet',
                videoId=video_id,
                maxResults=max_comments,
                order='relevance'
            ).execute()
            
            comments = []
            for item in comments_response['items']:
                comment = item['snippet']['topLevelComment']['snippet']
                comments.append({
                    'video_id': video_id,
                    'comment_text': comment['textDisplay'],
                    'like_count': comment.get('likeCount', 0),
                    'published_at': comment['publishedAt']
                })
            
            return comments
            
        except HttpError as e:
            print(f"Could not retrieve comments for video {video_id}: {e}")
            return []
    
    def collect_all_data(self, videos_per_query: int = 50, include_comments: bool = True) -> pd.DataFrame:
        """
        Collect data for all search terms and combine into single dataset
        
        Args:
            videos_per_query (int): Number of videos to collect per search term
            include_comments (bool): Whether to collect comments for sentiment analysis
        
        Returns:
            pd.DataFrame: Complete dataset with comments aggregated per video
        """
        all_videos = []
        
        print(f"Starting data collection for {len(self.search_terms)} search terms...")
        
        for i, term in enumerate(self.search_terms):
            print(f"\nProcessing search term {i+1}/{len(self.search_terms)}: '{term}'")
            
            # Collect videos for this search term
            videos = self.search_videos(term, max_results=videos_per_query)
            
            # Collect comments if requested and aggregate them per video
            if include_comments:
                for video in videos:
                    comments = self.collect_comments(video['video_id'])
                    
                    # Aggregate comment data for this video
                    if comments:
                        comment_texts = [c['comment_text'] for c in comments]
                        comment_likes = [c['like_count'] for c in comments]
                        
                        video['comments_text'] = ' | '.join(comment_texts)  # Join with separator
                        video['comments_count_collected'] = len(comments)
                        video['comments_total_likes'] = sum(comment_likes)
                        video['comments_avg_likes'] = np.mean(comment_likes) if comment_likes else 0
                    else:
                        video['comments_text'] = ''
                        video['comments_count_collected'] = 0
                        video['comments_total_likes'] = 0
                        video['comments_avg_likes'] = 0
                    
                    time.sleep(0.1)  # Rate limiting
            else:
                # Add empty comment fields if not collecting comments
                for video in videos:
                    video['comments_text'] = ''
                    video['comments_count_collected'] = 0
                    video['comments_total_likes'] = 0
                    video['comments_avg_likes'] = 0
            
            all_videos.extend(videos)
            
            # Rate limiting to respect API quotas
            time.sleep(1)
            
            # Progress update
            if (i + 1) % 5 == 0:
                print(f"Collected {len(all_videos)} videos so far...")
        
        # Remove duplicates based on video_id
        videos_df = pd.DataFrame(all_videos)
        if not videos_df.empty:
            videos_df = videos_df.drop_duplicates(subset=['video_id'])
            print(f"\nTotal unique videos collected: {len(videos_df)}")
        
        return videos_df
    
    def add_derived_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Add derived features for analysis
        
        Args:
            df (pd.DataFrame): Raw video data
        
        Returns:
            pd.DataFrame: Enhanced dataset with derived features
        """
        df = df.copy()
        
        # Convert published_at to datetime
        df['published_at'] = pd.to_datetime(df['published_at'])
        
        # Extract time-based features
        df['publish_hour'] = df['published_at'].dt.hour
        df['publish_day_of_week'] = df['published_at'].dt.dayofweek
        df['publish_month'] = df['published_at'].dt.month
        
        # Calculate engagement ratios
        df['like_to_view_ratio'] = df['like_count'] / (df['view_count'] + 1)  # +1 to avoid division by zero
        df['comment_to_view_ratio'] = df['comment_count'] / (df['view_count'] + 1)
        df['engagement_score'] = (df['like_count'] + df['comment_count']) / (df['view_count'] + 1)
        
        # Duration categories
        df['duration_category'] = pd.cut(df['duration_seconds'], 
                                       bins=[0, 300, 600, 1200, float('inf')], 
                                       labels=['Short', 'Medium', 'Long', 'Very Long'])
        
        # Title and description length
        df['title_length'] = df['title'].str.len()
        df['description_length'] = df['description'].str.len()
        df['tags_count'] = df['tags'].apply(lambda x: len(x) if isinstance(x, list) else 0)
        
        # Performance indicators (for labeling trending videos)
        # Define trending based on view count percentiles within the dataset
        view_threshold = df['view_count'].quantile(0.8)  # Top 20% by views
        engagement_threshold = df['engagement_score'].quantile(0.75)  # Top 25% by engagement
        
        df['is_trending'] = (
            (df['view_count'] >= view_threshold) & 
            (df['engagement_score'] >= engagement_threshold)
        )
        
        return df



In [10]:
# Configuration
API_KEY = "xxx"  # Replace with your actual API key
VIDEOS_PER_QUERY = 50
INCLUDE_COMMENTS = True
OUTPUT_DIR = "data"



In [1]:
# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Initialize collector
print("Initializing YouTube Data Collector...")
collector = YouTubeDataCollector(API_KEY)

# Collect data
print("Starting data collection process...")
start_time = time.time()

try:
    videos_df = collector.collect_all_data(
        videos_per_query=VIDEOS_PER_QUERY,
        include_comments=INCLUDE_COMMENTS
    )
    
    if not videos_df.empty:
        # Add derived features
        print("\nAdding derived features...")
        videos_df = collector.add_derived_features(videos_df)
        
        # Data quality check
        print("\n=== DATA QUALITY SUMMARY ===")
        print(f"Total videos collected: {len(videos_df)}")
        print(f"Date range: {videos_df['published_at'].min()} to {videos_df['published_at'].max()}")
        print(f"Average views: {videos_df['view_count'].mean():.0f}")
        print(f"Videos marked as trending: {videos_df['is_trending'].sum()}")
        print(f"Videos with comments: {(videos_df['comments_count_collected'] > 0).sum()}")
        
        # Show sample data
        print("\n=== SAMPLE VIDEO DATA ===")
        print(videos_df[['title', 'channel_title', 'view_count', 'like_count', 'comments_count_collected', 'is_trending']].head())
        
        # Save to single CSV file
        output_filename = f"{OUTPUT_DIR}/youtube_sea_food_travel_data.csv"
        videos_df.to_csv(output_filename, index=False, encoding='utf-8')
        print(f"\nComplete dataset saved to: {output_filename}")
        
        # Generate collection report
        report = {
            'collection_date': datetime.now().isoformat(),
            'total_videos': len(videos_df),
            'videos_with_comments': int((videos_df['comments_count_collected'] > 0).sum()),
            'total_comments_collected': int(videos_df['comments_count_collected'].sum()),
            'search_terms_used': len(collector.search_terms),
            'date_range': {
                'start': videos_df['published_at'].min().isoformat(),
                'end': videos_df['published_at'].max().isoformat()
            },
            'trending_videos': int(videos_df['is_trending'].sum()),
            'avg_views': float(videos_df['view_count'].mean()),
            'avg_engagement_score': float(videos_df['engagement_score'].mean())
        }
        
        report_filename = f"{OUTPUT_DIR}/collection_report.json"
        with open(report_filename, 'w') as f:
            json.dump(report, f, indent=2)
        print(f"Collection report saved to: {report_filename}")
        
    else:
        print("No videos were collected. Please check your API key and search terms.")

except Exception as e:
    print(f"Error during data collection: {e}")

finally:
    end_time = time.time()
    print(f"\nData collection completed in {end_time - start_time:.2f} seconds")

# Display collection statistics
if 'videos_df' in locals() and not videos_df.empty:
    print("\n=== FINAL COLLECTION STATISTICS ===")
    print(f"Unique channels: {videos_df['channel_id'].nunique()}")
    print(f"Videos per channel (avg): {len(videos_df) / videos_df['channel_id'].nunique():.1f}")
    print(f"View count distribution:")
    print(videos_df['view_count'].describe())
    
    print(f"\nTop 5 most viewed videos:")
    top_videos = videos_df.nlargest(5, 'view_count')[['title', 'channel_title', 'view_count', 'like_count']]
    for idx, row in top_videos.iterrows():
        print(f"- {row['title'][:60]}... | {row['channel_title']} | {row['view_count']:,} views")
    
    print(f"\nContent by country/cuisine (based on title keywords):")
    # Simple keyword counting in titles
    countries = ['thai', 'vietnam', 'malaysia', 'singapore', 'indonesia', 'filipino']
    for country in countries:
        count = videos_df['title'].str.lower().str.contains(country).sum()
        print(f"- {country.capitalize()}: {count} videos")
    
    print(f"\nComment collection summary:")
    print(f"- Videos with comments collected: {(videos_df['comments_count_collected'] > 0).sum()}")
    print(f"- Total comments collected: {videos_df['comments_count_collected'].sum()}")
    print(f"- Average comments per video: {videos_df['comments_count_collected'].mean():.1f}")


NameError: name 'os' is not defined