In [1]:
from googleapiclient.discovery import build
import pandas as pd
from tqdm import tqdm
import time

In [2]:
# La tua API Key
API_KEY = "AIzaSyA9BMJiGhlfOT7uTsskle7Rc_YcClbxWmM"

# Crea il client YouTube
youtube = build('youtube', 'v3', developerKey=API_KEY)

def get_video_statistics_batch(video_ids):
    """
    Recupera statistiche per fino a 50 video ID in una singola chiamata API.
    Ritorna un dizionario con video_id come chiave.
    """
    # YouTube API accetta max 50 IDs per chiamata
    video_ids_str = ','.join(video_ids)
    
    try:
        request = youtube.videos().list(
            part='statistics,contentDetails,snippet',
            id=video_ids_str
        )
        response = request.execute()
        
        stats_dict = {}
        
        for item in response.get('items', []):
            video_id = item['id']
            stats = item.get('statistics', {})
            content_details = item.get('contentDetails', {})
            snippet = item.get('snippet', {})
            
            stats_dict[video_id] = {
                'view_count': int(stats.get('viewCount', 0)),
                'like_count': int(stats.get('likeCount', 0)),
                'comment_count': int(stats.get('commentCount', 0)),
                'duration': content_details.get('duration', None),
                'definition': content_details.get('definition', None),
                'published_at': snippet.get('publishedAt', None),
                'category_id': snippet.get('categoryId', None)
            }
        
        return stats_dict
    
    except Exception as e:
        print(f"Errore nel recupero batch: {e}")
        return {}

def enrich_with_youtube_statistics(df, video_id_col='youtube_video_id', 
                                   batch_size=50, output_file='youtube_stats_enriched.csv'):
    """
    Arricchisce il dataframe con statistiche YouTube usando batch API calls.
    """
    # Filtra solo le righe con video ID validi
    df_with_ids = df[df[video_id_col].notna()].copy()
    video_ids_list = df_with_ids[video_id_col].tolist()
    
    print(f"Totale video ID da processare: {len(video_ids_list)}")
    print(f"Batch da {batch_size} video per chiamata API")
    print(f"Chiamate API necessarie: {len(video_ids_list) // batch_size + 1}")
    print(f"Quota consumata stimata: ~{len(video_ids_list) // batch_size + 1} units\n")
    
    # Inizializza nuove colonne
    df['yt_view_count'] = None
    df['yt_like_count'] = None
    df['yt_comment_count'] = None
    df['yt_duration'] = None
    df['yt_definition'] = None
    df['yt_published_at'] = None
    df['yt_category_id'] = None
    
    # Processa in batch
    all_stats = {}
    
    for i in tqdm(range(0, len(video_ids_list), batch_size), 
                  desc="Recupero statistiche YouTube", 
                  unit="batch",
                  colour="blue"):
        
        batch = video_ids_list[i:i+batch_size]
        batch_stats = get_video_statistics_batch(batch)
        all_stats.update(batch_stats)
        
        # Piccola pausa per rispettare rate limits (opzionale ma consigliato)
        time.sleep(0.2)
    
    print(f"\n✓ Recuperate statistiche per {len(all_stats)} video su {len(video_ids_list)}")
    
    # Mappa le statistiche sul dataframe
    for col_name, stat_key in [
        ('yt_view_count', 'view_count'),
        ('yt_like_count', 'like_count'),
        ('yt_comment_count', 'comment_count'),
        ('yt_duration', 'duration'),
        ('yt_definition', 'definition'),
        ('yt_published_at', 'published_at'),
        ('yt_category_id', 'category_id')
    ]:
        df[col_name] = df[video_id_col].map(
            lambda vid: all_stats.get(vid, {}).get(stat_key, None) if pd.notna(vid) else None
        )
    
    # Salva il risultato
    df.to_csv(output_file, index=False)
    print(f"\n✓ Dataset arricchito salvato in: {output_file}")
    
    # Statistiche finali
    print(f"\nStatistiche recupero:")
    print(f"  - View count trovati: {df['yt_view_count'].notna().sum()}/{len(df)}")
    print(f"  - Like count trovati: {df['yt_like_count'].notna().sum()}/{len(df)}")
    print(f"  - Comment count trovati: {df['yt_comment_count'].notna().sum()}/{len(df)}")
    
    return df



In [3]:

# 1. Carica il dataframe con gli ID scrapati
df = pd.read_csv('songs_with_youtube_ids.csv')  # o il tuo file con gli ID

# 2. Arricchisci con le statistiche YouTube
df_enriched = enrich_with_youtube_statistics(
    df, 
    video_id_col='youtube_video_id',
    batch_size=50,
    output_file='songs_with_youtube_stats.csv'
)

Totale video ID da processare: 11142
Batch da 50 video per chiamata API
Chiamate API necessarie: 223
Quota consumata stimata: ~223 units



Recupero statistiche YouTube: 100%|[34m██████████[0m| 223/223 [02:06<00:00,  1.76batch/s]



✓ Recuperate statistiche per 10453 video su 11142

✓ Dataset arricchito salvato in: songs_with_youtube_stats.csv

Statistiche recupero:
  - View count trovati: 11135/11166
  - Like count trovati: 11135/11166
  - Comment count trovati: 11135/11166
