In [2]:
!pip install google-api-python-client pandas

from googleapiclient.discovery import build
import pandas as pd
import time as time
from collections import Counter
import re



In [14]:
API_KEY = 'AIzaSyDjJjXe3yfIpBWi5yyGW5IODUbBoIpVqUM'
youtube = build('youtube', 'v3', developerKey=API_KEY)

def search_youtube_with_stats(query, total_results=100):
    all_videos = []
    video_ids = []
    next_page_token = None
    results_per_page = 50
    fetched_results = 0

    while fetched_results < total_results:
        request = youtube.search().list(
            q=query,
            part='snippet',
            type='video',
            maxResults=results_per_page,
            pageToken=next_page_token
        )
        response = request.execute()

        for item in response['items']:
            if item['id']['kind'] != 'youtube#video':
                continue

            video_id = item['id']['videoId']
            video = {
                'video_id': video_id,
                'title': item['snippet']['title'],
                'description': item['snippet']['description'],
                'channel': item['snippet']['channelTitle'],
                'published': item['snippet']['publishedAt']
            }
            all_videos.append(video)
            video_ids.append(video_id)
            fetched_results += 1

            if fetched_results >= total_results:
                break

        next_page_token = response.get('nextPageToken')
        if not next_page_token:
            break

        time.sleep(0.2)  # Small delay to avoid quota spikes

    # 🔁 Now fetch stats in batches of 50
    stats = []
    for i in range(0, len(video_ids), 50):
        ids = ",".join(video_ids[i:i+50])
        stats_response = youtube.videos().list(
            part="statistics",
            id=ids
        ).execute()

        for item in stats_response.get("items", []):
            stats.append({
                'video_id': item['id'],
                'views': item['statistics'].get('viewCount'),
                'likes': item['statistics'].get('likeCount'),
                'comments': item['statistics'].get('commentCount')
            })
        time.sleep(0.2)  # Optional pause

    # 🧩 Merge metadata and stats
    df_videos = pd.DataFrame(all_videos)
    df_stats = pd.DataFrame(stats)

    df_full = df_videos.merge(df_stats, on='video_id', how='left')

    return df_full

queries = [
    "innisfree green tea serum review",
    "innisfree no sebum powder wear test",
    "innisfree not sponsored review",
    "innisfree before and after",
    "innisfree skincare routine 2024",
    "is innisfree worth it",
    "innisfree vs cosrx",
    "innisfree volcanic clay mask review",
    "innisfree hydration test",
    "innisfree honest review",
    "why I stopped using innisfree",
    "innisfree eco-friendly beauty campaign",
    "innisfree"
]


all_results = []
for q in queries:
    df = search_youtube_with_stats(q, total_results=300)
    all_results.append(df)
    print(f"✅ {len(df)} videos fetched for query: {q}")

# Combine all into one DataFrame
combined_df = pd.concat(all_results, ignore_index=True)

# Convert numeric columns
for col in ['views', 'likes', 'comments']:
    combined_df[col] = pd.to_numeric(combined_df[col], errors='coerce')

combined_df.to_csv("/Users/giselle/Desktop/Dartmouth/Skinfluence/data/youtube_etude.csv", index=False)
print("📦 All queries saved. Total videos:", combined_df.shape[0])

✅ 334 videos fetched for query: innisfree green tea serum review
✅ 336 videos fetched for query: innisfree no sebum powder wear test
✅ 306 videos fetched for query: innisfree not sponsored review


HttpError: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/search?q=innisfree+before+and+after&part=snippet&type=video&maxResults=50&pageToken=CMgBEAA&key=AIzaSyDjJjXe3yfIpBWi5yyGW5IODUbBoIpVqUM&alt=json returned "The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.". Details: "[{'message': 'The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.', 'domain': 'youtube.quota', 'reason': 'quotaExceeded'}]">

In [6]:
combined_df.to_csv("/Users/giselle/Desktop/Dartmouth/Skinfluence/data/youtube_tarte.csv", index=False)

In [16]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3818 entries, 0 to 3817
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   video_id     3818 non-null   object 
 1   title        3818 non-null   object 
 2   description  3818 non-null   object 
 3   channel      3818 non-null   object 
 4   published    3818 non-null   object 
 5   views        3818 non-null   int64  
 6   likes        3640 non-null   float64
 7   comments     3778 non-null   float64
dtypes: float64(2), int64(1), object(5)
memory usage: 238.8+ KB


In [22]:
combined_df.to_csv("/Users/giselle/Desktop/Dartmouth/Skinfluence/data/youtube_estee_lauder.csv", index=False)

In [26]:
def get_top_comments(video_id, max_results=50):
    comments = []
    next_page_token = None

    while len(comments) < max_results:
        try:
            request = youtube.commentThreads().list(
                part="snippet",
                videoId=video_id,
                maxResults=min(100, max_results - len(comments)),  # API max is 100
                textFormat="plainText",
                pageToken=next_page_token
            )
            response = request.execute()
        except Exception as e:
            print(f"Failed to fetch comments for {video_id}: {e}")
            break

        for item in response.get("items", []):
            comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
            comments.append(comment)
            if len(comments) >= max_results:
                break

        next_page_token = response.get("nextPageToken")
        if not next_page_token:
            break

        time.sleep(0.1)  # avoid quota spikes

    return comments

In [27]:
all_comments = {}

for video_id in df['video_id']:
    try:
        comments = get_top_comments(video_id, max_results=20)
        all_comments[video_id] = comments
        print(f"Fetched {len(comments)} comments for video {video_id}")
    except Exception as e:
        print(f"Failed to fetch comments for {video_id}: {e}")
    time.sleep(0.1)  # be kind to the API


Fetched 20 comments for video A50_AmSTdVE
Fetched 20 comments for video GY90IvsNbvw
Fetched 20 comments for video Xy4LPjPOQ_w
Fetched 20 comments for video 4OawHDNzZW8
Fetched 20 comments for video QWwxA2pKJVg
Fetched 20 comments for video -0AT8YwD9Ow
Fetched 20 comments for video th-SfqKeOVg
Fetched 20 comments for video tPjEziErRNI
Fetched 0 comments for video 01HygSG2EQ8
Fetched 20 comments for video i4fxs5JLBvc
Fetched 20 comments for video KbpMDe5LK1g


KeyboardInterrupt: 

In [None]:
all_comments

{'8bGE4-t6vks': ['1.byoma cleanser\n2. Bubble cleaner\n3.buble slam dunk\n4.glow recipe due drops\n5.drunk elephant bronzing drops\n6.tower 28 toner',
  'Pls pin me',
  'y’all just called me poor in 50 ways…\n(this is a joke)',
  'you both are buaettiful the way you are',
  'My favorite skincare brand is Bubble, and here is my skincare routine! 😁 \n1. Bubble Fresh Start cleanser \n2. Bubble Water Slide serum \n3. Bubble Cloud Surf Moisturizer\n4. Bubble Solar Mate SPF \nP.S. I hoped u liked my skincare routine! ❤❤',
  '🌛☀️☀️☀️☀️☀️',
  'Utan, wow cool I wish we had that in Masterton',
  'I love skincare❤❤❤😮😮😮😂😂😂',
  'I WANT TO SEE A NIGHT 🌃 AND MORNING 🌄 ROUTINE ❤❤❤',
  'What is your favorite Skincare product',
  'My skin care routine \nFresh start bubble gel cleanser\nBubble mist bubble moisture gel and bubble soft moisture and way more skin care',
  'I love you',
  'I have a skin care routine too. I’m only a year older than Callie because I’m 17 and she is 16 and I entered your giveaw