In [9]:
# Import required libraries
import pandas as pd  # For data manipulation and CSV export
from selenium import webdriver  # For web automation
from selenium.webdriver.common.by import By  # For locating elements
from selenium.webdriver.support.ui import WebDriverWait  # For waiting for elements to load
from selenium.webdriver.support import expected_conditions as EC  # For element conditions
import time  # For adding delays
import re  # For text pattern matching

In [10]:
# Initialize Chrome browser
driver = webdriver.Chrome()  # Create a new Chrome browser instance

# List of YouTube channels to scrape (replace with actual channel URLs)
youtube_channels = [
    "https://www.youtube.com/@DataWithBaraa/videos",
    "https://www.youtube.com/@emanraslan1382/videos",
]

# Create empty list to store all video data
all_videos_data = []

# Loop through each YouTube channel
for channel_url in youtube_channels:
    print(f"Scraping channel: {channel_url}")
    
    # Navigate to the channel's videos page
    driver.get(channel_url)
    
    # Wait for page to load completely
    time.sleep(2)
    
    try:
        # Get channel follower count using XPath
        # Note: YouTube may change their HTML structure, so XPath might need adjustment
        follower_element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, "//*[contains(text(), 'subscribers') or contains(text(), 'followers')]"))
        )
        followers = follower_element.text  # Extract follower count text
    except:
        followers = "Not found"  # Default value if followers not found
    
    # Find all video elements on the page using XPath
    video_elements = driver.find_elements(By.XPATH, '//*[@id="video-title-link"]')
    
    # Extract video links from the first 10 videos
    video_links = []
    for video in video_elements[:10]:  # Limit to 10 videos
        video_url = video.get_attribute('href')  # Get video URL
        if video_url:  # Check if URL exists
            video_links.append(video_url)  # Add URL to list
    
    # Loop through each video link to get detailed information
    for video_link in video_links:
        try:
            # Navigate to individual video page
            driver.get(video_link)
            
            # Wait for video page to load
            time.sleep(3)
            
            # Extract video title using XPath
            title_element = driver.find_element(By.XPATH, '//h1[@class="style-scope ytd-watch-metadata"]')
            title = title_element.text  # Get video title text
            
            # Extract view count using XPath
            view_element = driver.find_element(By.XPATH, '//span[contains(text(), "views")]')
            views = view_element.text  # Get view count text
            
            # Extract like count using XPath
            like_element = driver.find_element(By.XPATH, '//button[contains(@aria-label, "likes")]')
            likes = like_element.get_attribute('aria-label')  # Get like count from aria-label
            
            # Extract publication date using XPath
            date_element = driver.find_element(By.XPATH, '//*[contains(text(), "Published") or contains(text(), "ago")]')
            publish_date = date_element.text  # Get publication date text
            
            # Create dictionary with all video data
            video_data = {
                'channel_url': channel_url,
                'followers': followers,
                'title': title,
                'views': views,
                'likes': likes,
                'publication_date': publish_date,
                'video_link': video_link
            }
            
            # Add video data to main list
            all_videos_data.append(video_data)
            
            print(f"Scraped: {title[:50]}...")  # Print first 50 characters of title
            
        except Exception as e:
            print(f"Error scraping video: {video_link} - {str(e)}")
            continue  # Continue to next video if error occurs
    
    # Small delay between channels to avoid being blocked
    time.sleep(2)

# Close the browser when done
driver.quit()

# Convert collected data to pandas DataFrame
df = pd.DataFrame(all_videos_data)

# Export DataFrame to CSV file
df.to_csv('youtube_viral_videos.csv', index=False)

# Display first few rows of data
print(df.head())

Scraping channel: https://www.youtube.com/@DataWithBaraa/videos
Error scraping video: https://www.youtube.com/watch?v=hgQD2znCc_I - Message: no such element: Unable to locate element: {"method":"xpath","selector":"//button[contains(@aria-label, "likes")]"}
  (Session info: chrome=141.0.7390.109); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#nosuchelementexception
Stacktrace:
	GetHandleVerifier [0x0x7ff6ad7ee8e5+80021]
	GetHandleVerifier [0x0x7ff6ad7ee940+80112]
	(No symbol) [0x0x7ff6ad57060f]
	(No symbol) [0x0x7ff6ad5c8854]
	(No symbol) [0x0x7ff6ad5c8b1c]
	(No symbol) [0x0x7ff6ad61c927]
	(No symbol) [0x0x7ff6ad5f126f]
	(No symbol) [0x0x7ff6ad61968a]
	(No symbol) [0x0x7ff6ad5f1003]
	(No symbol) [0x0x7ff6ad5b95d1]
	(No symbol) [0x0x7ff6ad5ba3f3]
	GetHandleVerifier [0x0x7ff6adaadc7d+2960429]
	GetHandleVerifier [0x0x7ff6adaa7f3a+2936554]
	GetHandleVerifier [0x0x7ff6adac8977+3070247]
	GetHandleVerifier [0x0x7ff6ad8083

In [11]:
# Data Cleaning and Transformation
print("Starting data cleaning...")

# Create a copy of the dataframe for cleaning
df_clean = df.copy()

# Clean followers column - extract numbers only
df_clean['followers_clean'] = df_clean['followers'].str.extract('(\d+\.?\d*)')[0]

# Convert follower count to numeric (handle K, M suffixes)
def convert_followers(follower_str):
    if pd.isna(follower_str):
        return 0
    follower_str = str(follower_str).lower()
    if 'k' in follower_str:
        return float(follower_str.replace('k', '')) * 1000
    elif 'm' in follower_str:
        return float(follower_str.replace('m', '')) * 1000000
    else:
        return float(follower_str)

df_clean['followers_numeric'] = df_clean['followers_clean'].apply(convert_followers)

# Clean views column - extract numbers only
df_clean['views_clean'] = df_clean['views'].str.extract('(\d+\.?\d*)')[0]

# Convert view count to numeric (handle K, M suffixes)
def convert_views(view_str):
    if pd.isna(view_str):
        return 0
    view_str = str(view_str).lower()
    if 'k' in view_str:
        return float(view_str.replace('k', '')) * 1000
    elif 'm' in view_str:
        return float(view_str.replace('m', '')) * 1000000
    else:
        return float(view_str)

df_clean['views_numeric'] = df_clean['views_clean'].apply(convert_views)

# Clean likes column - extract numbers only from aria-label
df_clean['likes_clean'] = df_clean['likes'].str.extract('(\d+\.?\d*)')[0]

# Convert like count to numeric (handle K, M suffixes)
df_clean['likes_numeric'] = df_clean['likes_clean'].apply(convert_views)

# Clean publication date - extract meaningful date information
df_clean['publication_date_clean'] = df_clean['publication_date'].str.extract('(\w+ \d+, \d+|\d+ \w+ ago)')[0]

# Calculate engagement rate (likes/views)
df_clean['engagement_rate'] = (df_clean['likes_numeric'] / df_clean['views_numeric']).round(4)

# Select and reorder final columns for analysis
final_columns = [
    'channel_url', 'followers_numeric', 'title', 'views_numeric', 
    'likes_numeric', 'publication_date_clean', 'engagement_rate', 'video_link'
]

# Create final dataframe
df_final = df_clean[final_columns]

# Rename columns for better readability
df_final.columns = [
    'channel_url', 'followers', 'title', 'views', 
    'likes', 'publication_date', 'engagement_rate', 'video_link'
]

# Display cleaned data sample
print("Cleaned data sample:")
print(df_final.head())

# Basic Data Analysis
print("\n=== BASIC DATA ANALYSIS ===")

# Display dataset info
print(f"Total videos scraped: {len(df_final)}")
print(f"Total channels: {len(df_final['channel_url'].unique())}")

# Basic statistics for numerical columns
print("\nBasic Statistics:")
print(df_final[['followers', 'views', 'likes', 'engagement_rate']].describe())

# Top performing videos by views
print("\nTop 5 Videos by Views:")
top_views = df_final.nlargest(5, 'views')[['title', 'views', 'likes', 'engagement_rate']]
print(top_views)

# Top performing videos by engagement rate
print("\nTop 5 Videos by Engagement Rate:")
top_engagement = df_final.nlargest(5, 'engagement_rate')[['title', 'engagement_rate', 'views', 'likes']]
print(top_engagement)

# Save to CSV file
output_filename = 'youtube_viral_videos_analysis.csv'
df_final.to_csv(output_filename, index=False)
print(f"\nData saved to: {output_filename}")

# Optional: Save raw data for backup
df.to_csv('youtube_raw_data_backup.csv', index=False)

print("Scraping and analysis completed successfully!")

Starting data cleaning...


  df_clean['followers_clean'] = df_clean['followers'].str.extract('(\d+\.?\d*)')[0]
  df_clean['views_clean'] = df_clean['views'].str.extract('(\d+\.?\d*)')[0]
  df_clean['likes_clean'] = df_clean['likes'].str.extract('(\d+\.?\d*)')[0]
  df_clean['publication_date_clean'] = df_clean['publication_date'].str.extract('(\w+ \d+, \d+|\d+ \w+ ago)')[0]


KeyError: 'followers'