In [None]:
import pandas as pd
import datetime
from dateutil.relativedelta import relativedelta
from googleapiclient.discovery import build

# -----------------------------
# Global Setup: API Key and YouTube Client
# -----------------------------
API_KEY = '<API>'  # Replace with your actual API key
youtube = build('youtube', 'v3', developerKey=API_KEY)

# ============================================================
# Part 1: Fetch Videos via Date-Segmented Search Query ("Artificial Intelligence")
# ============================================================
def fetch_searched_videos():
    query = "Artificial Intelligence"
    max_results_per_page = 100

    # Helper function to generate date ranges
    def generate_date_ranges(start_date, end_date, delta_months=1):
        date_ranges = []
        current_start = start_date
        while current_start < end_date:
            current_end = current_start + relativedelta(months=delta_months)
            if current_end > end_date:
                current_end = end_date
            date_ranges.append({
                'publishedAfter': current_start.strftime("%Y-%m-%dT%H:%M:%SZ"),
                'publishedBefore': current_end.strftime("%Y-%m-%dT%H:%M:%SZ")
            })
            current_start = current_end
        return date_ranges

    # Define the overall date range and segmentation period
    start_date = datetime.datetime(2024, 1, 1)
    end_date = datetime.datetime(2025, 1, 1)
    date_ranges = generate_date_ranges(start_date, end_date, delta_months=1)

    # Gather all video IDs across all segments
    all_video_ids = []
    for segment in date_ranges:
        next_page_token = None
        print(f"Fetching videos from {segment['publishedAfter']} to {segment['publishedBefore']}")
        while True:
            request = youtube.search().list(
                part="snippet",
                q=query,
                type="video",
                maxResults=max_results_per_page,
                pageToken=next_page_token,
                publishedAfter=segment['publishedAfter'],
                publishedBefore=segment['publishedBefore']
            )
            response = request.execute()
            for item in response.get("items", []):
                video_id = item['id'].get('videoId')
                if video_id:
                    all_video_ids.append(video_id)
            next_page_token = response.get("nextPageToken")
            if not next_page_token:
                break

    # Remove duplicate video IDs
    all_video_ids = list(set(all_video_ids))
    print(f"Total unique video IDs collected from search: {len(all_video_ids)}")

    # Fetch detailed video information (including snippet, contentDetails, and statistics)
    video_details = []
    for i in range(0, len(all_video_ids), 50):
        ids_chunk = all_video_ids[i:i+50]
        request = youtube.videos().list(
            part="snippet,contentDetails,statistics",
            id=",".join(ids_chunk)
        )
        response = request.execute()
        video_details.extend(response.get("items", []))

    print(f"Total videos fetched from search: {len(video_details)}")

    # --- New Section: Fetch Channel Subscriber Counts ---
    # Extract unique channel IDs from video details
    unique_channel_ids = set()
    for video in video_details:
        if 'snippet' in video and 'channelId' in video['snippet']:
            unique_channel_ids.add(video['snippet']['channelId'])

    print(f"Total unique channel IDs found: {len(unique_channel_ids)}")

    # Create a dictionary to hold the subscriber count for each channel
    channel_subscribers = {}
    unique_channel_ids = list(unique_channel_ids)

    # Process channel IDs in batches of up to 50.
    for i in range(0, len(unique_channel_ids), 50):
        ids_chunk = unique_channel_ids[i:i+50]
        request = youtube.channels().list(
            part="statistics",
            id=",".join(ids_chunk)
        )
        response = request.execute()
        for channel in response.get("items", []):
            channel_id = channel.get("id")
            stats = channel.get("statistics", {})
            # subscriberCount is returned as a string. It may be hidden or missing.
            channel_subscribers[channel_id] = stats.get("subscriberCount", None)

    # Append subscriber count to each video detail based on its channelId.
    for video in video_details:
        channel_id = video.get("snippet", {}).get("channelId")
        # If the channel id is not found, assign None
        video["channelSubscriberCount"] = channel_subscribers.get(channel_id, None)

    # Normalize and save the raw search results to CSV
    df_search = pd.json_normalize(video_details)
    search_csv = 'youtube_search_raw.csv'
    df_search.to_csv(search_csv, index=False)
    print(f"Search videos data has been saved to '{search_csv}'.")

# ============================================================
# Main Execution
# ============================================================
if __name__ == "__main__":
    fetch_searched_videos()


In [None]:
import re
import pandas as pd

def iso_duration_to_seconds(duration):
    """
    Converts an ISO8601 duration string (e.g., "PT5M12S") to total seconds.
    """
    if pd.isna(duration):
        return None
    pattern = re.compile(r'PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?')
    match = pattern.match(duration)
    if match:
        hours = int(match.group(1)) if match.group(1) else 0
        minutes = int(match.group(2)) if match.group(2) else 0
        seconds = int(match.group(3)) if match.group(3) else 0
        return hours * 3600 + minutes * 60 + seconds
    return None

def remove_non_ascii(text):
    """
    Removes non-ASCII symbols from text.
    """
    if pd.isnull(text):
        return text
    return re.sub(r'[^\x00-\x7F]+', '', text)

def clean_youtube_data(input_csv: str, output_csv: str):
    """
    Loads, cleans, and transforms YouTube video data according to the
    provided cleaning steps. Splits publishTime into publishDate & publishTimeUTC.
    Removes rows with viewCount < 1000.
    """
    # -----------------------------
    # Step 1: Load the CSV into a DataFrame
    # -----------------------------
    df = pd.read_csv(input_csv)
    print("Initial columns:")
    print(df.columns.tolist())
    print("\nFirst few rows of data:")
    print(df.head())

    # -----------------------------
    # Step 2: Remove Unwanted Columns
    # -----------------------------
    columns_to_drop = [
        'kind',
        'etag',
        'snippet.channelId',
        'snippet.thumbnails.default.url',
        'snippet.thumbnails.default.width',
        'snippet.thumbnails.default.height',
        'snippet.thumbnails.medium.url',
        'snippet.thumbnails.medium.width',
        'snippet.thumbnails.medium.height',
        'snippet.thumbnails.high.url',
        'snippet.thumbnails.high.width',
        'snippet.thumbnails.high.height',
        'snippet.thumbnails.standard.url',
        'snippet.thumbnails.standard.width',
        'snippet.thumbnails.standard.height',
        'snippet.thumbnails.maxres.url',
        'snippet.thumbnails.maxres.width',
        'snippet.thumbnails.maxres.height',
        'snippet.liveBroadcastContent',
        'snippet.localized.title',
        'snippet.localized.description',
        'contentDetails.dimension',
        'contentDetails.definition',
        'contentDetails.regionRestriction.allowed',
        'contentDetails.projection',
        'contentDetails.regionRestriction.blocked'
    ]
    cols_to_drop = [col for col in columns_to_drop if col in df.columns]
    df.drop(columns=cols_to_drop, inplace=True, errors='ignore')

    # -----------------------------
    # Step 3: Remove Duplicate Rows
    # -----------------------------
    df.drop_duplicates(inplace=True)

    # -----------------------------
    # Step 4: Convert Date/Time Values
    # -----------------------------
    if 'snippet.publishedAt' in df.columns:
        df['snippet.publishedAt'] = pd.to_datetime(df['snippet.publishedAt'], errors='coerce')

    # -----------------------------
    # Step 5: Convert Engagement Statistics to Numeric Types
    # -----------------------------
    stats_columns = [
        'statistics.viewCount',
        'statistics.likeCount',
        'statistics.favoriteCount',
        'statistics.commentCount'
    ]
    for col in stats_columns:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    # -----------------------------
    # Step 6: Convert Video Duration from ISO 8601 to Seconds
    # -----------------------------
    if 'contentDetails.duration' in df.columns:
        df['duration_seconds'] = df['contentDetails.duration'].apply(iso_duration_to_seconds)

    # -----------------------------
    # Step 7: Transform List Columns (e.g., snippet.tags)
    # -----------------------------
    if 'snippet.tags' in df.columns:
        df['snippet.tags'] = df['snippet.tags'].apply(lambda x: tuple(x) if isinstance(x, list) else x)

    # -----------------------------
    # Step 8: Map Category IDs to Names
    # -----------------------------
    category_mapping = {
        1: "Film & Animation",
        2: "Autos & Vehicles",
        10: "Music",
        15: "Pets & Animals",
        17: "Sports",
        19: "Travel & Events",
        20: "Gaming",
        22: "People & Blogs",
        23: "Comedy",
        24: "Entertainment",
        25: "News & Politics",
        26: "Howto & Style",
        27: "Education",
        28: "Science & Technology",
        29: "Nonprofits & Activism",
        30: "Movies",
        31: "Shows"
    }
    if 'snippet.categoryId' in df.columns:
        df['snippet.categoryId'] = pd.to_numeric(df['snippet.categoryId'], errors='coerce')
        df['categoryName'] = df['snippet.categoryId'].map(category_mapping)
    else:
        print("Column 'snippet.categoryId' not found. Skipping category mapping.")

    # -----------------------------
    # Step 9: Rename Columns for Easier Understanding
    # -----------------------------
    rename_dict = {
        'id': 'videoId',
        'snippet.publishedAt': 'publishTime',
        'snippet.title': 'title',
        'snippet.description': 'description',
        'snippet.channelTitle': 'channelTitle',
        'snippet.tags': 'tags',
        'snippet.categoryId': 'categoryId',
        'contentDetails.duration': 'duration',
        'contentDetails.caption': 'caption',
        'contentDetails.licensedContent': 'licensedContent',
        'statistics.viewCount': 'viewCount',
        'statistics.likeCount': 'likeCount',
        'statistics.favoriteCount': 'favoriteCount',
        'statistics.commentCount': 'commentCount',
        'snippet.defaultAudioLanguage': 'defaultAudioLanguage',
        'snippet.defaultLanguage': 'defaultLanguage',
        'duration_seconds': 'durationSeconds'
    }
    df.rename(columns=rename_dict, inplace=True)

    # -----------------------------
    # Step 10: Remove Non-ASCII Symbols from Text Columns
    # -----------------------------
    text_cols = ['title', 'description', 'channelTitle','tags']
    for col in text_cols:
        if col in df.columns:
            df[col] = df[col].apply(remove_non_ascii)

    # -----------------------------
    # Step 11: (Optional) Remove Unneeded Columns
    # -----------------------------
    df.drop(columns=['categoryId', 'duration','favoriteCount'], errors='ignore', inplace=True)

    # -----------------------------
    # Step 12: Split publishTime into Date & Time (without UTC offset)
    # -----------------------------
    if 'publishTime' in df.columns:
        df['publishDate'] = df['publishTime'].dt.date.astype(str)
        df['publishTimeUTC'] = df['publishTime'].dt.strftime('%H:%M:%S')
        df.drop(columns=['publishTime'], inplace=True)

    # -----------------------------
    # Step 13: Filter rows with viewCount < 1000
    # -----------------------------
    if 'viewCount' in df.columns:
        df = df[df['viewCount'] >= 1000]

    # -----------------------------
    # Step 14: Save the Final Cleaned Data
    # -----------------------------
    df.to_csv(output_csv, index=False)
    print(f"\nCleaned data has been saved to '{output_csv}'.")
    print(f"Total rows in cleaned data: {len(df)}")
    print("\nFinal DataFrame info:")
    print(df.info())
    print("\nPreview of cleaned data:")
    print(df.head())

# ---------------
# Example Usage
# ---------------
if __name__ == "__main__":
    input_file = "/content/youtube_search_raw.csv"  # Adjust path as needed
    output_file = "cleaned_data.csv"
    clean_youtube_data(input_file, output_file)
