In [1]:
import pandas as pd
import numpy as np
import requests
from PIL import Image
from io import BytesIO
from collections import Counter
import colorsys
import os
data = pd.read_csv("dataset_copy.csv", sep=None, engine='python')
INPUT_CSV = "dataset_copy.csv"  
OUTPUT_CSV = "dataset_thumbnail_featured.csv"

In [2]:
"""
[ YouTube API ]  --->  [ dataset_copy.csv ]
                             |
                             V
                    [ Thumbnail Script ] ---> [ dataset_thumbnail_featured.csv ]
                             |
                             V
                    [ Feature Engineering ] (Merge & Clean & Featuring)
                             |
                             V
                    [ data_featured.csv ]
                             |
                             V
                    [ ML Models ]

This is the roadmap of project and python files.
"""

'\n[ YouTube API ]  --->  [ dataset_copy.csv ]\n                             |\n                             V\n                    [ Thumbnail Script ] ---> [ dataset_thumbnail_featured.csv ]\n                             |\n                             V\n                    [ Feature Engineering ] (Merge & Clean & Featuring)\n                             |\n                             V\n                    [ data_featured.csv ]\n                             |\n                             V\n                    [ ML Models (XGBoost) ]\n\nThis is the roadmap of project and python files.\n'

In [None]:
import pandas as pd
import numpy as np
import requests
from PIL import Image
from io import BytesIO
from collections import Counter
import colorsys
import os

# Initialize the dataframe for processing
if 'data' in locals():
    df_process = data.copy()
else:
    df_process = data.copy()

def extract_all_color_features(url):

    #Downloads an image from a URL and extracts various color based metrics including RGB, HSV, Hex codes, and perceived brightness.

    try:
        # Return default values if the URL is empty or invalid
        if pd.isna(url) or url == "":
            return pd.Series([0,0,0,0,0,0,"#000000",0])

        # Download the image content with a 3 second timeout
        resp = requests.get(url, timeout=3)
        img = Image.open(BytesIO(resp.content)).convert("RGB")

        # Reduce the image to 16 colors to simplify the color space
        img = img.quantize(colors=16).convert("RGB")

        # Resize image for faster pixel analysis and find the most frequent color
        pixels = list(img.resize((25, 25)).getdata())
        rgb = Counter(pixels).most_common(1)[0][0]

        r, g, b = rgb[0], rgb[1], rgb[2]

        # Convert RGB to Hexadecimal format
        hex_code = "#{:02x}{:02x}{:02x}".format(r, g, b)

        # Convert RGB to Hue, Saturation, and Value (HSV) scale
        h, s, v = colorsys.rgb_to_hsv(r/255.0, g/255.0, b/255.0)
        h, s, v = h * 360, s * 100, v * 100

        # Calculate brightness as perceived by the human eye
        perceived_bright = (r * 0.299 + g * 0.587 + b * 0.114)

        return pd.Series([h, s, v, r, g, b, hex_code, perceived_bright])

    except:
        # Return zeros if any error occurs during image processing
        return pd.Series([0,0,0,0,0,0,"#000000",0])

# Logic to resume processing if the output file already exists
start_index = 0
if os.path.exists(OUTPUT_CSV):
    df_existing = pd.read_csv(OUTPUT_CSV, sep=";")
    start_index = len(df_existing)
    print(f"{start_index} video processed. Continuing")
else:
    # Create a new CSV file with appropriate headers if it does not exist
    cols = ['thumb_hue', 'thumb_saturation', 'thumb_brightness',
            'thumb_r', 'thumb_g', 'thumb_b',
            'thumb_hex', 'thumb_perceived_brightness']

    pd.DataFrame(columns=list(df_process.columns) + cols).to_csv(OUTPUT_CSV, sep=";", index=False)

print(f" {len(df_process) - start_index} will be processed")

# Process the data in batches of 50 to ensure data safety
batch_size = 50
batch_data = []
rows_to_process = df_process.iloc[start_index:].copy()

for index, row in rows_to_process.iterrows():
    # Extract features for each row using the defined function
    features = extract_all_color_features(row['thumbnail_url'])

    # Map extracted features back to the row dictionary
    row_data = row.to_dict()
    row_data['thumb_hue'] = features[0]
    row_data['thumb_saturation'] = features[1]
    row_data['thumb_brightness'] = features[2]
    row_data['thumb_r'] = features[3]
    row_data['thumb_g'] = features[4]
    row_data['thumb_b'] = features[5]
    row_data['thumb_hex'] = features[6]
    row_data['thumb_perceived_brightness'] = features[7]

    batch_data.append(row_data)

    # Save the batch to CSV and clear memory when batch limit is reached
    if len(batch_data) >= batch_size:
        pd.DataFrame(batch_data).to_csv(OUTPUT_CSV, sep=";", mode='a', header=False, index=False)
        print(f"Progress: {index+1}/{len(df_process)} | Color: {features[6]} | Sat: {features[1]:.0f}")
        batch_data = []

# Save any remaining records after the loop finishes
if batch_data:
    pd.DataFrame(batch_data).to_csv(OUTPUT_CSV, sep=";", mode='a', header=False, index=False)
    print("Done. All feature saved to the 'dataset_thumbnail_featured.csv'")

13317 video processed. Continuing
 0 will be processed
