In [None]:
# Cleaning Amplitude data, calculating click rate, and isolating UUID tag

import pandas as pd

# Load the CSV of Amplitude Data
df = pd.read_csv('/Users/parker.pape/Downloads/Data Table - Thumbnail Colorfulness Correlation Analysis.csv')

# Clean column names: strip, remove tabs/newlines, standardize spaces
df.columns = df.columns.str.replace(r'\s+', ' ', regex=True).str.strip()

# Show the cleaned column names
print("Cleaned column names:")
for i, col in enumerate(df.columns):
    print(f"{i}: '{col}'")

# Use the exact column names you got from above!
df['[Amplitude] Page Views--All Users'] = df['[Amplitude] Page Views--All Users'].astype(str).str.replace(',', '').astype(float)
df['Video Plays--All Users'] = df['Video Plays--All Users'].astype(str).str.replace(',', '').astype(float)

# Drop NA and divide-by-zero
df_clean = df.dropna(subset=['[Amplitude] Page Views--All Users', 'Video Plays--All Users'])
df_clean = df_clean[df_clean['Video Plays--All Users'] != 0]

# Compute conversion rate
df_clean['conversion_rate'] = df_clean['Video Plays--All Users'] / df_clean['[Amplitude] Page Views--All Users']

# Add UUID Tag
df_clean['UUID'] = df['CMS Url'].str.replace('\thttps://cms.weather.com/admin/content?title=&type=All&status=All&langcode=All&uuid=', '', regex=False)

output_path = '/Users/parker.pape/Downloads/Video_Performance_Cleaned.csv'
df_clean.to_csv(output_path, index=False)

print(f"\n✅ Cleaned data saved to: {output_path}")

In [None]:
# Added new column with corresponding live URLs in Excel from CMS data (1000 entries)
# New CSV in Thumbnail_Colorfulness_Analysis_Live_URLs.csv

In [None]:
# Extracting thumbnail image from Live URLs

import pandas as pd
import cv2
import numpy as np
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, parse_qs, urlencode, urlunparse

# Image quality needs to be better quality version found within HTML structure
def upgrade_thumbnail_url(thumbnail_url, width='1920', quality='90'):
    """Modify URL parameters for better quality."""
    parsed = urlparse(thumbnail_url)
    query = parse_qs(parsed.query)

    # Update parameters
    query['width'] = width
    query['quality'] = quality

    # Flatten the query dict back to string
    new_query = urlencode(query, doseq=True)

    # Rebuild full URL
    return urlunparse((parsed.scheme, parsed.netloc, parsed.path, parsed.params, new_query, parsed.fragment))

# Extraction of JPG from URL
def extract_video_thumbnail(url):
    headers = {
        'User-Agent': 'Mozilla/5.0'
    }

    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        img_tag = soup.find('img', class_='Image--rounded--BnTyA Image--intrinsicRatioElement--bFd9O')

        if img_tag and img_tag.get('src'):
            low_quality_url = img_tag['src']
            high_quality_url = upgrade_thumbnail_url(low_quality_url)
            return high_quality_url
        else:
            return "na"

    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        return "na"


# Load the CSV
df = pd.read_csv('/Users/parker.pape/Downloads/Thumbnail_Colorfulness_Analysis_Live_URLs.csv')

# Clean column names: strip, remove tabs/newlines, standardize spaces
df.columns = df.columns.str.replace(r'\s+', ' ', regex=True).str.strip()

# Remove NAs from missing URLs
df_clean = df.dropna(subset=['Live Url'])

# Add Corresponding Preview JPEG
df_clean['Thumbnail JPEG'] = df_clean['Live Url'].apply(extract_video_thumbnail)

# Output to CSV
output_path = '/Users/parker.pape/Downloads/Thumbnail_Colorfulness_Analysis_with_Image.csv'
df_clean.to_csv(output_path, index=False)

In [None]:
import pandas as pd
import cv2
import numpy as np
import matplotlib.pyplot as plt
import requests
from io import BytesIO
from PIL import Image

# Function to calculate colorfulness from an image URL
def calculate_colorfulness(image_url):
    try:
        # Download the image from the URL
        response = requests.get(image_url)
        response.raise_for_status()  # Raise error for bad responses

        # Read the image using PIL and convert to OpenCV format
        image_pil = Image.open(BytesIO(response.content)).convert("RGB")
        image = np.array(image_pil)
        image_rgb = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)  # Ensure OpenCV format
        
        # Convert to float and split into channels
        (R, G, B) = cv2.split(image_rgb.astype("float"))

        # rg = R - G
        rg = np.absolute(R - G)

        # yb = 0.5*(R + G) - B
        yb = np.absolute(0.5 * (R + G) - B)

        # Compute standard deviation and mean
        std_rg, mean_rg = np.std(rg), np.mean(rg)
        std_yb, mean_yb = np.std(yb), np.mean(yb)

        # Combine the metrics
        std_root = np.sqrt(std_rg**2 + std_yb**2)
        mean_root = np.sqrt(mean_rg**2 + mean_yb**2)

        # Final colorfulness score
        colorfulness = std_root + (0.3 * mean_root)
        return colorfulness

    except requests.exceptions.RequestException as e:
        return None

# Load the CSV
df = pd.read_csv('/Users/parker.pape/Downloads/Thumbnail_Colorfulness_Analysis_with_Image.csv')

# Clean column names: strip, remove tabs/newlines, standardize spaces
df.columns = df.columns.str.replace(r'\s+', ' ', regex=True).str.strip()

# Remove NAs from missing JPEG Links
df_clean = df[df['Thumbnail JPEG'].str.lower() != 'na']

# Add Corresponding Colorfulness Score
df_clean['Colorfulness Score'] = df_clean['Thumbnail JPEG'].apply(calculate_colorfulness)

# Output to CSV
output_path = '/Users/parker.pape/Downloads/Thumbnail_Colorfulness_Analysis_Final.csv'
df_clean.to_csv(output_path, index=False)