In [1]:
# sponsor_processing_example.ipynb
import pandas as pd
from multi_processing.processor import LLMProcessor
from multi_processing.processor_config import ProcessorConfig
from multi_processing.llm_client import DeepSeekClient


In [2]:

# 1. Setup Configuration
config = ProcessorConfig(
    batch_size=10,                    # Process 10 videos at a time
    max_workers=100,                  # High concurrency for DeepSeek
    cache_dir="sponsor_cache",        # Cache directory
    save_interval=5,                  # Save every 5 batches
    show_progress=True,
    metrics_output_path="metrics.json"
)

# 2. Initialize DeepSeek Client
client = DeepSeekClient(
    api_key='sk-cd405682db094b6781f9f815840163d8',
    model="deepseek-chat",
    temperature=0.1
)

# 3. Initialize Processor
processor = LLMProcessor(client, config)


In [3]:
# sponsor_processing.py

import json
import pandas as pd
from typing import List, Dict, Any, Optional
from dataclasses import dataclass

@dataclass
class VideoData:
    """Structure for video data"""
    video_id: str
    title: str
    description: str
    channel_id: Optional[str] = None
    channel_title: Optional[str] = None

def create_prompt(videos: List[Dict[str, Any]], desc_length: int = 200) -> str:
    """
    Create prompt for sponsor detection
    
    Args:
        videos: List of video data dictionaries
        desc_length: Max length for description truncation
    """
    videos_text = ""
    for i, video in enumerate(videos, 1):
        description = video['description']
        if len(description) > desc_length:
            description = description[:desc_length] + "..."
            
        videos_text += f"""VIDEO {i}:
ID: {video['videoId']}
Title: {video['title']}
Description: {description}

"""
    
    prompt = f"""Analyze these {len(videos)} videos for brand sponsorships.

{videos_text}
Return a JSON object with video IDs mapping to their sponsors:
{{
    "video_sponsors": [
        {{
            "video_id": "the_video_id",
            "sponsors": [
                {{
                    "name": "Brand name (e.g., 'Surfshark' not 'surfshark vpn')",
                    "domain": "Main company domain (e.g., 'surfshark.com' not promo URLs)",
                    "evidence": "Exact text snippet showing sponsorship"
                }}
            ]
        }}
    ]
}}

Guidelines for identifying sponsorships:
- Look for direct mentions of brands with promotional intent
- Include sponsored integrations, brand deals, partnerships
- Use main company domains (e.g., 'nordvpn.com' not 'nordvpn.com/creator')
- For each brand found, use their official domain regardless of promo links
- Include multiple sponsors if present
- Ignore: merch, generic affiliate links, social media, donations, self promo

Examples of correct domain mapping:
- Surfshark promo link -> surfshark.com
- Nord VPN creator link -> nordvpn.com
- Skillshare special offer -> skillshare.com"""

    return prompt

def process_batch_response(content: str) -> Dict[str, List[Dict[str, str]]]:
    """
    Process LLM response into structured sponsor data
    
    Args:
        content: Raw LLM response text
    
    Returns:
        Dictionary mapping video IDs to sponsor lists
    """
    try:
        # Clean up response if it contains markdown code blocks
        if content.startswith("```"):
            json_start = content.find("{")
            json_end = content.rfind("}") + 1
            if json_start != -1 and json_end != -1:
                content = content[json_start:json_end]
        
        # Parse JSON response
        result = json.loads(content)
        batch_results = {}
        
        if 'video_sponsors' in result:
            for video_data in result['video_sponsors']:
                video_id = video_data['video_id']
                sponsors = video_data.get('sponsors', [])
                batch_results[video_id] = sponsors
                
        return batch_results
        
    except Exception as e:
        print(f"Error processing response: {e}")
        print(f"Raw content: {content[:200]}...")  # Print start of content for debugging
        return {}

def process_video_batch(batch: Dict[str, Any], client) -> Dict[str, Any]:
    """
    Process a single video batch for sponsor detection
    
    Args:
        batch: Dictionary containing video data
        client: LLM client instance
    """
    # Create single-item batch for prompt
    batch_list = [batch]
    
    # Generate and call prompt
    prompt = create_prompt(batch_list)
    response = client.call_api(prompt)
    
    # Return structured result
    result = {
        'video_id': batch['videoId'],
        'processed_data': response
    }
    return result

def transform_results(result: Dict[str, Any]) -> List[Dict[str, Any]]:
    """
    Transform LLM results into structured sponsor records
    
    Args:
        result: Dictionary containing video ID and processed data
        
    Returns:
        List of sponsor records with normalized structure
    """
    video_id = result['video_id']
    processed_data = result['processed_data']
    
    if not processed_data.get('success'):
        print(f"Processing failed for video {video_id}: {processed_data.get('error')}")
        return []
    
    # Parse sponsors from LLM response
    sponsor_data = process_batch_response(processed_data['content'])
    sponsors = sponsor_data.get(video_id, [])
    
    # Create individual records for each sponsor
    records = []
    for i, sponsor in enumerate(sponsors, 1):
        record = {
            'video_id': video_id,
            f'sponsor_{i}_name': sponsor.get('name'),
            f'sponsor_{i}_domain': sponsor.get('domain'),
            f'sponsor_{i}_evidence': sponsor.get('evidence')
        }
        records.append(record)
    
    return records

def validate_sponsor_record(record: Dict[str, Any]) -> bool:
    """
    Validate a sponsor record
    
    Args:
        record: Dictionary containing sponsor data
        
    Returns:
        True if record is valid, False otherwise
    """
    required_fields = ['video_id']
    sponsor_fields = ['name', 'domain', 'evidence']
    
    # Check required fields
    if not all(field in record for field in required_fields):
        return False
        
    # Check that at least one sponsor exists
    has_sponsor = False
    i = 1
    while f'sponsor_{i}_name' in record:
        sponsor_valid = all(
            record.get(f'sponsor_{i}_{field}') 
            for field in sponsor_fields
        )
        if sponsor_valid:
            has_sponsor = True
        i += 1
        
    return has_sponsor

def process_sponsor_batch(
    videos: List[Dict[str, Any]],
    processor,
    cache_prefix: str = "sponsor_detection"
) -> pd.DataFrame:
    """
    Process a batch of videos for sponsor detection
    
    Args:
        videos: List of video data dictionaries
        processor: LLMProcessor instance
        cache_prefix: Prefix for cache keys
    
    Returns:
        DataFrame containing processed sponsor data
    """
    # Process videos through LLM processor
    results = processor.process_batch(
        items=videos,
        process_fn=process_video_batch,
        transform_fn=transform_results,
        cache_prefix=cache_prefix
    )
    
    # Convert to DataFrame
    if not results:
        return pd.DataFrame()
        
    df = pd.DataFrame(results)
    
    # Add metadata
    df['processed_timestamp'] = pd.Timestamp.now()
    df['cache_prefix'] = cache_prefix
    
    return df

In [4]:
# test_sponsor_processing.py

import pytest

@pytest.fixture
def test_video():
    """Sample video data for testing"""
    return {
        'videoId': 'test123',
        'title': 'Test Video',
        'description': 'This video is sponsored by NordVPN. Check them out at nordvpn.com/creator'
    }

@pytest.fixture
def test_processor():
    """Set up test processor"""
    config = ProcessorConfig(
        batch_size=1,
        max_workers=1,
        cache_dir="test_cache"
    )
    client = DeepSeekClient(
        api_key='your_api_key',
        model="deepseek-chat"
    )
    return LLMProcessor(client, config)

def test_create_prompt(test_video):
    """Test prompt creation"""
    prompt = create_prompt([test_video])
    assert "VIDEO 1" in prompt
    assert test_video['videoId'] in prompt
    assert "Return a JSON object" in prompt

def test_process_batch_response():
    """Test response processing"""
    sample_response = """```json
    {
        "video_sponsors": [
            {
                "video_id": "test123",
                "sponsors": [
                    {
                        "name": "NordVPN",
                        "domain": "nordvpn.com",
                        "evidence": "sponsored by NordVPN"
                    }
                ]
            }
        ]
    }```"""
    
    result = process_batch_response(sample_response)
    assert 'test123' in result
    assert len(result['test123']) == 1
    assert result['test123'][0]['name'] == 'NordVPN'

def test_transform_results():
    """Test result transformation"""
    sample_result = {
        'video_id': 'test123',
        'processed_data': {
            'content': '{"video_sponsors": [{"video_id": "test123", "sponsors": [{"name": "NordVPN", "domain": "nordvpn.com", "evidence": "test"}]}]}',
            'success': True
        }
    }
    
    records = transform_results(sample_result)
    assert len(records) == 1
    assert records[0]['video_id'] == 'test123'
    assert records[0]['sponsor_1_name'] == 'NordVPN'

def test_end_to_end(test_video, test_processor):
    """Test full processing pipeline"""
    df = process_sponsor_batch(
        videos=[test_video],
        processor=test_processor,
        cache_prefix="test"
    )
    
    assert not df.empty
    assert 'video_id' in df.columns
    assert 'sponsor_1_name' in df.columns

if __name__ == "__main__":
    pytest.main([__file__])

NameError: name '__file__' is not defined

In [13]:

# 5. Load and Process Data
# Load your video data
df = pd.read_csv('/Users/parthkocheta/Documents/sponsorFind/sponsorFind/chunk_8_of_245.csv')

# Convert DataFrame rows to list of dicts
video_data = df.to_dict('records')

# Process videos with our framework
results = processor.process_batch(
    items=video_data,
    process_fn=process_video_batch,
    transform_fn=transform_results,
    cache_prefix='sponsor_extraction',
    output_path='sponsor_results.csv'
)

# 6. Analyze Results
results_df = pd.DataFrame(results)

# Print statistics
print("\nProcessing Statistics:")
print(f"Total videos processed: {len(df)}")
print(f"Videos with sponsors found: {len(results_df)}")

# Show sample results
print("\nSample Sponsor Results:")
print(results_df.head())

# 7. Check Processing Metrics
import json
with open("metrics.json", 'r') as f:
    metrics = json.load(f)

print("\nProcessing Metrics:")
print(f"Total processing time: {metrics['total_time']:.2f} seconds")
print(f"Average time per item: {metrics['avg_process_time']:.2f} seconds")
print(f"Cache hits: {metrics['cache_hits']}")
print(f"Total errors: {metrics['errors']}")

# 8. Additional Analysis
if results_df.empty:
    print("No results found")
else:
    # Get sponsor frequency
    sponsor_cols = [col for col in results_df.columns if 'sponsor_' in col and 'name' in col]
    all_sponsors = results_df[sponsor_cols].values.flatten()
    sponsor_counts = pd.Series(all_sponsors).value_counts().dropna()

    print("\nTop Sponsors:")
    print(sponsor_counts.head())

# 9. Save Final Results
results_df.to_csv('final_sponsor_analysis.csv', index=False)
print("\nResults saved to 'final_sponsor_analysis.csv'")

  0%|          | 0/1668 [00:03<?, ?it/s]


ZeroDivisionError: float division by zero