# GAIM Notebook - End-to-End Influencer Selection

This notebook runs the complete pipeline without needing the web frontend or API server.

What it does:
- Expand campaign keywords
- Search top X videos per keyword
- Gather candidate channels and details
- Sample comments and compute an "organic" score
- Rank influencers using content match, keyword hits, comments, and optional network signal

Prerequisites:
- Python environment with project dependencies installed (`pip install -r requirements.txt`)
- `.env` file at project root with `YOUTUBE_API_KEY`



In [1]:
# Setup: imports and environment
import os
import asyncio
from dotenv import load_dotenv
from typing import List, Dict, Any
import pandas as pd

# Local project imports
import sys
sys.path.append('./backend')

from youtube_api import YouTubeAPI
from matcher import InfluencerMatcher
from network_analyzer import NetworkAnalyzer
from database import Database

load_dotenv()
YOUTUBE_API_KEY = os.getenv('YOUTUBE_API_KEY')

if not YOUTUBE_API_KEY:
    raise RuntimeError("YOUTUBE_API_KEY not found. Create a .env at project root with YOUTUBE_API_KEY=your_key")

yt = YouTubeAPI(api_key=YOUTUBE_API_KEY)
matcher = InfluencerMatcher()
net = NetworkAnalyzer()
db = Database()



In [2]:
# Helpers: keyword expansion, video search, comments score, selection

async def expand_keywords(campaign_text: str, seed_keywords: List[str], max_related_per_keyword: int = 5) -> List[str]:
    base_tokens = set()
    for token in (campaign_text or '').lower().replace('\n', ' ').split(' '):
        token = token.strip('.,!?:;"\'()[]{}')
        if len(token) >= 3:
            base_tokens.add(token)
    seeds = seed_keywords or list(base_tokens)

    def variants(k: str) -> List[str]:
        v = {k}
        if k.endswith('ing'):
            v.add(k[:-3])
        if not k.endswith('s'):
            v.add(k + 's')
        if '-' in k:
            v.add(k.replace('-', ' '))
        if ' ' in k:
            parts = k.split(' ')
            if len(parts) == 2:
                v.add(''.join(parts))
        return list(v)

    all_terms = set()
    for s in seeds[:20]:
        all_terms.update(variants(s.lower()))
        try:
            vids = await yt.search_videos(s, max_results=max_related_per_keyword, order='relevance')
            freq: Dict[str, int] = {}
            for v in vids:
                text = (v.get('title', '') + ' ' + v.get('description', '')).lower()
                for w in text.split():
                    w = w.strip('.,!?:;"\'()[]{}')
                    if len(w) >= 4 and w not in {'with','from','that','this','have','your','about','into','over','under','what','when','where','which','there'}:
                        freq[w] = freq.get(w, 0) + 1
            for w,_ in sorted(freq.items(), key=lambda x: x[1], reverse=True)[:max_related_per_keyword]:
                all_terms.add(w)
        except Exception:
            pass
    return sorted(list(all_terms))

async def search_top_videos(keywords: List[str], top_videos_per_keyword: int = 5, order: str = 'viewCount') -> Dict[str, List[Dict[str, Any]]]:
    res: Dict[str, List[Dict[str, Any]]] = {}
    for kw in keywords[:20]:
        vids = await yt.search_videos(kw, max_results=top_videos_per_keyword, order=order)
        res[kw] = vids
        for v in vids:
            if v.get('channel_id'):
                db.save_videos(v.get('channel_id', ''), [v])
    return res

async def comment_organic_score(video_ids: List[str], max_comments_per_video: int = 50) -> float:
    sample = video_ids[: min(3, len(video_ids))]
    total = 0
    unique_authors = set()
    total_len = 0
    repeated: Dict[str, int] = {}
    for vid in sample:
        try:
            comments = await yt.get_video_comments(vid, max_results=max_comments_per_video)
        except Exception:
            comments = []
        db.save_comments(vid, comments)
        total += len(comments)
        for c in comments:
            author = c.get('author', '')
            unique_authors.add(author)
            text = (c.get('text', '') or '')[:200]
            total_len += len(text)
            key = text.lower().strip()
            if len(key) >= 8:
                repeated[key] = repeated.get(key, 0) + 1
    if total == 0:
        return 0.5
    uniq_ratio = len(unique_authors) / total
    avg_len = total_len / max(total, 1)
    rep_penalty = min(max((max(repeated.values()) if repeated else 1) - 1, 0) / 10.0, 1.0)
    len_score = max(min((avg_len - 10) / (120 - 10), 1.0), 0.0)
    score = 0.6 * uniq_ratio + 0.4 * len_score
    score *= (1.0 - 0.5 * rep_penalty)
    return max(min(score, 1.0), 0.0)

async def select_influencers(
    campaign_text: str,
    seed_keywords: List[str],
    keywords: List[str] = None,
    top_videos_per_keyword: int = 5,
    past_videos_to_check: int = 5,
    top_n: int = 20,
    use_network: bool = False,
    max_comments_per_video: int = 50,
):
    # 1) keywords
    if not keywords:
        keywords = await expand_keywords(campaign_text, seed_keywords)

    # 2) search videos
    kw_videos = await search_top_videos(keywords, top_videos_per_keyword, order='viewCount')

    # 3) aggregate channels
    channel_hits: Dict[str, int] = {}
    channel_to_vids: Dict[str, List[Dict[str, Any]]] = {}
    for vids in kw_videos.values():
        for v in vids:
            cid = v.get('channel_id', '')
            if cid:
                channel_hits[cid] = channel_hits.get(cid, 0) + 1
                channel_to_vids.setdefault(cid, []).append(v)

    # 4) channel details
    channel_ids = list(channel_hits.keys())
    channels_data = await yt.get_channels_details(channel_ids)

    # 5) comments organic score per channel
    channel_comment_score: Dict[str, float] = {}
    for cid, vids in channel_to_vids.items():
        sample_video_ids = [v.get('video_id') for v in vids if v.get('video_id')][:3]
        channel_comment_score[cid] = await comment_organic_score(sample_video_ids, max_comments_per_video)

    # 6) base match
    matches = matcher.find_matches(
        channels_data=channels_data,
        brand_keywords=keywords or [],
        target_audience=None,
    )

    # 7) optional network
    pagerank_scores: Dict[str, float] = {}
    if use_network and len(channels_data) > 1:
        net_data = net.build_network(channels_data)
        pr = net_data.get('metrics', {}).get('pagerank', {})
        pagerank_scores = {cid: pr.get(cid, 0.0) for cid in [c['channel_id'] for c in channels_data]}

    # 8) final score
    enriched = []
    for m in matches:
        cid = m['channel_id']
        hit_score = min(channel_hits.get(cid, 0) / max(len(keywords or []), 1), 1.0)
        comment_score = channel_comment_score.get(cid, 0.5)
        net_score = pagerank_scores.get(cid, 0.0)
        final_score = 0.6 * m['match_score'] + 0.15 * hit_score + 0.2 * comment_score + 0.05 * net_score
        enriched.append({
            **m,
            'hit_score': round(hit_score, 3),
            'comment_score': round(comment_score, 3),
            'network_score': round(net_score, 3),
            'final_score': round(final_score, 4)
        })

    enriched.sort(key=lambda x: x['final_score'], reverse=True)
    top = enriched[: top_n]
    return top, keywords



In [3]:
# Configure your campaign here
campaign_text = "Launching an eco-friendly fitness water bottle for gym-goers"
seed_keywords = ["eco fitness", "water bottle", "gym"]

# Controls
TOP_VIDEOS_PER_KEYWORD = 5
PAST_VIDEOS_TO_CHECK = 5
TOP_N = 15
USE_NETWORK = True
MAX_COMMENTS_PER_VIDEO = 50



In [4]:
# Run the pipeline

ranked, used_keywords = asyncio.run(select_influencers(
    campaign_text=campaign_text,
    seed_keywords=seed_keywords,
    keywords=None,  # set to a custom list to skip expansion
    top_videos_per_keyword=TOP_VIDEOS_PER_KEYWORD,
    past_videos_to_check=PAST_VIDEOS_TO_CHECK,
    top_n=TOP_N,
    use_network=USE_NETWORK,
    max_comments_per_video=MAX_COMMENTS_PER_VIDEO,
))

print(f"Keywords used ({len(used_keywords)}):", ', '.join(used_keywords[:20]), ('...' if len(used_keywords) > 20 else ''))

# Display results
if ranked:
    df = pd.DataFrame(ranked)
    display(df[[
        'channel_id', 'title', 'country', 'subscriber_count', 'video_count', 'view_count',
        'match_score', 'hit_score', 'comment_score', 'network_score', 'final_score'
    ]])
else:
    print("No results.")



RuntimeError: asyncio.run() cannot be called from a running event loop

In [None]:
# Optional: Save ranked results to CSV
SAVE_CSV = True
CSV_PATH = 'ranked_influencers.csv'

if ranked and SAVE_CSV:
    pd.DataFrame(ranked).to_csv(CSV_PATH, index=False)
    print(f"Saved to {CSV_PATH}")



In [None]:
# Cleanup aiohttp session
asyncio.run(yt.close())
print("Done.")

