In [7]:
#!pip install google-search-results

In [8]:
import json
import os
import time
from serpapi import GoogleSearch
import requests
import hashlib
from pathlib import Path
import pandas as pd

In [9]:
base_dir = Path("llama_data")
src_dir = base_dir / "src"
results_dir = base_dir / "results"

In [None]:
base_dir.mkdir(exist_ok=True)
src_dir.mkdir(exist_ok=True)
results_dir.mkdir(exist_ok=True)


In [None]:
with open('generated_outlines.json', 'r') as file:
    content = file.read()
    data = json.loads(content)

In [12]:
print(f"Loaded {len(data)} report outlines")


Loaded 5 report outlines


In [None]:
print("\nSample report title:", data[0].get('original_goal', {}).get('Report Title', 'No title'))
print("Sample queries:")
for query in data[0].get('Web Queries', [])[:2]:
    print(f"- {query.get('query')}: {query.get('purpose')}")


Sample report title: Llama 3.3: A Revolutionary Leap in AI
Sample queries:
- Llama 3.3 new features and enhancements: To gather information on the new features and enhancements in Llama 3.3
- Llama 3.3 vs Llama 3.1 performance comparison: To gather information on the performance comparison between Llama 3.3 and Llama 3.1


In [None]:
all_queries = []

In [15]:
for report_index, report_data in enumerate(data):
    report_title = report_data.get('original_goal', {}).get('Report Title', f"Report {report_index}")
    
    for query_index, query_data in enumerate(report_data.get('Web Queries', [])):
        query = query_data.get('query', '')
        purpose = query_data.get('purpose', '')
        
        all_queries.append({
            'report_index': report_index,
            'report_title': report_title,
            'query_index': query_index,
            'query': query,
            'purpose': purpose
        })

In [None]:
queries_df = pd.DataFrame(all_queries)
print(f"Total queries extracted: {len(queries_df)}")
queries_df.head()


Total queries extracted: 15


Unnamed: 0,report_index,report_title,query_index,query,purpose
0,0,Llama 3.3: A Revolutionary Leap in AI,0,Llama 3.3 new features and enhancements,To gather information on the new features and ...
1,0,Llama 3.3: A Revolutionary Leap in AI,1,Llama 3.3 vs Llama 3.1 performance comparison,To gather information on the performance compa...
2,0,Llama 3.3: A Revolutionary Leap in AI,2,Cost of running Llama 3.3 on cloud vs local in...,To gather information on the cost-effectivenes...
3,1,Llama 3.3 vs Llama 3.1: A Comparative Analysis,0,Llama 3.3 new features and improvements,To gather information on new features and impr...
4,1,Llama 3.3 vs Llama 3.1: A Comparative Analysis,1,Llama 3.1 vs Llama 3.3 performance comparison,To gather information on performance differenc...


In [28]:
SERPAPI_KEY = ""
SERPAPI_KEY

''

In [None]:
def search_with_serpapi(query, num_results=5):
    print(f"Searching for: {query}")
    
    params = {
        "engine": "google",
        "q": query,
        "api_key": SERPAPI_KEY,
        "num": num_results,
    }
    
    search = GoogleSearch(params)
    results = search.get_dict()
    
    # Check if we have organic results
    if "organic_results" not in results:
        print(f"Warning: No organic results found for query: {query}")
        return []
    
    return results["organic_results"]

In [None]:
def fetch_html(url):
    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        }
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        return response.text
    except Exception as e:
        print(f"Error fetching HTML from {url}: {str(e)}")
        return None

In [None]:
def save_html(html_content, report_index, report_title, query_index, query, result_index, title, url):
    if html_content is None:
        return None
    
    sanitized_report = report_title.replace(" ", "_").replace(":", "").replace("/", "")[:30]
    sanitized_query = query.replace(" ", "_").replace(":", "").replace("/", "")[:30]
    
    url_hash = hashlib.md5(url.encode()).hexdigest()[:8]

    report_dir = results_dir / f"report_{report_index}_{sanitized_report}"
    report_dir.mkdir(exist_ok=True)
    
    query_dir = report_dir / f"query_{query_index}_{sanitized_query}"
    query_dir.mkdir(exist_ok=True)
    
    sanitized_title = ''.join(c if c.isalnum() or c in ['_', '-'] else '_' for c in title)[:30]
    filename = f"result_{result_index}_{url_hash}_{sanitized_title}.html"
    filepath = query_dir / filename

    with open(filepath, "w", encoding="utf-8") as f:
        f.write(html_content)
    
    metadata = {
        "report_index": report_index,
        "report_title": report_title,
        "query_index": query_index,
        "query": query,
        "result_index": result_index,
        "title": title,
        "url": url,
        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
    }
    
    metadata_path = query_dir / f"result_{result_index}_{url_hash}_metadata.json"
    with open(metadata_path, "w") as f:
        json.dump(metadata, f, indent=2)
    
    return str(filepath)

In [None]:
def process_all_queries(queries_df):
    results = []
    
    for index, row in queries_df.iterrows():
        print(f"\nProcessing query {index + 1}/{len(queries_df)}")
        print(f"Report: {row['report_title']}")
        print(f"Query: {row['query']}")
        
        search_results = search_with_serpapi(row['query'])
        
        query_results = []
        for result_index, result in enumerate(search_results):
            title = result.get('title', 'No Title')
            url = result.get('link', '')
            snippet = result.get('snippet', '')
            
            print(f"  Result {result_index + 1}: {title[:50]}...")
            
            html_content = fetch_html(url)
            filepath = save_html(
                html_content, 
                row['report_index'], 
                row['report_title'],
                row['query_index'], 
                row['query'], 
                result_index, 
                title, 
                url
            )
            
            result_info = {
                "result_index": result_index,
                "title": title,
                "url": url,
                "snippet": snippet,
                "filepath": filepath
            }
            
            query_results.append(result_info)
            
            # Timeout
            time.sleep(1)
        
        query_result = {
            "report_index": row['report_index'],
            "report_title": row['report_title'],
            "query_index": row['query_index'],
            "query": row['query'],
            "purpose": row['purpose'],
            "results": query_results
        }
        
        results.append(query_result)
        
        with open(base_dir / "results_so_far.json", "w") as f:
            json.dump(results, f, indent=2)
    
    return results


In [24]:
results = process_all_queries(queries_df)


Processing query 1/15
Report: Llama 3.3: A Revolutionary Leap in AI
Query: Llama 3.3 new features and enhancements
Searching for: Llama 3.3 new features and enhancements
  Result 1: Introducing the new Llama 3.3: Features and Overvi...
  Result 2: What is Meta Llama 3.3 70B? Features, Use Cases & ...
  Result 3: Key Features and Improvements in LLaMA 3.3...
  Result 4: Everything You Need to Know About Llama 3.3 | by A...

Processing query 2/15
Report: Llama 3.3: A Revolutionary Leap in AI
Query: Llama 3.3 vs Llama 3.1 performance comparison
Searching for: Llama 3.3 vs Llama 3.1 performance comparison
  Result 1: Llama 3 vs 3.1 vs 3.2 : r/LocalLLaMA...
  Result 2: Llama 3.3 70B Instruct vs Llama 3.1 405B Instruct...
  Result 3: Choosing the Best Llama Model: Llama 3 vs 3.1 vs 3...
  Result 4: Llama 3.3 just dropped — is it better than GPT-4 o...
  Result 5: Llama 3 vs Llama 3.1 : Which is Better for Your AI...

Processing query 3/15
Report: Llama 3.3: A Revolutionary Leap in AI
Query:

In [None]:
def analyze_results():

    try:
        with open(base_dir / "results_so_far.json", "r") as f:
            results = json.load(f)
        
        total_results = sum(len(query["results"]) for query in results)
        print(f"Total queries processed: {len(results)}")
        print(f"Total search results fetched: {total_results}")
        
        summary_data = []
        for query in results:
            report_title = query["report_title"]
            query_text = query["query"]
            results_count = len(query["results"])
            
            summary_data.append({
                "Report": report_title,
                "Query": query_text,
                "Results Count": results_count
            })
        
        summary_df = pd.DataFrame(summary_data)
        return summary_df
    except FileNotFoundError:
        print("No results file found. Run the processing first.")
        return None


In [26]:
summary_df = analyze_results()
# if summary_df is not None:
#     summary_df

Total queries processed: 15
Total search results fetched: 70
