In [None]:
#saves only 1 page with all params in json resp even when requested for 3 pages
from serpapi import GoogleSearch
import json
import os
from datetime import datetime
import time

def extract_jobs_data(api_key, num_pages=3):
    all_jobs = []
    
    for page in range(num_pages):
        # Parameters for the API request
        params = {
            'api_key': api_key,                           # Your API key
            'engine': 'google_jobs',                      # Search engine
            'q': 'software engineer',                     # Search query
            'hl': 'en',                                  # Language
            'gl': 'us',                                  # Country
            'google_domain': 'google.com',               # Google domain
            'start': page * 10 if page > 0 else None,    # Pagination
        }
        
        try:
            # Make the API request
            search = GoogleSearch(params)
            results = search.get_dict()
            
            # Check if we have job results and it's not empty
            if 'jobs_results' not in results or not results['jobs_results']:
                print(f"No more results found on page {page + 1}")
                break
            
            # Extract specific parameters from each job
            for job in results['jobs_results']:
                job_data = {
                    'title': job.get('title', 'N/A'),
                    'company': job.get('company_name', 'N/A'),
                    'location': job.get('location', 'N/A'),
                    'description': job.get('description', 'N/A')[:500] + '...' if job.get('description') else 'N/A',
                    'via': job.get('via', 'N/A'),
                    'posted_at': job.get('detected_extensions', {}).get('posted_at', 'N/A'),
                    'schedule_type': job.get('detected_extensions', {}).get('schedule_type', 'N/A'),
                    'salary': job.get('detected_extensions', {}).get('salary', 'N/A'),
                    'benefits': [ext for ext in job.get('extensions', []) if 'insurance' in ext.lower() or 'benefit' in ext.lower()],
                    'apply_link': job.get('apply_options', [{}])[0].get('link', 'N/A') if job.get('apply_options') else 'N/A'
                }
                all_jobs.append(job_data)
                
            print(f"Processed page {page + 1} - Found {len(results['jobs_results'])} jobs")
            
            # Get next page token if available
            if 'serpapi_pagination' in results and 'next_page_token' in results['serpapi_pagination']:
                params['next_page_token'] = results['serpapi_pagination']['next_page_token']
            else:
                break  # No more pages available
                
            # Add a small delay between requests
            time.sleep(2)
            
        except Exception as e:
            print(f"Error processing page {page + 1}: {str(e)}")
            continue
    
    return all_jobs

def save_to_json(jobs_data, filename='software_engineer_jobs.json'):
    """Save the extracted jobs data to a JSON file"""
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(jobs_data, f, indent=2, ensure_ascii=False)
    print(f"\nSaved {len(jobs_data)} jobs to {filename}")

def main():
    # Your SerpApi key
    API_KEY = "your key"
    
    try:
        # Extract jobs data
        print("Starting job extraction...")
        jobs_data = extract_jobs_data(API_KEY)
        
        if not jobs_data:
            print("No jobs were found!")
            return
            
        # Save to JSON file
        save_to_json(jobs_data)
        
        # Print summary
        print(f"\nExtraction completed successfully!")
        print(f"Total jobs extracted: {len(jobs_data)}")
        
        # Print sample of first job
        if jobs_data:
            print("\nSample of first job entry:")
            first_job = jobs_data[0]
            for key, value in first_job.items():
                if key == 'description':
                    print(f"{key}: {value[:150]}...")  # Show first 150 chars of description
                else:
                    print(f"{key}: {value}")
    
    except Exception as e:
        print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    main()

Starting job extraction...
Processed page 1 - Found 10 jobs
No more results found on page 2

Saved 10 jobs to software_engineer_jobs.json

Extraction completed successfully!
Total jobs extracted: 10

Sample of first job entry:
title: Lead Software Engineer-Java, Bank Modernization
company: Capital One
location: New York, NY
description: 114 5th Ave (22114), United States of America, New York, New York

Lead Software Engineer-Java, Bank Modernization

Do you love building and pioneerin...
via: Capital One Careers
posted_at: 5 days ago
schedule_type: Full-time and Part-time
salary: N/A
benefits: ['Health insurance']
apply_link: https://www.capitalonecareers.com/job/new-york/lead-software-engineer-java-bank-modernization/1732/73175694832?utm_campaign=google_jobs_apply&utm_source=google_jobs_apply&utm_medium=organic


In [None]:
#saves 3 pages data all params in json resp when requested for 3 pages
from serpapi import GoogleSearch
import json
import os
from datetime import datetime
import time

def extract_jobs_data(api_key, num_pages=3):
    all_jobs = []
    
    for page in range(num_pages):
        # Parameters for the API request
        params = {
            'api_key': api_key,                           # Your API key
            'engine': 'google_jobs',                      # Search engine
            'q': 'software engineer',                     # Search query
            'hl': 'en',                                  # Language
            'gl': 'us',                                  # Country
            'google_domain': 'google.com',               # Google domain
            'start': page * 10 if page > 0 else None,    # Pagination
        }
        
        try:
            # Make the API request
            search = GoogleSearch(params)
            results = search.get_dict()
            
            # Check if we have job results and it's not empty
            if 'jobs_results' not in results or not results['jobs_results']:
                print(f"No more results found on page {page + 1}")
                break
            
            # Extract specific parameters from each job
            for job in results['jobs_results']:
                job_data = {
                    'title': job.get('title', 'N/A'),
                    'company': job.get('company_name', 'N/A'),
                    'location': job.get('location', 'N/A'),
                    'description': job.get('description', 'N/A')[:500] + '...' if job.get('description') else 'N/A',
                    'via': job.get('via', 'N/A'),
                    'posted_at': job.get('detected_extensions', {}).get('posted_at', 'N/A'),
                    'schedule_type': job.get('detected_extensions', {}).get('schedule_type', 'N/A'),
                    'salary': job.get('detected_extensions', {}).get('salary', 'N/A'),
                    'benefits': [ext for ext in job.get('extensions', []) if 'insurance' in ext.lower() or 'benefit' in ext.lower()],
                    'apply_link': job.get('apply_options', [{}])[0].get('link', 'N/A') if job.get('apply_options') else 'N/A'
                }
                all_jobs.append(job_data)
                
            print(f"Processed page {page + 1} - Found {len(results['jobs_results'])} jobs")
            
            # Get next page token if available
            if 'serpapi_pagination' in results and 'next_page_token' in results['serpapi_pagination']:
                params['next_page_token'] = results['serpapi_pagination']['next_page_token']
            else:
                break  # No more pages available
                
            # Add a small delay between requests
            time.sleep(2)
            
        except Exception as e:
            print(f"Error processing page {page + 1}: {str(e)}")
            continue
    
    return all_jobs

def save_to_json(jobs_data, filename='software_engineer_jobs.json'):
    """Save the extracted jobs data to a JSON file"""
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(jobs_data, f, indent=2, ensure_ascii=False)
    print(f"\nSaved {len(jobs_data)} jobs to {filename}")

def main():
    # Your SerpApi key
    API_KEY = "your key"
    
    try:
        # Extract jobs data
        print("Starting job extraction...")
        jobs_data = extract_jobs_data(API_KEY)
        
        if not jobs_data:
            print("No jobs were found!")
            return
            
        # Save to JSON file
        save_to_json(jobs_data)
        
        # Print summary
        print(f"\nExtraction completed successfully!")
        print(f"Total jobs extracted: {len(jobs_data)}")
        
        # Print sample of first job
        if jobs_data:
            print("\nSample of first job entry:")
            first_job = jobs_data[0]
            for key, value in first_job.items():
                if key == 'description':
                    print(f"{key}: {value[:150]}...")  # Show first 150 chars of description
                else:
                    print(f"{key}: {value}")
    
    except Exception as e:
        print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    main()

Starting job extraction...

Checking response structure for page 1:
Pagination info found
Next page token: eyJmYyI6IkVvd0ZDc3dFUVVwSE9VcHJVRUZ5TUhVMGNtWTBVM28wVlZCMmJGSjFOVkptZVZRMWRWWjRNRFU0TmpsU2FrOWpibkpWUVVwVWFYWkVaVXR0U1ZkMVdETmlSbGhwZFcxSU9GQkpabWhTUzBOQ2VHRm5ielpYV2xCaFJ6YzFhbU5xZG1SV2FHRlphMVIxYW1KUVNtUlJiM3BvWkZORWJIcDBOVU42YjA4M05IUXdOWFY2ZVdkaVNsOXpSbEZqVG1Gck1sRndRbmh5VVVORmJsRlpZazF1UWtsSFpURmZTWEpFWTBWcWVGUlBRaTFUTVU4dE9UaDBkMWgxZFU4ek5tTlpVeko1V0ZwdU5rOW5ja1ZEY0ROVk4wRlJXbUYwUVVJeVJreGFlVlZSU25sbFMzWTJNMlJQYW1GeGFHUlhMWEZsU2xoV1NXcHdWRTVLUW5wbmR6Qm9SMkZCUjNKSmVtOVBiMEpUWTBWNFZVZExTVk5ZWVVaQ1pISllkVzh4WmpKcVRubzJXRzVyU1RObGJWOUJaVmhRU1RKU1dVSXhXbUp1Y3pOU1NtSldjRU13U2pVeGVGTmljV1Z1UVdFeFQxWnJlWGhhVVRCR01VdGxOMFZOY0RNMGVYVnVVbU54UVU5elRrazVXVTk1TUhvMWNFZzNTbEJKU1ZOT1RqaFJlbU01ZWt0UlowZDJPREpQVjJGU1MwUlFTaTFoVWt0SVpVOXlUakZqUjJWT2JYWjZNbk0xY0dSdFZVVldibEJYZFdoM1oyUlFPVU5XYUZCWGVIRkJaamx2YVVWeVQzUTNaMU4wTTFSRVpFbHFiblY2YURGblNFcFJRMDR0Vm5ORGRXUk5jbkJDZUdOa2VuaFdhMHBrWmtac2JUR

In [4]:
%pip install google-search-results

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting google-search-results
  Downloading google_search_results-2.4.2.tar.gz (18 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: google-search-results
  Building wheel for google-search-results (setup.py) ... [?25ldone
[?25h  Created wheel for google-search-results: filename=google_search_results-2.4.2-py3-none-any.whl size=32003 sha256=5a87bf0a3bfafbde11a8675bf2e13569f09125c32f364a243ba333dcfd19dd94
  Stored in directory: /private/var/folders/n6/v6mbk2mj25xc5yqv9prgz43m0000gp/T/pip-ephem-wheel-cache-46un_ssq/wheels/6e/42/3e/aeb691b02cb7175ec70e2da04b5658d4739d2b41e5f73cd06f
Successfully built google-search-results
Installing collected packages: google-search-results
Successfully installed google-search-results-2.4.2
Note: you may need to restart the kernel to use updated packages.


In [2]:
%pip install serpapi

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting serpapi
  Obtaining dependency information for serpapi from https://files.pythonhosted.org/packages/df/6a/21deade04100d64844e494353a5d65e7971fbdfddf78eb1f248423593ad0/serpapi-0.1.5-py2.py3-none-any.whl.metadata
  Downloading serpapi-0.1.5-py2.py3-none-any.whl.metadata (10 kB)
Downloading serpapi-0.1.5-py2.py3-none-any.whl (10 kB)
Installing collected packages: serpapi
Successfully installed serpapi-0.1.5
Note: you may need to restart the kernel to use updated packages.


In [None]:
#saves all 3 pages with all params in json resp
from serpapi import GoogleSearch
import json
import os
from datetime import datetime
import time

def extract_jobs_data(api_key, num_pages=3):
    all_jobs = []
    current_page = 1
    next_page_token = None
    
    while current_page <= num_pages:
        # Parameters for the API request
        params = {
            'api_key': api_key,                  
            'engine': 'google_jobs',             
            'q': 'software engineer',            
            'hl': 'en',                         
            'gl': 'us',                         
            'google_domain': 'google.com'        
        }
        
        # Add next_page_token if we have one (for pages after first)
        if next_page_token:
            params['next_page_token'] = next_page_token
        
        try:
            # Make the API request
            search = GoogleSearch(params)
            results = search.get_dict()
            
            # Debug print to see the structure (optional)
            print(f"\nChecking response structure for page {current_page}:")
            if 'serpapi_pagination' in results:
                print("Pagination info found")
                print("Next page token:", results.get('serpapi_pagination', {}).get('next_page_token', 'None'))
            
            # Check if we have job results and it's not empty
            if 'jobs_results' not in results or not results['jobs_results']:
                print(f"No more results found on page {current_page}")
                break
            
            # Extract jobs from current page
            jobs_on_this_page = results['jobs_results']
            print(f"Found {len(jobs_on_this_page)} jobs on page {current_page}")
            
            # Extract specific parameters from each job
            for job in jobs_on_this_page:
                job_data = {
                    'title': job.get('title', 'N/A'),
                    'company': job.get('company_name', 'N/A'),
                    'location': job.get('location', 'N/A'),
                    'description': job.get('description', 'N/A')[:500] + '...' if job.get('description') else 'N/A',
                    'via': job.get('via', 'N/A'),
                    'posted_at': job.get('detected_extensions', {}).get('posted_at', 'N/A'),
                    'schedule_type': job.get('detected_extensions', {}).get('schedule_type', 'N/A'),
                    'salary': job.get('detected_extensions', {}).get('salary', 'N/A'),
                    'benefits': [ext for ext in job.get('extensions', []) if 'insurance' in ext.lower() or 'benefit' in ext.lower()],
                    'apply_link': job.get('apply_options', [{}])[0].get('link', 'N/A') if job.get('apply_options') else 'N/A'
                }
                all_jobs.append(job_data)
            
            print(f"Successfully processed page {current_page} - Total jobs so far: {len(all_jobs)}")
            
            # Check for next page token
            serpapi_pagination = results.get('serpapi_pagination', {})
            next_page_token = serpapi_pagination.get('next_page_token')
            
            if not next_page_token:
                print(f"No next page token found after page {current_page}")
                break
                
            current_page += 1
            
            # Add a small delay between requests
            time.sleep(2)
            
        except Exception as e:
            print(f"Error processing page {current_page}: {str(e)}")
            break
    
    return all_jobs

def save_to_json(jobs_data, filename='software_engineer_jobs.json'):
    """Save the extracted jobs data to a JSON file"""
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(jobs_data, f, indent=2, ensure_ascii=False)
    print(f"\nSaved {len(jobs_data)} jobs to {filename}")

def main():
    # Your SerpApi key
    API_KEY = "your key"
    
    try:
        # Extract jobs data
        print("Starting job extraction...")
        jobs_data = extract_jobs_data(API_KEY, num_pages=3)  # Explicitly requesting 3 pages
        
        if not jobs_data:
            print("No jobs were found!")
            return
            
        # Save to JSON file
        save_to_json(jobs_data)
        
        # Print summary
        print(f"\nExtraction completed successfully!")
        print(f"Total jobs extracted: {len(jobs_data)}")
        
        # Print sample of first job and last job to verify pagination
        if jobs_data:
            print("\nFirst job entry:")
            first_job = jobs_data[0]
            for key, value in first_job.items():
                if key == 'description':
                    print(f"{key}: {value[:150]}...")
                else:
                    print(f"{key}: {value}")
                    
            if len(jobs_data) > 1:
                print("\nLast job entry:")
                last_job = jobs_data[-1]
                for key, value in last_job.items():
                    if key == 'description':
                        print(f"{key}: {value[:150]}...")
                    else:
                        print(f"{key}: {value}")
    
    except Exception as e:
        print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    main()

In [None]:
#saves as json and csv for all 3 pages jobs data
from serpapi import GoogleSearch
import json
import csv
import os
from datetime import datetime
import time

def extract_jobs_data(api_key, num_pages=3):
    all_jobs = []
    current_page = 1
    next_page_token = None
    
    while current_page <= num_pages:
        # Parameters for the API request
        params = {
            'api_key': api_key,                  
            'engine': 'google_jobs',             
            'q': 'software engineer',            
            'hl': 'en',                         
            'gl': 'us',                         
            'google_domain': 'google.com'        
        }
        
        # Add next_page_token if we have one (for pages after first)
        if next_page_token:
            params['next_page_token'] = next_page_token
        
        try:
            # Make the API request
            search = GoogleSearch(params)
            results = search.get_dict()
            
            # Check if we have job results and it's not empty
            if 'jobs_results' not in results or not results['jobs_results']:
                print(f"No more results found on page {current_page}")
                break
            
            # Extract jobs from current page
            jobs_on_this_page = results['jobs_results']
            print(f"Found {len(jobs_on_this_page)} jobs on page {current_page}")
            
            # Extract specific parameters from each job
            for job in jobs_on_this_page:
                # Get all apply links if available
                apply_links = []
                if job.get('apply_options'):
                    for option in job['apply_options']:
                        if option.get('link'):
                            apply_links.append(f"{option.get('title', 'Unknown')}: {option.get('link')}")
                
                job_data = {
                    'title': job.get('title', 'N/A'),
                    'company': job.get('company_name', 'N/A'),
                    'location': job.get('location', 'N/A'),
                    'description': job.get('description', 'N/A')[:500] + '...' if job.get('description') else 'N/A',
                    'posted_at': job.get('detected_extensions', {}).get('posted_at', 'N/A'),
                    'apply_links': ' | '.join(apply_links) if apply_links else 'N/A'
                }
                all_jobs.append(job_data)
            
            print(f"Successfully processed page {current_page} - Total jobs so far: {len(all_jobs)}")
            
            # Check for next page token
            serpapi_pagination = results.get('serpapi_pagination', {})
            next_page_token = serpapi_pagination.get('next_page_token')
            
            if not next_page_token:
                print(f"No next page token found after page {current_page}")
                break
                
            current_page += 1
            time.sleep(2)  # Small delay between requests
            
        except Exception as e:
            print(f"Error processing page {current_page}: {str(e)}")
            break
    
    return all_jobs

def save_to_csv(jobs_data, filename='software_engineer_jobs.csv'):
    """Save the extracted jobs data to a CSV file"""
    # Define the fieldnames for the CSV
    fieldnames = ['title', 'company', 'location', 'description', 'posted_at', 'apply_links']
    
    try:
        with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            
            # Write the header
            writer.writeheader()
            
            # Write the job data
            for job in jobs_data:
                writer.writerow(job)
                
        print(f"\nSuccessfully saved {len(jobs_data)} jobs to {filename}")
        
    except Exception as e:
        print(f"Error saving to CSV: {str(e)}")

def save_to_json(jobs_data, filename='software_engineer_jobs.json'):
    """Save the extracted jobs data to a JSON file"""
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(jobs_data, f, indent=2, ensure_ascii=False)
    print(f"\nSaved {len(jobs_data)} jobs to {filename}")

def main():
    # Your SerpApi key
    API_KEY = "your key"
    
    try:
        # Extract jobs data
        print("Starting job extraction...")
        jobs_data = extract_jobs_data(API_KEY, num_pages=3)
        
        if not jobs_data:
            print("No jobs were found!")
            return
            
        # Save to both JSON and CSV
        save_to_json(jobs_data)
        save_to_csv(jobs_data)
        
        # Print summary
        print(f"\nExtraction completed successfully!")
        print(f"Total jobs extracted: {len(jobs_data)}")
        
        # Print sample of first job
        if jobs_data:
            print("\nSample of first job entry:")
            first_job = jobs_data[0]
            for key, value in first_job.items():
                if key == 'description':
                    print(f"{key}: {value[:150]}...")
                else:
                    print(f"{key}: {value}")
    
    except Exception as e:
        print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    main()

Starting job extraction...
Found 10 jobs on page 1
Successfully processed page 1 - Total jobs so far: 10
Found 10 jobs on page 2
Successfully processed page 2 - Total jobs so far: 20
Found 10 jobs on page 3
Successfully processed page 3 - Total jobs so far: 30

Saved 30 jobs to software_engineer_jobs.json

Successfully saved 30 jobs to software_engineer_jobs.csv

Extraction completed successfully!
Total jobs extracted: 30

Sample of first job entry:
title: Lead Software Engineer-Java, Bank Modernization
company: Capital One
location: New York, NY
description: 114 5th Ave (22114), United States of America, New York, New York

Lead Software Engineer-Java, Bank Modernization

Do you love building and pioneerin...
posted_at: 5 days ago
apply_links: Capital One Careers: https://www.capitalonecareers.com/job/new-york/lead-software-engineer-java-bank-modernization/1732/73175694832?utm_campaign=google_jobs_apply&utm_source=google_jobs_apply&utm_medium=organic | Dice: https://www.dice.com/job-d

In [None]:
#multi-job data extraction while above is for single file
from serpapi import GoogleSearch
import json
import csv
import os
from datetime import datetime
import time

def extract_jobs_for_title(api_key, job_title, num_pages=3):
    all_jobs = []
    current_page = 1
    next_page_token = None
    
    while current_page <= num_pages:
        # Parameters for the API request
        params = {
            'api_key': api_key,                  
            'engine': 'google_jobs',             
            'q': job_title,            
            'hl': 'en',                         
            'gl': 'us',                         
            'google_domain': 'google.com'        
        }
        
        # Add next_page_token if we have one (for pages after first)
        if next_page_token:
            params['next_page_token'] = next_page_token
        
        try:
            # Make the API request
            search = GoogleSearch(params)
            results = search.get_dict()
            
            # Check if we have job results and it's not empty
            if 'jobs_results' not in results or not results['jobs_results']:
                print(f"No more results found on page {current_page} for {job_title}")
                break
            
            # Extract jobs from current page
            jobs_on_this_page = results['jobs_results']
            print(f"Found {len(jobs_on_this_page)} jobs on page {current_page} for {job_title}")
            
            # Extract specific parameters from each job
            for job in jobs_on_this_page:
                # Get all apply links if available
                apply_links = []
                if job.get('apply_options'):
                    for option in job['apply_options']:
                        if option.get('link'):
                            apply_links.append(f"{option.get('title', 'Unknown')}: {option.get('link')}")
                
                job_data = {
                    'search_query': job_title,  # Add the search query that found this job
                    'title': job.get('title', 'N/A'),
                    'company': job.get('company_name', 'N/A'),
                    'location': job.get('location', 'N/A'),
                    'description': job.get('description', 'N/A')[:500] + '...' if job.get('description') else 'N/A',
                    'posted_at': job.get('detected_extensions', {}).get('posted_at', 'N/A'),
                    'apply_links': ' | '.join(apply_links) if apply_links else 'N/A'
                }
                all_jobs.append(job_data)
            
            print(f"Successfully processed page {current_page} for {job_title} - Total jobs so far: {len(all_jobs)}")
            
            # Check for next page token
            serpapi_pagination = results.get('serpapi_pagination', {})
            next_page_token = serpapi_pagination.get('next_page_token')
            
            if not next_page_token:
                print(f"No next page token found after page {current_page} for {job_title}")
                break
                
            current_page += 1
            time.sleep(2)  # Small delay between requests
            
        except Exception as e:
            print(f"Error processing page {current_page} for {job_title}: {str(e)}")
            break
    
    return all_jobs

def save_to_csv(jobs_data, filename='tech_jobs.csv'):
    """Save the extracted jobs data to a CSV file"""
    # Define the fieldnames for the CSV
    fieldnames = ['search_query', 'title', 'company', 'location', 'description', 'posted_at', 'apply_links']
    
    try:
        with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            
            # Write the header
            writer.writeheader()
            
            # Write the job data
            for job in jobs_data:
                writer.writerow(job)
                
        print(f"\nSuccessfully saved {len(jobs_data)} jobs to {filename}")
        
    except Exception as e:
        print(f"Error saving to CSV: {str(e)}")

def save_to_json(jobs_data, filename='tech_jobs.json'):
    """Save the extracted jobs data to a JSON file"""
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(jobs_data, f, indent=2, ensure_ascii=False)
    print(f"\nSaved {len(jobs_data)} jobs to {filename}")

def main():
    # Your SerpApi key
    API_KEY = "your key"
    
    # Define job titles to search for
    job_titles = [
        'software engineer',
        'data engineer',
        'data scientist'
    ]
    
    try:
        # List to store all jobs from all searches
        all_jobs = []
        
        # Extract jobs data for each job title
        print("Starting job extraction...")
        for job_title in job_titles:
            print(f"\nSearching for {job_title} positions...")
            jobs = extract_jobs_for_title(API_KEY, job_title)
            all_jobs.extend(jobs)
            print(f"Found {len(jobs)} {job_title} positions")
            time.sleep(3)  # Add delay between different job title searches
        
        if not all_jobs:
            print("No jobs were found!")
            return
            
        # Save all jobs to both JSON and CSV
        save_to_json(all_jobs)
        save_to_csv(all_jobs)
        
        # Print summary
        print(f"\nExtraction completed successfully!")
        print(f"Total jobs extracted: {len(all_jobs)}")
        
        # Print summary by job title
        for job_title in job_titles:
            count = len([job for job in all_jobs if job['search_query'] == job_title])
            print(f"- {job_title}: {count} positions")
        
        # Print sample of first job for each type
        for job_title in job_titles:
            matching_jobs = [job for job in all_jobs if job['search_query'] == job_title]
            if matching_jobs:
                print(f"\nSample of first {job_title} job entry:")
                first_job = matching_jobs[0]
                for key, value in first_job.items():
                    if key == 'description':
                        print(f"{key}: {value[:150]}...")
                    else:
                        print(f"{key}: {value}")
    
    except Exception as e:
        print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    main()

In [None]:
#multi-page, multi-jobs, with proper dates for posted job added in CSV
from serpapi import GoogleSearch
import json
import csv
import os
from datetime import datetime, timedelta
import time
import re

def extract_days_ago(posted_at):
    """Extract number of days from posted_at string"""
    if not posted_at or posted_at == 'N/A':
        return None
        
    # Extract numbers using regex
    match = re.search(r'(\d+)', posted_at)
    if match:
        days = int(match.group(1))
        if 'month' in posted_at.lower():
            days = days * 30  # Approximate month to days
        elif 'week' in posted_at.lower():
            days = days * 7   # Convert weeks to days
        return days
    elif 'hour' in posted_at.lower() or 'today' in posted_at.lower():
        return 0
    return None

def extract_time_info(posted_at):
    """
    Extract time information from posted_at string
    Returns tuple of (number, unit)
    """
    if not posted_at or posted_at == 'N/A':
        return None, None
    
    # Extract numbers using regex
    match = re.search(r'(\d+)\s*(\w+)', posted_at.lower())
    if not match:
        return None, None
        
    number = int(match.group(1))
    unit = match.group(2).rstrip('s')  # remove plural 's' if present
    
    return number, unit

def calculate_posted_date(posted_at):
    """
    Calculate actual posted date from time ago format
    Keeps same date for hours, changes date for days/weeks/months
    """
    if not posted_at or posted_at == 'N/A':
        return 'N/A'
    
    number, unit = extract_time_info(posted_at)
    if not number or not unit:
        return 'N/A'
    
    today = datetime.now()
    
    # Handle different time units
    if 'hour' in unit:
        # For hours, keep today's date
        return today.strftime('%Y-%m-%d')
    elif 'day' in unit:
        posted_date = today - timedelta(days=number)
    elif 'week' in unit:
        posted_date = today - timedelta(days=number * 7)
    elif 'month' in unit:
        posted_date = today - timedelta(days=number * 30)  # approximate
    else:
        return 'N/A'
    
    return posted_date.strftime('%Y-%m-%d')

def extract_jobs_for_title(api_key, job_title, num_pages=3):
    all_jobs = []
    current_page = 1
    next_page_token = None
    
    while current_page <= num_pages:
        params = {
            'api_key': api_key,                  
            'engine': 'google_jobs',             
            'q': job_title,            
            'hl': 'en',                         
            'gl': 'us',                         
            'google_domain': 'google.com'        
        }
        
        if next_page_token:
            params['next_page_token'] = next_page_token
        
        try:
            search = GoogleSearch(params)
            results = search.get_dict()
            
            if 'jobs_results' not in results or not results['jobs_results']:
                print(f"No more results found on page {current_page} for {job_title}")
                break
            
            jobs_on_this_page = results['jobs_results']
            print(f"Found {len(jobs_on_this_page)} jobs on page {current_page} for {job_title}")
            
            for job in jobs_on_this_page:
                apply_links = []
                if job.get('apply_options'):
                    for option in job['apply_options']:
                        if option.get('link'):
                            apply_links.append(f"{option.get('title', 'Unknown')}: {option.get('link')}")
                
                posted_at = job.get('detected_extensions', {}).get('posted_at', 'N/A')
                posted_date = calculate_posted_date(posted_at)
                
                job_data = {
                    'search_query': job_title,
                    'title': job.get('title', 'N/A'),
                    'company': job.get('company_name', 'N/A'),
                    'location': job.get('location', 'N/A'),
                    'description': job.get('description', 'N/A')[:500] + '...' if job.get('description') else 'N/A',
                    'posted_at': posted_at,
                    'posted_date': posted_date,
                    'apply_links': ' | '.join(apply_links) if apply_links else 'N/A'
                }
                all_jobs.append(job_data)
            
            print(f"Successfully processed page {current_page} for {job_title} - Total jobs so far: {len(all_jobs)}")
            
            serpapi_pagination = results.get('serpapi_pagination', {})
            next_page_token = serpapi_pagination.get('next_page_token')
            
            if not next_page_token:
                print(f"No next page token found after page {current_page} for {job_title}")
                break
                
            current_page += 1
            time.sleep(2)
            
        except Exception as e:
            print(f"Error processing page {current_page} for {job_title}: {str(e)}")
            break
    
    return all_jobs

def save_to_csv(jobs_data, filename='tech_jobs.csv'):
    """Save the extracted jobs data to a CSV file"""
    fieldnames = ['search_query', 'title', 'company', 'location', 'description', 'posted_at', 'posted_date', 'apply_links']
    
    try:
        with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(jobs_data)
        print(f"\nSuccessfully saved {len(jobs_data)} jobs to {filename}")
        
    except Exception as e:
        print(f"Error saving to CSV: {str(e)}")

def save_to_json(jobs_data, filename='tech_jobs.json'):
    """Save the extracted jobs data to a JSON file"""
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(jobs_data, f, indent=2, ensure_ascii=False)
    print(f"\nSaved {len(jobs_data)} jobs to {filename}")

def main():
    API_KEY = "your key"
    
    job_titles = [
        'software engineer',
        'data engineer',
        'data scientist'
    ]
    
    try:
        all_jobs = []
        
        print("Starting job extraction...")
        for job_title in job_titles:
            print(f"\nSearching for {job_title} positions...")
            jobs = extract_jobs_for_title(API_KEY, job_title)
            all_jobs.extend(jobs)
            print(f"Found {len(jobs)} {job_title} positions")
            time.sleep(3)
        
        if not all_jobs:
            print("No jobs were found!")
            return
            
        save_to_json(all_jobs)
        save_to_csv(all_jobs)
        
        print(f"\nExtraction completed successfully!")
        print(f"Total jobs extracted: {len(all_jobs)}")
        
        # Print summary by job title
        for job_title in job_titles:
            count = len([job for job in all_jobs if job['search_query'] == job_title])
            print(f"- {job_title}: {count} positions")
        
        # Print sample entry
        if all_jobs:
            print("\nSample job entry with actual posted date:")
            first_job = all_jobs[0]
            for key, value in first_job.items():
                if key == 'description':
                    print(f"{key}: {value[:150]}...")
                else:
                    print(f"{key}: {value}")
    
    except Exception as e:
        print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    main()

Starting job extraction...

Searching for software engineer positions...
Found 10 jobs on page 1 for software engineer
Successfully processed page 1 for software engineer - Total jobs so far: 10
Found 10 jobs on page 2 for software engineer
Successfully processed page 2 for software engineer - Total jobs so far: 20
Found 10 jobs on page 3 for software engineer
Successfully processed page 3 for software engineer - Total jobs so far: 30
Found 30 software engineer positions

Searching for data engineer positions...
Found 10 jobs on page 1 for data engineer
Successfully processed page 1 for data engineer - Total jobs so far: 10
Found 9 jobs on page 2 for data engineer
Successfully processed page 2 for data engineer - Total jobs so far: 19
Found 8 jobs on page 3 for data engineer
Successfully processed page 3 for data engineer - Total jobs so far: 27
Found 27 data engineer positions

Searching for data scientist positions...
Found 10 jobs on page 1 for data scientist
Successfully processed 