In [None]:
!pip install kagglehub

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("arshkon/linkedin-job-postings")

print("Path to dataset files:", path)

In [None]:
import shutil
import os

shutil.copytree(path, "data")

# Read Raw Data

In [None]:
import pandas as pd

# Load all datasets
postings = pd.read_csv('data/postings.csv')
benefits = pd.read_csv('data/jobs/benefits.csv')
salaries = pd.read_csv('data/jobs/salaries.csv')
job_industries = pd.read_csv('data/jobs/job_industries.csv')
job_skills = pd.read_csv('data/jobs/job_skills.csv')

companies = pd.read_csv('data/companies/companies.csv')
employee_counts = pd.read_csv('data/companies/employee_counts.csv')
company_industries = pd.read_csv('data/companies/company_industries.csv')
company_specialities = pd.read_csv('data/companies/company_specialities.csv')

# Merge job-related datasets with one-to-one relationships
# postings = pd.merge(postings, salaries, on='job_id', how='left')

benefits = benefits.groupby('job_id').agg(benefits_types=('type', lambda x: list(x))).reset_index()
job_skills = job_skills.groupby('job_id').agg(skills_abreviation=('skill_abr', lambda x: list(x))).reset_index()

company_industries = company_industries.groupby('company_id').agg(industries=('industry', lambda x: list(x))).reset_index()
company_specialities = company_specialities.groupby('company_id').agg(specialities=('speciality', lambda x: list(x))).reset_index()
employee_counts = employee_counts.groupby('company_id').apply(lambda x: x.sample(1)).reset_index(drop=True)

postings = pd.merge(postings, benefits, on='job_id', how='left')
postings = pd.merge(postings, job_skills, on='job_id', how='left')

# Merge company-related datasets with one-to-one relationships
companies = pd.merge(companies, employee_counts, on='company_id', how='left')
companies = pd.merge(companies, company_industries, on='company_id', how='left')
companies = pd.merge(companies, company_specialities, on='company_id', how='left')

# Merge jobs and companies datasets
postings = pd.merge(postings, companies, on='company_id', how='left', suffixes=("", "_company"))

# Generate Synthetic Data

In [None]:
import json
import os
import time
import random
import vertexai
from vertexai.preview.batch_prediction import BatchPredictionJob
from google.cloud import storage


def upload_to_gcs(bucket_name, blob_name, content, content_type):
    """Uploads content to Google Cloud Storage as a string."""
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(blob_name)
    blob.upload_from_string(content, content_type=content_type)
    print(f"File uploaded to {bucket_name}/{blob_name}")

def prepare_jsonl(job_info_list, bucket_name, blob_name):
    """Prepares and uploads a JSONL file with prompts to Google Cloud Storage."""
    jsonl_content = ""
    for job_info in job_info_list:
        prompt = "Generate a well-formatted HTML page for a job listing with the following details:\n\n"
        for key, value in job_info.items():
            if value:
                prompt += f"{key.replace('_', ' ').capitalize()}: {value}\n"
        prompt += "\nMake sure the HTML page is well-structured and visually appealing."
        prompt += "\nIn your response, make sure you generate the HTML page and nothing else."

        # Format as JSON for Gemini Batch API
        jsonl_content += json.dumps({
            "request": {
                "contents": [{"role": "user", "parts": [{"text": prompt}]}]
            }
        }) + "\n"

    # Upload JSONL file to GCS
    upload_to_gcs(bucket_name, blob_name, jsonl_content, 'application/jsonl')

def batch_predict_gemini_createjob(input_uri: str, output_uri: str, model) -> BatchPredictionJob:
    """Perform batch text prediction using a Gemini AI model.
    Args:
        input_uri (str): URI of the input file in BigQuery table or Google Cloud Storage.
            Example: "gs://[BUCKET]/[DATASET].jsonl" OR "bq://[PROJECT].[DATASET].[TABLE]"

        output_uri (str): URI of the output folder,  in BigQuery table or Google Cloud Storage.
            Example: "gs://[BUCKET]/[OUTPUT].jsonl" OR "bq://[PROJECT].[DATASET].[TABLE]"
    Returns:
        batch_prediction_job: The batch prediction job object containing details of the job.
    """

    # Submit a batch prediction job with Gemini model
    batch_prediction_job = BatchPredictionJob.submit(
        source_model=model,
        input_dataset=input_uri,
        output_uri_prefix=output_uri,
    )

    # Check job status
    print(f"Job resource name: {batch_prediction_job.resource_name}")
    print(f"Model resource name with the job: {batch_prediction_job.model_name}")
    print(f"Job state: {batch_prediction_job.state.name}")

    # Refresh the job until complete
    while not batch_prediction_job.has_ended:
        time.sleep(5)
        batch_prediction_job.refresh()

    # Check if the job succeeds
    if batch_prediction_job.has_succeeded:
        print("Job succeeded!")
    else:
        print(f"Job failed: {batch_prediction_job.error}")

    # Check the location of the output
    print(f"Job output location: {batch_prediction_job.output_location}")

    return batch_prediction_job


def load_output_and_upload_html(bucket_name, folder_name):
    """Loads JSONL results from GCS and uploads each HTML result to GCS."""
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blobs = list(bucket.list_blobs(prefix=folder_name))
    
    for blob in blobs:
        output_content = blob.download_as_string().decode("utf-8")
        for line in output_content.strip().split('\n'):
            response_data = json.loads(line)
            html_content = response_data["response"]["candidates"][0]["content"]["parts"][0]["text"]
            job_id = random.randint(10**6, 10**7)
            destination_blob_name = f"data/job_listing_{job_id}.html"
            upload_to_gcs(bucket_name, destination_blob_name, html_content, 'text/html')

def main():
    
    PROJECT_ID = "vertexai-explore-437408"
    LOCATION = "us-central1"
    BUCKET_NAME = "job-listings-data-light"
    MODEL_NAME = "gemini-1.5-flash-002"

    input_blob = "batch_input/job_listings.jsonl"
    output_folder = "batch_output/"

    # Prepare job listings JSONL
    job_info = postings.iloc[:20].to_dict(orient='records')
    prepare_jsonl(job_info, BUCKET_NAME, input_blob)
    
    # Create batch prediction job
    vertexai.init(project=PROJECT_ID, location=LOCATION)
    input_uri = f"gs://{BUCKET_NAME}/{input_blob}"
    output_uri = f"gs://{BUCKET_NAME}/{output_folder}"
    output_location = batch_predict_gemini_createjob(input_uri, output_uri, MODEL_NAME).output_location
    output_location = output_location.replace(f"gs://{BUCKET_NAME}/", "")

    # Load output and upload HTML pages
    load_output_and_upload_html(BUCKET_NAME, output_location)

if __name__ == "__main__":
    main()


# Extra functions

In [None]:
def count_objects_in_bucket(bucket_name):
    # Initialize a Google Cloud Storage client
    storage_client = storage.Client()
    
    # Get the bucket
    bucket = storage_client.bucket(bucket_name)
    
    # List all objects in the bucket and count them
    blobs = bucket.list_blobs()
    object_count = sum(1 for _ in blobs)
    
    print(f"Total number of objects in bucket '{bucket_name}': {object_count}")
    return object_count

count_objects_in_bucket(BUCKET_NAME)

In [None]:
def delete_bucket_contents(bucket_name):
    # Initialize a client
    client = storage.Client()
    
    # Get the bucket
    bucket = client.get_bucket(bucket_name)
    
    # List all objects in the bucket
    blobs = bucket.list_blobs()

    # Delete each object
    for blob in blobs:
        blob.delete()
        print(f'Deleted {blob.name}')
    
    print(f'All contents of bucket {bucket_name} have been deleted.')

delete_bucket_contents(BUCKET_NAME)