In [2]:
import sys
import pickle
from google.cloud import storage

import pyspark
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf, SparkFiles
from pyspark.sql import SQLContext

BUCKET_NAME = "ir_project_2025"

full_path = f"gs://{BUCKET_NAME}/"
paths=[]

storage_client = storage.Client()

blobs_read = client.list_blobs(BUCKET_NAME)
for b in blobs_read:
    if "multistre" in b.name:
        paths.append(full_path+b.name)

print("Number of files: " + str(len(paths)))

Number of files: 60


In [3]:
# Define the destination path 
DESTINATION_FILENAME = "id_to_title.pkl"
FULL_DESTINATION_PATH = f"id_to_title_dict/{DESTINATION_FILENAME}"

bucket = storage_client.bucket(BUCKET_NAME)
blob = bucket.blob(FULL_DESTINATION_PATH)

# Safety Check: Prevent Overwriting
# This ensures we don't modify existing files or overwrite a previous run
if blob.exists():
    print(f"SAFETY STOP: The file '{FULL_DESTINATION_PATH}' already exists in the bucket.")
    print("To prevent accidental data loss, this code will not run. Please delete the file from the bucket or change the destination name.")
else:
    print(f"Safe to write. Destination '{FULL_DESTINATION_PATH}' does not exist yet.")

    try:
        print("Reading Parquet files...")
        df_wiki = spark.read.parquet(*paths)
        
        print("Extracting ID and Title pairs...")
        # Convert to RDD and map to (id, title) tuples
        # The .map() is required because collectAsMap expects tuples, not Rows
        id_title_pairs = df_wiki.select("id", "title").rdd.map(lambda x: (x[0], x[1]))
        
        print("Collecting to Dictionary...")
        id_to_title_dict = id_title_pairs.collectAsMap()
        
        print(f"   Collected {len(id_to_title_dict)} pairs.")

        print("Saving to local pickle file...")
        with open(DESTINATION_FILENAME, "wb") as f:
            pickle.dump(id_to_title_dict, f)

        print(f"Uploading...")
        blob.upload_from_filename(DESTINATION_FILENAME)
        
        print("\nSUCCESS: Dictionary created and uploaded!")
        print(f"Location: gs://{BUCKET_NAME}/{FULL_DESTINATION_PATH}")

    except Exception as e:
        print(f"\nError occurred: {str(e)}")

Safe to write. Destination 'id_to_title_dict/id_to_title.pkl' does not exist yet.
Reading Parquet files...


                                                                                

Extracting ID and Title pairs...
Collecting to Dictionary...


                                                                                

   Collected 6348910 pairs.
Saving to local pickle file...
Uploading...

SUCCESS: Dictionary created and uploaded!
Location: gs://ir_project_2025/id_to_title_dict/id_to_title.pkl
