In [None]:
# if the following command generates an error, you probably didn't enable 
# the cluster security option "Allow API access to all Google Cloud services"
# under Manage Security → Project Access when setting up the cluster
!gcloud dataproc clusters list --region us-central1

In [None]:
# !pip install -q google-cloud-storage==1.43.0
!pip install -q graphframes

In [None]:
import pyspark
import sys
from collections import Counter, OrderedDict, defaultdict
import itertools
from itertools import islice, count, groupby
import pandas as pd
import os
import re
from operator import itemgetter
import nltk
from nltk.stem.porter import *
from nltk.corpus import stopwords
from time import time
from pathlib import Path
import pickle
import pandas as pd
from google.cloud import storage

import hashlib
def _hash(s):
    return hashlib.blake2b(bytes(s, encoding='utf8'), digest_size=5).hexdigest()

nltk.download('stopwords')

from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf, SparkFiles
from pyspark.sql import SQLContext
from graphframes import *

In [None]:
# if nothing prints here you forgot to include the initialization script when starting the cluster
!ls -l /usr/lib/spark/jars/graph*

In [None]:
spark

In [None]:
BUCKET_NAME = 'ir_3_207472234' 
relative_path = f"meta_data"
client = storage.Client()

In [None]:
paths = "gs://ir_3_207472234/multistream*_preprocessed.parquet"

parquetFile = spark.read.parquet(paths)

doc_ids_rdd = parquetFile.select( "id").rdd

In [None]:
parquetFile.count()

In [None]:
doc_ids_rdd.take(5)

In [None]:
doc_text_rdd = parquetFile.select("text", "id").rdd

In [None]:
doc_text_rdd.take(1)

In [None]:
# if nothing prints here you forgot to upload the file inverted_index_gcp.py to the home dir
%cd -q /home/dataproc
!ls inverted_index_gcp.py

In [None]:
# adding our python module to the cluster
sc.addFile("/home/dataproc/inverted_index_gcp.py")
sys.path.insert(0,SparkFiles.getRootDirectory())

In [None]:
from inverted_index_gcp import InvertedIndex as idx

In [None]:
import numpy as np
from pyspark.sql import functions as F

def ids_df_stats_and_sorted(doc_ids_df):
    """
    doc_ids_df: Spark DataFrame with a column 'id' (any numeric type)

    Returns:
      ids_sorted_np: numpy array sorted ascending (uint32/uint64)
      N            : distinct doc count
      max_id       : max doc id
      min_id       : min doc id
    """
    # Make sure we only work with one column and correct type
    ids_df = doc_ids_df.select(F.col("id").cast("long").alias("id")).filter(F.col("id").isNotNull())

    # Distinct doc ids (Spark does this distributed)
    ids_distinct_df = ids_df.dropDuplicates(["id"]).cache()

    # Stats in one job (Spark aggregate)
    row = ids_distinct_df.agg(
        F.count("*").alias("N"),
        F.max("id").alias("max_id"),
        F.min("id").alias("min_id"),
    ).collect()[0]

    N = int(row["N"])
    max_id = int(row["max_id"])
    min_id = int(row["min_id"])

    # Collect sorted ids to driver (Spark sorts distributed, then we collect)
    # Note: .toPandas() can be memory-heavy; .collect() is fine for 6M on a big VM.
    ids_sorted_list = [r["id"] for r in ids_distinct_df.orderBy("id").select("id").collect()]

    # Choose smallest dtype that fits
    if max_id <= np.iinfo(np.uint32).max:
        ids_sorted_np = np.array(ids_sorted_list, dtype=np.uint32)
    else:
        ids_sorted_np = np.array(ids_sorted_list, dtype=np.uint64)

    return ids_sorted_np, N, max_id, min_id

def ids_rdd_stats_and_sorted(doc_ids_rdd):
    # doc_ids_rdd can be RDD[int] or RDD[Row] – handle both:
    ids_only = doc_ids_rdd.map(lambda x: x.id if hasattr(x, "id") else x) \
                          .filter(lambda x: x is not None) \
                          .map(lambda x: int(x)) \
                          .distinct() \
                          .cache()

    N = ids_only.count()
    min_id = ids_only.min()
    max_id = ids_only.max()

    ids_sorted_np = np.array(ids_only.sortBy(lambda x: x).collect(), dtype=np.int64)
    return ids_sorted_np, N, max_id, min_id

def build_docid_to_pos(ids_sorted_np, max_id):
    """
    ids_sorted_np: sorted numpy array of distinct doc ids
    max_id: maximum doc id (int)

    Returns:
      docid_to_pos: int32 numpy array of length max_id+1 filled with -1;
                   docid_to_pos[doc_id] = position in ids_sorted_np
    """
    docid_to_pos = np.full((max_id + 1,), -1, dtype=np.int32)
    docid_to_pos[ids_sorted_np.astype(np.int64)] = np.arange(ids_sorted_np.shape[0], dtype=np.int32)
    return docid_to_pos

import os
import numpy as np
import subprocess
import tempfile

def save_numpy_to_gcs(arr: np.ndarray, filename: str, bucket_name: str, relative_path: str):
    """
    Saves a numpy array to GCS as .npy

    arr            : numpy array
    filename       : e.g. 'docid_to_pos.npy'
    bucket_name    : GCS bucket name (no gs://)
    relative_path  : path inside bucket, e.g. 'metadata/body'
    """
    with tempfile.TemporaryDirectory() as tmpdir:
        local_path = os.path.join(tmpdir, filename)
        np.save(local_path, arr)

        gcs_path = f"gs://{bucket_name}/{relative_path}/{filename}"
        subprocess.check_call(["gsutil", "cp", local_path, gcs_path])

        print(f"Saved {filename} → {gcs_path}")

english_stopwords = frozenset(stopwords.words('english'))
corpus_stopwords = ["category", "references", "also", "external", "links", 
                    "may", "first", "see", "history", "people", "one", "two", 
                    "part", "thumb", "including", "second", "following", 
                    "many", "however", "would", "became"]

all_stopwords = english_stopwords.union(corpus_stopwords)
RE_WORD = re.compile(r"""[\#\@\w](['\-]?\w){2,24}""", re.UNICODE)

In [None]:
ids_sorted_np, N, max_id, min_id = ids_rdd_stats_and_sorted(doc_ids_rdd)
docid_to_pos = build_docid_to_pos(ids_sorted_np, max_id)

In [None]:
# N = ids_sorted_np.shape[0]

index = idx.InvertedIndex.read_index(
    base_dir=f"indexes/postings_gcp",
    name="index",
    bucket_name=BUCKET_NAME
)

# N must be total number of docs in corpus (distinct)
# If you already computed it earlier, reuse it. Example:
# N = doc_ids_df.dropDuplicates(["id"]).count()
# Otherwise set N explicitly.
N = int(N)

bc_df = sc.broadcast(index.df)   # dict term -> df
bc_N  = sc.broadcast(N)
bc_stop = sc.broadcast(all_stopwords)

In [None]:
def doc_invlen_and_norm(text: str, doc_id: int):
    import builtins
    import math
    """
    Returns TWO records via flatMap:
      ("inv_len", (doc_id, inv_doc_len))
      ("norm",    (doc_id, doc_norm))

    doc_norm uses:
      tf_norm = tf / doc_len   (implemented as tf * inv_doc_len)
      idf = log(N / df)
      norm = sqrt( sum_t (tf_norm * idf)^2 )
    """
    if text is None:
        text = ""

    # tokenize
    tokens = [m.group() for m in RE_WORD.finditer(text.lower())]

    # term freq after stopwords
    stop = bc_stop.value
    tf = {}
    for t in tokens:
        if t in stop:
            continue
        tf[t] = tf.get(t, 0) + 1

    # doc_len = total kept tokens
    doc_len = builtins.sum(tf.values())
    if doc_len == 0:
        inv_len = 0.0
        norm = 0.0
        return [("inv_len", (int(doc_id), float(inv_len))),
                ("norm",    (int(doc_id), float(norm)))]

    inv_len = 1.0 / float(doc_len)

    # norm accumulation
    df_dict = bc_df.value
    N_docs = float(bc_N.value)

    acc = 0.0
    for term, f in tf.items():
        df_t = df_dict.get(term, 0)
        if df_t <= 0:
            continue
        idf = math.log(N_docs / float(df_t))
        w = (float(f) * inv_len) * idf
        acc += w * w

    norm = math.sqrt(acc)

    return [("inv_len", (int(doc_id), float(inv_len))),
            ("norm",    (int(doc_id), float(norm)))]


In [None]:
# run over doc_text_rdd and split into two RDDs
# doc_text_rdd is (text, id) 
tagged = doc_text_rdd.flatMap(lambda x: doc_invlen_and_norm(x[0], x[1]))

doc_to_len  = tagged.filter(lambda x: x[0] == "inv_len").map(lambda x: x[1])  # (doc_id, inv_len)
doc_to_norm = tagged.filter(lambda x: x[0] == "norm").map(lambda x: x[1])     # (doc_id, norm)


In [None]:

inv_doc_len = np.zeros(N, dtype=np.float32)
doc_norm    = np.zeros(N, dtype=np.float32)


for doc_id, inv_len in doc_to_len.toLocalIterator():
    pos = docid_to_pos[doc_id]
    if pos != -1:
        inv_doc_len[pos] = inv_len


for doc_id, norm in doc_to_norm.toLocalIterator():
    pos = docid_to_pos[doc_id]
    if pos != -1:
        doc_norm[pos] = norm


In [None]:
# title_id_rdd = parquetFile.select("title", "id").rdd

In [None]:
# def build_titles_array_from_rdd(title_id_rdd, docid_to_pos: np.ndarray, N: int, max_id: int):
#     """
#     title_id_rdd: RDD of (title, id) OR (id, title)
#     docid_to_pos: numpy int32 array, length max_id+1, maps doc_id -> pos (or -1)
#     N: number of docs (len(ids_sorted_np))
#     max_id: maximum doc_id (for safety checks)

#     Returns:
#       titles_by_pos: numpy object array of length N, where titles_by_pos[pos] = title
#     """
#     titles_by_pos = np.empty(N, dtype=object)
#     titles_by_pos[:] = ""

#     def normalize(x):
#         a, b = x
#         if isinstance(a, (int, np.integer)) and isinstance(b, str):
#             return int(a), b  # (id, title)
#         if isinstance(b, (int, np.integer)) and isinstance(a, str):
#             return int(b), a  # (id, title)
#         # fallback: assume (title, id)
#         return int(b), str(a)

#     # Stream results to driver (no giant collect)
#     for did, title in title_id_rdd.map(normalize).toLocalIterator():
#         if 0 <= did <= max_id:
#             pos = docid_to_pos[did]
#             if pos != -1:
#                 titles_by_pos[pos] = title if title is not None else ""

#     return titles_by_pos


In [None]:
# titles_by_pos = build_titles_array_from_rdd(title_id_rdd, docid_to_pos, N, max_id)

In [None]:
assert inv_doc_len.dtype == np.float32
assert doc_norm.dtype == np.float32
assert inv_doc_len.shape == doc_norm.shape == (N,)

In [None]:
save_numpy_to_gcs(
    docid_to_pos,
    filename="doc_id_to_pos.npy",
    bucket_name=BUCKET_NAME,
    relative_path=relative_path
)

save_numpy_to_gcs(
    ids_sorted_np,
    filename="sorted_doc_ids.npy",
    bucket_name=BUCKET_NAME,
    relative_path=relative_path
)

save_numpy_to_gcs(
    inv_doc_len,
    filename="inv_doc_len_body.npy",
    bucket_name=BUCKET_NAME,
    relative_path=relative_path
)

save_numpy_to_gcs(
    doc_norm,
    filename="doc_norm_body.npy",
    bucket_name=BUCKET_NAME,
    relative_path=relative_path
)

# save_numpy_to_gcs(
#     arr=titles_by_pos,
#     filename="doc_id_to_title.npy",
#     bucket_name=BUCKET_NAME,
#     relative_path=relative_path
# )