In [1]:
!gcloud dataproc clusters list --region us-central1
!pip install -q google-cloud-storage==1.43.0
!pip install -q graphframes

NAME          PLATFORM  PRIMARY_WORKER_COUNT  SECONDARY_WORKER_COUNT  STATUS   ZONE           SCHEDULED_DELETE
cluster-e10b  GCE       4                                             RUNNING  us-central1-a
[0m

In [2]:
import os
import re
import sys
import pickle
import hashlib
from time import time
from collections import Counter, defaultdict
from itertools import groupby
from operator import itemgetter
from pathlib import Path

import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *
from operator import add

from google.cloud import storage
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf, SparkFiles
from pyspark.sql import SQLContext
from graphframes import *
from nltk.stem.porter import *

In [3]:
# Initialize Spark session
spark = SparkSession.builder.appName("InvertedIndex").getOrCreate()
sc = SparkContext.getOrCreate()

sc.addFile("/home/dataproc/inverted_index_gcp.py")
sys.path.insert(0, SparkFiles.getRootDirectory())

from inverted_index_gcp import InvertedIndex

24/03/11 12:40:49 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [4]:
# Download NLTK stopwords
nltk.download('stopwords')

NUM_BUCKETS = 124

# Set up stopwords
english_stopwords = set(stopwords.words('english'))
corpus_stopwords = ["category", "references", "also", "external", "links",
                    "may", "first", "see", "history", "people", "one", "two",
                    "part", "thumb", "including", "second", "following",
                    "many", "however", "would", "became"]

all_stopwords = english_stopwords.union(corpus_stopwords)
RE_WORD = re.compile(r"""[\#\@\w](['\-]?\w){2,24}""", re.UNICODE)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# Constants
PROJECT_ID = 'irproject-416709'
bucket_name = 'irproj_2605'
porter = PorterStemmer()

In [6]:
# Functions
def _hash(s):
    return hashlib.blake2b(bytes(s, encoding='utf8'), digest_size=5).hexdigest()

def token2bucket_id(token):
    return int(_hash(token), 16) % NUM_BUCKETS

def tokenize(text):
    return [porter.stem(token.group()) for token in RE_WORD.finditer(text.lower()) if token.group() not in all_stopwords]

def word_count(id, tokens):
    token_counts = Counter(tokens)
    result = [(token, (id, count)) for token, count in token_counts.items()]
    return result

def reduce_word_counts(unsorted_pl):
    return sorted(unsorted_pl, key=lambda x: x[0])

def calculate_df(postings):
    return postings.map(lambda x: (x[0], len(x[1]))) # (token, df) - df - in how many documents the term appeard

def partition_postings_and_write(postings, index):
    map_to_buckets = postings.map(lambda item: (token2bucket_id(item[0]), item)).groupByKey()
    return map_to_buckets.map(lambda x: InvertedIndex.write_a_posting_list(x, index.base_dir, bucket_name))

def create_anchor_list(page):
    doc_id, anchors = page[0], page[1]     
    return [(doc_id, anchor[1]) for anchor in anchors]

In [7]:
# Put your bucket name below and make sure you can access it without an error
# Set up Google Cloud Storage client
client = storage.Client()

# Get list of blobs in bucket
full_path = f"gs://{bucket_name}/"

blobs = [b for b in client.list_blobs(bucket_name, prefix='wiki_dump/') if b.name not in ['wiki_dump/graphframes.sh', 'wiki_dump/']]
paths = [full_path + b.name for b in blobs]

# Read parquet files
parquetFile = spark.read.parquet(*paths)

# Count number of wiki pages
N_docs = parquetFile.count()

doc_text_pairs = parquetFile.select("id", "anchor_text").rdd
doc_text_pairs = doc_text_pairs.flatMap(create_anchor_list).groupByKey().mapValues(list).map(lambda x: (x[0], " ".join(x[1])))

                                                                                

In [8]:
inverted = InvertedIndex(base_dir=f'indices/anchor_index/postings_anchor_gcp/')
inverted.num_docs = N_docs

In [9]:
doc_pairs = doc_text_pairs.map(lambda pair: (pair[0], tokenize(pair[1])))

In [10]:
word_counts = doc_pairs.flatMap(lambda x: word_count(x[0], x[1]))
postings = word_counts.groupByKey().mapValues(reduce_word_counts)
postings_filtered = postings.filter(lambda x: len(x[1])>50)

In [11]:
w2df = calculate_df(postings_filtered)
w2df_dict = w2df.collectAsMap()

24/03/11 12:43:58 WARN YarnAllocator: Container from a bad node: container_1710065956596_0051_01_000004 on host: cluster-e10b-w-2.us-central1-a.c.irproject-416709.internal. Exit status: 143. Diagnostics: [2024-03-11 12:43:58.206]Container killed on request. Exit code is 143
[2024-03-11 12:43:58.206]Container exited with a non-zero exit code 143. 
[2024-03-11 12:43:58.206]Killed by external signal
.
24/03/11 12:43:58 WARN YarnSchedulerBackend$YarnSchedulerEndpoint: Requesting driver to remove executor 4 for reason Container from a bad node: container_1710065956596_0051_01_000004 on host: cluster-e10b-w-2.us-central1-a.c.irproject-416709.internal. Exit status: 143. Diagnostics: [2024-03-11 12:43:58.206]Container killed on request. Exit code is 143
[2024-03-11 12:43:58.206]Container exited with a non-zero exit code 143. 
[2024-03-11 12:43:58.206]Killed by external signal
.
24/03/11 12:43:58 ERROR YarnScheduler: Lost executor 4 on cluster-e10b-w-2.us-central1-a.c.irproject-416709.internal:

24/03/11 12:48:29 WARN YarnAllocator: Container from a bad node: container_1710065956596_0051_01_000006 on host: cluster-e10b-w-0.us-central1-a.c.irproject-416709.internal. Exit status: 143. Diagnostics: [2024-03-11 12:48:29.522]Container killed on request. Exit code is 143
[2024-03-11 12:48:29.522]Container exited with a non-zero exit code 143. 
[2024-03-11 12:48:29.522]Killed by external signal
.
24/03/11 12:48:29 WARN YarnSchedulerBackend$YarnSchedulerEndpoint: Requesting driver to remove executor 6 for reason Container from a bad node: container_1710065956596_0051_01_000006 on host: cluster-e10b-w-0.us-central1-a.c.irproject-416709.internal. Exit status: 143. Diagnostics: [2024-03-11 12:48:29.522]Container killed on request. Exit code is 143
[2024-03-11 12:48:29.522]Container exited with a non-zero exit code 143. 
[2024-03-11 12:48:29.522]Killed by external signal
.
24/03/11 12:48:29 ERROR YarnScheduler: Lost executor 6 on cluster-e10b-w-0.us-central1-a.c.irproject-416709.internal:

In [12]:
# _ = partition_postings_and_write(postings_filtered, inverted).collect()

In [13]:
# # collect all posting lists locations into one super-set
# super_posting_locs = defaultdict(list)
# for blob in client.list_blobs(bucket_name, prefix='postings_gcp'):
#     if not blob.name.endswith("pickle"):
#         continue
#     with blob.open("rb") as f:
#         posting_locs = pickle.load(f)
#         for k, v in posting_locs.items():
#             super_posting_locs[k].extend(v)

# Initialize Google Cloud Storage client
client = storage.Client()

bucket_name = 'irproj_2605'
prefix = 'indices/anchor_index/postings_anchor_gcp'

# Collect all posting lists locations into one super-set
super_posting_locs = defaultdict(list)

for blob in client.list_blobs(bucket_name, prefix=prefix):
    if blob.name.endswith("pickle"):  # Assuming posting lists locations are stored in pickle files
        with blob.open("rb") as f:
            posting_locs = pickle.load(f)
            for k, v in posting_locs.items():
                super_posting_locs[k].extend(v)

# Example: Updating the inverted index with the aggregated posting locations
# Assuming `inverted` is your InvertedIndex object instance
inverted.posting_locs = super_posting_locs

In [14]:
# Adding the posting locations dictionary to the inverted index
# inverted.posting_locs = super_posting_locs
# Add the token - df dictionary to the inverted index
inverted.df = w2df_dict

In [15]:
inverted.term_total.update(postings.flatMapValues(lambda x: x).map(lambda x: (x[0], x[1][1])).reduceByKey(add).collectAsMap())

                                                                                

In [16]:
inverted.doc_lengths = doc_pairs.map(lambda x: (x[0], len(x[1]))).collectAsMap()

                                                                                

In [17]:
inverted.avg_doc_length = np.mean(np.array(list(inverted.doc_lengths.values())))

In [18]:
# write the global stats out
inverted.write_index('.', 'index_anchor')
# upload to gs
index_src = "index_anchor.pkl"
index_dst = f'gs://{bucket_name}/postings_gcp/{index_src}'
!gsutil cp $index_src $index_dst

Copying file://index_anchor.pkl [Content-Type=application/octet-stream]...
- [1 files][ 78.4 MiB/ 78.4 MiB]                                                
Operation completed over 1 objects/78.4 MiB.                                     
