In [1]:
!gcloud dataproc clusters list --region us-central1
!pip install -q google-cloud-storage==1.43.0
!pip install -q graphframes

NAME          PLATFORM  PRIMARY_WORKER_COUNT  SECONDARY_WORKER_COUNT  STATUS   ZONE           SCHEDULED_DELETE
cluster-84ed  GCE       4                                             RUNNING  us-central1-c
[0m

In [2]:
import os
import re
import sys
import pickle
import hashlib
from time import time
from collections import Counter, defaultdict
from itertools import groupby
from operator import itemgetter
from pathlib import Path

import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *
from operator import add

from google.cloud import storage
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf, SparkFiles
from pyspark.sql import SQLContext
from graphframes import *
from nltk.stem.porter import *

In [3]:
# Initialize Spark session
spark = SparkSession.builder.appName("InvertedIndex").getOrCreate()
sc = SparkContext.getOrCreate()

sc.addFile("/home/dataproc/inverted_index_gcp.py")
sys.path.insert(0, SparkFiles.getRootDirectory())

from inverted_index_gcp import InvertedIndex

24/03/10 15:21:20 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [4]:
# Download NLTK stopwords
nltk.download('stopwords')

NUM_BUCKETS = 124

# Set up stopwords
english_stopwords = set(stopwords.words('english'))
corpus_stopwords = ["category", "references", "also", "external", "links",
                    "may", "first", "see", "history", "people", "one", "two",
                    "part", "thumb", "including", "second", "following",
                    "many", "however", "would", "became"]

all_stopwords = english_stopwords.union(corpus_stopwords)
RE_WORD = re.compile(r"""[\#\@\w](['\-]?\w){2,24}""", re.UNICODE)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# Constants
PROJECT_NAME = 'ir-project-415515'
BUCKET_NAME = 'irproj_26051997'
stemmer = PorterStemmer()

In [12]:
# Functions
def _hash(s):
    return hashlib.blake2b(bytes(s, encoding='utf8'), digest_size=5).hexdigest()


def token2bucket_id(token):
    return int(_hash(token), 16) % NUM_BUCKETS

def tokenize(text):
    return [token.group() for token in RE_WORD.finditer(text.lower()) if token.group() not in all_stopwords]

def word_count(id, tokens):
    token_counts = Counter(tokens)
    result = [(token, (id, count)) for token, count in token_counts.items()]
    return result

def reduce_word_counts(unsorted_pl):
    return sorted(unsorted_pl, key=lambda x: x[0])


def calculate_df(postings):
    return postings.map(lambda x: (x[0], len(x[1]))) # (token, df) - df - in how many documents the term ocurred


def partition_postings_and_write(postings, index):
    map_to_buckets = postings.map(lambda item: (token2bucket_id(item[0]), item)).groupByKey()
    return map_to_buckets.map(lambda x: InvertedIndex.write_a_posting_list(x, index.base_dir, BUCKET_NAME))


def create_anchor_list(page):
    doc_id, anchors = page[0], page[1]     
    return [(doc_id, anchor[1]) for anchor in anchors]


def create_index(doc_pairs, directory, filter_tf=False):
    inverted = InvertedIndex(base_dir=directory)
    
    doc_pairs = doc_pairs.map(lambda pair: (pair[0], tokenize(pair[1])))
    print("Done tokenization")
    
    # Calculate word counts and filter
    word_counts = doc_pairs.flatMap(lambda x: word_count(x[0], x[1]))
    postings = word_counts.groupByKey().mapValues(reduce_word_counts)
    w2df_dict = calculate_df(postings).collectAsMap()
    
    if filter_tf:
        postings = postings.filter(lambda x: len(x[1]) > 50)
        
    print("Done posting lists creation and df creation")

    _ = partition_postings_and_write(postings, inverted).collect()
    
    # Collect all posting lists locations into one super-set
    super_posting_locs = defaultdict(list)
    for blob in client.list_blobs(BUCKET_NAME, prefix=directory):
        if not blob.name.endswith("pickle"):
            continue
        with blob.open("rb") as f:
            posting_locs = pickle.load(f)
            for k, v in posting_locs.items():
                super_posting_locs[k].extend(v)
    
    print("Done creating a posting locs list")
    
    # Create and configure InvertedIndex instance
    inverted.posting_locs = super_posting_locs
    print("Saved posting locs")

    inverted.df.update(w2df_dict)
    print("Updated df")

    inverted.term_total.update(postings.flatMapValues(lambda x: x).map(lambda x: (x[0], x[1][1])).reduceByKey(add).collectAsMap())
    print("Updated tf")
    
    return inverted

In [14]:
# Main function
# Set up Google Cloud Storage client
client = storage.Client()

# Get list of blobs in bucket
full_path = f"gs://{BUCKET_NAME}/"

blobs = [b for b in client.list_blobs(BUCKET_NAME, prefix='wiki_files/') if b.name not in ['wiki_files/graphframes.sh', 'wiki_files/']]
paths = [full_path + b.name for b in blobs]

# Read parquet files
parquetFile = spark.read.parquet(*paths)

doc_title_pairs = parquetFile.select("id", "title").rdd
inverted_title = create_index(doc_title_pairs, f'indices/title_index/postings_title_gcp/')
print('Created title index')

doc_token_counts_title = doc_title_pairs.map(lambda x: (x[0], len(tokenize(x[1]))))
inverted_title.doc_lengths = doc_token_counts_title.collectAsMap()
inverted_title.num_docs = len(inverted_title.doc_lengths.keys())
inverted_title.avg_doc_length = np.mean(np.array(list(inverted_title.doc_lengths.values())))


# Write global stats and upload to Google Storage
inverted_title.write_index('.', 'title_index')
index_src = "title_index.pkl"
index_dst = f'gs://{BUCKET_NAME}/indices/title_index/postings_title_gcp/{index_src}'
!gsutil cp $index_src $index_dst
print('Title index saved to bucket successfully')

                                                                                

Done tokenization


Exception in thread "serve RDD 109" java.net.SocketTimeoutException: Accept timed out
	at java.base/java.net.PlainSocketImpl.socketAccept(Native Method)
	at java.base/java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:474)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:565)
	at java.base/java.net.ServerSocket.accept(ServerSocket.java:533)
	at org.apache.spark.security.SocketAuthServer$$anon$1.run(SocketAuthServer.scala:64)
                                                                                

Done posting lists creation and df creation


                                                                                

Done creating a posting locs list
Saved posting locs
Updated df


                                                                                

Updated tf
Created title index


                                                                                

Copying file://title_index.pkl [Content-Type=application/octet-stream]...
\ [1 files][132.9 MiB/132.9 MiB]                                                
Operation completed over 1 objects/132.9 MiB.                                    
Title index saved to bucket successfully


In [15]:
name = "title_id_dict"

with open(f"{name}.pkl", "wb") as f:
    pickle.dump(doc_title_pairs.collectAsMap(), f)

client = storage.Client()
bucket = client.bucket(BUCKET_NAME)
blob = bucket.blob(f"title_id_dict/{name}.pkl")
blob.upload_from_filename(f"{name}.pkl")

                                                                                

In [None]:
spark.stop()

24/03/10 15:21:45 WARN TaskSetManager: Lost task 12.0 in stage 2.0 (TID 81) (cluster-84ed-w-3.c.ir-project-415515.internal executor 6): TaskKilled (Stage cancelled)
24/03/10 15:21:46 WARN TaskSetManager: Lost task 9.2 in stage 2.0 (TID 80) (cluster-84ed-w-3.c.ir-project-415515.internal executor 8): TaskKilled (Stage cancelled)
