<a href="https://colab.research.google.com/github/matdjohnson-at-umass-dot-edu/CS532-FinalProject/blob/main/CS532_FinalProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
!pip install ir-datasets
!pip install pyspark

import ir_datasets
from pyspark import SparkContext, StorageLevel

import re
import csv
import pickle



In [2]:
from google.colab import drive
drive.mount('/content/drive')

dataset_directory = "/content/drive/MyDrive/CS532-FinalProject/ir_datasets"
training_dataset_name = "wikir/en1k/training"
training_dataset_filename = training_dataset_name.replace("/", "_")

Mounted at /content/drive


In [3]:
# read dataset and write to CSV file on Google Drive
# resolves memory issues that occur consequent to initializaing RDD from dataset
#     iterables using SparkContext.parallelize()

train_dataset = ir_datasets.load(training_dataset_name)

docs_file = open(f"{dataset_directory}/{training_dataset_filename}_docs", 'w+')
csv_writer = csv.writer(docs_file, dialect='unix')
for doc in train_dataset.docs_iter():
    csv_writer.writerow([doc.doc_id, doc.text])
docs_file.close()

queries_file = open(f"{dataset_directory}/{training_dataset_filename}_queries", 'w+')
csv_writer = csv.writer(queries_file, dialect='unix')
for query in train_dataset.queries_iter():
    csv_writer.writerow([query.query_id, query.text])
queries_file.close()

qrels_file = open(f"{dataset_directory}/{training_dataset_filename}_qrels", 'w+')
csv_writer = csv.writer(qrels_file, dialect='unix')
for qrel in train_dataset.qrels_iter():
    csv_writer.writerow([qrel.query_id, qrel.doc_id, qrel.relevance, qrel.iteration])
qrels_file.close()

[INFO] If you have a local copy of https://zenodo.org/record/3565761/files/wikIR1k.zip, you can symlink it here to avoid downloading it again: /root/.ir_datasets/downloads/554299bca984640cb283d6ba55753608
[INFO] [starting] https://zenodo.org/record/3565761/files/wikIR1k.zip
[INFO] [finished] https://zenodo.org/record/3565761/files/wikIR1k.zip: [00:06] [165MB] [24.7MB/s]


In [4]:
# map corpus documents to corpus vocabulary and document id pairs
# D -> (D.text.word, D.doc_id)

# stop standalone cluster if it exists (facilitates re-execution)
try:
    spark.stop()
except NameError:
    pass # no op

# start spark standalone cluster and load spark session context
spark = SparkContext()

# load dataset from CSV file to RDD
train_rdd = spark.textFile(f"{dataset_directory}/{training_dataset_filename}_docs")
train_rdd.persist(StorageLevel.MEMORY_AND_DISK)

# define function for mapping CSV file lines to vocabulary-document-id pairs
def map_function(csv_file_line):
    csv_file_line_elements = csv_file_line.split('\",\"')
    doc_id = re.sub("[^A-Za-z0-9 ]", "", csv_file_line_elements[0])
    words_for_doc = re.sub("[^A-Za-z0-9 ]", "", csv_file_line_elements[1]).lower().split(' ')
    return list(zip(words_for_doc, list(doc_id for i in range(0, len(words_for_doc)))))

# map CSV file to vocabulary-document-id pairs, flattening pairs across documents
train_rdd = train_rdd.flatMap(map_function)

# confirm mapping is as expected
train_rdd.take(10)


[('it', '1781133'),
 ('was', '1781133'),
 ('used', '1781133'),
 ('in', '1781133'),
 ('landing', '1781133'),
 ('craft', '1781133'),
 ('during', '1781133'),
 ('world', '1781133'),
 ('war', '1781133'),
 ('ii', '1781133')]

In [None]:
# reduce corpus vocabulary term and corpus document id pairs to map of vocab terms to doc id lists
# list((term, doc_id)) -> dict({term: list(doc_id)})

# program currently runs out of memory - additional review of system behavior required

def reduce_function(list_of_doc_ids_for_term_instance_1, list_of_doc_ids_for_term_instance_2):
    list_of_doc_ids_for_term_instance_1 += list_of_doc_ids_for_term_instance_2
    return list_of_doc_ids_for_term_instance_1

# train_rdd = train_rdd.reduceByKey(reduce_function).collect()


In [8]:
# write index to CSV file on Google Drive

dummy_index = {
    'the': [1, 2, 3],
    'quick': [1, 2, 3],
    'brown': [1, 2, 3],
    'fox': [1, 2, 3],
    'jumpted': [1, 2, 3],
    'over': [1, 2, 3],
    'lazy': [1, 2, 3],
    'dog': [1, 2, 3],
}

index_file = open(f"{dataset_directory}/{training_dataset_filename}_index", 'w+')
csv_writer = csv.writer(index_file, dialect='unix')
for k, v in dummy_index.items():
    csv_writer.writerow([k, pickle.dumps(v)])
index_file.close()


In [None]:
# map CSV file lines to RDD entries for the corpus inverted index
# each line corresponds to one vocabulary term and document id list

# index loading is currently non functioning due to python object serialization issues

# load dataset from CSV file to RDD
train_index_rdd = spark.textFile(f"{dataset_directory}/{training_dataset_filename}_index")
train_index_rdd.persist(StorageLevel.MEMORY_AND_DISK)

# define function for mapping CSV file lines to index entries
def map_function(csv_file_line):
    csv_file_line_elements = csv_file_line.split('\",\"')
    vocab_term = re.sub("^\"", "", csv_file_line_elements[0])
    bytes_as_string = re.sub("'\"$", "", csv_file_line_elements[1])
    bytes_as_string = re.sub("^b'", "", bytes_as_string)
    docs_for_term = pickle.loads(bytes_as_string.encode('utf-8'))
    return (vocab_term, docs_for_term)

# map CSV file
# train_index_rdd = train_index_rdd.map(map_function)

# confirm mapping is as expected
# train_index_rdd.take(10)

train_index_rdd = spark.parallelize(dummy_index.items())
train_index_rdd.take(10)

In [17]:
# map CSV file lines to RDD entries for the wikIR queries
# each line corresponds to one query

# query loading is currently non functioning due to time constraints

# load dataset from CSV file to RDD
train_query_rdd = spark.textFile(f"{dataset_directory}/{training_dataset_filename}_queries")
train_query_rdd.persist(StorageLevel.MEMORY_AND_DISK)

# copy from above
# def map_function(csv_file_line):
#     csv_file_line_elements = csv_file_line.split('\",\"')
#     vocab_term = re.sub("^\"", "", csv_file_line_elements[0])
#     bytes_as_string = re.sub("'\"$", "", csv_file_line_elements[1])
#     bytes_as_string = re.sub("^b'", "", bytes_as_string)
#     docs_for_term = pickle.loads(bytes_as_string.encode('utf-8'))
#     return (vocab_term, docs_for_term)

train_query_rdd = spark.parallelize(
    [
        (1, 'brown fox'),
        (2, 'lazy dog')
    ]
)
train_query_rdd.take(1)

[(1, 'brown fox')]

In [19]:
# map CSV file lines to RDD entries for the wikIR qrels
# each line corresponds to one qrel

# qrel loading is currently non functioning due to time constraints

# load dataset from CSV file to RDD
train_qrels_rdd = spark.textFile(f"{dataset_directory}/{training_dataset_filename}_qrels")
train_qrels_rdd.persist(StorageLevel.MEMORY_AND_DISK)

# copy from above
# def map_function(csv_file_line):
#     csv_file_line_elements = csv_file_line.split('\",\"')
#     vocab_term = re.sub("^\"", "", csv_file_line_elements[0])
#     bytes_as_string = re.sub("'\"$", "", csv_file_line_elements[1])
#     bytes_as_string = re.sub("^b'", "", bytes_as_string)
#     docs_for_term = pickle.loads(bytes_as_string.encode('utf-8'))
#     return (vocab_term, docs_for_term)

train_qrels_rdd = spark.parallelize(
    [
        (1, 1, 1, 0),
        (1, 2, 1, 0),
        (1, 3, 1, 0),
        (1, 1, 1, 0),
        (1, 2, 1, 0),
        (1, 3, 1, 0)
    ]
)
train_qrels_rdd.take(10)

[(1, 1, 1, 0),
 (1, 2, 1, 0),
 (1, 3, 1, 0),
 (1, 1, 1, 0),
 (1, 2, 1, 0),
 (1, 3, 1, 0)]

In [None]:
# implementation of Okapai BM25 inverted index search results
def bm_25_evaluator(term):
    pass

def bm_25_mapping_function(query):
    # assume ability to reference other RDDs from context of RDD mapping function
    # if not possible, create index object that can be passed to workers
    # docs_with_ranks = list()
    # for term in query:
    #   docs_with_ranks_for_term = bm_25_evaluator(term)
    #   docs_with_ranks.expand(docs_with_ranks_for_term)
    # return docs
    pass

train_qrels_evaluation = train_query_rdd.map(bm_25_mapping_function)

In [None]:
# implementatioon of negative discounted cumulative gain
def ndcg_func():
    pass

def qrels_validation_function(qrels_evaluation):
    # assume ability to reference other RDDs from context of RDD mapping function
    # if not possible, create index object that can be passed to workers
    # docs_actual = train_qrels_rdd.lookup(qrels_evaluation.query_id)
    # accuracy_for_doc = ndcg_funct(docs_actual, qrels_evaluation.docs)
    pass

train_qrels_evaluation.map(qrels_validation_function)

# method for analyzing and displaying the results of the ndcg evaluation of the BM25 search results
def analyze_and_display_ndcg_results():
    pass