In [1]:
from pyspark.sql import SparkSession
import boto3
from io import BytesIO
from PyPDF2 import PdfReader
import numpy as np
import faiss
from pyspark.ml.feature import Tokenizer, StopWordsRemover, Word2Vec

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
3,application_1713759699975_0004,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [2]:
# Initialize Spark Session
spark = SparkSession.builder.appName("PDF Processing").getOrCreate()
sc = spark.sparkContext

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Getting PDF Names

In [41]:
s3_client = boto3.client('s3')
bucket_name = 'metcs777-term-project'
prefix = 'arXiv/'

def list_all_pdf_files(bucket, prefix):
    """List all PDF files in the specified S3 bucket and prefix."""
    paginator = s3_client.get_paginator('list_objects_v2')
    page_iterator = paginator.paginate(Bucket=bucket, Prefix=prefix)

    pdf_files = []
    for page in page_iterator:
        if 'Contents' in page:
            pdf_files += ['s3://' + bucket + '/' + item['Key']
                          for item in page['Contents']
                          if item['Key'].endswith('.pdf')]
    return pdf_files

files = list_all_pdf_files(bucket_name, prefix)
print(f'Total PDF files: {len(files)}')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Total PDF files: 9344

## Creating Batches

In [None]:
def create_batches(filenames, batch_size):
    """Yield successive n-sized chunks from filenames."""
    for i in range(0, len(filenames), batch_size):
        yield filenames[i:i + batch_size]
        
def save_batches_to_s3(batches, bucket, prefix):
    """Save each batch of filenames as a separate text file in S3."""
    s3_client = boto3.client('s3')
    for index, batch in enumerate(batches):
        # Create a string from the batch list
        batch_content = '\n'.join(batch)
        # File path in S3
        file_path = f"{prefix}/batch_{index+1}.txt"
        # Upload the batch file to S3
        s3_client.put_object(Bucket=bucket, Key=file_path, Body=batch_content)
        print(f"Batch {index+1} saved to {file_path}")

In [None]:
bucket_name = 'metcs777-term-project'
file_prefix = 'batched_files'  # e.g., 'batched_files'

# Create batches
filename_batches = list(create_batches(files, 500))

# Save batches to S3
save_batches_to_s3(filename_batches, bucket_name, file_prefix)

## Function to extract text from pdf

In [50]:
# Function to fetch and extract text and file name from PDF in S3
def extract_text_from_pdf_s3(bucket, key):
    s3_client = boto3.client('s3')
    response = s3_client.get_object(Bucket=bucket, Key=key)
    pdf_bytes = response['Body'].read()
    pdfFileObject = BytesIO(pdf_bytes)
    pdfReader = PdfReader(pdfFileObject)
    output = ''
    for i in range(len(pdfReader.pages)):
        page = pdfReader.pages[i]
        output += page.extract_text() or ''  # Adding fallback for None
    pdfFileObject.close()
    return output

# Function to process PDF files and return file name and text
def process_pdf(file_path):
    try:
        key = file_path[len('s3://metcs777-term-project/'):]
        text = extract_text_from_pdf_s3('metcs777-term-project', key)
        return (file_path, text)  # Return both file name and text
    except Exception as e:
        return (file_path, str(e))  # Return file name and error message

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Reading Batch files

In [42]:
#get batch txt files
s3_client = boto3.client('s3')
response2 = s3_client.list_objects_v2(Bucket='metcs777-term-project', Prefix='batched_files/')
files2 = ['s3://metcs777-term-project/' + item['Key'] for item in response2['Contents'] if item['Key'].endswith('.txt')]

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [43]:
#take first 10 batch files
files2 = files2[10:]
files2_rdd = sc.parallelize(files2)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Function to read batch files

In [44]:
def extract_text_from_s3(bucket, key):
    """Fetch and read text from a text file stored in an S3 bucket."""
    s3_client = boto3.client('s3')
    response = s3_client.get_object(Bucket=bucket, Key=key)
    text_data = response['Body'].read().decode('utf-8')
    cleaned_text = text_data.split("\n")  # Remove all newline characters
    return cleaned_text

def process_text_files(bucket, keys):
    """Process multiple text files and collect their contents into a list."""
    contents = []
    for key in keys:
        key_f = key[len('s3://metcs777-term-project/'):]
        file_content = extract_text_from_s3(bucket, key_f)
        contents.extend(file_content)
    return contents

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Getting name from batch files

In [51]:
#name of pdf files from batch files
bucket_name = 'metcs777-term-project'
pdf_file_paths = process_text_files(bucket_name, files2)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## For each 500 files, extracting text and saving to S3

In [None]:
for i in range(500,5000,500):
    batch_files = pdf_file_paths[i-500:i]
    print(len(batch_files))
    files_rdd = sc.parallelize(batch_files)
    results = files_rdd.map(process_pdf).collect()
    results_str = '\n'.join([str(result) for result in results])
    s3_client.put_object(Bucket=bucket_name, Key="output/results"+str(i+5000)+ ".txt", Body=results_str)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…