## this file pulls .gz files from an AWS S3 bucket, converts them into a pyspark dataframe, writes them to a new AWS S3 bucket 

In [3]:
from pyspark.sql.types import *
from pyspark.sql.functions import udf
from pyspark.sql import SQLContext
from pyspark import SparkContext
from pyspark.sql.session import SparkSession

sc = SparkContext.getOrCreate()
spark = SparkSession(sc)
raw_data_bucket_name = "open-research-corpus"
preprocessed_bucket_name = "preprocessed-open-research-corpus"

# Reads all .gz files from an AWS bucket
def read_all_gz_from_bucket(orig_bucket_name):
    '''
    This function reads data from the original bucket: open-research-corpus 
    '''
    return spark.read.text("s3a://{0}/corpus-2019-01-31/*.gz".format(orig_bucket_name))

def write_aws_s3(preprocessed_bucket_name, df):
    '''
    This function writes a dataframe to the preprocessed bucket: preprocessed-open-research-corpus
    '''
    df.write.save("s3a://{0}".format(preprocessed_bucket_name), format="json", mode="overwrite")
    
raw_data = read_all_gz_from_bucket(raw_data_bucket_name) # read in raw data
write_aws_s3(preprocessed_bucket_name, raw_data) # write preprocessed data to AWS S3 bucket 

In [None]:
filenames = "s3a://open-research-corpus/corpus-2019-01-31/s2-corpus-00.gz" # 1GB file 