## This part attempts to pull multiple S3 files 

In [1]:
from pyspark.sql.types import *
from pyspark.sql.functions import udf
from pyspark.sql import SQLContext
from pyspark import SparkContext
from pyspark.sql.session import SparkSession

In [12]:
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)
bucket_name = "open-research-corpus"

In [11]:
# Reads all gz files from an AWS bucket
def read_all_gz_from_bucket(bucket_name):
    return spark.read.text("s3a://{0}/corpus-2019-01-31/*.gz".format(bucket_name))

## This part attempts to write a pyspark dataframe to S3

In [13]:
filenames = "s3a://open-research-corpus/sample-S2-records.gz" # path to the example file from S3 file 
raw_data = spark.read.text(filenames)

In [19]:
def write_aws_s3(bucket_name, file_name, df):
    df.write.save("s3a://{0}/{1}".format(bucket_name, file_name), format="txt", mode="overwrite")

In [20]:
bucket_name = "preprocessed-open-research-corpus"
file_name = "sample-S2-records"

In [16]:
write_aws_s3(bucket_name, file_name, raw_data)

## This part attempts to read a pyspark dataframe from S3 as a json 

In [1]:
filename = "s3a://preprocessed-open-research-corpus/sample-S2-records/part-00000-9ebb5618-7074-4eab-9e80-1b86f286a8b9-c000.json"

In [2]:
def read_all_json_from_bucket(filename):
    return spark.read.json("{0}".format(filename))

In [None]:
# Reads all JSON files from an AWS bucket
def read_all_json_from_bucket(bucket_name):
    return sql_context.read.json("s3a://{0}/*.json*".format(bucket_name))

In [5]:
df = read_all_json_from_bucket(filename)

In [4]:
print(type(df))


<class 'pyspark.sql.dataframe.DataFrame'>


In [7]:
print("Schema for raw data + ids + abstracts")
print("-------------------------------------")
df.createOrReplaceTempView("raw_ids_and_abstracts")
df.printSchema()
results = spark.sql("SELECT * FROM raw_ids_and_abstracts LIMIT 5")
print("First 5 entries for add_ids_abstracts data")
print("----------------------------")
results.show()
results.count()

Schema for raw data + ids + abstracts
-------------------------------------
root
 |-- value: string (nullable = true)

First 5 entries for add_ids_abstracts data
----------------------------
+--------------------+
|               value|
+--------------------+
|{"entities":["Epi...|
|{"entities":["Lip...|
|{"entities":["Spa...|
|{"entities":["ACT...|
|{"entities":[],"j...|
+--------------------+



5