In [0]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
import requests

In [0]:
spark = SparkSession.builder.getOrCreate()
spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")

In [0]:
df = spark.read.json("s3a://csparkdata/ol_cdump.json")

In [0]:
df = df.withColumn("publish_date", f.regexp_replace(f.col("publish_date"), "([0-9])th", "$1"))

In [0]:
df = df.withColumn("publish_date_formatted", f.expr('''
case
    when lower(publish_date) like "%u" then null
    when to_date(publish_date, "dd MMM yyyy") is not null and to_date(publish_date, "dd MMM yyyy") <= to_date(current_timestamp()) then to_date(publish_date, "dd MMM yyyy")
    when to_date(publish_date, "dd/MM/yyyy") is not null and to_date(publish_date, "dd/MM/yyyy") <= to_date(current_timestamp()) then to_date(publish_date, "dd/MM/yyyy")
    when to_date(publish_date, "MM/dd/yyyy") is not null and to_date(publish_date, "MM/dd/yyyy") <= to_date(current_timestamp()) then to_date(publish_date, "MM/dd/yyyy")
    when to_date(publish_date, "MM-dd-yyyy") is not null and to_date(publish_date, "MM-dd-yyyy") <= to_date(current_timestamp()) then to_date(publish_date, "MM-dd-yyyy")
    when to_date(publish_date, "MM.dd.yyyy") is not null and to_date(publish_date, "MM.dd.yyyy") <= to_date(current_timestamp()) then to_date(publish_date, "MM.dd.yyyy")
    when to_date(publish_date, "MM/dd/yy") is not null and to_date(publish_date, "MM/dd/yy") <= to_date(current_timestamp()) then to_date(publish_date, "MM/dd/yy")
    when to_date(publish_date, "MM-dd-yy") is not null and to_date(publish_date, "MM-dd-yy") <= to_date(current_timestamp()) then to_date(publish_date, "MM-dd-yy")
    when to_date(publish_date, "MM/yyyy") is not null and to_date(publish_date, "MM/yyyy") <= to_date(current_timestamp()) then to_date(publish_date, "MM/yyyy")
    when to_date(publish_date, "MMMM d, yyyy") is not null and to_date(publish_date, "MMMM d, yyyy") <= to_date(current_timestamp()) then to_date(publish_date, "MMMM d, yyyy")
    when to_date(publish_date, "MMMM d,yy") is not null and to_date(publish_date, "MMMM d,yy") <= to_date(current_timestamp()) then to_date(publish_date, "MMMM d,yy")
    when to_date(publish_date, "MMMM yyyy") is not null and to_date(publish_date, "MMMM yyyy") <= to_date(current_timestamp()) then to_date(publish_date, "MMMM yyyy")
    when to_date(publish_date, "MMMM, yyyy") is not null and to_date(publish_date, "MMMM, yyyy") <= to_date(current_timestamp()) then to_date(publish_date, "MMMM, yyyy")
    when to_date(publish_date, "yyyy") is not null and to_date(publish_date, "yyyy") <= to_date(current_timestamp()) then to_date(publish_date, "yyyy")
    else null
end
'''))

In [0]:
df.createOrReplaceTempView("df")

In [0]:
df_w_pub_year = spark.sql('''
  select *,
  case 
    when publish_date_formatted is not null then year(publish_date_formatted) 
    else 0000
  end as publish_year
  from df
''')

In [0]:
df_w_pub_year.createOrReplaceTempView("df_pub_year")

In [0]:
df_cleaned = spark.sql('''
  select * from df_pub_year 
  where title is not null
  and lower(title) not like "%no title exists%"
  and number_of_pages > 20
  and publish_year > 1950
  and publish_date_formatted is not null
  and key is not null
  and authors is not null
''')

In [0]:
def parse_author_keys(author_keys):
  return ",".join([x[1].split("/")[2] for x in author_keys])

parse_author_keys_udf = f.udf(parse_author_keys)
spark.udf.register("parse_author_keys_udf", parse_author_keys)

Out[148]: <function __main__.parse_author_keys(author_keys)>

In [0]:
def parse_book_key(book_key):
  return book_key.split("/")[2]

parse_book_key_udf = f.udf(parse_book_key)
spark.udf.register("parse_book_key_udf", parse_book_key)

Out[149]: <function __main__.parse_book_key(book_key)>

In [0]:
def get_author_name(author_key):
  url = f"https://openlibrary.org/authors/{author_key}.json"
  try:
    r = requests.get(url = url)
    author_name = r.json()["name"]
    return author_name
  except:
    return None
  
get_author_name_udf = f.udf(get_author_name)
spark.udf.register("get_author_name_udf", get_author_name)

Out[150]: <function __main__.get_author_name(author_key)>

In [0]:
df_cleaned = df_cleaned.withColumn("author_keys", parse_author_keys_udf(f.col("authors")))

In [0]:
df_cleaned = df_cleaned.withColumn("book_key", parse_book_key_udf(f.col("key")))

In [0]:
df_cleaned.createOrReplaceTempView("data")

In [0]:
ques1 = spark.sql('''
  select distinct book_key, title as book_title, number_of_pages from
  (select *, dense_rank() over (order by number_of_pages desc) as page_rank from data)
  where page_rank = 1
''').toPandas()

In [0]:
ques1

Unnamed: 0,book_key,book_title,number_of_pages
0,OL22855337M,Nihon shokuminchi kenchikuron,48418


In [0]:
ques2_inter = spark.sql('''
  select *, explode(genres) as genre from data
''')

ques2_inter = ques2_inter.withColumn("genre", f.regexp_replace(f.col("genre"), "\.$", ""))

ques2_inter.createOrReplaceTempView("ques2_inter")

ques2 = spark.sql('''
  select * from (
  select rank() over (order by books desc) as genre_rank, genre, books from
  (select genre, count(distinct key) as books from ques2_inter group by 1)
  )
  order by genre_rank
''').toPandas()

In [0]:
ques2[ques2.genre_rank <= 5]

Unnamed: 0,genre_rank,genre,books
0,1,Fiction,3392
1,2,Biography,2739
2,3,Juvenile literature,1675
3,4,Exhibitions,1148
4,5,Juvenile fiction,689


In [0]:
ques3_inter = spark.sql('''
  select *, explode(split(author_keys, ",")) as author_key from data
''')

ques3_inter.createOrReplaceTempView("ques3_inter")

ques3 = spark.sql('''
  select author_rank, author_key, author_name_data, get_author_name_udf(author_key) as author_name_site, books from (
  select *, rank() over (order by books desc) as author_rank from
  (select author_key, name as author_name_data, count(distinct key) as books from ques3_inter group by 1,2)
  )
  where author_rank <= 5
  order by author_rank
''').toPandas()

In [0]:
ques3

Unnamed: 0,author_rank,author_key,author_name_data,author_name_site,books
0,1,OL1224818A,,California. Dept. of Water Resources.,236
1,2,OL4283462A,,Jirō Akagawa,116
2,3,OL785848A,,John Harold Haynes,106
3,4,OL539875A,,Philip M. Parker,90
4,5,OL1926829A,,San Francisco (Calif.). Dept. of City Planning.,80


In [0]:
ques4 = spark.sql('''
  select publish_year, count(distinct author_key) as authors from
  (select *, explode(split(author_keys, ",")) as author_key from data)
  group by 1
  order by 1
''').toPandas()

In [0]:
pd.set_option("display.max_rows", None)
ques4

Unnamed: 0,publish_year,authors
0,1951,648
1,1952,601
2,1953,601
3,1954,644
4,1955,589
5,1956,638
6,1957,706
7,1958,755
8,1959,829
9,1960,923


In [0]:
ques5 = spark.sql('''
  select date_format(publish_date_formatted, "yyyy-MM") as publish_year_month, count(distinct author_key) as authors, count(distinct key) as books from
  (select *, explode(split(author_keys, ",")) as author_key from data)
  group by 1
  order by 1
''').toPandas()

In [0]:
ques5

Unnamed: 0,publish_year_month,authors,books
0,1951-01,648,676
1,1952-01,601,633
2,1953-01,600,622
3,1953-10,1,1
4,1954-01,643,665
5,1954-06,1,1
6,1955-01,588,611
7,1955-06,1,1
8,1956-01,638,664
9,1957-01,705,740
