In [1]:
from pyspark.sql import SparkSession
from operator import add

In [2]:
spark_session = SparkSession\
        .builder\
        .master("spark://host-192-168-1-37-ldsa:7077") \
        .appName("project_team25_thursday")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.shuffle.service.enabled", True)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores",2)\
        .getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext

In [3]:
# Load the csv from HDFS
data_frame = spark_session.read\
    .option("header", "true")\
    .csv('hdfs://192.168.1.153:9000/team25/libraryDataset/Library_Collection_Inventory.csv')

In [29]:
data_frame.show()

+-------+--------------------+--------------------+--------------------+---------------+--------------------+--------------------+--------+--------------+------------+------------+----------+---------+
| BibNum|               Title|              Author|                ISBN|PublicationYear|           Publisher|            Subjects|ItemType|ItemCollection|FloatingItem|ItemLocation|ReportDate|ItemCount|
+-------+--------------------+--------------------+--------------------+---------------+--------------------+--------------------+--------+--------------+------------+------------+----------+---------+
|3011076|A tale of two fri...|       O'Ryan, Ellie|1481425730, 14814...|          2014.|    Simon Spotlight,|Musicians Fiction...|    jcbk|         ncrdr|    Floating|         qna|09/01/2017|        1|
|2248846|Naruto. Vol. 1, U...|Kishimoto, Masash...|          1569319006|   2003, c1999.|                Viz,|Ninja Japan Comic...|    acbk|       nycomic|          NA|         lcy|09/01/2017| 

In [30]:
data_frame.count()

2687149

In [32]:
data_frame.printSchema()

root
 |-- BibNum: string (nullable = true)
 |-- Title: string (nullable = true)
 |-- Author: string (nullable = true)
 |-- ISBN: string (nullable = true)
 |-- PublicationYear: string (nullable = true)
 |-- Publisher: string (nullable = true)
 |-- Subjects: string (nullable = true)
 |-- ItemType: string (nullable = true)
 |-- ItemCollection: string (nullable = true)
 |-- FloatingItem: string (nullable = true)
 |-- ItemLocation: string (nullable = true)
 |-- ReportDate: string (nullable = true)
 |-- ItemCount: string (nullable = true)



In [31]:
req_df = data_frame.select('BibNum','Title','Author','PublicationYear','Publisher','Subjects').distinct()

In [33]:
req_df.printSchema()

root
 |-- BibNum: string (nullable = true)
 |-- Title: string (nullable = true)
 |-- Author: string (nullable = true)
 |-- PublicationYear: string (nullable = true)
 |-- Publisher: string (nullable = true)
 |-- Subjects: string (nullable = true)



In [34]:
req_df.count()

584901

In [35]:
data_frame_new = spark_session.read\
    .option("header", "true")\
    .csv('hdfs://192.168.1.153:9000/team25/libraryDataset/Checkouts_By_Title_Data_Lens_*.csv')

In [36]:
data_frame_new.show()

+---------+-------------+--------+----------+--------------------+--------------------+
|BibNumber|  ItemBarcode|ItemType|Collection|          CallNumber|    CheckoutDateTime|
+---------+-------------+--------+----------+--------------------+--------------------+
|  1842225|0010035249209|    acbk|     namys|  MYSTERY ELKINS1999|05/23/2005 03:20:...|
|  1928264|0010037335444|    jcbk|     ncpic|            E TABACK|12/14/2005 05:56:...|
|  1982511|0010039952527|   jcvhs|   ncvidnf|VHS J796.2 KNOW_Y...|08/11/2005 01:52:...|
|  2026467|0010040985615|    accd|      nacd|  CD 782.421642 Y71T|10/19/2005 07:47:...|
|  2174698|0010047696215|    jcbk|     ncpic|           E KROSOCZ|12/29/2005 03:42:...|
|  1602768|0010028318730|    jcbk|     ncpic|             E BLACK|10/08/2005 02:15:...|
|  2285195|0010053424767|    accd|      cacd|   CD 782.42166 F19R|09/30/2005 10:16:...|
|  2245955|0010048392665|    jcbk|      ncnf|  J949.73 Or77S 2004|12/05/2005 05:03:...|
|   770918|0010044828100|    jcb

In [37]:
data_frame_new.count()

91980693

In [38]:
data_frame_new.rdd.getNumPartitions()

52

In [39]:
data_frame_new.printSchema()

root
 |-- BibNumber: string (nullable = true)
 |-- ItemBarcode: string (nullable = true)
 |-- ItemType: string (nullable = true)
 |-- Collection: string (nullable = true)
 |-- CallNumber: string (nullable = true)
 |-- CheckoutDateTime: string (nullable = true)



In [40]:
# Load the csv from HDFS
data_frame_latest = spark_session.read\
    .option("header", "true")\
    .csv('hdfs://192.168.1.153:9000/team25/libraryDataset/Integrated_Library_System__ILS__Data_Dictionary.csv')\
    .cache()

In [41]:
data_frame_latest.show()

+-------+--------------------+---------+------------+---------------+--------------+-----------------+
|   Code|         Description|Code Type|Format Group|Format Subgroup|Category Group|Category Subgroup|
+-------+--------------------+---------+------------+---------------+--------------+-----------------+
| pkbknh|     Peak Picks Book| ItemType|        null|           null|          null|             null|
|  acart|Framed Art: Adult/YA| ItemType|       Media|            Art|          null|             null|
|   acbk|      Book: Adult/YA| ItemType|       Print|           Book|          null|             null|
|  accas|Audio Tape: Adult/YA| ItemType|       Media|     Audio Tape|          null|             null|
|   accd|        CD: Adult/YA| ItemType|       Media|     Audio Disc|          null|             null|
|accdrom|    CD-ROM: Adult/YA| ItemType|       Media|      Data Disc|          null|             null|
| acdisk|  Diskette: Adult/YA| ItemType|       Media|      Data Disc|    

In [42]:
data_frame_latest.count()

555

In [43]:
data_frame_latest.rdd.getNumPartitions()

1

In [44]:
data_frame_latest.printSchema()

root
 |-- Code: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Code Type: string (nullable = true)
 |-- Format Group: string (nullable = true)
 |-- Format Subgroup: string (nullable = true)
 |-- Category Group: string (nullable = true)
 |-- Category Subgroup: string (nullable = true)



In [45]:
data_frame_merged = data_frame_new.join(data_frame_latest, data_frame_new.ItemType == data_frame_latest.Code)

In [46]:
data_frame_merged.show()

+---------+-------------+--------+----------+--------------------+--------------------+-----+--------------------+---------+------------+---------------+--------------+-----------------+
|BibNumber|  ItemBarcode|ItemType|Collection|          CallNumber|    CheckoutDateTime| Code|         Description|Code Type|Format Group|Format Subgroup|Category Group|Category Subgroup|
+---------+-------------+--------+----------+--------------------+--------------------+-----+--------------------+---------+------------+---------------+--------------+-----------------+
|  1842225|0010035249209|    acbk|     namys|  MYSTERY ELKINS1999|05/23/2005 03:20:...| acbk|      Book: Adult/YA| ItemType|       Print|           Book|          null|             null|
|  1928264|0010037335444|    jcbk|     ncpic|            E TABACK|12/14/2005 05:56:...| jcbk|           Book: Juv| ItemType|       Print|           Book|          null|             null|
|  1982511|0010039952527|   jcvhs|   ncvidnf|VHS J796.2 KNOW_Y...

In [47]:
data_frame_merged.count()

91980694

In [48]:
merged_data_frame = data_frame_merged.join(req_df, data_frame_merged.BibNumber == req_df.BibNum)

In [49]:
merged_data_frame.show()

+---------+-------------+--------+----------+-------------+--------------------+----+--------------+---------+------------+---------------+--------------+-----------------+------+--------------------+---------------+---------------+-----------------+--------------------+
|BibNumber|  ItemBarcode|ItemType|Collection|   CallNumber|    CheckoutDateTime|Code|   Description|Code Type|Format Group|Format Subgroup|Category Group|Category Subgroup|BibNum|               Title|         Author|PublicationYear|        Publisher|            Subjects|
+---------+-------------+--------+----------+-------------+--------------------+----+--------------+---------+------------+---------------+--------------+-----------------+------+--------------------+---------------+---------------+-----------------+--------------------+
|   100704|0010007884181|    acbk|      canf|917.9794 W85M|09/21/2006 11:53:...|acbk|Book: Adult/YA| ItemType|       Print|           Book|          null|             null|100704|Men, 

In [50]:
merged_data_frame.count()

62566971

In [51]:
merged_data_frame.rdd.getNumPartitions()

200

In [52]:
merged_data_frame.printSchema()

root
 |-- BibNumber: string (nullable = true)
 |-- ItemBarcode: string (nullable = true)
 |-- ItemType: string (nullable = true)
 |-- Collection: string (nullable = true)
 |-- CallNumber: string (nullable = true)
 |-- CheckoutDateTime: string (nullable = true)
 |-- Code: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Code Type: string (nullable = true)
 |-- Format Group: string (nullable = true)
 |-- Format Subgroup: string (nullable = true)
 |-- Category Group: string (nullable = true)
 |-- Category Subgroup: string (nullable = true)
 |-- BibNum: string (nullable = true)
 |-- Title: string (nullable = true)
 |-- Author: string (nullable = true)
 |-- PublicationYear: string (nullable = true)
 |-- Publisher: string (nullable = true)
 |-- Subjects: string (nullable = true)



In [61]:
x='201909'
x[0:4]

'2019'

In [76]:
import pyspark.sql.functions as f
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

def convTime2Year(data):
    newyear = data[6:10]
    return int(newyear)

udf_toyear = udf(convTime2Year, IntegerType())

In [82]:
import time

In [77]:
newdata = merged_data_frame.withColumn('Year', udf_toyear('CheckoutDateTime'))

In [79]:
newdata.select('Year').first()

Row(Year=2005)

In [83]:
start_time = time.time()
merged_data_frame.select('CheckoutDateTime').first()
print(time.time() - start_time)

139.55632281303406


In [56]:
mostDesc = merged_data_frame.groupby('Description').count().select('Description', f.col('count')).orderBy('count',ascending=False).cache()

In [58]:
mostAuthorBook = merged_data_frame.filter('`Format Subgroup` == "Book" ').groupby('Author','Format Subgroup').count().select('Author','Format Subgroup', f.col('count')).orderBy('count',ascending=False).cache()

In [59]:
mostAuthorBook.show()

+--------------------+---------------+-------+
|              Author|Format Subgroup|  count|
+--------------------+---------------+-------+
|                null|           Book|1479059|
|          Seuss, Dr.|           Book| 217611|
|         Willems, Mo|           Book| 195779|
|      Meadows, Daisy|           Book| 178522|
|  Osborne, Mary Pope|           Book| 161940|
|     Rylant, Cynthia|           Book| 154421|
|Davis, Jim, 1945 ...|           Book| 144607|
|   Stilton, Geronimo|           Book| 129746|
|Patterson, James,...|           Book|  90592|
|Berenstain, Stan,...|           Book|  90097|
|   Holm, Jennifer L.|           Book|  79989|
|        Arnold, Tedd|           Book|  76690|
|  Pilkey, Dav, 1966-|           Book|  75180|
|      O'Connor, Jane|           Book|  71431|
|        Hunter, Erin|           Book|  70437|
|         Carle, Eric|           Book|  64071|
|       Park, Barbara|           Book|  58562|
|       Riordan, Rick|           Book|  57823|
|McCall Smith

In [60]:
mostDesc.show()

+--------------------+--------+
|         Description|   count|
+--------------------+--------+
|      Book: Adult/YA|20076679|
|       DVD: Adult/YA|15881200|
|           Book: Juv|15840463|
|        CD: Adult/YA| 7423766|
|DVD: Juv Circulating| 2284498|
|             CD: Juv|  812341|
|Music Score: Adul...|  112357|
|Book: Adult/Forme...|   29337|
|            Kit: Juv|   27130|
|     Peak Picks Book|   27093|
| Equipment: Adult/YA|   17572|
|  Book: Ref Adult/YA|   11775|
|Laptop: Hourly Ci...|    6359|
|  Uncataloged Folder|    5957|
|DVD: Adult/Former...|    4230|
|Song Book: Bound Juv|    1224|
|Equipment: Adult/...|     887|
|       Kit: Adult/YA|     393|
|     Tablet computer|     386|
|Equipment: Ref Ad...|     338|
+--------------------+--------+
only showing top 20 rows



In [85]:
mostSeuessYear = newdata.filter('`Author` == "Seuss, Dr." ').groupby('Author','Year').count().select('Author','Year', f.col('count')).orderBy('Year',ascending=False).cache()

In [87]:
mostSeuessYear.show()

+----------+----+-----+
|    Author|Year|count|
+----------+----+-----+
|Seuss, Dr.|2017|17029|
|Seuss, Dr.|2016|19137|
|Seuss, Dr.|2015|22353|
|Seuss, Dr.|2014|21170|
|Seuss, Dr.|2013|23796|
|Seuss, Dr.|2012|25102|
|Seuss, Dr.|2011|16706|
|Seuss, Dr.|2010|17019|
|Seuss, Dr.|2009|15387|
|Seuss, Dr.|2008|14722|
|Seuss, Dr.|2007|14667|
|Seuss, Dr.|2006|13695|
|Seuss, Dr.|2005| 9182|
+----------+----+-----+



In [94]:
import pyspark

In [95]:
newdata.persist(pyspark.StorageLevel.MEMORY_ONLY)

DataFrame[BibNumber: string, ItemBarcode: string, ItemType: string, Collection: string, CallNumber: string, CheckoutDateTime: string, Code: string, Description: string, Code Type: string, Format Group: string, Format Subgroup: string, Category Group: string, Category Subgroup: string, BibNum: string, Title: string, Author: string, PublicationYear: string, Publisher: string, Subjects: string, Year: int]

In [96]:
newdata.count()

62566971

In [108]:
sizedf = newdata.filter('`Description` == "Uncataloged Folder"')

In [112]:
sizedf.count()

KeyboardInterrupt: 

In [None]:
sizedf.rdd.getNumPartitions()

In [111]:
sizedf.write.csv('hdfs://192.168.1.153:9000/team25/libraryDataset/temp.csv')

KeyboardInterrupt: 