In [30]:
from pyspark.sql import SparkSession
from operator import add

In [31]:
spark_session = SparkSession\
        .builder\
        .master("spark://host-192-168-1-83-ldsa:7077") \
        .appName("project_team25_start")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.shuffle.service.enabled", True)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores",3)\
        .getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext

In [42]:
# Load the csv from HDFS
data_frame = spark_session.read\
    .option("header", "true")\
    .csv('hdfs://192.168.1.153:9000/team25/libraryDataset/Library_Collection_Inventory.csv')\
    .cache()

In [43]:
data_frame.show()

+-------+--------------------+--------------------+--------------------+---------------+--------------------+--------------------+--------+--------------+------------+------------+----------+---------+
| BibNum|               Title|              Author|                ISBN|PublicationYear|           Publisher|            Subjects|ItemType|ItemCollection|FloatingItem|ItemLocation|ReportDate|ItemCount|
+-------+--------------------+--------------------+--------------------+---------------+--------------------+--------------------+--------+--------------+------------+------------+----------+---------+
|3011076|A tale of two fri...|       O'Ryan, Ellie|1481425730, 14814...|          2014.|    Simon Spotlight,|Musicians Fiction...|    jcbk|         ncrdr|    Floating|         qna|09/01/2017|        1|
|2248846|Naruto. Vol. 1, U...|Kishimoto, Masash...|          1569319006|   2003, c1999.|                Viz,|Ninja Japan Comic...|    acbk|       nycomic|          NA|         lcy|09/01/2017| 

In [44]:
data_frame.count()

2687149

In [45]:
data_frame.rdd.getNumPartitions()

7

In [46]:
data_frame.printSchema()

root
 |-- BibNum: string (nullable = true)
 |-- Title: string (nullable = true)
 |-- Author: string (nullable = true)
 |-- ISBN: string (nullable = true)
 |-- PublicationYear: string (nullable = true)
 |-- Publisher: string (nullable = true)
 |-- Subjects: string (nullable = true)
 |-- ItemType: string (nullable = true)
 |-- ItemCollection: string (nullable = true)
 |-- FloatingItem: string (nullable = true)
 |-- ItemLocation: string (nullable = true)
 |-- ReportDate: string (nullable = true)
 |-- ItemCount: string (nullable = true)



In [37]:
data_frame_new = spark_session.read\
    .option("header", "true")\
    .csv('hdfs://192.168.1.153:9000/team25/libraryDataset/Checkouts_By_Title_Data_Lens_*.csv')

In [38]:
data_frame_new.count()

91980693

In [39]:
data_frame_new.show()

+---------+-------------+--------+----------+--------------------+--------------------+
|BibNumber|  ItemBarcode|ItemType|Collection|          CallNumber|    CheckoutDateTime|
+---------+-------------+--------+----------+--------------------+--------------------+
|  1842225|0010035249209|    acbk|     namys|  MYSTERY ELKINS1999|05/23/2005 03:20:...|
|  1928264|0010037335444|    jcbk|     ncpic|            E TABACK|12/14/2005 05:56:...|
|  1982511|0010039952527|   jcvhs|   ncvidnf|VHS J796.2 KNOW_Y...|08/11/2005 01:52:...|
|  2026467|0010040985615|    accd|      nacd|  CD 782.421642 Y71T|10/19/2005 07:47:...|
|  2174698|0010047696215|    jcbk|     ncpic|           E KROSOCZ|12/29/2005 03:42:...|
|  1602768|0010028318730|    jcbk|     ncpic|             E BLACK|10/08/2005 02:15:...|
|  2285195|0010053424767|    accd|      cacd|   CD 782.42166 F19R|09/30/2005 10:16:...|
|  2245955|0010048392665|    jcbk|      ncnf|  J949.73 Or77S 2004|12/05/2005 05:03:...|
|   770918|0010044828100|    jcb

In [40]:
data_frame_new.rdd.getNumPartitions()

52

In [41]:
data_frame_new.printSchema()

root
 |-- BibNumber: string (nullable = true)
 |-- ItemBarcode: string (nullable = true)
 |-- ItemType: string (nullable = true)
 |-- Collection: string (nullable = true)
 |-- CallNumber: string (nullable = true)
 |-- CheckoutDateTime: string (nullable = true)



In [47]:
merged_data_frame = data_frame_new.join(data_frame, data_frame_new.BibNumber == data_frame.BibNum)

In [48]:
merged_data_frame.show()

+---------+-------------+--------+----------+-------------+--------------------+------+--------------------+---------------+----------+---------------+-----------------+--------------------+--------+--------------+------------+------------+----------+---------+
|BibNumber|  ItemBarcode|ItemType|Collection|   CallNumber|    CheckoutDateTime|BibNum|               Title|         Author|      ISBN|PublicationYear|        Publisher|            Subjects|ItemType|ItemCollection|FloatingItem|ItemLocation|ReportDate|ItemCount|
+---------+-------------+--------+----------+-------------+--------------------+------+--------------------+---------------+----------+---------------+-----------------+--------------------+--------+--------------+------------+------------+----------+---------+
|   100704|0000101463974|    acbk|      canf|917.9794 W85M|08/28/2005 02:12:...|100704|Men, mules, and m...|Wood, Robert L.|0916890430|          1976.|The Mountaineers,|ONeil Joseph Patr...|    acbk|          canf|

In [49]:
merged_data_frame.count()

703511452

In [51]:
merged_data_frame.rdd.getNumPartitions()

200

In [52]:
merged_data_frame.printSchema()

root
 |-- BibNumber: string (nullable = true)
 |-- ItemBarcode: string (nullable = true)
 |-- ItemType: string (nullable = true)
 |-- Collection: string (nullable = true)
 |-- CallNumber: string (nullable = true)
 |-- CheckoutDateTime: string (nullable = true)
 |-- BibNum: string (nullable = true)
 |-- Title: string (nullable = true)
 |-- Author: string (nullable = true)
 |-- ISBN: string (nullable = true)
 |-- PublicationYear: string (nullable = true)
 |-- Publisher: string (nullable = true)
 |-- Subjects: string (nullable = true)
 |-- ItemType: string (nullable = true)
 |-- ItemCollection: string (nullable = true)
 |-- FloatingItem: string (nullable = true)
 |-- ItemLocation: string (nullable = true)
 |-- ReportDate: string (nullable = true)
 |-- ItemCount: string (nullable = true)

