In [1]:
import prjmod as pm
import prjmod.commons as commons
from pyspark.sql import SparkSession
import pandas as pd

In [2]:

spark = SparkSession.builder.getOrCreate()

In [3]:
# Print the tables in the catalog
print(spark.catalog.listTables())

[]


# Reading Data 

## Local file to Spark DataFrame (SDF)

In [4]:
sdf_flights = spark.read.csv(commons.DL_FILE_FLIGHTS, header=True, sep = ';')

In [5]:
sdf_flights.show()

+---+----+-----+---+--------+--------------+---------+--------+--------------+---------+-------+------+-------+------+----+--------+--------+----+------+-------------------+
|_c0|year|month|day|dep_time|sched_dep_time|dep_delay|arr_time|sched_arr_time|arr_delay|carrier|flight|tailnum|origin|dest|air_time|distance|hour|minute|          time_hour|
+---+----+-----+---+--------+--------------+---------+--------+--------------+---------+-------+------+-------+------+----+--------+--------+----+------+-------------------+
|  1|2013|    1|  1|     517|           515|        2|     830|           819|       11|     UA|  1545| N14228|   EWR| IAH|     227|    1400|   5|    15|2013-01-01 05:00:00|
|  2|2013|    1|  1|     533|           529|        4|     850|           830|       20|     UA|  1714| N24211|   LGA| IAH|     227|    1416|   5|    29|2013-01-01 05:00:00|
|  3|2013|    1|  1|     542|           540|        2|     923|           850|       33|     AA|  1141| N619AA|   JFK| MIA|     16

In [6]:
print(sdf_flights.describe())

DataFrame[summary: string, _c0: string, year: string, month: string, day: string, dep_time: string, sched_dep_time: string, dep_delay: string, arr_time: string, sched_arr_time: string, arr_delay: string, carrier: string, flight: string, tailnum: string, origin: string, dest: string, air_time: string, distance: string, hour: string, minute: string, time_hour: string]


In [7]:
# Add sdf_flights to the catalog
sdf_flights.createOrReplaceTempView("sdf_flights_temp")

print(spark.catalog.listTables())

[Table(name='sdf_flights_temp', database=None, description=None, tableType='TEMPORARY', isTemporary=True)]


In [17]:
# Now we can get the data as SDF from the table in the Spark cluster:
sdf_flights_frm_table = spark.table('sdf_flights_temp')

## SDF to pandas DataFrame (PDF)

In [8]:
pdf_flights = sdf_flights.toPandas()

In [9]:
print(pdf_flights.head())

  _c0  year month day dep_time sched_dep_time dep_delay arr_time  \
0   1  2013     1   1      517            515         2      830   
1   2  2013     1   1      533            529         4      850   
2   3  2013     1   1      542            540         2      923   
3   4  2013     1   1      544            545        -1     1004   
4   5  2013     1   1      554            600        -6      812   

  sched_arr_time arr_delay carrier flight tailnum origin dest air_time  \
0            819        11      UA   1545  N14228    EWR  IAH      227   
1            830        20      UA   1714  N24211    LGA  IAH      227   
2            850        33      AA   1141  N619AA    JFK  MIA      160   
3           1022       -18      B6    725  N804JB    JFK  BQN      183   
4            837       -25      DL    461  N668DN    LGA  ATL      116   

  distance hour minute            time_hour  
0     1400    5     15  2013-01-01 05:00:00  
1     1416    5     29  2013-01-01 05:00:00  
2     10

In [10]:
pdf_flights.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 336776 entries, 0 to 336775
Data columns (total 20 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   _c0             336776 non-null  object
 1   year            336776 non-null  object
 2   month           336776 non-null  object
 3   day             336776 non-null  object
 4   dep_time        336776 non-null  object
 5   sched_dep_time  336776 non-null  object
 6   dep_delay       336776 non-null  object
 7   arr_time        336776 non-null  object
 8   sched_arr_time  336776 non-null  object
 9   arr_delay       336776 non-null  object
 10  carrier         336776 non-null  object
 11  flight          336776 non-null  object
 12  tailnum         336776 non-null  object
 13  origin          336776 non-null  object
 14  dest            336776 non-null  object
 15  air_time        336776 non-null  object
 16  distance        336776 non-null  object
 17  hour            336776 non-nu

## Pandas to SDF

In [11]:
# Create sdf_flights_2 from pd_temp
sdf_flights_2 = spark.createDataFrame(pdf_flights.iloc[0:100, :])


In [None]:
sdf_flights_2.show()

In [13]:
# Examine the tables in the catalog
print(spark.catalog.listTables())

[Table(name='sdf_flights_temp', database=None, description=None, tableType='TEMPORARY', isTemporary=True)]


In [14]:

# Add sdf_flights_2 to the catalog
sdf_flights_2.createOrReplaceTempView("sdf_flights_2_temp")

# Examine the tables in the catalog again
print(spark.catalog.listTables())

[Table(name='sdf_flights_2_temp', database=None, description=None, tableType='TEMPORARY', isTemporary=True), Table(name='sdf_flights_temp', database=None, description=None, tableType='TEMPORARY', isTemporary=True)]


# Querying Spark Tables with SQL Spark

In [15]:
flights10 = spark.sql("SELECT * FROM sdf_flights_temp LIMIT 10")

print(flights10)

DataFrame[_c0: string, year: string, month: string, day: string, dep_time: string, sched_dep_time: string, dep_delay: string, arr_time: string, sched_arr_time: string, arr_delay: string, carrier: string, flight: string, tailnum: string, origin: string, dest: string, air_time: string, distance: string, hour: string, minute: string, time_hour: string]


In [16]:
flights10.show()

+---+----+-----+---+--------+--------------+---------+--------+--------------+---------+-------+------+-------+------+----+--------+--------+----+------+-------------------+
|_c0|year|month|day|dep_time|sched_dep_time|dep_delay|arr_time|sched_arr_time|arr_delay|carrier|flight|tailnum|origin|dest|air_time|distance|hour|minute|          time_hour|
+---+----+-----+---+--------+--------------+---------+--------+--------------+---------+-------+------+-------+------+----+--------+--------+----+------+-------------------+
|  1|2013|    1|  1|     517|           515|        2|     830|           819|       11|     UA|  1545| N14228|   EWR| IAH|     227|    1400|   5|    15|2013-01-01 05:00:00|
|  2|2013|    1|  1|     533|           529|        4|     850|           830|       20|     UA|  1714| N24211|   LGA| IAH|     227|    1416|   5|    29|2013-01-01 05:00:00|
|  3|2013|    1|  1|     542|           540|        2|     923|           850|       33|     AA|  1141| N619AA|   JFK| MIA|     16