In [1]:

from pyspark import SparkConf
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

conf = SparkConf()

spark = SparkSession.builder \
    .appName("test-parquet") \
    .master('local[*]') \
    .config(conf=conf) \
    .getOrCreate()

sqlContext = SQLContext(spark.sparkContext)
print('Version: ' + str(spark.version))


Version: 2.4.2


In [2]:

df = sqlContext.read.parquet('/Users/sg0218817/Downloads/*.parquet')
df = df.withColumn('my_date', to_date(col("numberDepartureDate"), format='dd-MM-yyyy'))
df.registerTempTable("archive")

df.printSchema()


root
 |-- carrier: string (nullable = true)
 |-- departureDateDOW: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- flightId: string (nullable = true)
 |-- hoursToDeparture: integer (nullable = true)
 |-- isCancelled: boolean (nullable = true)
 |-- marketingCarrier: string (nullable = true)
 |-- marketingFlightNumber: integer (nullable = true)
 |-- number: integer (nullable = true)
 |-- numberDepartureDate: timestamp (nullable = true)
 |-- operatingCarrier: string (nullable = true)
 |-- optGroupDate: timestamp (nullable = true)
 |-- flightKey: string (nullable = true)
 |-- my_date: date (nullable = true)



In [3]:

print('column names: ' + str(df.columns))
print('# of records: ' + str(df.count()))

print('the statistical summary of the given column:')
df.describe('flightId').show()

df.select('carrier').distinct().show()
df.select(df.carrier).distinct().show()
df.select(col('carrier')).distinct().show()

print('selecting multiple columns:')
df.select('flightId', 'numberDepartureDate', 'my_date')\
    .show(10)

df.select('carrier')\
    .distinct()\
    .show()

print('filtering data:')
df.filter(df.flightId == 'SU.1007[2018-11-13]')\
    .show()

df.filter((df.flightId == 'SU.1007[2018-11-13]') & (df.carrier == 'SU'))\
    .show()

df.groupby('numberDepartureDate')\
    .count()\
    .orderBy(df.numberDepartureDate)\
    .show()


column names: ['carrier', 'departureDateDOW', 'timestamp', 'flightId', 'hoursToDeparture', 'isCancelled', 'marketingCarrier', 'marketingFlightNumber', 'number', 'numberDepartureDate', 'operatingCarrier', 'optGroupDate', 'flightKey', 'my_date']
# of records: 24749
the statistical summary of the given column:
+-------+-------------------+
|summary|           flightId|
+-------+-------------------+
|  count|              24749|
|   mean|               null|
| stddev|               null|
|    min|SU.1001[2018-02-05]|
|    max|  XX.77[2018-06-30]|
+-------+-------------------+

+-------+
|carrier|
+-------+
|     SU|
|     XX|
+-------+

+-------+
|carrier|
+-------+
|     SU|
|     XX|
+-------+

+-------+
|carrier|
+-------+
|     SU|
|     XX|
+-------+

selecting multiple columns:
+-------------------+-------------------+----------+
|           flightId|numberDepartureDate|   my_date|
+-------------------+-------------------+----------+
|SU.1001[2018-02-05]|2018-02-05 01:00:00|2018-02-0

In [4]:

Inventory = spark.sql("select * from archive where flightId='SU.1007[2018-11-13]'")
Inventory.show(1)

Inventory = spark.sql("select * from archive")\
    .where(col('flightId').like('SU.1007[2018-11-13]'))

Inventory.show(1)


+-------+----------------+-------------------+-------------------+----------------+-----------+----------------+---------------------+------+-------------------+----------------+------------+--------------------+----------+
|carrier|departureDateDOW|          timestamp|           flightId|hoursToDeparture|isCancelled|marketingCarrier|marketingFlightNumber|number|numberDepartureDate|operatingCarrier|optGroupDate|           flightKey|   my_date|
+-------+----------------+-------------------+-------------------+----------------+-----------+----------------+---------------------+------+-------------------+----------------+------------+--------------------+----------+
|     SU|            null|2018-01-02 01:00:00|SU.1007[2018-11-13]|            null|       null|            null|                 null|  1007|2018-11-13 01:00:00|            null|        null|SU.1007[2018-11-1...|2018-11-13|
+-------+----------------+-------------------+-------------------+----------------+-----------+---------

In [5]:

df.write.parquet("/Users/sg0218817/Downloads/NEW.parquet")


In [6]:

spark.stop()
