In [1]:
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.sql import SparkSession, Row
import pandas as pd


In [2]:
spark = SparkSession.builder.appName("rdd-app").config("spark.config.option", "value").getOrCreate()
scfg = SparkConf().setAppName('rdd-app')
sc = spark.sparkContext

In [3]:
import string

text_file = '/user/student/shakespeare/tragedy/hamlet.txt'
text = sc.textFile(text_file)

In [4]:
def strip_punc(s):
    return s.translate(str.maketrans('', '', string.punctuation)).split(' ')

def search_word_in_line(word):
    count = 1
    for line in text.collect():
        if word in strip_punc(line):
            print('{}. {}'.format(count, line))
        count += 1

In [5]:
flatmap = text.flatMap(lambda line: line.translate(str.maketrans('', '', string.punctuation)).split(' '))
map = flatmap.map(lambda word: (word, 1))
reduced = map.reduceByKey(lambda a, b: a + b)

In [6]:
counts = text.flatMap(lambda line: line.translate(str.maketrans('', '', string.punctuation)).split(' '))\
             .map(lambda word: (word, 1))\
             .reduceByKey(lambda a, b: a + b)    

In [7]:
word = "purpose"
for count in reduced.collect():
    # kv = str(count).translate(str.maketrans('', '', string.punctuation)).split(' ')
    kv = strip_punc(str(count))
    if word == kv[0]:
        print('Found \'{}\' occurs \'{}\' times'.format(kv[0], kv[1])) 
        search_word_in_line(word)
        break

Found 'purpose' occurs '11' times
2599.     Why, any thing, but to the purpose. You were sent
2926.     Black as his purpose, did the night resemble
3216.     And drive his purpose on to these delights.
3540.     from the purpose of playing, whose end, both at the
3909.     The passion ending, doth the purpose lose.
4766.     Is but to whet thy almost blunted purpose.
6202.     And, for that purpose, I'll anoint my sword.
6227.     Our purpose may hold there.
6378.     purpose, confess thyself--
7328.     king hold his purpose, I will win for him an I can;
7376.     I am constant to my purpose; they follow the king's


## Manipulating airline performance data

In [8]:

from pyspark.sql.types import Row
from datetime import datetime


In [9]:
data_by_year = '/user/student/airline/2002.csv'
airline_performance = spark.read.option("header", "true").csv(data_by_year)

In [10]:
airline_performance.head()

Row(Year='2002', Month='1', DayofMonth='13', DayOfWeek='7', DepTime='2231', CRSDepTime='2235', ArrTime='2342', CRSArrTime='2353', UniqueCarrier='US', FlightNum='723', TailNum='N709��', ActualElapsedTime='71', CRSElapsedTime='78', AirTime='55', ArrDelay='-11', DepDelay='-4', Origin='PIT', Dest='CLT', Distance='366', TaxiIn='3', TaxiOut='13', Cancelled='0', CancellationCode='NA', Diverted='0', CarrierDelay='NA', WeatherDelay='NA', NASDelay='NA', SecurityDelay='NA', LateAircraftDelay='NA')

In [11]:
airline_performance

DataFrame[Year: string, Month: string, DayofMonth: string, DayOfWeek: string, DepTime: string, CRSDepTime: string, ArrTime: string, CRSArrTime: string, UniqueCarrier: string, FlightNum: string, TailNum: string, ActualElapsedTime: string, CRSElapsedTime: string, AirTime: string, ArrDelay: string, DepDelay: string, Origin: string, Dest: string, Distance: string, TaxiIn: string, TaxiOut: string, Cancelled: string, CancellationCode: string, Diverted: string, CarrierDelay: string, WeatherDelay: string, NASDelay: string, SecurityDelay: string, LateAircraftDelay: string]

In [12]:
from pyspark.sql.types import IntegerType
airline_performance = airline_performance.withColumn("ArrDelay", airline_performance["ArrDelay"].cast(IntegerType()))
airline_performance = airline_performance.withColumn("DepDelay", airline_performance["DepDelay"].cast(IntegerType()))

In [13]:
airline_performance.describe(['ArrDelay']).show()
airline_performance.describe(['DepDelay']).show()

+-------+------------------+
|summary|          ArrDelay|
+-------+------------------+
|  count|           5197860|
|   mean| 3.191243511752914|
| stddev|29.479317265839487|
|    min|              -987|
|    max|              2137|
+-------+------------------+

+-------+------------------+
|summary|          DepDelay|
+-------+------------------+
|  count|           5206216|
|   mean| 5.532276225189274|
| stddev|26.099045759971386|
|    min|             -1370|
|    max|              2119|
+-------+------------------+



In [14]:
import pyspark.sql.functions as F
airline_performance.select(airline_performance.Dest,F.when(airline_performance.ArrDelay > 0, 1).otherwise(0)).show()

+----+------------------------------------------+
|Dest|CASE WHEN (ArrDelay > 0) THEN 1 ELSE 0 END|
+----+------------------------------------------+
| CLT|                                         0|
| CLT|                                         0|
| CLT|                                         0|
| CLT|                                         0|
| CLT|                                         0|
| CLT|                                         0|
| CLT|                                         0|
| CLT|                                         0|
| CLT|                                         0|
| CLT|                                         0|
| CLT|                                         0|
| CLT|                                         0|
| CLT|                                         0|
| CLT|                                         0|
| CLT|                                         0|
| CLT|                                         0|
| CLT|                                         0|


In [15]:
airline_performance = airline_performance.withColumn('ArrDelayCount',F.when(airline_performance.ArrDelay > 0, 1).otherwise(0))
airline_performance = airline_performance.withColumn('DepDelayCount',F.when(airline_performance.DepDelay > 0, 1).otherwise(0))
airline_performance.show(2)

+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+-------------+-------------+
|Year|Month|DayofMonth|DayOfWeek|DepTime|CRSDepTime|ArrTime|CRSArrTime|UniqueCarrier|FlightNum|TailNum|ActualElapsedTime|CRSElapsedTime|AirTime|ArrDelay|DepDelay|Origin|Dest|Distance|TaxiIn|TaxiOut|Cancelled|CancellationCode|Diverted|CarrierDelay|WeatherDelay|NASDelay|SecurityDelay|LateAircraftDelay|ArrDelayCount|DepDelayCount|
+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+-------------+-------------+
|2002|    

In [16]:
#Most Depature Delay Airport 2002
airline_performance\
.filter(F.col('DepDelayCount') == 1)\
.groupBy('Year','Origin')\
.count()\
.orderBy('count',ascending=False)\
.show(1)

+----+------+------+
|Year|Origin| count|
+----+------+------+
|2002|   ATL|116323|
+----+------+------+
only showing top 1 row



In [17]:
#Least Depature Delay Airport 2002
airline_performance\
.filter(F.col('DepDelayCount') == 1)\
.groupBy('Year','Origin')\
.count()\
.orderBy('count',ascending=True)\
.show(1)

+----+------+-----+
|Year|Origin|count|
+----+------+-----+
|2002|   ERI|    5|
+----+------+-----+
only showing top 1 row



In [18]:
#Most Arrival Delay Airport 2002
airline_performance\
.filter(F.col('ArrDelayCount') == 1)\
.groupBy('Year','Dest')\
.count()\
.orderBy('count',ascending=False)\
.show(1)

+----+----+------+
|Year|Dest| count|
+----+----+------+
|2002| ORD|115841|
+----+----+------+
only showing top 1 row



In [19]:
#least Arrival Delay Airport 2002
airline_performance\
.filter(F.col('ArrDelayCount') == 1)\
.groupBy('Year','Dest')\
.count()\
.orderBy('count',ascending=True)\
.show(1)

+----+----+-----+
|Year|Dest|count|
+----+----+-----+
|2002| ERI|    4|
+----+----+-----+
only showing top 1 row



In [20]:
#Most Arrival Delay Flight 2002
airline_performance\
.filter(F.col('ArrDelayCount') == 1)\
.groupBy('Year','UniqueCarrier','FlightNum')\
.count()\
.orderBy('count',ascending=False)\
.show(1)

+----+-------------+---------+-----+
|Year|UniqueCarrier|FlightNum|count|
+----+-------------+---------+-----+
|2002|           WN|     2233| 1013|
+----+-------------+---------+-----+
only showing top 1 row



In [21]:
#Least Arrival Delay Flight 2002
airline_performance\
.filter(F.col('ArrDelayCount') == 1)\
.groupBy('Year','UniqueCarrier','FlightNum')\
.count()\
.orderBy('count',ascending=True)\
.show(1)

+----+-------------+---------+-----+
|Year|UniqueCarrier|FlightNum|count|
+----+-------------+---------+-----+
|2002|           WN|     2512|    1|
+----+-------------+---------+-----+
only showing top 1 row



In [22]:
#Most Departure Delay Flight 2002
airline_performance\
.filter(F.col('DepDelayCount') == 1)\
.groupBy('Year','UniqueCarrier','FlightNum')\
.count()\
.orderBy('count',ascending=False)\
.show(1)

+----+-------------+---------+-----+
|Year|UniqueCarrier|FlightNum|count|
+----+-------------+---------+-----+
|2002|           WN|     1702|  928|
+----+-------------+---------+-----+
only showing top 1 row



In [23]:
#Least Departure Delay Flight 2002
airline_performance\
.filter(F.col('DepDelayCount') == 1)\
.groupBy('Year','UniqueCarrier','FlightNum')\
.count()\
.orderBy('count',ascending=True)\
.show(1)

+----+-------------+---------+-----+
|Year|UniqueCarrier|FlightNum|count|
+----+-------------+---------+-----+
|2002|           US|     1845|    1|
+----+-------------+---------+-----+
only showing top 1 row



In [24]:
airline_performance\
.groupBy('Year','UniqueCarrier')\
.agg(F.mean('ArrDelay'))\
.show()

+----+-------------+------------------+
|Year|UniqueCarrier|     avg(ArrDelay)|
+----+-------------+------------------+
|2002|           WN| 3.890699404053452|
|2002|           DL| 5.712299163508456|
|2002|           US|1.4699437440761107|
|2002|           AS| 5.404059028695579|
|2002|           UA|2.4330837498304536|
|2002|           CO|2.2972342892478936|
|2002|           NW|3.5336313012895664|
|2002|           MQ| 4.214754405699288|
|2002|           AA| 1.182908445915422|
|2002|           HP| 2.478977337198147|
+----+-------------+------------------+



In [25]:
airline_performance\
.groupBy('Year','UniqueCarrier')\
.agg(F.mean('DepDelay'))\
.show()

+----+-------------+------------------+
|Year|UniqueCarrier|     avg(DepDelay)|
+----+-------------+------------------+
|2002|           WN|  8.92502960886713|
|2002|           DL|5.5655362269371285|
|2002|           US| 3.676496743288962|
|2002|           AS|  7.35249066762614|
|2002|           UA| 4.664783588934704|
|2002|           CO|2.5368067195762296|
|2002|           NW|  4.82061634414722|
|2002|           MQ| 5.546236367849778|
|2002|           AA| 4.473949298662345|
|2002|           HP| 6.269212934339063|
+----+-------------+------------------+

