# Python Spark DataFrames Project


Spark documentation available at:
https://spark.apache.org/docs/2.3.1/


## Q1- How many trips were started in each year present in the data set?



In [31]:
%%time
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
import statistics as statis

spark = SparkSession.builder.master('local[*]').appName('words').getOrCreate()
sc = spark.sparkContext

try :
    lines = sc.textFile('Taxi_Trips_151MB.csv')
    logRows = lines.filter( lambda line : len(line) > 0 ).map( lambda line : line.split(';'))\
                   .map( lambda line: line[2].split('/')).map( lambda arr: Row( Year = arr[2].split(' ')[0]))
    
    logRowsDF = spark.createDataFrame( logRows )
    frequencies = logRowsDF.groupBy('Year').agg(count('Year').alias('Trips'))
    frequencies_ordered = frequencies.sort('Year')
    frequencies_ordered.show(10)
    
except Exception as err:
    print(err)
    sc.stop()
    


+----+-----+
|Year|Trips|
+----+-----+
|2013|54409|
|2014|74753|
|2015|64761|
|2016|63628|
|2017|50006|
|2018|41567|
|2019|32797|
|2020| 6829|
+----+-----+

CPU times: user 129 ms, sys: 87.9 ms, total: 217 ms
Wall time: 8.21 s


## Q2- For each of the 24 hours of the day, how many taxi trips there were, what was their average trip miles and trip total cost? 
 Non-integer values should be printed with two decimal places




In [32]:
%%time
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
import statistics as statis
spark = SparkSession.builder.master('local[*]').appName('words').getOrCreate()
sc = spark.sparkContext

try :
    lines = sc.textFile('Taxi_Trips_151MB.csv')
    rows = lines.filter( lambda line : len(line) > 0 )   \
                        .map( lambda line : line.split(';') ) \
                        .map( lambda arr : Row(   hour = arr[2][11:13].split(":")[0]+" "+arr[2][20:].split(":")[0], \
                                                  miles = arr[5].replace(',',''),\
                                                  cost = arr[14].replace(',','')))
    rowsDF = spark.createDataFrame( rows )
    
       
    result=rowsDF.groupBy(col("hour")).agg(count("hour").alias("Trips"),\
                                           format_number(avg('miles'), 2).alias("Avg_Miles"),\
                                           format_number(avg('cost'), 2).alias("Avg_Cost"))
    resultOrder=result.sort("hour")
    
    resultOrder.show(24)

    sc.stop()
except Exception as err:
    print(err)
    sc.stop()



+-----+-----+---------+--------+
| hour|Trips|Avg_Miles|Avg_Cost|
+-----+-----+---------+--------+
|01 AM|11166|     2.46|   13.73|
|01 PM|20181|     3.38|   15.79|
|02 AM| 8832|     2.35|   12.71|
|02 PM|20039|     3.56|   16.10|
|03 AM| 6594|     2.48|   12.96|
|03 PM|20708|     3.59|   17.38|
|04 AM| 4604|     3.80|   15.19|
|04 PM|21714|     3.34|   16.87|
|05 AM| 4087|     5.94|   20.94|
|05 PM|23639|     3.09|   15.20|
|06 AM| 5629|     5.57|   19.93|
|06 PM|25446|     2.94|   14.62|
|07 AM|10145|     3.98|   17.09|
|07 PM|25402|     2.94|   15.26|
|08 AM|15695|     2.98|   13.96|
|08 PM|22222|     3.18|   15.99|
|09 AM|18248|     3.23|   14.47|
|09 PM|19786|     3.40|   15.95|
|10 AM|17777|     3.30|   15.34|
|10 PM|18492|     3.11|   15.29|
|11 AM|18622|     3.56|   16.48|
|11 PM|16303|     3.07|   15.00|
|12 AM|13544|     2.83|   14.45|
|12 PM|19875|     3.38|   16.04|
+-----+-----+---------+--------+

CPU times: user 149 ms, sys: 38.6 ms, total: 188 ms
Wall time: 9.21 s


## Q3- For each of the 24 hours of the day, which are the (up to) 5 most popular routes (pairspickup/dropoff regions) according to the the total number of taxi trips? Also report and the average fare (total trip cost). 
Non-integer values should be printed with two decimal places.


In [33]:
%%time
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, col
import pyspark.sql.functions as F

import statistics as statis
spark = SparkSession.builder.master('local[*]').appName('words').getOrCreate()
sc = spark.sparkContext

try :
    lines = sc.textFile('Taxi_Trips_151MB.csv')
    rows = lines.filter( lambda line : len(line) > 0 )   \
                        .map( lambda line : line.split(';') ) \
                        .filter( lambda v: len(v[6])>0 and len(v[7])>0) \
                        .map( lambda arr : Row(    hour = arr[2][11:13].split(":")[0]+" "+arr[2][20:].split(":")[0]+" ", \
                                                  pickup = arr[6]+" ", dropoff = arr[7]+" ",\
                                                  miles = arr[5].replace(',',''),\
                                                  cost = arr[14].replace(',','')))
    df = spark.createDataFrame( rows )
       
    result=df.groupBy('hour','pickup','dropoff').agg(count('*').alias("Trips"),\
                                           format_number(avg('miles'), 2).alias("Avg_Miles"),\
                                           format_number(avg('cost'), 2).alias("Avg_Cost"))
    
    window = Window.partitionBy(result["hour"]).orderBy(result['Trips'].desc())
    
    #result.show()
    resultv1=result.select('*', row_number().over(window).alias('rank')).filter(col('rank') <= 5).sort('hour').drop('rank')
    solutionv0=resultv1.withColumn('Pickup_Location', F.concat('hour','pickup','dropoff'))
    solutionv1=solutionv0.drop('hour','pickup','dropoff')
    solutionv2=solutionv1.select('Pickup_Location','Trips','Avg_Miles','Avg_Cost')
    solutionv2.show(5*24,False)

    sc.stop()
except Exception as err:
    print(err)
    sc.stop()





+------------------------------+-----+---------+--------+
|Pickup_Location               |Trips|Avg_Miles|Avg_Cost|
+------------------------------+-----+---------+--------+
|01 AM 17031081700 17031081700 |96   |0.46     |6.91    |
|01 AM 17031081700 17031081800 |73   |0.54     |6.96    |
|01 AM 17031081800 17031081800 |47   |0.42     |5.99    |
|01 AM 17031081700 17031320100 |41   |0.78     |7.55    |
|01 AM 17031081700 17031839100 |41   |0.79     |8.09    |
|01 PM 17031320100 17031839100 |264  |0.76     |7.54    |
|01 PM 17031839100 17031839100 |433  |0.54     |6.68    |
|01 PM 17031839100 17031320100 |271  |0.78     |7.52    |
|01 PM 17031839100 17031081700 |173  |0.59     |7.07    |
|01 PM 17031081500 17031839100 |168  |0.94     |8.04    |
|02 AM 17031081700 17031081800 |75   |0.44     |7.10    |
|02 AM 17031081700 17031081700 |75   |0.43     |6.41    |
|02 AM 17031081700 17031320100 |43   |0.78     |7.67    |
|02 AM 17031081800 17031081800 |43   |0.72     |7.51    |
|02 AM 1703108

## Q4- How many payments, and each payment type, where made in each location

In [34]:
%%time
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
import statistics as statis
spark = SparkSession.builder.master('local[*]').appName('words').getOrCreate()
sc = spark.sparkContext

try :
    lines = sc.textFile('Taxi_Trips_151MB.csv')
    rows = lines.filter( lambda line : len(line) > 0 )   \
                        .map( lambda line : line.split(';') )\
                        .filter(lambda v: len(v[6])>0 and len(v[15])>0) \
                        .map( lambda arr : Row(   pickupRegion = arr[6], \
                                                  paymenType=arr[15], ))
    rowsDF = spark.createDataFrame( rows )
    
    table_proc=rowsDF.groupBy(col("pickupRegion"),col("paymenType")).count()
       
    resultOrder=table_proc.sort("pickupRegion","count",ascending=False)
    
    resultOrder.show(100)

    sc.stop()
except Exception as err:
    print(err)
    sc.stop()


+------------+-----------+-----+
|pickupRegion| paymenType|count|
+------------+-----------+-----+
| 17031980100|Credit Card| 2919|
| 17031980100|       Cash| 1565|
| 17031980100|     Mobile|   15|
| 17031980100|  No Charge|   13|
| 17031980100|    Unknown|    5|
| 17031980100|    Dispute|    1|
| 17031980100|     Prcard|    1|
| 17031980000|Credit Card|10430|
| 17031980000|       Cash| 6010|
| 17031980000|     Mobile|   67|
| 17031980000|  No Charge|   39|
| 17031980000|    Unknown|   11|
| 17031980000|    Dispute|    9|
| 17031980000|     Prcard|    3|
| 17031980000|      Split|    1|
| 17031843700|       Cash|   16|
| 17031843700|Credit Card|    7|
| 17031843500|       Cash|    1|
| 17031843300|Credit Card|    2|
| 17031843300|       Cash|    1|
| 17031843200|       Cash|    1|
| 17031843100|Credit Card|    2|
| 17031843100|       Cash|    1|
| 17031842900|Credit Card|    2|
| 17031842900|       Cash|    2|
| 17031842300|       Cash|  717|
| 17031842300|Credit Card|  250|
| 17031842