### PySpark Module 5 Assignment

```
Dataset Description:
* dispatching_base_number: The base station ID
* date: Date
* active_vehicles: The number of active vehicles
* trips: Trips
```


#### Tasks to be Done


In [0]:
#configuring spark
import findspark
findspark.find()
findspark.init()

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('UberTripsDataAnalysis').master('local[2]').getOrCreate()
spark

In [0]:
from pyspark.sql.types import StructType,StructField,StringType,IntegerType,DateType
schema = StructType([
    StructField("dispatching_base_number", StringType(), False),
    StructField("date", DateType(), True),
    StructField("active_vehicles", IntegerType(), True),
    StructField("trips", IntegerType(), True),
])

In [0]:
# 1. Load the dataset
df=spark\
    .read\
    .format('csv')\
    .option('header',True)\
    .option('dateFormat','M/d/y')\
    .option('mode','dropmalformed')\
    .option('badRecordsPath','./badRecords')\
    .schema(schema)\
    .load('../data/Mod5_uber_data.csv')

df.show(10)

In [0]:
# 2. Create a temporary SQL table of the dataset
df.createOrReplaceTempView("UberTrip")
spark.sql("CREATE TEMPORARY VIEW UberTripTemp AS SELECT * FROM UberTrip")
rdd = spark.sql("SELECT * FROM UberTripTemp")
rdd.show(10)

In [0]:
# 3. Print the schema of the table
rdd.printSchema()

In [0]:
# 4. Print all the distinct ‘dispatching_base_number’
distinct_dispatching_base_number=spark.sql('SELECT DISTINCT dispatching_base_number as unique_dispatch_number FROM UberTripTemp')
distinct_dispatching_base_number.show()

In [0]:
# 5. Determine which dispatching base is the busiest based on the number of trips
busiest_base=spark.sql('select dispatching_base_number \
                            from UberTripTemp\
                            group by dispatching_base_number\
                            order by sum(trips) desc\
                            limit 1').collect()[0][0]
f'Busiest base by trips: {busiest_base}'

In [0]:
#Alternate method - Column Based Expression
import pyspark.sql.functions as f
rdd\
    .groupBy('dispatching_base_number')\
    .agg(f.sum('trips')\
    .alias('TotalTrips'))\
    .orderBy(f.desc('TotalTrips'))\
    .limit(1)\
    .show()


In [0]:
# 6. Determine the five busiest days based on the number of trips in the time range of the data
spark.sql('select date, sum(trips) as TotalTrips\
            from UberTripTemp\
            group by date\
            order by sum(trips) desc\
            limit 5')\
            .show()

In [0]:
# Alternate method - Window-Rank
from pyspark.sql.window import Window

window=Window.orderBy(f.desc('TotalTrips'))
rdd\
    .groupBy('date')\
    .agg(f.sum('trips').alias('TotalTrips'))\
    .withColumn('drank',f.dense_rank().over(window))\
    .where(f.col('drank')<=5)\
    .show()

In [0]:
# 7. Calculate the average number of active vehicles on the base station ‘B02512’
average_no_of_active_vehicles=spark.sql('SELECT CEIL(AVG(active_vehicles)) as average_active_vehicles_B02512\
                                            FROM UberTripTemp\
                                            WHERE dispatching_base_number="B02512"')
average_no_of_active_vehicles.show()

In [0]:
# spark.stop()