<a href="https://colab.research.google.com/github/maheshbabu-r/BIG_DATA_PySpark/blob/main/Spark_taxi_App_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pwd
!ls
!python --version

/content
sample_data		   spark-3.1.2-bin-hadoop3.2.tgz
spark-3.1.2-bin-hadoop3.2  yellow.csv
Python 3.7.11


In [2]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

!wget -q https://dlcdn.apache.org/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz  # supress download output use -q

!tar -zxvf spark-3.1.2-bin-hadoop3.2.tgz | grep "something" 2>/dev/null #Suppress tar output ---| grep "something" 2>/dev/null--- add after file_name"

!pip -q install findspark

In [3]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop3.2"

import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark import SparkContext

# or import like this
from pyspark.sql.functions import *

sc = SparkContext.getOrCreate(SparkConf().setMaster("local[*]"))
spark = SparkSession.builder.appName("Analysis taxi App").getOrCreate()
print(spark.sparkContext.appName)

pyspark-shell


**Problem Statement:**

---


Imagine that you are working as an analyst for a famous Taxi App company. Your organization
provides hassle-free travel to people all around the world. You have been provided with a
Spark–Hadoop setup to perform certain analytical tasks.


In [4]:
!wget -q https://raw.githubusercontent.com/maheshbabu-r/BIG_DATA/main/Hadoop%20Datasets/yellow.csv

In [5]:
df=spark.read.format("csv").option("header","true").load("/content/yellow.csv")
df.printSchema()
df.show(5)

root
 |-- VendorID: string (nullable = true)
 |-- tpep_pickup_datetime: string (nullable = true)
 |-- tpep_dropoff_datetime: string (nullable = true)
 |-- passenger_count: string (nullable = true)
 |-- trip_distance: string (nullable = true)
 |-- pickup_longitude: string (nullable = true)
 |-- pickup_latitude: string (nullable = true)
 |-- RateCodeID: string (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- dropoff_longitude: string (nullable = true)
 |-- dropoff_latitude: string (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- fare_amount: string (nullable = true)
 |-- extra: string (nullable = true)
 |-- mta_tax: string (nullable = true)
 |-- tip_amount: string (nullable = true)
 |-- tolls_amount: string (nullable = true)
 |-- total_amount: string (nullable = true)
 |-- trip_time: string (nullable = true)

+--------+--------------------+---------------------+---------------+-------------+-----------------+----------------+----------+----------

In [6]:
# 1. What is the total number of trips (equal to the number of rows)?
df.count()

10000

In [7]:
# 2. What is the total revenue generated by all the trips? The fare is stored in the column,total_amount.
df.select(sum("total_amount").alias("total_revenue")).show()

+------------------+
|     total_revenue|
+------------------+
|160546.80999999488|
+------------------+



In [8]:
# 3. What fraction of the total is paid for tolls? The toll is stored in tolls_amount.
df.select((sum("tolls_amount")*100/sum("total_amount")).alias("fraction of the total for tolls")).show()

+-------------------------------+
|fraction of the total for tolls|
+-------------------------------+
|             1.5553034034124158|
+-------------------------------+



In [9]:
# 4. What fraction of it is given as driver tips? The tip is stored in tip_amount.
df.select((sum("tip_amount")*100/sum("total_amount")).alias("fraction of the total for tip")).show()

+-----------------------------+
|fraction of the total for tip|
+-----------------------------+
|           10.785203393328445|
+-----------------------------+



In [10]:
# 5. What is the average trip amount?
df.select(mean("total_amount").alias("avg_trip_amount")).show()

+------------------+
|   avg_trip_amount|
+------------------+
|16.054680999999487|
+------------------+



In [11]:
# 6. What is the average distance of the trips? Distance is stored in the column, trip_distance
df.select(mean("trip_distance").alias("avg_trip_distance")).show()

+-----------------+
|avg_trip_distance|
+-----------------+
|3.253033000000003|
+-----------------+



In [12]:
# 7. How many different payment types are used?
df.select("payment_type").distinct().count()

4

In [13]:
# 8. For each payment type, display the following details:
# a. Average fare generated
# b. Average tip
# c. Average tax – tax is stored in the column, mta_tax


df.groupby("payment_type").agg(mean("fare_amount").alias("avg_fare_amount"),\
                               mean("tip_amount").alias("avg_tip_amount"),\
                               mean("mta_tax").alias("avg_mta_tax")).orderBy("payment_type").show(truncate=False)

+------------+------------------+-----------------+-------------------+
|payment_type|avg_fare_amount   |avg_tip_amount   |avg_mta_tax        |
+------------+------------------+-----------------+-------------------+
|1           |13.561018272684619|2.704248008745903|0.49711072934561923|
|2           |11.393383098591547|0.0              |0.4988732394366197 |
|3           |13.21078947368421 |0.0              |0.42105263157894735|
|4           |12.222222222222221|0.0              |0.5                |
+------------+------------------+-----------------+-------------------+



In [14]:
# 9. On average, which hour of the day generates the highest revenue?
df.groupby(hour(col("tpep_dropoff_datetime")).alias("hour")).agg(sum("total_amount").alias("max_revenue"))\
                                                  .orderBy(desc("max_revenue")).show(1)

+----+------------------+
|hour|       max_revenue|
+----+------------------+
|  23|108332.74000000632|
+----+------------------+
only showing top 1 row

