In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark= SparkSession. \
builder. \
config('spark.ui.port','0'). \
config("spark.sql.warehouse.dir", f"/user/itv009033/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

# 1. Write PySpark code to create a new dataframe with the data given below having 2 columns (‘season’) and (‘windspeed’).

In [2]:
data = [("Spring", 12.3),
("Summer", 10.5),
("Autumn", 8.2),
("Winter", 15.1)]

In [3]:
data

[('Spring', 12.3), ('Summer', 10.5), ('Autumn', 8.2), ('Winter', 15.1)]

In [4]:
seasons_schema = 'season string , windspeed float'

In [5]:
df_seasons = spark.createDataFrame(data, seasons_schema)
df_seasons.show()

+------+---------+
|season|windspeed|
+------+---------+
|Spring|     12.3|
|Summer|     10.5|
|Autumn|      8.2|
|Winter|     15.1|
+------+---------+



# 2. Consider the library management dataset located at the following path
(/public/trendytech/datasets/library_data.json). Using PySpark, load the
data into a Dataframe and enforce schema using StructType.


In [6]:
lib_df = spark.read\
              .json("/public/trendytech/datasets/library_data.json")

In [7]:
lib_df.show()

+--------------------+-----------------+-----------+--------------------+
|               books|     library_name|   location|             members|
+--------------------+-----------------+-----------+--------------------+
|[{F. Scott Fitzge...|  Central Library|City Center|[{28, [B001], M00...|
|[{George Orwell, ...|Community Library|     Suburb|[{42, [B003, B004...|
+--------------------+-----------------+-----------+--------------------+



In [8]:
from pyspark.sql.types import *

In [9]:
schema = StructType([
StructField("library_name", StringType()),
StructField("location", StringType()),
StructField("books", ArrayType(
StructType([
StructField("book_id", StringType()),
StructField("book_name", StringType()),
StructField("author", StringType()),
StructField("copies_available", IntegerType())
])
)),
StructField("members", ArrayType(
StructType([
StructField("member_id", StringType()),
StructField("member_name", StringType()),
StructField("age", IntegerType()),
StructField("books_borrowed", ArrayType(StringType()))
])
))
])


In [10]:
library_df = spark.read.schema(schema).json("/public/trendytech/datasets/library_data.json")

In [11]:
library_df.show()

+-----------------+-----------+--------------------+--------------------+
|     library_name|   location|               books|             members|
+-----------------+-----------+--------------------+--------------------+
|  Central Library|City Center|[{B001, The Great...|[{M001, John Smit...|
|Community Library|     Suburb|[{B003, 1984, Geo...|[{M003, Michael B...|
+-----------------+-----------+--------------------+--------------------+



# 3. Given the dataset (/public/trendytech/datasets/train.csv), create a Dataframe using PySpark and perform the following operations


In [12]:
train_df = spark.read.format("csv")\
                .option("header", "true")\
                .option("inferSchema","true").load("/public/trendytech/datasets/train.csv")
train_df.show()

+------------+----------+---------------+--------------+---+-------------+-----------+
|train_number|train_name|seats_available|passenger_name|age|ticket_number|seat_number|
+------------+----------+---------------+--------------+---+-------------+-----------+
|         123|   Express|            100|          John| 25|         T123|         A1|
|         123|   Express|            100|          Emma| 30|         T124|         B2|
|         456| Superfast|            150|       Michael| 35|         T125|         C3|
|         456| Superfast|            150|        Sophia| 40|         T126|         D4|
|         789|     Local|             50|       William| 28|         T127|         E5|
|         789|     Local|             50|        Sophia| 32|         T128|         F6|
|         789|     Local|             50|        Oliver| 45|         T129|         G7|
+------------+----------+---------------+--------------+---+-------------+-----------+



In [13]:
train_df.printSchema()

root
 |-- train_number: integer (nullable = true)
 |-- train_name: string (nullable = true)
 |-- seats_available: integer (nullable = true)
 |-- passenger_name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- ticket_number: string (nullable = true)
 |-- seat_number: string (nullable = true)



**a) Drop the columns passenger_name and age from the dataset**

In [14]:
df1 = train_df.drop("passenger_name", "age")

In [15]:
df1.show()

+------------+----------+---------------+-------------+-----------+
|train_number|train_name|seats_available|ticket_number|seat_number|
+------------+----------+---------------+-------------+-----------+
|         123|   Express|            100|         T123|         A1|
|         123|   Express|            100|         T124|         B2|
|         456| Superfast|            150|         T125|         C3|
|         456| Superfast|            150|         T126|         D4|
|         789|     Local|             50|         T127|         E5|
|         789|     Local|             50|         T128|         F6|
|         789|     Local|             50|         T129|         G7|
+------------+----------+---------------+-------------+-----------+



**b) Count the number of rows after removing duplicates of columns
train_number and ticket_number.**

In [16]:
df2 = df1.dropDuplicates(["train_number", "ticket_number"])
df2.show()

+------------+----------+---------------+-------------+-----------+
|train_number|train_name|seats_available|ticket_number|seat_number|
+------------+----------+---------------+-------------+-----------+
|         789|     Local|             50|         T128|         F6|
|         123|   Express|            100|         T124|         B2|
|         123|   Express|            100|         T123|         A1|
|         456| Superfast|            150|         T126|         D4|
|         456| Superfast|            150|         T125|         C3|
|         789|     Local|             50|         T127|         E5|
|         789|     Local|             50|         T129|         G7|
+------------+----------+---------------+-------------+-----------+



In [17]:
print('Before removal ', df1.count(), ' ', 'After Removal ', df2.count())

Before removal  7   After Removal  7


**c) Count the number of unique train names.**

In [18]:
df1.select("train_name").distinct()

train_name
Express
Local
Superfast


In [19]:
df1.select("train_name").distinct().count()

3

# 4. You are working as a Data Engineer in a large retail company. The company has a dataset named "sales_data.json" that contains sales records from various stores. The dataset is stored in JSON format and may have some corrupt or malformed records due to occasional data quality issues.

**Your task is to read the "sales_data.json" dataset
(/public/trendytech/datasets/sales_data.json) using PySpark, utilizing
different read modes to handle corrupt records. You need to create a
Dataframe using pyspark and perform the following operations:**

In [20]:
sales_schema = "store_id int, product string, quantity int, revenue float"

**1. Read the dataset using the "permissive" mode and count the number of records read.**

In [21]:
sales_df0 = spark.read.schema(sales_schema).json("/public/trendytech/datasets/sales_data.json")

In [22]:
sales_df0.show()

+--------+----------+--------+-------+
|store_id|   product|quantity|revenue|
+--------+----------+--------+-------+
|       1|     Apple|      10|  100.0|
|       2|    Banana|      15|   75.0|
|       3|    Orange|      12|   90.0|
|       4|     Mango|       8|  120.0|
|       5|     Grape|      20|  150.0|
|       6|Watermelon|       5|   50.0|
|       7|Strawberry|      18|  108.0|
|       8| Pineapple|      14|  140.0|
|       9|    Cherry|       7|  105.0|
|      10|      Pear|       9|   81.0|
|      11| Blueberry|      11|   88.0|
|      12|      Kiwi|      16|  128.0|
|      13|     Peach|      13|   91.0|
|      14|      Plum|       6|   54.0|
|      15|     Lemon|      10|   70.0|
|      16| Raspberry|      17|  136.0|
|      17|   Coconut|       4|   80.0|
|      18|   Avocado|      11|   99.0|
|      19|Blackberry|       8|   64.0|
|      20|         G|    null|    NaN|
+--------+----------+--------+-------+
only showing top 20 rows



In [23]:
sales_df = spark.read.option("mode", "failfast").schema(sales_schema).json("/public/trendytech/datasets/sales_data.json")

In [24]:
sales_df.show()

Py4JJavaError: An error occurred while calling o150.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 48.0 failed 4 times, most recent failure: Lost task 0.3 in stage 48.0 (TID 1022) (w01.itversity.com executor 1): org.apache.spark.SparkException: Malformed records are detected in record parsing. Parse Mode: FAILFAST. To process malformed records as null result, try setting the option 'mode' as 'PERMISSIVE'.
	at org.apache.spark.sql.catalyst.util.FailureSafeParser.parse(FailureSafeParser.scala:70)
	at org.apache.spark.sql.execution.datasources.json.TextInputJsonDataSource$.$anonfun$readFile$9(JsonDataSource.scala:144)
	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:484)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:490)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:93)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:755)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:345)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:898)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:898)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.sql.catalyst.util.BadRecordException: java.lang.RuntimeException: Failed to parse a value for data type int (current token: VALUE_STRING).
	at org.apache.spark.sql.catalyst.json.JacksonParser.parse(JacksonParser.scala:492)
	at org.apache.spark.sql.execution.datasources.json.TextInputJsonDataSource$.$anonfun$readFile$7(JsonDataSource.scala:140)
	at org.apache.spark.sql.catalyst.util.FailureSafeParser.parse(FailureSafeParser.scala:60)
	... 23 more
Caused by: java.lang.RuntimeException: Failed to parse a value for data type int (current token: VALUE_STRING).
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$failedConversion$1.applyOrElse(JacksonParser.scala:375)
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$failedConversion$1.applyOrElse(JacksonParser.scala:355)
	at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:38)
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$$nestedInanonfun$makeConverter$4$1.applyOrElse(JacksonParser.scala:184)
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$$nestedInanonfun$makeConverter$4$1.applyOrElse(JacksonParser.scala:184)
	at org.apache.spark.sql.catalyst.json.JacksonParser.parseJsonToken(JacksonParser.scala:343)
	at org.apache.spark.sql.catalyst.json.JacksonParser.$anonfun$makeConverter$4(JacksonParser.scala:184)
	at org.apache.spark.sql.catalyst.json.JacksonParser.org$apache$spark$sql$catalyst$json$JacksonParser$$convertObject(JacksonParser.scala:397)
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$$nestedInanonfun$makeStructRootConverter$3$1.applyOrElse(JacksonParser.scala:96)
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$$nestedInanonfun$makeStructRootConverter$3$1.applyOrElse(JacksonParser.scala:95)
	at org.apache.spark.sql.catalyst.json.JacksonParser.parseJsonToken(JacksonParser.scala:343)
	at org.apache.spark.sql.catalyst.json.JacksonParser.$anonfun$makeStructRootConverter$3(JacksonParser.scala:95)
	at org.apache.spark.sql.catalyst.json.JacksonParser.$anonfun$parse$2(JacksonParser.scala:467)
	at org.apache.spark.util.Utils$.tryWithResource(Utils.scala:2622)
	at org.apache.spark.sql.catalyst.json.JacksonParser.parse(JacksonParser.scala:462)
	... 25 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2258)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2207)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2206)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2206)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1079)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1079)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1079)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2445)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2387)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2376)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:868)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2196)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2217)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2236)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:472)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:425)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:47)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3696)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:2722)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3687)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3685)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2722)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2929)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:301)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:338)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


In [25]:
sales_df1 = spark.read.option("mode", "dropmalformed").schema(sales_schema).json("/public/trendytech/datasets/sales_data.json")

In [26]:
sales_df1.show()

+--------+----------+--------+-------+
|store_id|   product|quantity|revenue|
+--------+----------+--------+-------+
|       1|     Apple|      10|  100.0|
|       2|    Banana|      15|   75.0|
|       3|    Orange|      12|   90.0|
|       4|     Mango|       8|  120.0|
|       5|     Grape|      20|  150.0|
|       6|Watermelon|       5|   50.0|
|       7|Strawberry|      18|  108.0|
|       8| Pineapple|      14|  140.0|
|       9|    Cherry|       7|  105.0|
|      10|      Pear|       9|   81.0|
|      11| Blueberry|      11|   88.0|
|      12|      Kiwi|      16|  128.0|
|      13|     Peach|      13|   91.0|
|      14|      Plum|       6|   54.0|
|      15|     Lemon|      10|   70.0|
|      16| Raspberry|      17|  136.0|
|      17|   Coconut|       4|   80.0|
|      18|   Avocado|      11|   99.0|
|      19|Blackberry|       8|   64.0|
+--------+----------+--------+-------+



In [27]:
drop_count = sales_df1.count()

In [28]:
default_count = sales_df0.count()

In [29]:
default_count - drop_count

1

In [30]:
spark

# 5. You have a hospital dataset with the following fields:
``● patient_id (integer): Unique identifier for each patient.
● admission_date (date): The date the patient was admitted to the
hospital. (MM-dd-yyyy)
● discharge_date (date): The date the patient was discharged from the
hospital. (yyyy-MM-dd)
● diagnosis (string): The diagnosed medical condition of the patient.
● doctor_id (integer): The identifier of the doctor responsible for the
patient's care.
● total_cost (float): The total cost of the hospital stay for the patient.
Using PySpark, load the data into a Dataframe and perform the following
operations on the hospital dataset
(/public/trendytech/datasets/hospital.csv):``

In [31]:
hospital_schema = "patient_id int, admission_date date, discharge_date date, diagnosis string, doctor_id int,total_cost float"

In [32]:
spark.read.csv("/public/trendytech/datasets/hospital.csv").show()

+----------+--------------+--------------+-------------+---------+----------+
|       _c0|           _c1|           _c2|          _c3|      _c4|       _c5|
+----------+--------------+--------------+-------------+---------+----------+
|patient_id|admission_date|discharge_date|    diagnosis|doctor_id|total_cost|
|         1|    01-01-2022|    2022-01-10|    Pneumonia|      101|   5000.00|
|         2|    02-05-2022|    2022-02-09| Appendicitis|      102|   7000.00|
|         3|    03-12-2022|    2022-03-18|Fractured Arm|      103|   3500.00|
|         4|    04-02-2022|    2022-04-08| Heart Attack|      104|  15000.00|
|         5|    05-05-2022|    2022-05-07|    Influenza|      105|   2500.00|
|         6|    06-10-2022|    2022-06-15| Appendicitis|      106|   8000.00|
|         7|    07-20-2022|    2022-07-25|    Pneumonia|      107|   5500.00|
|         8|    08-25-2022|    2022-09-01| Heart Attack|      108|  20000.00|
|         9|    09-15-2022|    2022-09-22|Fractured Leg|      10

In [34]:
hospital_df = \
spark.read\
.format('csv')\
.option("header", "true")\
.schema(hospital_schema)\
.option("dateFormat","MM-dd-yyyy") \
.load("/public/trendytech/datasets/hospital.csv")

In [35]:
hospital_df.show()

+----------+--------------+--------------+-------------+---------+----------+
|patient_id|admission_date|discharge_date|    diagnosis|doctor_id|total_cost|
+----------+--------------+--------------+-------------+---------+----------+
|         1|    2022-01-01|    2022-01-10|    Pneumonia|      101|    5000.0|
|         2|    2022-02-05|    2022-02-09| Appendicitis|      102|    7000.0|
|         3|    2022-03-12|    2022-03-18|Fractured Arm|      103|    3500.0|
|         4|    2022-04-02|    2022-04-08| Heart Attack|      104|   15000.0|
|         5|    2022-05-05|    2022-05-07|    Influenza|      105|    2500.0|
|         6|    2022-06-10|    2022-06-15| Appendicitis|      106|    8000.0|
|         7|    2022-07-20|    2022-07-25|    Pneumonia|      107|    5500.0|
|         8|    2022-08-25|    2022-09-01| Heart Attack|      108|   20000.0|
|         9|    2022-09-15|    2022-09-22|Fractured Leg|      109|    6000.0|
|        10|    2022-10-05|    2022-10-10| Appendicitis|      11

In [37]:
df01 = hospital_df.drop("doctor_id")
df01.show()

+----------+--------------+--------------+-------------+----------+
|patient_id|admission_date|discharge_date|    diagnosis|total_cost|
+----------+--------------+--------------+-------------+----------+
|         1|    2022-01-01|    2022-01-10|    Pneumonia|    5000.0|
|         2|    2022-02-05|    2022-02-09| Appendicitis|    7000.0|
|         3|    2022-03-12|    2022-03-18|Fractured Arm|    3500.0|
|         4|    2022-04-02|    2022-04-08| Heart Attack|   15000.0|
|         5|    2022-05-05|    2022-05-07|    Influenza|    2500.0|
|         6|    2022-06-10|    2022-06-15| Appendicitis|    8000.0|
|         7|    2022-07-20|    2022-07-25|    Pneumonia|    5500.0|
|         8|    2022-08-25|    2022-09-01| Heart Attack|   20000.0|
|         9|    2022-09-15|    2022-09-22|Fractured Leg|    6000.0|
|        10|    2022-10-05|    2022-10-10| Appendicitis|    7500.0|
|        11|    2022-11-02|    2022-11-05|    Influenza|    2800.0|
|        12|    2022-12-10|    2022-12-18|    Pn

In [39]:
df01 = df01.withColumnRenamed("total_cost","hospital_bill")
df01.show()

+----------+--------------+--------------+-------------+-------------+
|patient_id|admission_date|discharge_date|    diagnosis|hospital_bill|
+----------+--------------+--------------+-------------+-------------+
|         1|    2022-01-01|    2022-01-10|    Pneumonia|       5000.0|
|         2|    2022-02-05|    2022-02-09| Appendicitis|       7000.0|
|         3|    2022-03-12|    2022-03-18|Fractured Arm|       3500.0|
|         4|    2022-04-02|    2022-04-08| Heart Attack|      15000.0|
|         5|    2022-05-05|    2022-05-07|    Influenza|       2500.0|
|         6|    2022-06-10|    2022-06-15| Appendicitis|       8000.0|
|         7|    2022-07-20|    2022-07-25|    Pneumonia|       5500.0|
|         8|    2022-08-25|    2022-09-01| Heart Attack|      20000.0|
|         9|    2022-09-15|    2022-09-22|Fractured Leg|       6000.0|
|        10|    2022-10-05|    2022-10-10| Appendicitis|       7500.0|
|        11|    2022-11-02|    2022-11-05|    Influenza|       2800.0|
|     

In [42]:
df02 = df01.selectExpr('*', "DATEDIFF(discharge_date,admission_date) as duration_of_stay")

In [43]:
df02.show()

+----------+--------------+--------------+-------------+-------------+----------------+
|patient_id|admission_date|discharge_date|    diagnosis|hospital_bill|duration_of_stay|
+----------+--------------+--------------+-------------+-------------+----------------+
|         1|    2022-01-01|    2022-01-10|    Pneumonia|       5000.0|               9|
|         2|    2022-02-05|    2022-02-09| Appendicitis|       7000.0|               4|
|         3|    2022-03-12|    2022-03-18|Fractured Arm|       3500.0|               6|
|         4|    2022-04-02|    2022-04-08| Heart Attack|      15000.0|               6|
|         5|    2022-05-05|    2022-05-07|    Influenza|       2500.0|               2|
|         6|    2022-06-10|    2022-06-15| Appendicitis|       8000.0|               5|
|         7|    2022-07-20|    2022-07-25|    Pneumonia|       5500.0|               5|
|         8|    2022-08-25|    2022-09-01| Heart Attack|      20000.0|               7|
|         9|    2022-09-15|    2

In [69]:
df03 = df02.selectExpr("*", "CASE WHEN diagnosis LIKE 'Heart Attack' THEN hospital_bill * 1.5 WHEN diagnosis like 'Appendicitis' THEN hospital_bill *1.2 \
ELSE hospital_bill END AS adjusted_total_cost")

**a) Drop the columns passenger_name and age from the dataset**

In [70]:
df03.show()

+----------+--------------+--------------+-------------+-------------+----------------+-------------------+
|patient_id|admission_date|discharge_date|    diagnosis|hospital_bill|duration_of_stay|adjusted_total_cost|
+----------+--------------+--------------+-------------+-------------+----------------+-------------------+
|         1|    2022-01-01|    2022-01-10|    Pneumonia|       5000.0|               9|             5000.0|
|         2|    2022-02-05|    2022-02-09| Appendicitis|       7000.0|               4|             8400.0|
|         3|    2022-03-12|    2022-03-18|Fractured Arm|       3500.0|               6|             3500.0|
|         4|    2022-04-02|    2022-04-08| Heart Attack|      15000.0|               6|            22500.0|
|         5|    2022-05-05|    2022-05-07|    Influenza|       2500.0|               2|             2500.0|
|         6|    2022-06-10|    2022-06-15| Appendicitis|       8000.0|               5|             9600.0|
|         7|    2022-07-20| 

In [77]:
df03.select("patient_id", "diagnosis", "hospital_bill", "adjusted_total_cost", "duration_of_stay")\
.orderBy("adjusted_total_cost", ascending = False).show(500)

+----------+-------------+-------------+-------------------+----------------+
|patient_id|    diagnosis|hospital_bill|adjusted_total_cost|duration_of_stay|
+----------+-------------+-------------+-------------------+----------------+
|        22| Heart Attack|      21000.0|            31500.0|               7|
|         8| Heart Attack|      20000.0|            30000.0|               7|
|        13| Heart Attack|      18000.0|            27000.0|               7|
|        17| Heart Attack|      16000.0|            24000.0|               3|
|         4| Heart Attack|      15000.0|            22500.0|               6|
|         6| Appendicitis|       8000.0|             9600.0|               5|
|        20| Appendicitis|       7800.0|             9360.0|               6|
|        10| Appendicitis|       7500.0|             9000.0|               5|
|        14| Appendicitis|       7200.0|             8640.0|               4|
|         2| Appendicitis|       7000.0|             8400.0|    