In [35]:
from pyspark.sql import SparkSession

In [36]:
spark = SparkSession.builder.appName('Dataframe').getOrCreate()

In [37]:
spark

In [38]:
# Read the dataset
df = spark.read.option('header', 'true').csv('data/health_insurance_coverage_detroit.csv', header=True, inferSchema=True)

In [39]:
df.show(5)

+---------+-------------------------+--------+
|Geography|Estimated_Percent_Insured|ObjectId|
+---------+-------------------------+--------+
|    48201|                       89|       1|
|    48202|                       90|       2|
|    48203|                       89|       3|
|    48204|                       87|       4|
|    48205|                       86|       5|
+---------+-------------------------+--------+
only showing top 5 rows



In [40]:
# Check the schema
df.printSchema()

root
 |-- Geography: string (nullable = true)
 |-- Estimated_Percent_Insured: integer (nullable = true)
 |-- ObjectId: integer (nullable = true)



In [41]:
# Column names of the dataframe

df.columns

['Geography', 'Estimated_Percent_Insured', 'ObjectId']

In [42]:
# View a few rows
print(df.head(3))
print(type(df.head(3)))

[Row(Geography='48201', Estimated_Percent_Insured=89, ObjectId=1), Row(Geography='48202', Estimated_Percent_Insured=90, ObjectId=2), Row(Geography='48203', Estimated_Percent_Insured=89, ObjectId=3)]
<class 'list'>


In [43]:
df.select('Geography').show(5)

+---------+
|Geography|
+---------+
|    48201|
|    48202|
|    48203|
|    48204|
|    48205|
+---------+
only showing top 5 rows



In [44]:
print(type(df.select('Geography').show(5)))
print(type(df.select('Geography').head(5)))

+---------+
|Geography|
+---------+
|    48201|
|    48202|
|    48203|
|    48204|
|    48205|
+---------+
only showing top 5 rows

<class 'NoneType'>
<class 'list'>


In [45]:
# Select multiple columns
df.select(['Geography', 'Estimated_Percent_Insured']).show(3)

+---------+-------------------------+
|Geography|Estimated_Percent_Insured|
+---------+-------------------------+
|    48201|                       89|
|    48202|                       90|
|    48203|                       89|
+---------+-------------------------+
only showing top 3 rows



In [46]:
df['Estimated_Percent_Insured']

Column<'Estimated_Percent_Insured'>

In [47]:
df.select('Estimated_Percent_Insured')

DataFrame[Estimated_Percent_Insured: int]

In [48]:
# Data types
df.dtypes

[('Geography', 'string'),
 ('Estimated_Percent_Insured', 'int'),
 ('ObjectId', 'int')]

In [49]:
df.describe().show()

+-------+------------------+-------------------------+-----------------+
|summary|         Geography|Estimated_Percent_Insured|         ObjectId|
+-------+------------------+-------------------------+-----------------+
|  count|                31|                       31|               31|
|   mean|           48218.1|        88.41935483870968|             16.0|
| stddev|12.169350987145803|       3.3938590969414815|9.092121131323903|
|    min|             48201|                       80|                1|
|    max|   City of Detroit|                       96|               31|
+-------+------------------+-------------------------+-----------------+



In [59]:
# Adding columns in dataframe
df_mod = df.withColumn('Percent_Uninsured', 100 - df['Estimated_Percent_Insured'])

In [62]:
# Drop a column

df_mod.drop('Percent_Uninsured').show()

+---------+-------------------------+--------+
|Geography|Estimated_Percent_Insured|ObjectId|
+---------+-------------------------+--------+
|    48201|                       89|       1|
|    48202|                       90|       2|
|    48203|                       89|       3|
|    48204|                       87|       4|
|    48205|                       86|       5|
|    48206|                       87|       6|
|    48207|                       92|       7|
|    48208|                       92|       8|
|    48209|                       80|       9|
|    48210|                       80|      10|
|    48211|                       86|      11|
|    48212|                       84|      12|
|    48213|                       88|      13|
|    48214|                       89|      14|
|    48215|                       90|      15|
|    48216|                       84|      16|
|    48217|                       90|      17|
|    48219|                       88|      18|
|    48221|  

In [67]:
df.withColumnRenamed('Estimated_Percent_Insured', 'Percent Insured').show()

+---------+---------------+--------+
|Geography|Percent Insured|ObjectId|
+---------+---------------+--------+
|    48201|             89|       1|
|    48202|             90|       2|
|    48203|             89|       3|
|    48204|             87|       4|
|    48205|             86|       5|
|    48206|             87|       6|
|    48207|             92|       7|
|    48208|             92|       8|
|    48209|             80|       9|
|    48210|             80|      10|
|    48211|             86|      11|
|    48212|             84|      12|
|    48213|             88|      13|
|    48214|             89|      14|
|    48215|             90|      15|
|    48216|             84|      16|
|    48217|             90|      17|
|    48219|             88|      18|
|    48221|             90|      19|
|    48223|             88|      20|
+---------+---------------+--------+
only showing top 20 rows



23/11/24 14:07:17 WARN Executor: Issue communicating with driver in heartbeater
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:101)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:85)
	at org.apache.spark.storage.BlockManagerMaster.registerBlockManager(BlockManagerMaster.scala:80)
	at org.apache.spark.storage.BlockManager.reregister(BlockManager.scala:642)
	at org.apache.spark.executor.Executor.reportHeartBeat(Executor.scala:1223)
	at org.apache.spark.executor.Executor.$anonfun$heartbeater$1(Executor.scala:295)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1928)