## Module 2: Perform Data Cleansing and preparation using Apache Spark

#### Reading data from delta table

In [1]:
data_df = spark.read.format("delta").load("Tables/heartFailure")
display(data_df)

StatementMeta(, ce688a6b-0bdc-454e-9d81-6ea593c81805, 3, Finished, Available)

SynapseWidget(Synapse.DataFrame, 80d95b89-ae1a-4333-90ef-2f38a92e211c)

#### Checking if datatypes are numerical

In [2]:
data_df.dtypes

StatementMeta(, ce688a6b-0bdc-454e-9d81-6ea593c81805, 4, Finished, Available)

[('Age', 'int'),
 ('Sex', 'string'),
 ('ChestPainType', 'string'),
 ('RestingBP', 'int'),
 ('Cholesterol', 'int'),
 ('FastingBS', 'int'),
 ('RestingECG', 'string'),
 ('MaxHR', 'int'),
 ('ExerciseAngina', 'string'),
 ('Oldpeak', 'double'),
 ('ST_Slope', 'string'),
 ('HeartDisease', 'int')]

#### Summarize dataframe

In [3]:
display(data_df.summary())

StatementMeta(, ce688a6b-0bdc-454e-9d81-6ea593c81805, 5, Finished, Available)

SynapseWidget(Synapse.DataFrame, 9cf0488a-c965-4e26-b2ec-61e4fba4ec7f)

In [4]:
display(data_df.select("age").summary())

StatementMeta(, ce688a6b-0bdc-454e-9d81-6ea593c81805, 6, Finished, Available)

SynapseWidget(Synapse.DataFrame, c639bfbd-91f6-41eb-9689-a2488c66fe97)

In [5]:
display(data_df.groupBy("age").count())

StatementMeta(, ce688a6b-0bdc-454e-9d81-6ea593c81805, 7, Finished, Available)

SynapseWidget(Synapse.DataFrame, c72d8d49-cff6-49a6-b171-958322eba5d9)

####  Missing Observation Analysis

Checking if any column has missing value

In [6]:
data_is_null = {col:data_df.filter(data_df[col].isNull()).count() for col in data_df.columns}
data_is_null

StatementMeta(, ce688a6b-0bdc-454e-9d81-6ea593c81805, 8, Finished, Available)

{'Age': 0,
 'Sex': 0,
 'ChestPainType': 0,
 'RestingBP': 0,
 'Cholesterol': 0,
 'FastingBS': 0,
 'RestingECG': 0,
 'MaxHR': 0,
 'ExerciseAngina': 0,
 'Oldpeak': 0,
 'ST_Slope': 0,
 'HeartDisease': 0}

In [7]:
display(data_df.summary())

StatementMeta(, ce688a6b-0bdc-454e-9d81-6ea593c81805, 9, Finished, Available)

SynapseWidget(Synapse.DataFrame, 06e70c9a-71ce-4ced-8fa6-c5e60f38a949)

## feature engineering
from sklearn.preprocessing import LabelEncoder- changing the datatype

In [8]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd
lab = LabelEncoder()

StatementMeta(, ce688a6b-0bdc-454e-9d81-6ea593c81805, 10, Finished, Available)

In [9]:
data_df1 = data_df.toPandas()
obj = data_df1.select_dtypes(include='object')
not_obj = data_df1.select_dtypes(exclude='object')
for i in range(0, obj.shape[1]):
  obj.iloc[:,i] = lab.fit_transform(obj.iloc[:,i])
df_new = pd.concat([obj, not_obj], axis=1)
df_new.head(10)

StatementMeta(, ce688a6b-0bdc-454e-9d81-6ea593c81805, 11, Finished, Available)

Unnamed: 0,Sex,ChestPainType,RestingECG,ExerciseAngina,ST_Slope,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
0,1,0,2,0,2,54,150,365,0,134,1.0,0
1,1,1,2,0,2,54,160,195,0,130,1.0,0
2,1,3,1,0,2,54,120,171,0,137,2.0,0
3,0,1,1,0,2,54,120,221,0,138,1.0,0
4,1,2,2,0,2,54,133,203,0,137,0.2,0
5,1,1,2,0,2,54,132,182,0,141,0.1,0
6,1,2,0,0,0,54,125,273,0,152,0.5,0
7,1,2,0,0,2,54,150,232,0,165,1.6,0
8,1,0,1,0,2,54,140,239,0,160,1.2,0
9,0,0,1,0,2,48,108,163,0,175,2.0,0


In [10]:
display(df_new)

StatementMeta(, ce688a6b-0bdc-454e-9d81-6ea593c81805, 12, Finished, Available)

  Expected bytes, got a 'int' object
Attempting non-optimization as 'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to true.
  warn(msg)


SynapseWidget(Synapse.DataFrame, 53e85ee6-7e36-44e4-8f33-959ca7dc4f42)

#### Save processed data to a Delta Table

In [11]:
spark.conf.set("sprk.sql.parquet.vorder.enabled", "true") # Enable Verti-Parquet write
spark.conf.set("spark.microsoft.delta.optimizeWrite.enabled", "true") # Enable automatic delta optimized write

StatementMeta(, ce688a6b-0bdc-454e-9d81-6ea593c81805, 13, Finished, Available)

In [12]:
table_name = "heartfailure_processed"
data_df_processed = spark.createDataFrame(df_new)
data_df_processed.write.mode("overwrite").format("delta").save(f"Tables/{table_name}")
print(f"Spark dataframe saved to delta table: {table_name}")

StatementMeta(, ce688a6b-0bdc-454e-9d81-6ea593c81805, 14, Finished, Available)

Spark dataframe saved to delta table: heartfailure_processed


In [13]:
%%sql

select * from heartfailure_processed limit 100;

StatementMeta(, ce688a6b-0bdc-454e-9d81-6ea593c81805, 15, Finished, Available)

<Spark SQL result set with 100 rows and 12 fields>