In [1]:
# Install PySpark if it's not already installed
#!pip install pyspark

from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("DataCleaning") \
    .getOrCreate()

# Load the dataset
file_path = 'csv uncleaned (1).csv'

try:
    # Read the CSV file into a DataFrame
    data = spark.read.csv(file_path, header=True, inferSchema=True)
    # Show the first few rows to verify the load
    data.show()
except Exception as e:
    print(f"An error occurred: {e}")


+-------+------+-------------+------------------+----------------+--------------------+------------------+-----------+--------------------+--------------+---------+---------+---------+-------------+-------+---------------------+----------------+------------+-----------+-------------------+-----------------------+-----------------------+-----------------+-------------------------+-----------------+--------------------+--------------------+---------+---------------------+---------------+--------------+-----------------+-----+---------------+----------+------------+-------------+--------------------+----------------+--------+
|  State|   Sex|GeneralHealth|PhysicalHealthDays|MentalHealthDays|     LastCheckupTime|PhysicalActivities| SleepHours|        RemovedTeeth|HadHeartAttack|HadAngina|HadStroke|HadAsthma|HadSkinCancer|HadCOPD|HadDepressiveDisorder|HadKidneyDisease|HadArthritis|HadDiabetes|DeafOrHardOfHearing|BlindOrVisionDifficulty|DifficultyConcentrating|DifficultyWalking|DifficultyDre

In [2]:
from pyspark.sql.functions import col, isnan, when, count

# Count missing values in each column
missing_values = data.select(
    [count(when(col(c).isNull() | isnan(c), c)).alias(c) for c in data.columns]
)
missing_values.show()


+-----+-----+-------------+------------------+----------------+---------------+------------------+----------+------------+--------------+---------+---------+---------+-------------+-------+---------------------+----------------+------------+-----------+-------------------+-----------------------+-----------------------+-----------------+-------------------------+-----------------+------------+---------------+---------+---------------------+-----------+--------------+-----------------+-----+---------------+----------+------------+-------------+-----------------+----------------+--------+
|State|  Sex|GeneralHealth|PhysicalHealthDays|MentalHealthDays|LastCheckupTime|PhysicalActivities|SleepHours|RemovedTeeth|HadHeartAttack|HadAngina|HadStroke|HadAsthma|HadSkinCancer|HadCOPD|HadDepressiveDisorder|HadKidneyDisease|HadArthritis|HadDiabetes|DeafOrHardOfHearing|BlindOrVisionDifficulty|DifficultyConcentrating|DifficultyWalking|DifficultyDressingBathing|DifficultyErrands|SmokerStatus|ECigarette

In [3]:
from pyspark.sql.functions import col, mean

# Identify numerical columns
numeric_cols = [c for c, t in data.dtypes if t in ['int', 'double']]

# Calculate mean and fill missing values
for col_name in numeric_cols:
    mean_value = data.select(mean(col(col_name))).first()[0]
    data = data.na.fill({col_name: mean_value})


In [4]:
from pyspark.sql.functions import col, count

# Get categorical columns
categorical_cols = [c for c, t in data.dtypes if t == 'string']

# Iterate over each categorical column
for col_name in categorical_cols:
    # Get the most frequent value (mode)
    mode_df = data.groupBy(col_name).agg(count(col_name).alias('count')).orderBy('count', ascending=False)
    
    # Check if the DataFrame is empty
    if mode_df.count() > 0:
        mode_value = mode_df.first()[0]
        
        # Fill missing values with mode
        data = data.na.fill({col_name: mode_value})
    else:
        print(f"No mode found for column {col_name}.")


In [5]:
data = data.dropDuplicates()
