In [18]:
from pyspark.sql import SparkSession
import sys
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def process_survey_data(spark, input_path):
    try:
        # Read CSV data
        surveyDF = spark.read \
            .option("header", "true") \
            .option("inferSchema", "true") \
            .csv(input_path)

        # Create a temporary view
        surveyDF.createOrReplaceTempView("survey_tbl")

        # Perform SQL query
        countDF = spark.sql("SELECT Country, COUNT(1) AS count FROM survey_tbl WHERE Age < 40 GROUP BY Country")

        return countDF

    except Exception as e:
        logger.error(f"An error occurred: {str(e)}")
        sys.exit(1)

def main():
    try:
        # Create a SparkSession
        spark = SparkSession.builder \
            .master("local[3]") \
            .appName("HelloSparkSQL") \
            .getOrCreate()

        # Input CSV file path
        input_path = "sample.csv"

        # Process survey data
        countDF = process_survey_data(spark, input_path)

        # Show result
        countDF.show()

    except Exception as e:
        logger.error(f"An error occurred: {str(e)}")
        sys.exit(1)

    finally:
        # Stop SparkSession
        spark.stop()
        logger.info("SparkSession stopped successfully.")

if __name__ == "__main__":
    main()


INFO:__main__:SparkSession stopped successfully.


+--------------+-----+
|       Country|count|
+--------------+-----+
| United States|    4|
|        Canada|    2|
|United Kingdom|    1|
+--------------+-----+

