In [None]:
%pip install pyspark

In [10]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[1]") \
                    .appName("Customers") \
                    .getOrCreate()

## 1. Read & Write CSV File

In [None]:
# Read CSV file into data frame
df = spark.read.csv("../data/Customers.csv")
df.printSchema()

# Using format and load
df = spark.read.format("csv").load("../data/Customers.csv")

# Using header record for column names
df = spark.read.option("header", True) \
                .option("inferSchema", True) \
                .csv("../data/Customers.csv")

df.printSchema()

In [None]:
# Reading CSV File Options

# delimeter - specify the column delimeter of the CSV file
df_1 = spark.read.option(delimeter=',').csv("../data/Customers.csv")

# header - use the first line as the header
df_2 = spark.read.option(header=True).csv("../data/Customers.csv")

# inferSchema - infers the input schema automatically from data
df_3 = spark.read.option(inferSchema=True).csv("../data/Customers.csv")

# quotes - the character used to quote fields
df_4 = spark.read.option(quotes='"').csv("../data/Customers.csv")

# nullValue - specifies a string that indicates a null value
df_5 = spark.read.option(nullValue='NA').csv("../data/Customers.csv")

In [None]:
# Specify Custom Schema
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

schema = StructType() \
            .add("CustomerID", IntegerType(), True) \
            .add("Gender", StringType(), True) \
            .add("Age", IntegerType(), True) \
            .add("Annual Income", IntegerType(), True) \
            .add("Spending Score", IntegerType(), True) \
            .add("Profession", StringType(), True) \
            .add("Work Experience", IntegerType(), True) \
            .add("Family Size", IntegerType(), True)

df_with_schema = spark.read.format("csv") \
                    .option("header", True) \
                    .schema(schema) \
                    .load("../data/Customers.csv")

df_with_schema.printSchema()

In [None]:
# Writing PySpark DataFrame to CSV File
df_with_schema.write.mode("overwrite") \
        .option("header", True) \
        .option("delimiter", "|") \
        .csv("output/Customers.csv")