In [None]:
%pip install pyspark

In [10]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[1]") \
                    .appName("Customers") \
                    .getOrCreate()

In [None]:
# Read CSV file into data frame
df = spark.read.csv("../data/Customers.csv")
df.printSchema()

# Using format and load
df = spark.read.format("csv").load("../data/Customers.csv")

# Using header record for column names
df = spark.read.option("header", True) \
                .option("inferSchema", True) \
                .csv("../data/Customers.csv")

df.printSchema()

In [None]:
# Reading CSV File Options

# delimeter - specify the column delimeter of the CSV file
df_1 = spark.read.option(delimeter=',').csv("../data/Customers.csv")

# header - use the first line as the header
df_2 = spark.read.option(header=True).csv("../data/Customers.csv")

# inferSchema - infers the input schema automatically from data
df_3 = spark.read.option(inferSchema=True).csv("../data/Customers.csv")

# quotes - the character used to quote fields
df_4 = spark.read.option(quotes='"').csv("../data/Customers.csv")

# nullValue - specifies a string that indicates a null value
df_5 = spark.read.option(nullValue='NA').csv("../data/Customers.csv")

In [None]:
# Specify Custom Schema
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

schema = StructType() \
            .add("CustomerID", IntegerType(), True) \
            .add("Gender", StringType(), True) \
            .add("Age", IntegerType(), True) \
            .add("Annual Income", IntegerType(), True) \
            .add("Spending Score", IntegerType(), True) \
            .add("Profession", StringType(), True) \
            .add("Work Experience", IntegerType(), True) \
            .add("Family Size", IntegerType(), True)

df_with_schema = spark.read.format("csv") \
                    .option("header", True) \
                    .schema(schema) \
                    .load("../data/Customers.csv")

df_with_schema.printSchema()

In [None]:
# Writing PySpark DataFrame to CSV File
df_with_schema.write.mode("overwrite") \
        .option("header", True) \
        .option("delimiter", "|") \
        .csv("data/Customers_output.csv")

In [None]:
# Writing CSV File Options

# header - specify whether to include a header row with column names in the CSV file
df.write.option("header", True).csv("data/Customers_output.csv")

# delimiter - specify the delimiter to use between fields in the CSV file
df.write.option("delimiter", "|").csv("data/Customers_output.csv")

# quote - specify the character used for quoting fields in the CSV file
df.write.option("quote", "*").csv("data/Customers_output.csv")

# escape - specify the character used for escaping the quote character in the CSV file
df.write.option("escape", "\\").csv("data/Customers_output.csv")

# nullValue - specify the string to write for null values in the CSV file
df.write.option("nullValue", "NA").csv("data/Customers_output.csv")

# mode
# overwrite - overwrite the existing data
df.write.mode("overwrite").csv("data/Customers_output.csv")

# append - append the data to the existing data
df.write.mode("append").csv("data/Customers_output.csv")

# ignore - ignore the operation if the data already exists
df.write.mode("ignore").csv("data/Customers_output.csv")

# error - throw an error if the data already exists
df.write.mode("error").csv("data/Customers_output.csv")