### Creating Spark Session

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('read-csv').getOrCreate()

In [None]:
spark.getActiveSession()

### Create DF using the data from csv file

In [None]:
# Method 1 : Using spark.read.csv()

file_path = 'sample_data/customers-100.csv'
df = spark.read.csv(file_path)

df.show(2)
df.printSchema()

+-----+---------------+----------+---------+---------------+------------+-------+------------+----------------+--------------------+-----------------+--------------------+
|  _c0|            _c1|       _c2|      _c3|            _c4|         _c5|    _c6|         _c7|             _c8|                 _c9|             _c10|                _c11|
+-----+---------------+----------+---------+---------------+------------+-------+------------+----------------+--------------------+-----------------+--------------------+
|Index|    Customer Id|First Name|Last Name|        Company|        City|Country|     Phone 1|         Phone 2|               Email|Subscription Date|             Website|
|    1|DD37Cf93aecA6Dc|    Sheryl|   Baxter|Rasmussen Group|East Leonard|  Chile|229.077.5154|397.884.0519x718|zunigavanessa@smi...|       2020-08-24|http://www.stephe...|
+-----+---------------+----------+---------+---------------+------------+-------+------------+----------------+--------------------+--------

In [None]:
# Method 2 : spark.read.format().load()

df = spark.read.format('csv').load(file_path)
df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: string (nullable = true)
 |-- _c8: string (nullable = true)
 |-- _c9: string (nullable = true)
 |-- _c10: string (nullable = true)
 |-- _c11: string (nullable = true)



In [None]:
df = spark.read.csv(file_path,header=True)
df.show()

+-----+---------------+----------+---------+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+-----------------+--------------------+
|Index|    Customer Id|First Name|Last Name|             Company|             City|             Country|             Phone 1|             Phone 2|               Email|Subscription Date|             Website|
+-----+---------------+----------+---------+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+-----------------+--------------------+
|    1|DD37Cf93aecA6Dc|    Sheryl|   Baxter|     Rasmussen Group|     East Leonard|               Chile|        229.077.5154|    397.884.0519x718|zunigavanessa@smi...|       2020-08-24|http://www.stephe...|
|    2|1Ef7b82A4CAAD10|   Preston|   Lozano|         Vega-Gentry|East Jimmychester|            Djibouti|          5153435776|    686-620-1820x944|     vmata@colon.com|     

### List of params for 'Options' for csv

In [None]:
# Different ways to define the options

# Method - 1
df = spark.read.option('header',True).option('inferschema',True).csv(file_path)
df.printSchema()

root
 |-- Index: integer (nullable = true)
 |-- Customer Id: string (nullable = true)
 |-- First Name: string (nullable = true)
 |-- Last Name: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Phone 1: string (nullable = true)
 |-- Phone 2: string (nullable = true)
 |-- Email: string (nullable = true)
 |-- Subscription Date: date (nullable = true)
 |-- Website: string (nullable = true)



In [None]:
# Method - 2
df = spark.read.csv(file_path,header=True, inferSchema=True)
df.printSchema()

root
 |-- Index: integer (nullable = true)
 |-- Customer Id: string (nullable = true)
 |-- First Name: string (nullable = true)
 |-- Last Name: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Phone 1: string (nullable = true)
 |-- Phone 2: string (nullable = true)
 |-- Email: string (nullable = true)
 |-- Subscription Date: date (nullable = true)
 |-- Website: string (nullable = true)



In [None]:
# Method - 3

options_dict = {
    'header':True,
    'inferSchema':True,
    'delimiter':',',           # we can also use 'sep'
    'encoding':'UTF-8',
   #  'timestampFormat':'YYYY-MM-DD' - Raise Error
    'timestampFormat':'yyyy-MM-dd',
    'quote':'"',                # default: "
    'escape':'\\',
    'multiline':True,
    'nullValue':'NA',

}

df = spark.read.options(**options_dict).csv(file_path)
df.show(2)
df.printSchema()

+-----+---------------+----------+---------+---------------+-----------------+--------+------------+----------------+--------------------+-----------------+--------------------+
|Index|    Customer Id|First Name|Last Name|        Company|             City| Country|     Phone 1|         Phone 2|               Email|Subscription Date|             Website|
+-----+---------------+----------+---------+---------------+-----------------+--------+------------+----------------+--------------------+-----------------+--------------------+
|    1|DD37Cf93aecA6Dc|    Sheryl|   Baxter|Rasmussen Group|     East Leonard|   Chile|229.077.5154|397.884.0519x718|zunigavanessa@smi...|       2020-08-24|http://www.stephe...|
|    2|1Ef7b82A4CAAD10|   Preston|   Lozano|    Vega-Gentry|East Jimmychester|Djibouti|  5153435776|686-620-1820x944|     vmata@colon.com|       2021-04-23|http://www.hobbs....|
+-----+---------------+----------+---------+---------------+-----------------+--------+------------+----------

**Error Details:**
The error SparkUpgradeException: [INCONSISTENT_BEHAVIOR_CROSS_VERSION.DATETIME_PATTERN_RECOGNITION] ... is raised because the timestampFormat option in your options_dict is set to YYYY-MM-DD, which is not a valid datetime pattern recognized by Spark 3.0 and later. Spark's newer versions use a different parser for datetime strings, and the old pattern is no longer supported by default.

### Multiple csv files

In [None]:
# Pass list of files seperated by comma
df = spark.read.csv('file_path_1','file_path_2')

In [None]:
# Read all files from a directory

df = spark.read.csv("Folder path")

### Custom user defined schema using StructField and StructType

In [None]:
from pyspark.sql.types import StructType,StructField,StringType,IntegerType,DateType


custom_schema = StructType([
    StructField('Index',IntegerType(),False),
StructField('Customer Id',StringType(),False),
    StructField('First Name',StringType(),False),
    StructField('Last Name',StringType(),False),
    StructField('Company',StringType(),False),
    StructField('City',StringType(),False),
    StructField('Country',StringType(),False),
    StructField('Phone 1',StringType(),False),
    StructField('Phone 2',StringType(),False),
    StructField('Email',StringType(),False),
    StructField('Sub Date',DateType(),False),
    StructField('Website',StringType(),False),
])

# df = spark.read.csv(file_path,schema=custom_schema)

df = spark.read.format('csv').schema(custom_schema).load(file_path) # schema should be defined before loading the csv
df.printSchema()

root
 |-- Index: integer (nullable = true)
 |-- Customer Id: string (nullable = true)
 |-- First Name: string (nullable = true)
 |-- Last Name: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Phone 1: string (nullable = true)
 |-- Phone 2: string (nullable = true)
 |-- Email: string (nullable = true)
 |-- Sub Date: date (nullable = true)
 |-- Website: string (nullable = true)



### Writing Data to CSV file

In [None]:
# Write to csv file
save_csv_path = 'sample_data/sample_output.csv'

df.write.csv(save_csv_path, header=True)

### List of Params for 'option' - write to csv

#### list of modes

* overwrite – Overwrite the existing file if already exists.

* append – New rows are appended to the existing rows.

* ignore – When this option is used, it ignores the writing operation when the file already exists.

* error – This option returns an error when the file already exists. This is a default option.

In [None]:
save_csv_path = 'sample_output_data'

options_dict = {
    'header':True,
    'delimiter':',',           # we can also use 'sep'
    'encoding':'UTF-8',
   #  'timestampFormat':'YYYY-MM-DD' - Raise Error
    'timestampFormat':'yyyy-MM-dd',
    'quote':'"',                # default: "
    'escape':'\\',
    'multiline':True,
    'nullValue':'NA',
    'mode':"overwrite",  # “overwrite”, “append”, “ignore”, and “error”
    # 'compression':'gzip'

}

df.write.mode('overwrite').format('csv').options(**options_dict).save(save_csv_path)