In [6]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

spark = SparkSession.builder.getOrCreate()

schema = StructType([
        StructField('Student_Name', StringType(), True),
        StructField('Student_Age', IntegerType(), True),
        StructField('Student_Subject', StringType(), True),
        StructField('Student_Class', IntegerType(), True),
        StructField('Student_Fees', IntegerType(), True),
    ])

df = spark.read.format('csv').schema(schema).option('header',True).load('content/student_data.csv')

df.printSchema()

root
 |-- Student_Name: string (nullable = true)
 |-- Student_Age: integer (nullable = true)
 |-- Student_Subject: string (nullable = true)
 |-- Student_Class: integer (nullable = true)
 |-- Student_Fees: integer (nullable = true)



In [9]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('').getOrCreate()

df1 = spark.read.csv('content/student_data.csv', inferSchema = True, header = True)

df2 = df1.withColumn('fees',df1['fees'].cast('float'))

df1.printSchema()

df2.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- subject: string (nullable = true)
 |-- class: integer (nullable = true)
 |-- fees: integer (nullable = true)

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- subject: string (nullable = true)
 |-- class: integer (nullable = true)
 |-- fees: float (nullable = true)



In [11]:
#PySpark - Apply custom schema to a DataFrame by changing metadata

# Import the libraries SparkSession, StructType,
# StructField, StringType, IntegerType
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Create a spark session using getOrCreate() function
spark_session = SparkSession.builder.getOrCreate()

# Define the schema for the data frame
schema = StructType([
        StructField('Student_Name', StringType(),
         True, metadata={"desc": "Name of the student"}),
        StructField('Student_Age', IntegerType(),
         True, metadata={"desc": "Age of the student"}),
        StructField('Student_Subject', StringType(),
         True, metadata={"desc": "Subject of the student"}),
        StructField('Student_Class', IntegerType(),
         True, metadata={"desc": "Class of the student"}),
        StructField('Student_Fees', IntegerType(),
         True, metadata={"desc": "Fees of the student"})
         ])

# Applying custom schema to data frame
df = spark_session.read.format("csv").schema(
        schema).option("header",
        True).load("content/student_data.csv")

# Display the updated schema of the data frame
df.printSchema()

# Run a loop to display metadata for each column
for i in range(len(df.columns)):
  a=df.schema.fields[i].metadata["desc"]
  print('Column ',i+1,': ',a)

root
 |-- Student_Name: string (nullable = true)
 |-- Student_Age: integer (nullable = true)
 |-- Student_Subject: string (nullable = true)
 |-- Student_Class: integer (nullable = true)
 |-- Student_Fees: integer (nullable = true)

Column  1 :  Name of the student
Column  2 :  Age of the student
Column  3 :  Subject of the student
Column  4 :  Class of the student
Column  5 :  Fees of the student
