###  **Inferring schema from files and handling schema changes**

In [0]:
#Read the original CSV file and infer schema
df_v1 = spark.read.option("header", True).option("inferSchema", True).csv("dbfs:/FileStore/shared_uploads/21embit039@mlvti.ac.in/sales_data-3.csv")
df_v1 = df_v1.toDF(*[c.strip().lower().replace(" ", "_") for c in df_v1.columns])
df_v1.printSchema()
df_v1.show(5)


root
 |-- order_id: string (nullable = true)
 |-- product: string (nullable = true)
 |-- quantity_ordered: string (nullable = true)
 |-- price_each: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- purchase_address: string (nullable = true)

+--------+--------------------+----------------+----------+--------------+--------------------+
|order_id|             product|quantity_ordered|price_each|    order_date|    purchase_address|
+--------+--------------------+----------------+----------+--------------+--------------------+
|  295665|  Macbook Pro Laptop|               1|      1700|12/30/19 00:01|136 Church St, Ne...|
|  295666|  LG Washing Machine|               1|     600.0|12/29/19 07:03|562 2nd St, New Y...|
|  295667|USB-C Charging Cable|               1|     11.95|12/12/19 18:21|277 Main St, New ...|
|  295668|    27in FHD Monitor|               1|    149.99|12/22/19 15:13|410 6th St, San F...|
|  295669|USB-C Charging Cable|               1|     11.95|12/1

In [0]:
from pyspark.sql import Row

# New data with additional 'discount' column
data_new = [
    Row(order_id=101, product="Laptop", quantity_ordered=2, price_each=50000.0, order_date="2024-06-01", purchase_address="Delhi", discount=5000.0),
    Row(order_id=102, product="Phone", quantity_ordered=3, price_each=20000.0, order_date="2024-06-02", purchase_address="Mumbai", discount=2000.0)
]

df_v2 = spark.createDataFrame(data_new)
df_v2.printSchema()
df_v2.show()


root
 |-- order_id: long (nullable = true)
 |-- product: string (nullable = true)
 |-- quantity_ordered: long (nullable = true)
 |-- price_each: double (nullable = true)
 |-- order_date: string (nullable = true)
 |-- purchase_address: string (nullable = true)
 |-- discount: double (nullable = true)

+--------+-------+----------------+----------+----------+----------------+--------+
|order_id|product|quantity_ordered|price_each|order_date|purchase_address|discount|
+--------+-------+----------------+----------+----------+----------------+--------+
|     101| Laptop|               2|   50000.0|2024-06-01|           Delhi|  5000.0|
|     102|  Phone|               3|   20000.0|2024-06-02|          Mumbai|  2000.0|
+--------+-------+----------------+----------+----------+----------------+--------+



In [0]:
from pyspark.sql.functions import lit

# Add missing column 'discount' with null values to df_v1
df_v1_aligned = df_v1.withColumn("discount", lit(None).cast("double"))


In [0]:
#Merge both DataFrames using unionByName()
df_merged = df_v1_aligned.unionByName(df_v2)
df_merged.printSchema()
df_merged.show(5)


root
 |-- order_id: string (nullable = true)
 |-- product: string (nullable = true)
 |-- quantity_ordered: string (nullable = true)
 |-- price_each: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- purchase_address: string (nullable = true)
 |-- discount: double (nullable = true)

+--------+--------------------+----------------+----------+--------------+--------------------+--------+
|order_id|             product|quantity_ordered|price_each|    order_date|    purchase_address|discount|
+--------+--------------------+----------------+----------+--------------+--------------------+--------+
|  295665|  Macbook Pro Laptop|               1|      1700|12/30/19 00:01|136 Church St, Ne...|    null|
|  295666|  LG Washing Machine|               1|     600.0|12/29/19 07:03|562 2nd St, New Y...|    null|
|  295667|USB-C Charging Cable|               1|     11.95|12/12/19 18:21|277 Main St, New ...|    null|
|  295668|    27in FHD Monitor|               1|    149.99|12/22/

### **Use SQL to query a DataFrame directly (e.g., SELECT * FROM ...)**

In [0]:
#Create a simple DataFrame

data = [(1, "Alice", 25), (2, "Bob", 30), (3, "Charlie", 22)]
columns = ["id", "name", "age"]

df = spark.createDataFrame(data, columns)
df.show()


+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|  Alice| 25|
|  2|    Bob| 30|
|  3|Charlie| 22|
+---+-------+---+



In [0]:
#Create a temporary view of the DataFrame
df.createOrReplaceTempView("people")


In [0]:
%sql

SELECT id, name FROM people WHERE age < 30


id,name
1,Alice
3,Charlie


## **Simulate adding a new column to an existing DataFrame.**

In [0]:
from pyspark.sql.functions import lit

# Sample data
data = [
    (101, "Nikita", 28),
    (102, "Ravi", 35),
    (103, "Anita", 22)
]

columns = ["id", "name", "age"]

# Create DataFrame
df = spark.createDataFrame(data, columns)

# Add a new column 'salary' with a default value of 50000
df_new = df.withColumn("salary", lit(50000))

# Show the updated DataFrame
df_new.show()

# Check schema to see the new column
df_new.printSchema()


+---+------+---+------+
| id|  name|age|salary|
+---+------+---+------+
|101|Nikita| 28| 50000|
|102|  Ravi| 35| 50000|
|103| Anita| 22| 50000|
+---+------+---+------+

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- salary: integer (nullable = false)



## **Write SQL queries on DataFrames using spark.sql()**

In [0]:
# Select all rows
result_all = spark.sql("SELECT * FROM people")
result_all.show()

# Filter rows where age > 24
result_filtered = spark.sql("SELECT name, age FROM people WHERE age > 24")
result_filtered.show()

# Aggregate - average age
result_avg = spark.sql("SELECT AVG(age) as avg_age FROM people")
result_avg.show()


+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|  Alice| 25|
|  2|    Bob| 30|
|  3|Charlie| 22|
+---+-------+---+

+-----+---+
| name|age|
+-----+---+
|Alice| 25|
|  Bob| 30|
+-----+---+

+------------------+
|           avg_age|
+------------------+
|25.666666666666668|
+------------------+

