<a href="https://colab.research.google.com/github/mayureshpawashe/ad_spark/blob/main/ad_spark_day3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import urllib.request
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Ad Spark day3 ').getOrCreate()
url = "https://raw.githubusercontent.com/prasertcbs/basic-dataset/refs/heads/master/Restaurant%20customer%20data.csv"
file_path = "/tmp/Restaurant_customer_data.csv"
urllib.request.urlretrieve(url, file_path)
df = spark.read.csv(file_path, header=True, inferSchema=True)
df.show(10)

+------+---------+-----------+------+--------------+----------------+--------+---------+--------------+-----------+----------+----------+-------------------+--------+------------+------+------+------+------+
|userID| latitude|  longitude|smoker|   drink_level|dress_preference|ambience|transport|marital_status|      hijos|birth_year|  interest|        personality|religion|    activity| color|weight|budget|height|
+------+---------+-----------+------+--------------+----------------+--------+---------+--------------+-----------+----------+----------+-------------------+--------+------------+------+------+------+------+
| U1001|22.139997|-100.978803| false|    abstemious|        informal|  family|  on foot|        single|independent|      1989|   variety|  thrifty-protector|    none|     student| black|    69|medium|  1.77|
| U1002|22.150087|-100.983325| false|    abstemious|        informal|  family|   public|        single|independent|      1990|technology|hunter-ostentatious|Catholic|  

##Schema Basics, Inference

In [12]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Schema Inference").getOrCreate()

# Read CSV with schema inference
df = spark.read.option("header", "true") \
               .option("inferSchema", "true") \
               .option("delimiter", ",") \
               .csv("/tmp/Restaurant_customer_data.csv")

df.show(10, truncate=False)


df.printSchema()


+------+---------+-----------+------+--------------+----------------+--------+---------+--------------+-----------+----------+----------+-------------------+--------+------------+------+------+------+------+
|userID|latitude |longitude  |smoker|drink_level   |dress_preference|ambience|transport|marital_status|hijos      |birth_year|interest  |personality        |religion|activity    |color |weight|budget|height|
+------+---------+-----------+------+--------------+----------------+--------+---------+--------------+-----------+----------+----------+-------------------+--------+------------+------+------+------+------+
|U1001 |22.139997|-100.978803|false |abstemious    |informal        |family  |on foot  |single        |independent|1989      |variety   |thrifty-protector  |none    |student     |black |69    |medium|1.77  |
|U1002 |22.150087|-100.983325|false |abstemious    |informal        |family  |public   |single        |independent|1990      |technology|hunter-ostentatious|Catholic|st

##adding a new column to an existing DataFrame

In [13]:
from pyspark.sql.functions import lit

# Add a new column 'income' with a default value
df = df.withColumn("income", lit(50000))  # Default value: 50,000

df.printSchema()

df.show(5, truncate=False)

root
 |-- userID: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- smoker: string (nullable = true)
 |-- drink_level: string (nullable = true)
 |-- dress_preference: string (nullable = true)
 |-- ambience: string (nullable = true)
 |-- transport: string (nullable = true)
 |-- marital_status: string (nullable = true)
 |-- hijos: string (nullable = true)
 |-- birth_year: integer (nullable = true)
 |-- interest: string (nullable = true)
 |-- personality: string (nullable = true)
 |-- religion: string (nullable = true)
 |-- activity: string (nullable = true)
 |-- color: string (nullable = true)
 |-- weight: integer (nullable = true)
 |-- budget: string (nullable = true)
 |-- height: double (nullable = true)
 |-- income: integer (nullable = false)

+------+---------+-----------+------+--------------+----------------+--------+---------+--------------+-----------+----------+----------+-------------------+--------+------------+-----

##Changing Data Type (smoker from string → boolean)

In [14]:
from pyspark.sql.types import BooleanType
from pyspark.sql.functions import col, when

# Convert 'smoker' column string  to Boolean
df = df.withColumn("smoker", when(col("smoker") == "true", True)
                             .when(col("smoker") == "false", False)
                             .cast(BooleanType()))

df.printSchema()
df.show(5, truncate=False)


root
 |-- userID: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- smoker: boolean (nullable = true)
 |-- drink_level: string (nullable = true)
 |-- dress_preference: string (nullable = true)
 |-- ambience: string (nullable = true)
 |-- transport: string (nullable = true)
 |-- marital_status: string (nullable = true)
 |-- hijos: string (nullable = true)
 |-- birth_year: integer (nullable = true)
 |-- interest: string (nullable = true)
 |-- personality: string (nullable = true)
 |-- religion: string (nullable = true)
 |-- activity: string (nullable = true)
 |-- color: string (nullable = true)
 |-- weight: integer (nullable = true)
 |-- budget: string (nullable = true)
 |-- height: double (nullable = true)
 |-- income: integer (nullable = false)

+------+---------+-----------+------+--------------+----------------+--------+---------+--------------+-----------+----------+----------+-------------------+--------+------------+----

##Removing a Column (income)

In [15]:
df = df.drop("income")

# Show updated schema
df.printSchema()

# Show updated data
df.show(5, truncate=False)


root
 |-- userID: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- smoker: boolean (nullable = true)
 |-- drink_level: string (nullable = true)
 |-- dress_preference: string (nullable = true)
 |-- ambience: string (nullable = true)
 |-- transport: string (nullable = true)
 |-- marital_status: string (nullable = true)
 |-- hijos: string (nullable = true)
 |-- birth_year: integer (nullable = true)
 |-- interest: string (nullable = true)
 |-- personality: string (nullable = true)
 |-- religion: string (nullable = true)
 |-- activity: string (nullable = true)
 |-- color: string (nullable = true)
 |-- weight: integer (nullable = true)
 |-- budget: string (nullable = true)
 |-- height: double (nullable = true)

+------+---------+-----------+------+--------------+----------------+--------+---------+--------------+-----------+----------+----------+-------------------+--------+------------+-----+------+------+------+
|userID|latitude

##Merging Two DataFrames with Different Schemas

In [17]:
from pyspark.sql import Row

# Create a new DataFrame with an extra column "income"
data_new = [
    Row(userID="U2001", latitude=20.0, longitude=-100.0, smoker=False, income=60000),
    Row(userID="U2002", latitude=21.0, longitude=-101.0, smoker=True, income=75000)
]
df_new = spark.createDataFrame(data_new)

# Merge DataFrames using `unionByName()` (fills missing columns with NULL)
df_merged = df.unionByName(df_new, allowMissingColumns=True)

# Show updated schema
df_merged.printSchema()




# Show merged data
df_merged.show(5, truncate=False)


root
 |-- userID: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- smoker: boolean (nullable = true)
 |-- drink_level: string (nullable = true)
 |-- dress_preference: string (nullable = true)
 |-- ambience: string (nullable = true)
 |-- transport: string (nullable = true)
 |-- marital_status: string (nullable = true)
 |-- hijos: string (nullable = true)
 |-- birth_year: integer (nullable = true)
 |-- interest: string (nullable = true)
 |-- personality: string (nullable = true)
 |-- religion: string (nullable = true)
 |-- activity: string (nullable = true)
 |-- color: string (nullable = true)
 |-- weight: integer (nullable = true)
 |-- budget: string (nullable = true)
 |-- height: double (nullable = true)
 |-- income: long (nullable = true)

+------+---------+-----------+------+--------------+----------------+--------+---------+--------------+-----------+----------+----------+-------------------+--------+------------+-----+--

##SQL Queries on PySpark DataFrames

First, i need to register your DataFrame as a temporary SQL table, and then i can run SQL queries on it.

In [20]:
df.createOrReplaceTempView("users")

In [32]:
df_sql = spark.sql("SELECT * FROM users")
df_ori=df_sql
df_sql.show()

+------+---------+-----------+------+--------------+----------------+--------+---------+--------------+-----------+----------+------------+-------------------+---------+------------+------+------+------+------+
|userID| latitude|  longitude|smoker|   drink_level|dress_preference|ambience|transport|marital_status|      hijos|birth_year|    interest|        personality| religion|    activity| color|weight|budget|height|
+------+---------+-----------+------+--------------+----------------+--------+---------+--------------+-----------+----------+------------+-------------------+---------+------------+------+------+------+------+
| U1001|22.139997|-100.978803| false|    abstemious|        informal|  family|  on foot|        single|independent|      1989|     variety|  thrifty-protector|     none|     student| black|    69|medium|  1.77|
| U1002|22.150087|-100.983325| false|    abstemious|        informal|  family|   public|        single|independent|      1990|  technology|hunter-ostentatio

In [24]:
df_sql = spark.sql("SELECT userID, smoker, drink_level FROM users")
df_sql.show(10)

+------+------+--------------+
|userID|smoker|   drink_level|
+------+------+--------------+
| U1001| false|    abstemious|
| U1002| false|    abstemious|
| U1003| false|social drinker|
| U1004| false|    abstemious|
| U1005| false|    abstemious|
| U1006|  true|social drinker|
| U1007| false|casual drinker|
| U1008| false|social drinker|
| U1009| false|    abstemious|
| U1010| false|social drinker|
+------+------+--------------+
only showing top 10 rows



In [25]:
#Filtering Data (WHERE Clause)
df_sql = spark.sql("SELECT * FROM users WHERE smoker = 'false'")
df_sql.show()

+------+---------+-----------+------+--------------+----------------+--------+---------+--------------+-----------+----------+------------+-------------------+--------+------------+------+------+------+------+
|userID| latitude|  longitude|smoker|   drink_level|dress_preference|ambience|transport|marital_status|      hijos|birth_year|    interest|        personality|religion|    activity| color|weight|budget|height|
+------+---------+-----------+------+--------------+----------------+--------+---------+--------------+-----------+----------+------------+-------------------+--------+------------+------+------+------+------+
| U1001|22.139997|-100.978803| false|    abstemious|        informal|  family|  on foot|        single|independent|      1989|     variety|  thrifty-protector|    none|     student| black|    69|medium|  1.77|
| U1002|22.150087|-100.983325| false|    abstemious|        informal|  family|   public|        single|independent|      1990|  technology|hunter-ostentatious|C

In [27]:
#Ordering Data (ORDER BY)
df_sql = spark.sql("SELECT * FROM users ORDER BY birth_year DESC")
df_sql.show(10)

+------+---------+-----------+------+--------------+----------------+--------+---------+--------------+-----------+----------+------------+-----------------+--------+----------+------+------+------+------+
|userID| latitude|  longitude|smoker|   drink_level|dress_preference|ambience|transport|marital_status|      hijos|birth_year|    interest|      personality|religion|  activity| color|weight|budget|height|
+------+---------+-----------+------+--------------+----------------+--------+---------+--------------+-----------+----------+------------+-----------------+--------+----------+------+------+------+------+
| U1040|18.895187|  -99.18039| false|    abstemious|   no preference| friends|   public|        single|independent|      1994|        none|thrifty-protector|Catholic|   student|  blue|    73|medium|  1.64|
| U1110|18.871678| -99.183263| false|    abstemious|   no preference|  family|car owner|        single|independent|      1993|     variety|thrifty-protector|Catholic|   student

In [30]:
#Count the number of users by drink_level
df_sql = spark.sql("SELECT drink_level, COUNT(*) as count FROM users GROUP BY drink_level")
df_sql.show()

+--------------+-----+
|   drink_level|count|
+--------------+-----+
|    abstemious|   51|
|casual drinker|   47|
|social drinker|   40|
+--------------+-----+



In [34]:
#Count the number of users by marital_status only married
df_ori = spark.sql("SELECT marital_status, COUNT(*) as count FROM users WHERE marital_status = 'married' GROUP BY marital_status")
df_ori.show()


+--------------+-----+
|marital_status|count|
+--------------+-----+
|       married|   10|
+--------------+-----+



In [35]:
#Filter drink levels with more than 2 users
df_sql = spark.sql("SELECT drink_level, COUNT(*) as count FROM users GROUP BY drink_level HAVING count > 2")
df_sql.show()

+--------------+-----+
|   drink_level|count|
+--------------+-----+
|    abstemious|   51|
|casual drinker|   47|
|social drinker|   40|
+--------------+-----+



In [38]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

spark = SparkSession.builder.appName("SQL Example").getOrCreate()

# Define schema
schema = StructType([
    StructField("userID", StringType(), True),
    StructField("income", IntegerType(), True)
])
data = [
    ("U1001", 50000),
    ("U1002", 45000),
    ("U1003", 60000),
    ("U1004", 55000),
    ("U1005", 48000)
]

# Create DataFrame
df_income = spark.createDataFrame(data, schema=schema)

# Register as SQL Table
df_income.createOrReplaceTempView("income_data")

In [39]:
#Using JOIN with Another DataFrame
df_income.createOrReplaceTempView("income_data")
df_sql = spark.sql("""
    SELECT u.userID, u.smoker, u.drink_level, i.income
    FROM users u
    JOIN income_data i ON u.userID = i.userID
""")
df_sql.show()


+------+------+--------------+------+
|userID|smoker|   drink_level|income|
+------+------+--------------+------+
| U1001| false|    abstemious| 50000|
| U1002| false|    abstemious| 45000|
| U1003| false|social drinker| 60000|
| U1004| false|    abstemious| 55000|
| U1005| false|    abstemious| 48000|
+------+------+--------------+------+

