<a href="https://colab.research.google.com/github/mayureshpawashe/ad_spark/blob/main/ad_spark_day5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##UDFs & Future Updates

In [29]:
import urllib.request
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Ad Spark day3 ').getOrCreate()
url = "https://raw.githubusercontent.com/prasertcbs/basic-dataset/refs/heads/master/Employee%20data.csv"
file_path = "/tmp/EMP_data.csv"
urllib.request.urlretrieve(url, file_path)
df = spark.read.csv(file_path, header=True, inferSchema=True)
df.show(10)

+----+------+----------+----+--------+-------+--------+-------+-------+--------+
|  id|gender|     bdate|educ|  jobcat| salary|salbegin|jobtime|prevexp|minority|
+----+------+----------+----+--------+-------+--------+-------+-------+--------+
| 1.0|  Male|1952-02-03|  15| Manager|57000.0| 27000.0|   98.0|  144.0|      No|
| 2.0|  Male|1958-05-23|  16|Clerical|40200.0| 18750.0|   98.0|   36.0|      No|
| 3.0|Female|1929-07-26|  12|Clerical|21450.0| 12000.0|   98.0|  381.0|      No|
| 4.0|Female|1947-04-15|   8|Clerical|21900.0| 13200.0|   98.0|  190.0|      No|
| 5.0|  Male|1955-02-09|  15|Clerical|45000.0| 21000.0|   98.0|  138.0|      No|
| 6.0|  Male|1958-08-22|  15|Clerical|32100.0| 13500.0|   98.0|   67.0|      No|
| 7.0|  Male|1956-04-26|  15|Clerical|36000.0| 18750.0|   98.0|  114.0|      No|
| 8.0|Female|1966-05-06|  12|Clerical|21900.0|  9750.0|   98.0|missing|      No|
| 9.0|Female|1946-01-23|  15|Clerical|27900.0| 12750.0|   98.0|  115.0|      No|
|10.0|Female|1946-02-13|  12

##UDF to Convert Gender to Uppercase

In [8]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, IntegerType
import datetime
def to_uppercase(gender):
    return gender.upper() if gender else None  # Handling None values

gender_udf = udf(to_uppercase, StringType())

df = df.withColumn("gender_upper", gender_udf(df.gender))
df.show(10)

+----+------+----------+----+--------+-------+--------+-------+-------+--------+------------+
|  id|gender|     bdate|educ|  jobcat| salary|salbegin|jobtime|prevexp|minority|gender_upper|
+----+------+----------+----+--------+-------+--------+-------+-------+--------+------------+
| 1.0|  Male|1952-02-03|  15| Manager|57000.0| 27000.0|   98.0|  144.0|      No|        MALE|
| 2.0|  Male|1958-05-23|  16|Clerical|40200.0| 18750.0|   98.0|   36.0|      No|        MALE|
| 3.0|Female|1929-07-26|  12|Clerical|21450.0| 12000.0|   98.0|  381.0|      No|      FEMALE|
| 4.0|Female|1947-04-15|   8|Clerical|21900.0| 13200.0|   98.0|  190.0|      No|      FEMALE|
| 5.0|  Male|1955-02-09|  15|Clerical|45000.0| 21000.0|   98.0|  138.0|      No|        MALE|
| 6.0|  Male|1958-08-22|  15|Clerical|32100.0| 13500.0|   98.0|   67.0|      No|        MALE|
| 7.0|  Male|1956-04-26|  15|Clerical|36000.0| 18750.0|   98.0|  114.0|      No|        MALE|
| 8.0|Female|1966-05-06|  12|Clerical|21900.0|  9750.0|   98

##UDF to Calculate Age from Birthdate

In [13]:
from pyspark.sql.functions import udf, year
from pyspark.sql.types import IntegerType
import datetime
def calculate_age(birth_year):
    current_year = datetime.datetime.now().year
    return current_year - birth_year if birth_year else None

age_udf = udf(calculate_age, IntegerType())

df1 = df.withColumn("age", age_udf(year(df.bdate)))

df1.show(10)


+----+------+----------+----+--------+-------+--------+-------+-------+--------+------------+---+
|  id|gender|     bdate|educ|  jobcat| salary|salbegin|jobtime|prevexp|minority|gender_upper|age|
+----+------+----------+----+--------+-------+--------+-------+-------+--------+------------+---+
| 1.0|  Male|1952-02-03|  15| Manager|57000.0| 27000.0|   98.0|  144.0|      No|        MALE| 73|
| 2.0|  Male|1958-05-23|  16|Clerical|40200.0| 18750.0|   98.0|   36.0|      No|        MALE| 67|
| 3.0|Female|1929-07-26|  12|Clerical|21450.0| 12000.0|   98.0|  381.0|      No|      FEMALE| 96|
| 4.0|Female|1947-04-15|   8|Clerical|21900.0| 13200.0|   98.0|  190.0|      No|      FEMALE| 78|
| 5.0|  Male|1955-02-09|  15|Clerical|45000.0| 21000.0|   98.0|  138.0|      No|        MALE| 70|
| 6.0|  Male|1958-08-22|  15|Clerical|32100.0| 13500.0|   98.0|   67.0|      No|        MALE| 67|
| 7.0|  Male|1956-04-26|  15|Clerical|36000.0| 18750.0|   98.0|  114.0|      No|        MALE| 69|
| 8.0|Female|1966-05

##UDF to Categorize Salary

In [19]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
from datetime import datetime

# Fix the UDF
def calculate_age(bdate):
    if bdate is None:
        return None
    elif isinstance(bdate, str):  # If it's a string, convert to datetime
        birth_year = int(bdate.split("-")[0])
    elif isinstance(bdate, datetime.date):  # If it's already a date object
        birth_year = bdate.year
    else:
        return None
    return datetime.now().year - birth_year

# Register the UDF
age_udf = udf(calculate_age, IntegerType())

# Ensure `bdate` is of StringType before applying UDF
df = df.withColumn("bdate", df["bdate"].cast("string"))

# Apply the UDF
df = df.withColumn("age", age_udf(df.bdate))

# Show results
df.show(10)


+----+------+----------+----+--------+------+--------+-------+-------+--------+------------+---+---------------+
|  id|gender|     bdate|educ|  jobcat|salary|salbegin|jobtime|prevexp|minority|gender_upper|age|salary_category|
+----+------+----------+----+--------+------+--------+-------+-------+--------+------------+---+---------------+
| 1.0|  Male|1952-02-03|  15| Manager| 57000| 27000.0|   98.0|  144.0|      No|        MALE| 73|         Medium|
| 2.0|  Male|1958-05-23|  16|Clerical| 40200| 18750.0|   98.0|   36.0|      No|        MALE| 67|         Medium|
| 3.0|Female|1929-07-26|  12|Clerical| 21450| 12000.0|   98.0|  381.0|      No|      FEMALE| 96|            Low|
| 4.0|Female|1947-04-15|   8|Clerical| 21900| 13200.0|   98.0|  190.0|      No|      FEMALE| 78|            Low|
| 5.0|  Male|1955-02-09|  15|Clerical| 45000| 21000.0|   98.0|  138.0|      No|        MALE| 70|         Medium|
| 6.0|  Male|1958-08-22|  15|Clerical| 32100| 13500.0|   98.0|   67.0|      No|        MALE| 67|

In [23]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
def square_number(n):
    return n * n if n is not None else None  # Handle null values

square_udf = udf(square_number, IntegerType())

data = [(1,), (2,), (3,), (4,), (5,)]
df = spark.createDataFrame(data, ["number"])

df = df.withColumn("square", square_udf(df["number"]))

df.show()


+------+------+
|number|square|
+------+------+
|     1|     1|
|     2|     4|
|     3|     9|
|     4|    16|
|     5|    25|
+------+------+



##Create a UDF to check if a value in a column is prime or not

In [28]:
from pyspark.sql.functions import udf
from pyspark.sql.types import BooleanType

# Define function to check if a number is prime
def is_prime(n):
    if n is None or n < 2:
        return False
    for i in range(2, int(n ** 0.5) + 1):
        if n % i == 0:
            return False
    return True

is_prime_udf = udf(is_prime, BooleanType())
df = df.withColumn("is_prime_educ", is_prime_udf(df["educ"]))
df.select("educ", "is_prime_educ").show()


+----+-------------+
|educ|is_prime_educ|
+----+-------------+
|  15|        false|
|  16|        false|
|  12|        false|
|   8|        false|
|  15|        false|
|  15|        false|
|  15|        false|
|  12|        false|
|  15|        false|
|  12|        false|
|  16|        false|
|   8|        false|
|  15|        false|
|  15|        false|
|  12|        false|
|  12|        false|
|  15|        false|
|  16|        false|
|  12|        false|
|  12|        false|
+----+-------------+
only showing top 20 rows

