## Introduction to PySpark String Functions
PySpark String Functions are built-in methods in the pyspark.sql.functions module that enable efficient manipulation and transformation of text data in distributed DataFrame operations.

### Links and Resources
- [String Functions](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/functions.html#string-functions)

In [0]:
from pyspark.sql.types import StructType, StructField, StringType

# Define a StructType schema for the DataFrame
schema = StructType([
    StructField("city", StringType(), True),
    StructField("state", StringType(), True)
])


# Sample data with extra spaces and inconsistent capitalization
data = [
    (" New York", " ny "),
    ("los angeles ", " CA"),
    (" Chicago", "IL "),
    ("Houston ", " tx")
]

# Create the DataFrame
df = spark.createDataFrame(data, schema)

df.show()

+------------+-----+
|        city|state|
+------------+-----+
|    New York|  ny |
|los angeles |   CA|
|     Chicago|  IL |
|    Houston |   tx|
+------------+-----+



In [0]:
from pyspark.sql.functions import trim, initcap, upper, lit, concat, concat_ws, length

In [0]:
# trim

df = df.select(trim("city").alias("city"), trim("state").alias("state"))
df.display()

trim(city),trim(state)
New York,ny
los angeles,CA
Chicago,IL
Houston,tx


In [0]:
# initcap, upper

df = df.select(initcap("city").alias("city"), upper("state").alias("state"))
df.display()

city,state
New York,NY
Los Angeles,CA
Chicago,IL
Houston,TX


In [0]:
# lit

df.select("city", "state", lit("constant")).display()

city,state,constant
New York,NY,constant
Los Angeles,CA,constant
Chicago,IL,constant
Houston,TX,constant


In [0]:
# concat

df.select("city", "state", concat("city", lit(", "), "state").alias("city_state")).display()

city,state,"concat(city, state)"
New York,NY,New YorkNY
Los Angeles,CA,Los AngelesCA
Chicago,IL,ChicagoIL
Houston,TX,HoustonTX


In [0]:
# concat_ws

df = df.select("city", "state", concat_ws(", ", "city", "state").alias("city_state"))
df.display()

city,state,city_state
New York,NY,"New York, NY"
Los Angeles,CA,"Los Angeles, CA"
Chicago,IL,"Chicago, IL"
Houston,TX,"Houston, TX"


In [0]:
# length

df.select("city_state", length("city_state").alias("num_chars")).display()

city_state,num_chars
"New York, NY",12
"Los Angeles, CA",15
"Chicago, IL",11
"Houston, TX",11
