# DataTypes in PySpark
    - In PySpark, data types represent the type of data stored in a DataFrame or a distributed dataset (RDD)
    
    - PySpark provides several built-in data types that are similar to those in Python, but they are specifically designed to work in a distributed computing environment
    
    - These data types are part of the `pyspark.sql.types` module

In [None]:
#Import Spark Packges 
from pyspark.sql import SparkSession

In [None]:
#Build an Spark Session
spark = SparkSession.builder.getOrCreate()

# 1. StringType:
   - Represents text or string data.

In [None]:
data = [("Alice",), ("Bob",), ("Charlie",)]
df = spark.createDataFrame(data, ["Name"])
df.show()

# 2. IntegerType:
   - Represents integer values

In [None]:
data = [(25,), (30,), (22,)]
df = spark.createDataFrame(data, ["Age"])
df.show()

# 3. DoubleType:
   - Represents floating-point or double-precision numbers

In [None]:
data = [(3.14,), (2.718,), (1.618,)]
df = spark.createDataFrame(data, ["Value"])
df.show()

# 4. BooleanType:
   - Represents Boolean values (True or False)

In [None]:
data = [(True,), (False,), (True,)]
df = spark.createDataFrame(data, ["IsMarried"])
df.show()

# 5. DateType:
   - Represents date values.

In [None]:
from pyspark.sql.functions import to_date
spark = SparkSession.builder.appName("DateDataType").getOrCreate()
data = [("2022-01-15",), ("2023-03-20",), ("2021-11-05",)]
df = spark.createDataFrame(data, ["Date"])
df = df.withColumn("Date", to_date(df["Date"]))
df.show()

# 6. TimestampType:
   - Represents timestamp values with date and time.

In [None]:
data = [("2022-01-15 14:30:00",), ("2023-03-20 09:45:00",), ("2021-11-05 18:15:00",)]
df = spark.createDataFrame(data, ["Timestamp"])
df.show()

# 7. ArrayType:
   - Represents arrays or lists of values

In [None]:
from pyspark.sql.functions import split

data = [("1,2,3",), ("4,5",), ("6",)]
df = spark.createDataFrame(data, ["Values"])
df = df.withColumn("Values", split(df["Values"], ","))
df.show()

# 8. MapType:
   - Represents key-value pairs or dictionaries

In [None]:
data = [({"Name": "Alice", "Age": 25},), ({"Name": "Bob", "Age": 30},), ({"Name": "Charlie", "Age": 22},)]
df = spark.createDataFrame(data, ["Info"])
df.show()