# <center> <img src="../labs/img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Big Data** </center>
---
### <center> **Spring 2025** </center>
---
### <center> **Examples on manipulating columns** </center>

---
**Profesor**: Dr. Pablo Camarillo Ramirez

In [35]:
import findspark
findspark.init()

#### Creacion de la conexión con el cluster de spark


In [61]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("SparkSQL-Columns") \
    .master("spark://078b2e28e517:7077") \
    .config("spark.ui.port","4040") \
    .getOrCreate()
sc = spark.sparkContext

### Load e-commerce dataset

In [None]:
from team_name.spark_utils import SparkUtils

columns_info = [("product", "string"),
                ("price", "double"),
                ("quantity", "integer"),
                ("discount", "float"),
                ("customer_first_name", "string"),
                ("order_date", "date")]

schema = SparkUtils.generate_schema(columns_info)

# Create DataFrame
ecommerce_df = spark \
                .read \
                .schema(schema) \
                .option("header", "true") \
                .csv("/home/jovyan/notebooks/data/e_commerce_dataset/")

ecommerce_df.printSchema()

#### Show first 10 rows

In [None]:
ecommerce_df.show(n=10)

In [None]:
# Adding a new column with a constant value
from pyspark.sql.functions import lit

df_with_lit = ecommerce_df.withColumn('new_column', lit(10))
df_with_lit.show(n=5)

In [None]:
# Dropping Unnecessary Columns
df_without_lit = ecommerce_df.drop('new_column')
df_without_lit.show(n=5)

In [None]:
# Create new columns to add new data based on existing columns.
ecommerce_df = ecommerce_df.withColumn('total_cost', ecommerce_df['price'] * ecommerce_df['quantity'])
ecommerce_df.show(n=5)

In [None]:
# Apply conditions using 'when' to populate a new column.
from pyspark.sql.functions import when

ecommerce_df = ecommerce_df.withColumn('is_high_order', when(ecommerce_df['total_cost'] > 2500, "YES").otherwise("NO"))
ecommerce_df.show(n=5)

In [None]:
# Use 'concat' to combine strings from multiple columns.
from pyspark.sql.functions import concat

df = ecommerce_df.select("customer_first_name", "product").withColumn("label", concat(ecommerce_df["customer_first_name"], lit(" bought:"), ecommerce_df["product"]))
df.select("label").show(n=5, truncate=False)

In [None]:
# Extract parts of a date using functions like year, month, day of month.
from pyspark.sql.functions import year, month
df = ecommerce_df.select("product", "order_date").withColumn('order_year', year(ecommerce_df['order_date']))
df = ecommerce_df.withColumn('order_month', month(ecommerce_df['order_date']))
df.show(5)

In [None]:
# Changing the Data Type of a Column
df = ecommerce_df.withColumn('price_int', ecommerce_df.price.cast('integer'))
df.show(3)

In [None]:
# Renaming columns
df = df.withColumnRenamed('price_int', 'price_integer')
df.printSchema()


#### Manipulating JSON columns

In [None]:
# Sample data with JSON strings
data = [
    ('1', '{"name": "Alice", "age": 25, "payments": [34, 433, 54], "address": {"city": "New York", "zip": "10001"}}'),
    ('2', '{"name": "Bob", "age": 30, "address": {"city": "Los Angeles", "zip": "90001"}}'),
    ('3', '{"name": "Charlie", "age": 35, "address": {"city": "Chicago", "zip": "60601"}}')
]

# Create DataFrame
columns = ["id", "json_col"]
df = spark.createDataFrame(data, columns)
df.printSchema()
df.show(truncate=False)

##### Use json_extract to extract JSON fields as JSON strings

In [None]:
from pyspark.sql.functions import get_json_object
df.withColumn("name", get_json_object(df.json_col, '$.name')).show()

In [None]:
# Create city column

In [None]:
# Get 1st payment

In [76]:
# Stop the SparkContext
sc.stop()