<a href="https://colab.research.google.com/github/luasampaio/data-engineering/blob/main/ntb_StructField_Ingestao.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

 - Trabalhando com StructFields
 -

In [7]:
#Create PySpark
from pyspark.sql import SparkSession
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Spark DataFrames").getOrCreate()

simpleData = [("Michael","Sales","NY",90000,34,10000),
    ("Michael","Sales","NY",86000,56,20000),
    ("Luciana","Sales","CA",81000,30,23000),
    ("Maria","Finance","CA",90000,24,23000),
    ("Arthur","Finance","CA",99000,40,24000),
    ("Scott","Finance","NY",83000,36,19000),
    ("Jen","Finance","NY",79000,53,15000),
    ("Jeff","Marketing","CA",80000,25,18000),
    ("Kumar","Marketing","NY",91000,50,21000)
  ]

# Define schema for the DataFrame
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

schema = StructType([
    StructField("name", StringType(), True),
    StructField("department", StringType(), True),
    StructField("state", StringType(), True),
    StructField("salary", IntegerType(), True),
    StructField("age", IntegerType(), True),
    StructField("bonus", IntegerType(), True)
])

# Create a PySpark DataFrame
df = spark.createDataFrame(data=simpleData, schema=schema)

# Now you can use .show()
df.show()

+-------+----------+-----+------+---+-----+
|   name|department|state|salary|age|bonus|
+-------+----------+-----+------+---+-----+
|Michael|     Sales|   NY| 90000| 34|10000|
|Michael|     Sales|   NY| 86000| 56|20000|
|Luciana|     Sales|   CA| 81000| 30|23000|
|  Maria|   Finance|   CA| 90000| 24|23000|
| Arthur|   Finance|   CA| 99000| 40|24000|
|  Scott|   Finance|   NY| 83000| 36|19000|
|    Jen|   Finance|   NY| 79000| 53|15000|
|   Jeff| Marketing|   CA| 80000| 25|18000|
|  Kumar| Marketing|   NY| 91000| 50|21000|
+-------+----------+-----+------+---+-----+



In [8]:
from pyspark.sql.functions import monotonically_increasing_id

df = df.withColumn("id", monotonically_increasing_id())
display(df)

DataFrame[name: string, department: string, state: string, salary: int, age: int, bonus: int, id: bigint]

In [9]:
df.show(truncate=False)

+-------+----------+-----+------+---+-----+----------+
|name   |department|state|salary|age|bonus|id        |
+-------+----------+-----+------+---+-----+----------+
|Michael|Sales     |NY   |90000 |34 |10000|0         |
|Michael|Sales     |NY   |86000 |56 |20000|1         |
|Luciana|Sales     |CA   |81000 |30 |23000|2         |
|Maria  |Finance   |CA   |90000 |24 |23000|3         |
|Arthur |Finance   |CA   |99000 |40 |24000|8589934592|
|Scott  |Finance   |NY   |83000 |36 |19000|8589934593|
|Jen    |Finance   |NY   |79000 |53 |15000|8589934594|
|Jeff   |Marketing |CA   |80000 |25 |18000|8589934595|
|Kumar  |Marketing |NY   |91000 |50 |21000|8589934596|
+-------+----------+-----+------+---+-----+----------+



In [10]:
## Resetar o id

from pyspark.sql.functions import row_number
from pyspark.sql.window import Window

df = df.withColumn("id", row_number().over(Window.orderBy("name")))
display(df)
df.show(truncate=False)


DataFrame[name: string, department: string, state: string, salary: int, age: int, bonus: int, id: int]

+-------+----------+-----+------+---+-----+---+
|name   |department|state|salary|age|bonus|id |
+-------+----------+-----+------+---+-----+---+
|Arthur |Finance   |CA   |99000 |40 |24000|1  |
|Jeff   |Marketing |CA   |80000 |25 |18000|2  |
|Jen    |Finance   |NY   |79000 |53 |15000|3  |
|Kumar  |Marketing |NY   |91000 |50 |21000|4  |
|Luciana|Sales     |CA   |81000 |30 |23000|5  |
|Maria  |Finance   |CA   |90000 |24 |23000|6  |
|Michael|Sales     |NY   |90000 |34 |10000|7  |
|Michael|Sales     |NY   |86000 |56 |20000|8  |
|Scott  |Finance   |NY   |83000 |36 |19000|9  |
+-------+----------+-----+------+---+-----+---+



In [11]:


from pyspark.sql.functions import monotonically_increasing_id, row_number
from pyspark.sql.window import Window

# Adiciona uma coluna "id" com valores únicos e ordenados
df = df.withColumn("id", row_number().over(Window.orderBy(monotonically_increasing_id())))

# Move a coluna "id" para a primeira posição
columns = df.columns
columns.remove("id")
columns.insert(0, "id")
df = df.select(columns)

display(df)
df.show(truncate=False)


DataFrame[id: int, name: string, department: string, state: string, salary: int, age: int, bonus: int]

+---+-------+----------+-----+------+---+-----+
|id |name   |department|state|salary|age|bonus|
+---+-------+----------+-----+------+---+-----+
|1  |Michael|Sales     |NY   |90000 |34 |10000|
|2  |Michael|Sales     |NY   |86000 |56 |20000|
|3  |Luciana|Sales     |CA   |81000 |30 |23000|
|4  |Maria  |Finance   |CA   |90000 |24 |23000|
|5  |Arthur |Finance   |CA   |99000 |40 |24000|
|6  |Scott  |Finance   |NY   |83000 |36 |19000|
|7  |Jen    |Finance   |NY   |79000 |53 |15000|
|8  |Jeff   |Marketing |CA   |80000 |25 |18000|
|9  |Kumar  |Marketing |NY   |91000 |50 |21000|
+---+-------+----------+-----+------+---+-----+



## Verificando as colunas

In [13]:
df.columns

['id', 'name', 'department', 'state', 'salary', 'age', 'bonus']

In [14]:
import datetime
dt = datetime.datetime.now()
print(dt)

2025-01-31 20:49:26.608022


In [4]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- state: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- bonus: integer (nullable = true)



In [5]:
from pyspark.sql.functions import lit
from pyspark.sql.functions import when

df.withColumn("age", \
  when((df.age < 30), lit(None)) \
    .when((df.age >= 18) & (df.age <= 60), lit("adulto")) \
    .otherwise(lit("idoso")) \
  ).show()

+-------+----------+-----+------+------+-----+
|   name|department|state|salary|   age|bonus|
+-------+----------+-----+------+------+-----+
|Michael|     Sales|   NY| 90000|adulto|10000|
|Michael|     Sales|   NY| 86000|adulto|20000|
|Luciana|     Sales|   CA| 81000|adulto|23000|
|  Maria|   Finance|   CA| 90000|  NULL|23000|
| Arthur|   Finance|   CA| 99000|adulto|24000|
|  Scott|   Finance|   NY| 83000|adulto|19000|
|    Jen|   Finance|   NY| 79000|adulto|15000|
|   Jeff| Marketing|   CA| 80000|  NULL|18000|
|  Kumar| Marketing|   NY| 91000|adulto|21000|
+-------+----------+-----+------+------+-----+



In [6]:
display(df)

DataFrame[name: string, department: string, state: string, salary: int, age: int, bonus: int]