In [32]:
from pyspark.sql import SparkSession

# Spark entry point
spark = SparkSession \
    .builder \
    .appName("Aula Interativa 1 - Engenharia de Dados - Apache Spark") \
    .getOrCreate()

spark.version


'3.3.2'

In [36]:
# Carregando a planilha CSV
titanic_df_1 = spark.read.csv('titanic-partial-1.csv',header='True',inferSchema='True')

# Detalhes dos atributos em https://www.kaggle.com/c/titanic/data?select=train.csv
titanic_df_1.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [37]:
titanic_df_1.agg({"Fare": "max"}).collect()

[Row(max(Fare)=512.3292)]

In [38]:
# Carregando a planilha CSV
titanic_df_2 = spark.read.csv('titanic-partial-2.csv',header='True',inferSchema='True')

# Detalhes dos atributos em https://www.kaggle.com/c/titanic/data?select=train.csv
titanic_df_2.printSchema()


root
 |-- PassengerId: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [39]:
titanic_df_2 = titanic_df_2.withColumnRenamed("Gender", "Sex")

titanic_df_2.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [40]:
titanic_df = titanic_df_1.union(titanic_df_2)

titanic_df.count()

990

In [41]:
titanic_df.distinct().count()

891

In [42]:
titanic_df_3 = titanic_df.distinct()

In [43]:
titanic_df_3.count()

891

In [44]:
# Carregando o arquivo JSON

survived_df = spark.read.json("titanic-survived.json")
survived_df.printSchema()
survived_df.show()

root
 |-- PassengerId: long (nullable = true)
 |-- Survived: long (nullable = true)

+-----------+--------+
|PassengerId|Survived|
+-----------+--------+
|          1|       0|
|          2|       1|
|          3|       1|
|          4|       1|
|          5|       0|
|          6|       0|
|          7|       0|
|          8|       0|
|          9|       1|
|         10|       1|
|         11|       1|
|         12|       1|
|         13|       0|
|         14|       0|
|         15|       0|
|         16|       1|
|         17|       0|
|         18|       1|
|         19|       0|
|         20|       1|
+-----------+--------+
only showing top 20 rows



In [None]:
titanic_df.printSchema()

In [45]:
# Juntando os dois dataframes.

titanic_df = titanic_df_3.join(survived_df, ["PassengerId"])

titanic_df.printSchema()


root
 |-- PassengerId: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)
 |-- Survived: long (nullable = true)



In [None]:
titanic_df.show(5)


In [None]:
titanic_df.show(5)

In [None]:
titanic_df.select("Survived", "Name", "Pclass","Embarked").show()

In [None]:
titanic_df.describe().show()


In [None]:
titanic_df.describe('Age', 'Fare').show()

In [None]:
titanic_df.select('Age', 'Fare').summary().show()

In [None]:
titanic_df.groupBy("Survived").count().show()


In [None]:
titanic_df.groupBy("Sex","Survived").count().show()

In [None]:
titanic_df.groupBy("Pclass","Survived").count().show()



In [None]:
from pyspark.sql.functions import col, regexp_extract

titanic_df = titanic_df.withColumn("Initial", regexp_extract(col("Name"),"([A-Za-z]+)\.",1))

titanic_df.select("Initial","Name").show()

In [None]:
titanic_df.select("Initial").distinct().show()


In [None]:
titanic_df = titanic_df.replace(['Mlle','Mme', 'Ms', 'Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don'],
               ['Miss','Miss','Miss','Mr','Mr',  'Mrs',  'Mrs',  'Other',  'Other','Other','Mr','Mr','Mr'])

In [None]:
titanic_df.select("Initial").distinct().show()

In [None]:
titanic_df.groupby('Initial').avg('Age').collect()


In [None]:
from pyspark.sql.functions import col

titanic_df.filter(col("Initial") == 'Miss').select("Name", "Age", "Initial").show(5)

titanic_df.filter(titanic_df.Initial == 'Miss').select("Name", "Age", "Initial").show(5)

In [None]:
from pyspark.sql.functions import when

titanic_df = titanic_df.withColumn("Age",when((titanic_df["Initial"] == "Miss") & (titanic_df["Age"].isNull()), 22).otherwise(titanic_df["Age"]))
titanic_df = titanic_df.withColumn("Age",when((titanic_df["Initial"] == "Other") & (titanic_df["Age"].isNull()), 46).otherwise(titanic_df["Age"]))
titanic_df = titanic_df.withColumn("Age",when((titanic_df["Initial"] == "Master") & (titanic_df["Age"].isNull()), 5).otherwise(titanic_df["Age"]))
titanic_df = titanic_df.withColumn("Age",when((titanic_df["Initial"] == "Mr") & (titanic_df["Age"].isNull()), 33).otherwise(titanic_df["Age"]))
titanic_df = titanic_df.withColumn("Age",when((titanic_df["Initial"] == "Mrs") & (titanic_df["Age"].isNull()), 36).otherwise(titanic_df["Age"]))


In [None]:
titanic_df.groupBy("Embarked").count().show()


In [None]:
titanic_df = titanic_df.na.fill({"Embarked" : 'S'})

In [None]:
titanic_df.groupBy("Embarked").count().show()

In [None]:
titanic_df.describe("Cabin").show()

In [None]:
titanic_df = titanic_df.drop("Cabin")

titanic_df.printSchema()


In [None]:
titanic_df = titanic_df.withColumn("Family_Size",col('SibSp')+col('Parch') + 1)

In [None]:
from pyspark.sql.functions import col


titanic_df.groupBy("Family_Size").count().show()


In [None]:
from pyspark.sql.functions import asc

titanic_df.select("Name", "Family_Size").orderBy(col("Family_Size").desc()).show()


In [None]:
from pyspark.sql.functions import lit

titanic_df = titanic_df.withColumn('Alone',lit(0))


In [None]:
from pyspark.sql.functions import lit


titanic_df = titanic_df.withColumn('Alone',lit(0))
titanic_df = titanic_df.withColumn("Alone",when(titanic_df["Family_Size"] == 0, 1).otherwise(titanic_df["Alone"]))



In [None]:
titanic_df.columns


In [None]:
spark.conf.set("spark.sql.codegen.wholeStage", False)

titanic_df.filter(titanic_df.Age > 70).show()


In [None]:
from pyspark.sql.functions import desc


titanic_df.orderBy(desc("age")).show(1)


In [None]:
from pyspark.sql.functions import asc

titanic_df.orderBy(asc("age")).show(1)

In [None]:
titanic_df.stat.corr("age", "fare")


In [None]:
titanic_df.stat.corr("age", "family_size")


In [None]:
# https://spark.apache.org/docs/latest/api/python//reference/pyspark.sql/api/pyspark.sql.DataFrameStatFunctions.html#pyspark.sql.DataFrameStatFunctions
titanic_df.stat.crosstab("Embarked", "PClass").show()

In [None]:
titanic_df.groupBy('pclass').agg({'fare': 'avg'}).show()

In [None]:
# UDFs ajudam plugar funções complexas

from pyspark.sql.types import IntegerType
from pyspark.sql.functions import udf
import random

# LGPD!
def anonymize(passengerId):
    return random.randint(1, 1000000)

anonymizeUDF = udf(lambda passengerId:anonymize(passengerId), IntegerType())   

titanic_df.select(anonymizeUDF("PassengerId")).show()

In [None]:
# Salvando os dados

titanic_df.write.format("csv").save("titanic-final.csv")

In [None]:
titanic_df.select("name", "age", "survived").write.format("parquet").save("titanic-final.parquet")