In [5]:
from pyspark.sql import SparkSession

# Spark entry point
spark = SparkSession \
    .builder \
    .appName("Aula Interativa 1 - Engenharia de Dados - Apache Spark") \
    .getOrCreate()

spark.version


'3.2.1'

In [6]:
# Carregando a planilha CSV
titanic_df_1 = spark.read.csv('titanic-partial-1.csv',header='True',inferSchema='False')

# Detalhes dos atributos em https://www.kaggle.com/c/titanic/data?select=train.csv
titanic_df_1.printSchema()

root
 |-- PassengerId: string (nullable = true)
 |-- Pclass: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- SibSp: string (nullable = true)
 |-- Parch: string (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: string (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [7]:
titanic_df_1.agg({"Fare": "max"}).collect()

[Row(max(Fare)='93.5')]

In [8]:
# Carregando a planilha CSV
titanic_df_2 = spark.read.csv('titanic-partial-2.csv',header='True',inferSchema='True')

# Detalhes dos atributos em https://www.kaggle.com/c/titanic/data?select=train.csv
titanic_df_2.printSchema()


root
 |-- PassengerId: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [9]:
titanic_df_2 = titanic_df_2.withColumnRenamed("Gender", "Sex")

titanic_df_2.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [10]:
titanic_df = titanic_df_1.union(titanic_df_2)

titanic_df.count()

990

In [11]:
titanic_df.distinct().count()

968

In [12]:
titanic_df_3 = titanic_df.distinct()

In [13]:
titanic_df_3.count()

968

In [14]:
# Carregando o arquivo JSON

survived_df = spark.read.json("titanic-survived.json")
survived_df.printSchema()
survived_df.show()

root
 |-- PassengerId: long (nullable = true)
 |-- Survived: long (nullable = true)

+-----------+--------+
|PassengerId|Survived|
+-----------+--------+
|          1|       0|
|          2|       1|
|          3|       1|
|          4|       1|
|          5|       0|
|          6|       0|
|          7|       0|
|          8|       0|
|          9|       1|
|         10|       1|
|         11|       1|
|         12|       1|
|         13|       0|
|         14|       0|
|         15|       0|
|         16|       1|
|         17|       0|
|         18|       1|
|         19|       0|
|         20|       1|
+-----------+--------+
only showing top 20 rows



In [15]:
titanic_df.printSchema()

root
 |-- PassengerId: string (nullable = true)
 |-- Pclass: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- SibSp: string (nullable = true)
 |-- Parch: string (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: string (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [16]:
# Juntando os dois dataframes.

titanic_df = titanic_df_3.join(survived_df, ["PassengerId"])

titanic_df.printSchema()


root
 |-- PassengerId: string (nullable = true)
 |-- Pclass: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- SibSp: string (nullable = true)
 |-- Parch: string (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: string (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)
 |-- Survived: long (nullable = true)



In [17]:
titanic_df.count()

968

In [18]:
titanic_df.show(5)

+-----------+------+--------------------+------+----+-----+-----+---------------+-----+-----+--------+--------+
|PassengerId|Pclass|                Name|   Sex| Age|SibSp|Parch|         Ticket| Fare|Cabin|Embarked|Survived|
+-----------+------+--------------------+------+----+-----+-----+---------------+-----+-----+--------+--------+
|        357|     1|Bowerman, Miss. E...|female|  22|    0|    1|         113505|   55|  E33|       S|       1|
|        158|     3|     Corn, Mr. Harry|  male|  30|    0|    0|SOTON/OQ 392090| 8.05| null|       S|       0|
|        171|     1|Van der hoef, Mr....|  male|  61|    0|    0|         111240| 33.5|  B19|       S|       0|
|        197|     3| Mernagh, Mr. Robert|  male|null|    0|    0|         368703| 7.75| null|       Q|       0|
|        320|     1|Spedden, Mrs. Fre...|female|  40|    1|    1|          16966|134.5|  E34|       C|       1|
+-----------+------+--------------------+------+----+-----+-----+---------------+-----+-----+--------+--

In [19]:
titanic_df.select("Survived", "Name", "Pclass","Embarked").show()

+--------+--------------------+------+--------+
|Survived|                Name|Pclass|Embarked|
+--------+--------------------+------+--------+
|       1|Bowerman, Miss. E...|     1|       S|
|       0|     Corn, Mr. Harry|     3|       S|
|       0|Van der hoef, Mr....|     1|       S|
|       0| Mernagh, Mr. Robert|     3|       Q|
|       1|Spedden, Mrs. Fre...|     1|       C|
|       1|Mellinger, Miss. ...|     2|       S|
|       0|  Lahoud, Mr. Sarkis|     3|       C|
|       0|Reuchlin, Jonkhee...|     1|       S|
|       0|"Sage, Miss. Doro...|     3|       S|
|       1|McDermott, Miss. ...|     3|       Q|
|       0|   Kantor, Mr. Sinai|     2|       S|
|       0|Pengelly, Mr. Fre...|     2|       S|
|       1|"O'Leary, Miss. H...|     3|       Q|
|       1|"Ryerson, Miss. S...|     1|       C|
|       0|Allum, Mr. Owen G...|     3|       S|
|       0|  Gale, Mr. Shadrach|     2|       S|
|       1|Silvey, Mrs. Will...|     1|       S|
|       0|Downton, Mr. Will...|     2|  

In [20]:
titanic_df.describe().show()

+-------+------------------+------------------+--------------------+------+------------------+------------------+------------------+-----------------+-----------------+-----+--------+-------------------+
|summary|       PassengerId|            Pclass|                Name|   Sex|               Age|             SibSp|             Parch|           Ticket|             Fare|Cabin|Embarked|           Survived|
+-------+------------------+------------------+--------------------+------+------------------+------------------+------------------+-----------------+-----------------+-----+--------+-------------------+
|  count|               968|               968|                 968|   968|               790|               968|               968|              968|              968|  222|     965|                968|
|   mean|414.48553719008265|2.3088842975206614|                null|  null|  29.4746835443038|0.5506198347107438|0.3956611570247934|269679.2738764045|32.26562438016525| null|    null|0

In [21]:
titanic_df.describe('Age', 'Fare').show()

+-------+------------------+-----------------+
|summary|               Age|             Fare|
+-------+------------------+-----------------+
|  count|               790|              968|
|   mean|  29.4746835443038|32.26562438016525|
| stddev|14.625063000563486|49.20622569057534|
|    min|              0.42|                0|
|    max|                 9|             93.5|
+-------+------------------+-----------------+



In [22]:
titanic_df.select('Age', 'Fare').summary().show()

+-------+------------------+-----------------+
|summary|               Age|             Fare|
+-------+------------------+-----------------+
|  count|               790|              968|
|   mean|  29.4746835443038|32.26562438016525|
| stddev|14.625063000563486|49.20622569057534|
|    min|              0.42|                0|
|    25%|              20.0|            7.925|
|    50%|              28.0|             14.5|
|    75%|              38.0|           31.275|
|    max|                 9|             93.5|
+-------+------------------+-----------------+



In [23]:
titanic_df.groupBy("Survived").count().show()


+--------+-----+
|Survived|count|
+--------+-----+
|       0|  594|
|       1|  374|
+--------+-----+



In [24]:
titanic_df.groupBy("Sex","Survived").count().show()

+------+--------+-----+
|   Sex|Survived|count|
+------+--------+-----+
|  male|       0|  505|
|female|       1|  258|
|female|       0|   89|
|  male|       1|  116|
+------+--------+-----+



In [25]:
titanic_df.groupBy("Pclass","Survived").count().show()



+------+--------+-----+
|Pclass|Survived|count|
+------+--------+-----+
|     2|       1|   99|
|     1|       0|   90|
|     3|       1|  131|
|     1|       1|  144|
|     2|       0|  102|
|     3|       0|  402|
+------+--------+-----+



In [26]:
from pyspark.sql.functions import col, regexp_extract

titanic_df = titanic_df.withColumn("Initial", regexp_extract(col("Name"),"([A-Za-z]+)\.",1))

titanic_df.select("Initial","Name").show()

+--------+--------------------+
| Initial|                Name|
+--------+--------------------+
|    Miss|Bowerman, Miss. E...|
|      Mr|     Corn, Mr. Harry|
|      Mr|Van der hoef, Mr....|
|      Mr| Mernagh, Mr. Robert|
|     Mrs|Spedden, Mrs. Fre...|
|    Miss|Mellinger, Miss. ...|
|      Mr|  Lahoud, Mr. Sarkis|
|Jonkheer|Reuchlin, Jonkhee...|
|    Miss|"Sage, Miss. Doro...|
|    Miss|McDermott, Miss. ...|
|      Mr|   Kantor, Mr. Sinai|
|      Mr|Pengelly, Mr. Fre...|
|    Miss|"O'Leary, Miss. H...|
|    Miss|"Ryerson, Miss. S...|
|      Mr|Allum, Mr. Owen G...|
|      Mr|  Gale, Mr. Shadrach|
|     Mrs|Silvey, Mrs. Will...|
|      Mr|Downton, Mr. Will...|
|      Mr|Lindqvist, Mr. Ei...|
|  Master|Hamalainen, Maste...|
+--------+--------------------+
only showing top 20 rows



In [27]:
titanic_df.select("Initial").distinct().show()


+--------+
| Initial|
+--------+
|     Don|
|    Miss|
|Countess|
|     Col|
|     Rev|
|    Lady|
|  Master|
|     Mme|
|    Capt|
|      Mr|
|      Dr|
|     Mrs|
|     Sir|
|Jonkheer|
|    Mlle|
|   Major|
|      Ms|
+--------+



In [28]:
titanic_df = titanic_df.replace(['Mlle','Mme', 'Ms', 'Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don'],
               ['Miss','Miss','Miss','Mr','Mr',  'Mrs',  'Mrs',  'Other',  'Other','Other','Mr','Mr','Mr'])

In [29]:
titanic_df.select("Initial").distinct().show()

+-------+
|Initial|
+-------+
|   Miss|
|  Other|
| Master|
|     Mr|
|    Mrs|
+-------+



In [30]:
titanic_df.groupby('Initial').avg('Age').collect()


AnalysisException: "Age" is not a numeric column. Aggregation function can only be applied on a numeric column.

In [None]:
from pyspark.sql.functions import col

titanic_df.filter(col("Initial") == 'Miss').select("Name", "Age", "Initial").show(5)

titanic_df.filter(titanic_df.Initial == 'Miss').select("Name", "Age", "Initial").show(5)

In [None]:
from pyspark.sql.functions import when

titanic_df = titanic_df.withColumn("Age",when((titanic_df["Initial"] == "Miss") & (titanic_df["Age"].isNull()), 22).otherwise(titanic_df["Age"]))
titanic_df = titanic_df.withColumn("Age",when((titanic_df["Initial"] == "Other") & (titanic_df["Age"].isNull()), 46).otherwise(titanic_df["Age"]))
titanic_df = titanic_df.withColumn("Age",when((titanic_df["Initial"] == "Master") & (titanic_df["Age"].isNull()), 5).otherwise(titanic_df["Age"]))
titanic_df = titanic_df.withColumn("Age",when((titanic_df["Initial"] == "Mr") & (titanic_df["Age"].isNull()), 33).otherwise(titanic_df["Age"]))
titanic_df = titanic_df.withColumn("Age",when((titanic_df["Initial"] == "Mrs") & (titanic_df["Age"].isNull()), 36).otherwise(titanic_df["Age"]))


In [None]:
titanic_df.groupBy("Embarked").count().show()


In [None]:
titanic_df = titanic_df.na.fill({"Embarked" : 'S'})

In [None]:
titanic_df.groupBy("Embarked").count().show()

In [None]:
titanic_df.describe("Cabin").show()

In [None]:
titanic_df = titanic_df.drop("Cabin")

titanic_df.printSchema()


In [None]:
titanic_df = titanic_df.withColumn("Family_Size",col('SibSp')+col('Parch'))

In [None]:
from pyspark.sql.functions import col


titanic_df.groupBy("Family_Size").count().show()


In [None]:
from pyspark.sql.functions import asc

titanic_df.select("Name", "Family_Size").orderBy(col("Family_Size").desc()).show()


In [None]:
from pyspark.sql.functions import lit

titanic_df = titanic_df.withColumn('Alone',lit(0))


In [None]:
from pyspark.sql.functions import lit


titanic_df = titanic_df.withColumn('Alone',lit(0))
titanic_df = titanic_df.withColumn("Alone",when(titanic_df["Family_Size"] == 0, 1).otherwise(titanic_df["Alone"]))



In [None]:
titanic_df.columns


In [None]:
spark.conf.set("spark.sql.codegen.wholeStage", False)

titanic_df.filter(titanic_df.Age > 70).show()


In [None]:
from pyspark.sql.functions import desc


titanic_df.orderBy(desc("age")).show(1)


In [None]:
from pyspark.sql.functions import asc

titanic_df.orderBy(asc("age")).show(1)

In [None]:
titanic_df.stat.corr("age", "fare")


In [None]:
titanic_df.stat.corr("age", "family_size")


In [None]:
# https://spark.apache.org/docs/latest/api/python//reference/pyspark.sql/api/pyspark.sql.DataFrameStatFunctions.html#pyspark.sql.DataFrameStatFunctions
titanic_df.stat.crosstab("Embarked", "PClass").show()

In [None]:
titanic_df.groupBy('pclass').agg({'fare': 'avg'}).show()

In [36]:
# UDFs ajudam plugar funções complexas

from pyspark.sql.types import IntegerType
from pyspark.sql.functions import udf
import random

# LGPD!
def anonymize(passengerId):
    return random.randint(1, 1000000)

anonymizeUDF = udf(lambda passengerId:anonymize(passengerId), IntegerType())   

titanic_df.select(anonymizeUDF("PassengerId")).show()

+---------------------+
|<lambda>(PassengerId)|
+---------------------+
|               233098|
|               958292|
|               882499|
|               375525|
|               892704|
|               286538|
|               372481|
|               692364|
|               217570|
|               957970|
|               385165|
|               384321|
|               265112|
|                59420|
|               508502|
|               508568|
|               442441|
|               491999|
|               855610|
|               139434|
+---------------------+
only showing top 20 rows



In [None]:
# Salvando os dados

titanic_df.write.format("csv").save("titanic-final.csv")

In [None]:
titanic_df.select("name", "age", "survived").write.format("parquet").save("titanic-final.parquet")