In [16]:
from pyspark.sql import SparkSession

# Spark entry point
spark = SparkSession \
    .builder \
    .appName("Aula Interativa 2 - Apache Spark") \
    .getOrCreate()

spark.version


'3.3.0'

In [17]:
# Arquivo https://www.kaggle.com/c/titanic/data?select=train.csv
titanic_df = spark.read.csv('/home/pcalais/XPE/engenharia-dados/aula2/titanic-3.csv',header='True',inferSchema='True')

titanic_df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [18]:
titanic_df.count()

891

In [19]:
titanic_df.groupBy('survived').count().show()

+--------+-----+
|survived|count|
+--------+-----+
|       1|  342|
|       0|  549|
+--------+-----+



In [20]:
titanic_df.createOrReplaceTempView('table')

spark.sql("SELECT survived, count(*) FROM table GROUP BY survived").show()

+--------+--------+
|survived|count(1)|
+--------+--------+
|       1|     342|
|       0|     549|
+--------+--------+



In [21]:
titanic_df.groupBy('survived').agg({"Fare": "avg"}).show()

+--------+------------------+
|survived|         avg(Fare)|
+--------+------------------+
|       1| 48.39540760233917|
|       0|22.117886885245877|
+--------+------------------+



In [22]:
spark.sql("SELECT survived, avg(fare) FROM table GROUP BY survived").show()

+--------+------------------+
|survived|         avg(fare)|
+--------+------------------+
|       1| 48.39540760233917|
|       0|22.117886885245877|
+--------+------------------+



In [23]:
spark.sql("SELECT age, count(*) as age_count FROM table WHERE survived == 1 GROUP BY age ORDER BY age_count DESC").show()


+----+---------+
| age|age_count|
+----+---------+
|null|       52|
|24.0|       15|
|35.0|       11|
|27.0|       11|
|36.0|       11|
|22.0|       11|
|30.0|       10|
|18.0|        9|
|32.0|        9|
|19.0|        9|
|31.0|        8|
|29.0|        8|
| 4.0|        7|
|28.0|        7|
|34.0|        6|
|25.0|        6|
|42.0|        6|
|40.0|        6|
|48.0|        6|
|33.0|        6|
+----+---------+
only showing top 20 rows



In [24]:
# UDFs ajudam plugar funções complexas no SQL.

def uppercase(str):
    return str.upper()

def ml_model(data):
    return model(data)


spark.udf.register("machinelearning_model", uppercase)

spark.sql("SELECT machine_learning_model(Name), Name from table").show(5)


22/09/22 21:36:27 WARN SimpleFunctionRegistry: The function upperudf replaced a previously registered function.
+--------------------+--------------------+
|      upperUDF(Name)|                Name|
+--------------------+--------------------+
|BRAUND, MR. OWEN ...|Braund, Mr. Owen ...|
|CUMINGS, MRS. JOH...|Cumings, Mrs. Joh...|
|HEIKKINEN, MISS. ...|Heikkinen, Miss. ...|
|FUTRELLE, MRS. JA...|Futrelle, Mrs. Ja...|
|ALLEN, MR. WILLIA...|Allen, Mr. Willia...|
+--------------------+--------------------+
only showing top 5 rows



In [None]:
# https://sparkbyexamples.com/spark/spark-write-dataframe-to-csv-file/

spark.sql("SELECT upperUDF(Name), Age from table").write.option("header", True).format("csv").save("names.csv")

In [None]:
# https://sparkbyexamples.com/spark/spark-read-and-write-json-file/


spark.sql("SELECT age, count(*) as age_count FROM table WHERE survived == 1 GROUP BY age ORDER BY age_count DESC").write.format("json").save("ages.json")

In [None]:
# https://spark.apache.org/docs/latest/sql-data-sources-jdbc.html
    

# Saving data to a JDBC source
#jdbcDF.write \
#    .format("jdbc") \
#    .option("url", "jdbc:postgresql:dbserver") \
#    .option("dbtable", "schema.tablename") \
#    .option("user", "username") \
#    .option("password", "password") \
#    .save()
    