# Cargar datos en un `DataFrame`

In [None]:
// BUG: https://issues.apache.org/jira/browse/SPARK-22918
System.setSecurityManager(null)

In [None]:
import org.apache.spark.sql.SparkSession

In [None]:
val spark = SparkSession.builder.enableHiveSupport().getOrCreate()

In [None]:
import spark.sql

In [None]:
import spark.implicits._

In [None]:
import org.apache.spark.sql.types._

In [None]:
val MesaElectoralSchema = StructType(Seq(
    StructField("Distrito", IntegerType, false),
    StructField("Barrio", IntegerType, false),
    StructField("Seccion", IntegerType, false),
    StructField("Mesa", StringType, false),
    StructField("Censo", IntegerType, false),
    StructField("Abstencion", IntegerType, true),
    StructField("Nulos", IntegerType, true),
    StructField("Blanco", IntegerType, true),
    StructField("Emitidos", IntegerType, false))) 

In [None]:
object StringUtils {
    implicit class StringImprovements(val s:String) {
        import scala.util.control.Exception._
        def toIntSafe = catching(classOf[NumberFormatException]) opt s.toInt
    }
}

In [None]:
import StringUtils._

In [None]:
import org.apache.spark.sql.Row

In [None]:
def stringToRow(row:String):Row = {
    val x = row.split(";")
    Row(x(0).toInt, x(1).toInt, x(2).toInt, x(3).slice(0, 1), x(4).toInt, 
                    x(5).toIntSafe.getOrElse(null), x(7).toIntSafe.getOrElse(null), x(9).toIntSafe.getOrElse(null), 
                    (x(11).toInt + x(12).toInt + x(13).toInt + x(14).toInt +
                    x(15).toInt + x(16).toInt + x(17).toInt + x(18).toInt + x(19).toInt +
                    x(20).toInt + x(21).toInt + x(22).toInt + x(23).toInt + x(24).toInt +
                    x(25).toInt + x(26).toInt + x(27).toInt + x(28).toInt + x(29).toInt +
                    x(30).toInt + x(31).toInt + x(32).toInt))
}

In [None]:
val elecciones_2015_rdd = sc.textFile("hdfs:///eoi/s4/elecciones/Elecciones_2015_NH.csv")

In [None]:
val elecciones_2015 = elecciones_2015_rdd.map(r => stringToRow(r))

In [None]:
val elecciones_2015_DF = spark.createDataFrame(elecciones_2015, MesaElectoralSchema).cache

In [None]:
elecciones_2015_DF.show

# Guardar un `DataFrame` con `saveAsTable`

In [None]:
spark.catalog.listDatabases().show()

In [None]:
spark.catalog.listTables().show()

In [None]:
elecciones_2015_DF.write.saveAsTable("elecciones_2015")

In [None]:
spark.catalog.listTables().show()

In [None]:
spark.sql("create database test")

In [None]:
elecciones_2015_DF.write.saveAsTable("test.elecciones_2015")

In [None]:
spark.catalog.listDatabases().show()

In [None]:
spark.catalog.listTables().show()

In [None]:
spark.catalog.setCurrentDatabase("test")

In [None]:
spark.catalog.listTables().show()

In [None]:
elecciones_2015_DF.write.format("json").option("path", "/warehouse").saveAsTable("test.elecciones_2015_json")

In [None]:
spark.catalog.listTables().show()

In [None]:
spark.sql("drop table test.elecciones_2015_json")

In [None]:
spark.sql("drop table test.elecciones_2015")

In [None]:
spark.sql("drop database test")

In [None]:
spark.catalog.listTables().show()

In [None]:
spark.catalog.setCurrentDatabase("default")

In [None]:
spark.catalog.listTables().show()

In [None]:
spark.catalog.listDatabases().show()

# Guardar un `DataFrame` con `insertInto`

In [None]:
val resultado = spark.sql("select count(*) from elecciones_2015").show()

In [None]:
elecciones_2015_DF.write.insertInto("elecciones_2015")

In [None]:
val resultado = spark.sql("select count(*) from elecciones_2015").show()

# Guardar un `DataFrame` con `save`

In [None]:
elecciones_2015_DF.write.format("csv").save("elecciones")

In [None]:
elecciones_2015_DF.write.format("csv").mode("append").save("elecciones")

In [None]:
spark.catalog.listTables().show()

In [None]:
elecciones_2015_DF.write.format("csv").mode("overwrite").save("elecciones")

# Guardar un `DataFrame` con `json`, `csv` y `parquet`

In [None]:
elecciones_2015_DF.write.json("elecciones_json")

In [None]:
elecciones_2015_DF.write.csv("elecciones_csv")

In [None]:
elecciones_2015_DF.write.option("compression", "none").parquet("elecciones_parquet")

# Guardar un `DataFrame` con `jdbc`

In [None]:
val propiedades = new java.util.Properties()
propiedades.setProperty("user", "eoi_user")
propiedades.setProperty("password", "eoi_password")

In [None]:
elecciones_2015_DF.write.jdbc("jdbc:postgresql://localhost/eoi_db", "elecciones", propiedades)

In [None]:
spark.catalog.listDatabases().show()