# Convertir un RDD en un DataFrame usando un `schema`

In [None]:
import org.apache.spark.sql.SparkSession

In [None]:
val spark = SparkSession.builder.getOrCreate()

In [None]:
import spark.implicits._

In [None]:
import org.apache.spark.sql.types._

In [None]:
val MesaElectoralSchema = StructType(Seq(
    StructField("Distrito", IntegerType, false),
    StructField("Barrio", IntegerType, false),
    StructField("Seccion", IntegerType, false),
    StructField("Mesa", StringType, false),
    StructField("Censo", IntegerType, false),
    StructField("Abstencion", IntegerType, true),
    StructField("Nulos", IntegerType, true),
    StructField("Blanco", IntegerType, true),
    StructField("Emitidos", IntegerType, false))) 

In [None]:
object StringUtils {
    implicit class StringImprovements(val s:String) {
        import scala.util.control.Exception._
        def toIntSafe = catching(classOf[NumberFormatException]) opt s.toInt
    }
}

In [None]:
import StringUtils._

In [None]:
import org.apache.spark.sql.Row

In [None]:
def stringToRow(row:String):Row = {
    val x = row.split(";")
    Row(x(0).toInt, x(1).toInt, x(2).toInt, x(3).slice(0, 1), x(4).toInt, 
                    x(5).toIntSafe.getOrElse(null), x(7).toIntSafe.getOrElse(null), x(9).toIntSafe.getOrElse(null), 
                    (x(11).toInt + x(12).toInt + x(13).toInt + x(14).toInt +
                    x(15).toInt + x(16).toInt + x(17).toInt + x(18).toInt + x(19).toInt +
                    x(20).toInt + x(21).toInt + x(22).toInt + x(23).toInt + x(24).toInt +
                    x(25).toInt + x(26).toInt + x(27).toInt + x(28).toInt + x(29).toInt +
                    x(30).toInt + x(31).toInt + x(32).toInt))
}

In [None]:
val elecciones_2015_rdd = sc.textFile("hdfs:///eoi/s4/elecciones/Elecciones_2015_NH.csv")

In [None]:
val elecciones_2015 = elecciones_2015_rdd.map(r => stringToRow(r))

In [None]:
val elecciones_2015_DF = spark.createDataFrame(elecciones_2015, MesaElectoralSchema)

In [None]:
elecciones_2015_DF.show

In [None]:
elecciones_2015_DF.printSchema

In [None]:
elecciones_2015_DF.columns

In [None]:
elecciones_2015_DF.dtypes

In [None]:
case class MesaElectoral (Distrito:Int, Barrio:Int, Seccion:Int, 
                          Mesa:String, Censo:Int, Abstencion:Option[Int], 
                          Nulos:Option[Int], Blanco:Option[Int], Emitidos:Int)

In [None]:
val elecciones_2015_DS = elecciones_2015_DF.as[MesaElectoral]

In [None]:
elecciones_2015_DS.show

In [None]:
elecciones_2015_DS.printSchema

In [None]:
elecciones_2015_DS.columns

In [None]:
elecciones_2015_DS.dtypes