# Introducción a PySpark

### Cargando el entorno de PySpark en Goolge Colab

In [121]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark

In [122]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

In [123]:
!ls

cars.csv    sample_data			   spark-3.1.1-bin-hadoop3.2.tgz.1
cars.csv.1  spark-3.1.1-bin-hadoop3.2	   spark-3.1.1-bin-hadoop3.2.tgz.2
cars.csv.2  spark-3.1.1-bin-hadoop3.2.tgz


In [124]:
import findspark
findspark.init()
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.sql import SparkSession
#spark = SparkSession.builder.master("local[*]").getOrCreate()
spark = SparkSession.builder \
                    .master("local[*]") \
                    .appName("Ejemplo") \
                    .getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # Property used to format output tables better
spark

### Importando una base de datos externa

In [125]:
!wget https://jacobceles.github.io/knowledge_repo/colab_and_pyspark/cars.csv

--2025-04-24 23:32:28--  https://jacobceles.github.io/knowledge_repo/colab_and_pyspark/cars.csv
Resolving jacobceles.github.io (jacobceles.github.io)... 185.199.108.153, 185.199.109.153, 185.199.110.153, ...
Connecting to jacobceles.github.io (jacobceles.github.io)|185.199.108.153|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://jacobcelestine.com/knowledge_repo/colab_and_pyspark/cars.csv [following]
--2025-04-24 23:32:28--  https://jacobcelestine.com/knowledge_repo/colab_and_pyspark/cars.csv
Resolving jacobcelestine.com (jacobcelestine.com)... 185.199.108.153, 185.199.109.153, 185.199.110.153, ...
Connecting to jacobcelestine.com (jacobcelestine.com)|185.199.108.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 22608 (22K) [text/csv]
Saving to: ‘cars.csv.3’


2025-04-24 23:32:28 (19.1 MB/s) - ‘cars.csv.3’ saved [22608/22608]



## Creando un Dataframe a partir de un archivo "csv" de entrada

In [72]:
df = spark.read.csv('cars.csv', header=True, sep=";", inferSchema=True)
df.show(10)

+--------------------+----+---------+------------+----------+------+------------+-----+------+
|                 Car| MPG|Cylinders|Displacement|Horsepower|Weight|Acceleration|Model|Origin|
+--------------------+----+---------+------------+----------+------+------------+-----+------+
|Chevrolet Chevell...|18.0|        8|       307.0|     130.0|  3504|        12.0|   70|    US|
|   Buick Skylark 320|15.0|        8|       350.0|     165.0|  3693|        11.5|   70|    US|
|  Plymouth Satellite|18.0|        8|       318.0|     150.0|  3436|        11.0|   70|    US|
|       AMC Rebel SST|16.0|        8|       304.0|     150.0|  3433|        12.0|   70|    US|
|         Ford Torino|17.0|        8|       302.0|     140.0|  3449|        10.5|   70|    US|
|    Ford Galaxie 500|15.0|        8|       429.0|     198.0|  4341|        10.0|   70|    US|
|    Chevrolet Impala|14.0|        8|       454.0|     220.0|  4354|         9.0|   70|    US|
|   Plymouth Fury iii|14.0|        8|       440.0|

In [8]:
# Imprimiendo en número de registros en el Dataframe
df.count()

406

In [9]:
# Imprimiendo el número de columnas en el Dataframe
print(len(df.columns))

9


#### Imprimiendo información sobre el tipo de datos y esquema del Dataframe

In [26]:
# Imprimiendo los tipos de datos del Dataframe
df.dtypes

[('Car', 'string'),
 ('MPG', 'double'),
 ('Cylinders', 'int'),
 ('Displacement', 'double'),
 ('Horsepower', 'double'),
 ('Weight', 'decimal(4,0)'),
 ('Acceleration', 'double'),
 ('Model', 'int'),
 ('Origin', 'string')]

In [27]:
# Imprimiendo el esquema del Dataframe
df.printSchema()

root
 |-- Car: string (nullable = true)
 |-- MPG: double (nullable = true)
 |-- Cylinders: integer (nullable = true)
 |-- Displacement: double (nullable = true)
 |-- Horsepower: double (nullable = true)
 |-- Weight: decimal(4,0) (nullable = true)
 |-- Acceleration: double (nullable = true)
 |-- Model: integer (nullable = true)
 |-- Origin: string (nullable = true)



#### Cambiando el esquema del Dataframe

In [28]:
# Si se requiere cambiar el tipo de alguna columna, se puede usar withColumn
from pyspark.sql.types import IntegerType, FloatType

# Dos formas en las cuales se puede cambiar el formato de la columna MPG
df = df.withColumn('MPG', df['MPG'].cast(FloatType()))
#df = df.withColumn("MPG", df["MPG"].cast("float"))

df.printSchema()

root
 |-- Car: string (nullable = true)
 |-- MPG: float (nullable = true)
 |-- Cylinders: integer (nullable = true)
 |-- Displacement: double (nullable = true)
 |-- Horsepower: double (nullable = true)
 |-- Weight: decimal(4,0) (nullable = true)
 |-- Acceleration: double (nullable = true)
 |-- Model: integer (nullable = true)
 |-- Origin: string (nullable = true)



In [29]:
# Si fuera necesario, se puede cambiar el tipo a todas las columnas del Dataframe usando selectExpr

df2 = df.selectExpr(
    'cast(Car as string) Car',
    'cast(MPG as float) MPG',
    'cast(Cylinders as int) Cylinders',
    'cast(Displacement as int) Displacement',
    'cast(Horsepower as int) Horsepower',
    'cast(Weight as int) Weight',
    'cast(Acceleration as float) Acceleration',
    'cast(Model as int) Model',
    'cast(Origin as string) Origin'
)

df2.printSchema()

root
 |-- Car: string (nullable = true)
 |-- MPG: float (nullable = true)
 |-- Cylinders: integer (nullable = true)
 |-- Displacement: integer (nullable = true)
 |-- Horsepower: integer (nullable = true)
 |-- Weight: integer (nullable = true)
 |-- Acceleration: float (nullable = true)
 |-- Model: integer (nullable = true)
 |-- Origin: string (nullable = true)



In [30]:
df2.show(10, truncate=False)

+-------------------------+----+---------+------------+----------+------+------------+-----+------+
|Car                      |MPG |Cylinders|Displacement|Horsepower|Weight|Acceleration|Model|Origin|
+-------------------------+----+---------+------------+----------+------+------------+-----+------+
|Chevrolet Chevelle Malibu|18.0|8        |307         |130       |3504  |12.0        |70   |US    |
|Buick Skylark 320        |15.0|8        |350         |165       |3693  |11.5        |70   |US    |
|Plymouth Satellite       |18.0|8        |318         |150       |3436  |11.0        |70   |US    |
|AMC Rebel SST            |16.0|8        |304         |150       |3433  |12.0        |70   |US    |
|Ford Torino              |17.0|8        |302         |140       |3449  |10.5        |70   |US    |
|Ford Galaxie 500         |15.0|8        |429         |198       |4341  |10.0        |70   |US    |
|Chevrolet Impala         |14.0|8        |454         |220       |4354  |9.0         |70   |US    |


## Definiendo un esquema para un Dataframe

In [50]:
# Se especifica un DDL
ddl_schema = "Car STRING, MPG FLOAT, Cylinders INT, Displacement FLOAT, Horsepower FLOAT, Weight FLOAT, Acceleration FLOAT, Model INT, Origin STRING"

df3 = spark.read.csv('cars.csv', header=True, sep=";", schema=ddl_schema)
df3.printSchema()


root
 |-- Car: string (nullable = true)
 |-- MPG: float (nullable = true)
 |-- Cylinders: integer (nullable = true)
 |-- Displacement: float (nullable = true)
 |-- Horsepower: float (nullable = true)
 |-- Weight: float (nullable = true)
 |-- Acceleration: float (nullable = true)
 |-- Model: integer (nullable = true)
 |-- Origin: string (nullable = true)



In [51]:
df3.show(10, truncate=False)

+-------------------------+----+---------+------------+----------+------+------------+-----+------+
|Car                      |MPG |Cylinders|Displacement|Horsepower|Weight|Acceleration|Model|Origin|
+-------------------------+----+---------+------------+----------+------+------------+-----+------+
|Chevrolet Chevelle Malibu|18.0|8        |307.0       |130.0     |3504.0|12.0        |70   |US    |
|Buick Skylark 320        |15.0|8        |350.0       |165.0     |3693.0|11.5        |70   |US    |
|Plymouth Satellite       |18.0|8        |318.0       |150.0     |3436.0|11.0        |70   |US    |
|AMC Rebel SST            |16.0|8        |304.0       |150.0     |3433.0|12.0        |70   |US    |
|Ford Torino              |17.0|8        |302.0       |140.0     |3449.0|10.5        |70   |US    |
|Ford Galaxie 500         |15.0|8        |429.0       |198.0     |4341.0|10.0        |70   |US    |
|Chevrolet Impala         |14.0|8        |454.0       |220.0     |4354.0|9.0         |70   |US    |


In [56]:
# Se se especifica un DDL que no coincide con el dataset, se pueden obtener lecturas nulas
ddl_schema = "Car STRING, MPG INT, Cylinders INT, Displacement FLOAT, Horsepower FLOAT, Weight FLOAT, Acceleration FLOAT, Model INT, Origin STRING"

df4 = spark.read.csv('cars.csv', header=True, sep=";", schema=ddl_schema)
df4.show(10, truncate=False)


+-------------------------+----+---------+------------+----------+------+------------+-----+------+
|Car                      |MPG |Cylinders|Displacement|Horsepower|Weight|Acceleration|Model|Origin|
+-------------------------+----+---------+------------+----------+------+------------+-----+------+
|Chevrolet Chevelle Malibu|null|8        |307.0       |130.0     |3504.0|12.0        |70   |US    |
|Buick Skylark 320        |null|8        |350.0       |165.0     |3693.0|11.5        |70   |US    |
|Plymouth Satellite       |null|8        |318.0       |150.0     |3436.0|11.0        |70   |US    |
|AMC Rebel SST            |null|8        |304.0       |150.0     |3433.0|12.0        |70   |US    |
|Ford Torino              |null|8        |302.0       |140.0     |3449.0|10.5        |70   |US    |
|Ford Galaxie 500         |null|8        |429.0       |198.0     |4341.0|10.0        |70   |US    |
|Chevrolet Impala         |null|8        |454.0       |220.0     |4354.0|9.0         |70   |US    |


## Transformaciones aplicables a columnas

#### 1. Seleccion de columnas: select

In [73]:
# Para la seleccion de columnas de un dataframe, se pueden usar variantes en la sintaxis
print("Método 1")
df_car = df.select(df.Car)
df_car.show(10, truncate=False)

print("Método 2")
df_car = df.select(df['Car'])
df_car.show(10, truncate=False)

print("Método 3")
df_car = df.select('Car')
df_car.show(10, truncate=False)


Método 1
+-------------------------+
|Car                      |
+-------------------------+
|Chevrolet Chevelle Malibu|
|Buick Skylark 320        |
|Plymouth Satellite       |
|AMC Rebel SST            |
|Ford Torino              |
|Ford Galaxie 500         |
|Chevrolet Impala         |
|Plymouth Fury iii        |
|Pontiac Catalina         |
|AMC Ambassador DPL       |
+-------------------------+
only showing top 10 rows

Método 2
+-------------------------+
|Car                      |
+-------------------------+
|Chevrolet Chevelle Malibu|
|Buick Skylark 320        |
|Plymouth Satellite       |
|AMC Rebel SST            |
|Ford Torino              |
|Ford Galaxie 500         |
|Chevrolet Impala         |
|Plymouth Fury iii        |
|Pontiac Catalina         |
|AMC Ambassador DPL       |
+-------------------------+
only showing top 10 rows

Método 3
+-------------------------+
|Car                      |
+-------------------------+
|Chevrolet Chevelle Malibu|
|Buick Skylark 320       

In [76]:
# Existe otra forma de acceder a una columna, usando el módulo "col" de SQL
from pyspark.sql.functions import col

print("Método 4")
df.select(col('car')).show(10, truncate=False)

Método 4
+-------------------------+
|car                      |
+-------------------------+
|Chevrolet Chevelle Malibu|
|Buick Skylark 320        |
|Plymouth Satellite       |
|AMC Rebel SST            |
|Ford Torino              |
|Ford Galaxie 500         |
|Chevrolet Impala         |
|Plymouth Fury iii        |
|Pontiac Catalina         |
|AMC Ambassador DPL       |
+-------------------------+
only showing top 10 rows



In [75]:
# Es posible seleccionar varias columnas en una sola transformación
# Nota: se puede mezclar sintaxis

# Seleccion de columna "Car" y "MPG"
# Método 1
print("Método 1")
df_Car_MPG = df.select(df.Car, df.MPG)
df_Car_MPG.show(10,truncate=False)

# Método 2
print("Método 2")
df_Car_MPG = df.select(df['Car'], df.MPG)
df_Car_MPG.show(10,truncate=False)

# Método 3
print("Método 3")
from pyspark.sql.functions import col
df_Car_MPG = df.select(col('car'), col('MPG'))
df_Car_MPG.show(10,truncate=False)

Método 1
+-------------------------+----+
|Car                      |MPG |
+-------------------------+----+
|Chevrolet Chevelle Malibu|18.0|
|Buick Skylark 320        |15.0|
|Plymouth Satellite       |18.0|
|AMC Rebel SST            |16.0|
|Ford Torino              |17.0|
|Ford Galaxie 500         |15.0|
|Chevrolet Impala         |14.0|
|Plymouth Fury iii        |14.0|
|Pontiac Catalina         |14.0|
|AMC Ambassador DPL       |15.0|
+-------------------------+----+
only showing top 10 rows

Método 2
+-------------------------+----+
|Car                      |MPG |
+-------------------------+----+
|Chevrolet Chevelle Malibu|18.0|
|Buick Skylark 320        |15.0|
|Plymouth Satellite       |18.0|
|AMC Rebel SST            |16.0|
|Ford Torino              |17.0|
|Ford Galaxie 500         |15.0|
|Chevrolet Impala         |14.0|
|Plymouth Fury iii        |14.0|
|Pontiac Catalina         |14.0|
|AMC Ambassador DPL       |15.0|
+-------------------------+----+
only showing top 10 rows

Método

#### 2. Añadiendo columnas: withColumn

In [77]:
# Caso 1: Añadiendo la columna "col_1"
from pyspark.sql.functions import lit
df_newCols = df.withColumn('col_1',lit(1))
# lit se usa para especificar el valor a usar para llenar la columna creada

# Caso 2: Añadiendo dos columnas
df_newCols = df_newCols.withColumn('col_2', lit(2)) \
                  .withColumn('col_3', lit('c3'))
df_newCols.show(10,truncate=False)

+-------------------------+----+---------+------------+----------+------+------------+-----+------+-----+-----+-----+
|Car                      |MPG |Cylinders|Displacement|Horsepower|Weight|Acceleration|Model|Origin|col_1|col_2|col_3|
+-------------------------+----+---------+------------+----------+------+------------+-----+------+-----+-----+-----+
|Chevrolet Chevelle Malibu|18.0|8        |307.0       |130.0     |3504  |12.0        |70   |US    |1    |2    |c3   |
|Buick Skylark 320        |15.0|8        |350.0       |165.0     |3693  |11.5        |70   |US    |1    |2    |c3   |
|Plymouth Satellite       |18.0|8        |318.0       |150.0     |3436  |11.0        |70   |US    |1    |2    |c3   |
|AMC Rebel SST            |16.0|8        |304.0       |150.0     |3433  |12.0        |70   |US    |1    |2    |c3   |
|Ford Torino              |17.0|8        |302.0       |140.0     |3449  |10.5        |70   |US    |1    |2    |c3   |
|Ford Galaxie 500         |15.0|8        |429.0       |1

In [95]:
# Caso 3: Añadiendo una nueva columna a partir de una existente
# La nueva columna se llamará 'car_model' generada a aprtir de las columnas Car y Model
from pyspark.sql.functions import concat
df_newCols = df_newCols.withColumn('car_model', concat(col("Car"), lit(" "), col("Model")))

df_newCols.show(10,truncate=False)
print("# registros: ", df_newCols.count())

+-------------------------+----+---------+------------+----------+------+------------+-----+------+-----+-----+-----+----------------------------+
|Car                      |MPG |Cylinders|Displacement|Horsepower|Weight|Acceleration|Model|Origin|col_1|col_2|col_3|car_model                   |
+-------------------------+----+---------+------------+----------+------+------------+-----+------+-----+-----+-----+----------------------------+
|Chevrolet Chevelle Malibu|18.0|8        |307.0       |130.0     |3504  |12.0        |70   |US    |1    |2    |c3   |Chevrolet Chevelle Malibu 70|
|Buick Skylark 320        |15.0|8        |350.0       |165.0     |3693  |11.5        |70   |US    |1    |2    |c3   |Buick Skylark 320 70        |
|Plymouth Satellite       |18.0|8        |318.0       |150.0     |3436  |11.0        |70   |US    |1    |2    |c3   |Plymouth Satellite 70       |
|AMC Rebel SST            |16.0|8        |304.0       |150.0     |3433  |12.0        |70   |US    |1    |2    |c3   |A

In [96]:
# Se debe recordar que withColumn también es posible cambiar el tipo de dato de una columna
# Dos formas en las cuales se puede cambiar el formato de la columna MPG
df_newCols.printSchema()

df_newCols1 = df_newCols.withColumn('col_1', df_newCols['col_1'].cast('string'))

df_newCols1.printSchema()

root
 |-- Car: string (nullable = true)
 |-- MPG: double (nullable = true)
 |-- Cylinders: integer (nullable = true)
 |-- Displacement: double (nullable = true)
 |-- Horsepower: double (nullable = true)
 |-- Weight: decimal(4,0) (nullable = true)
 |-- Acceleration: double (nullable = true)
 |-- Model: integer (nullable = true)
 |-- Origin: string (nullable = true)
 |-- col_1: string (nullable = false)
 |-- col_2: integer (nullable = false)
 |-- col_3: string (nullable = false)
 |-- car_model: string (nullable = true)

root
 |-- Car: string (nullable = true)
 |-- MPG: double (nullable = true)
 |-- Cylinders: integer (nullable = true)
 |-- Displacement: double (nullable = true)
 |-- Horsepower: double (nullable = true)
 |-- Weight: decimal(4,0) (nullable = true)
 |-- Acceleration: double (nullable = true)
 |-- Model: integer (nullable = true)
 |-- Origin: string (nullable = true)
 |-- col_1: string (nullable = false)
 |-- col_2: integer (nullable = false)
 |-- col_3: string (nullable = f

In [97]:
# Se pueden usar tipos de datos de SQL
from pyspark.sql.types import IntegerType, FloatType, StringType
df_newCols.printSchema()
df_newCols1 = df_newCols.withColumn('col_1', df_newCols['col_1'].cast(StringType()))
df_newCols1.printSchema()

root
 |-- Car: string (nullable = true)
 |-- MPG: double (nullable = true)
 |-- Cylinders: integer (nullable = true)
 |-- Displacement: double (nullable = true)
 |-- Horsepower: double (nullable = true)
 |-- Weight: decimal(4,0) (nullable = true)
 |-- Acceleration: double (nullable = true)
 |-- Model: integer (nullable = true)
 |-- Origin: string (nullable = true)
 |-- col_1: string (nullable = false)
 |-- col_2: integer (nullable = false)
 |-- col_3: string (nullable = false)
 |-- car_model: string (nullable = true)

root
 |-- Car: string (nullable = true)
 |-- MPG: double (nullable = true)
 |-- Cylinders: integer (nullable = true)
 |-- Displacement: double (nullable = true)
 |-- Horsepower: double (nullable = true)
 |-- Weight: decimal(4,0) (nullable = true)
 |-- Acceleration: double (nullable = true)
 |-- Model: integer (nullable = true)
 |-- Origin: string (nullable = true)
 |-- col_1: string (nullable = false)
 |-- col_2: integer (nullable = false)
 |-- col_3: string (nullable = f

#### 3. Renombrando columnas

In [98]:
df_newCols = df_newCols.withColumnRenamed('col_1', 'col1') \
       .withColumnRenamed('col_2', 'col2') \
       .withColumnRenamed('col_3', 'col3')
df_newCols.show(10,truncate=False)

+-------------------------+----+---------+------------+----------+------+------------+-----+------+----+----+----+----------------------------+
|Car                      |MPG |Cylinders|Displacement|Horsepower|Weight|Acceleration|Model|Origin|col1|col2|col3|car_model                   |
+-------------------------+----+---------+------------+----------+------+------------+-----+------+----+----+----+----------------------------+
|Chevrolet Chevelle Malibu|18.0|8        |307.0       |130.0     |3504  |12.0        |70   |US    |1   |2   |c3  |Chevrolet Chevelle Malibu 70|
|Buick Skylark 320        |15.0|8        |350.0       |165.0     |3693  |11.5        |70   |US    |1   |2   |c3  |Buick Skylark 320 70        |
|Plymouth Satellite       |18.0|8        |318.0       |150.0     |3436  |11.0        |70   |US    |1   |2   |c3  |Plymouth Satellite 70       |
|AMC Rebel SST            |16.0|8        |304.0       |150.0     |3433  |12.0        |70   |US    |1   |2   |c3  |AMC Rebel SST 70      

#### 4. Agrupando valores de columnas

In [100]:
# La funcion groupBy agrupa los valoes de una columna y se debe indicar
# una función de agregación para saber que se va a realizar con dichos valores
# Funciones de agregación: avg, count, max, min
# Caso 1: agrupando valores a partir de una columna
print("Agrupando a partir de una sola columna")
df_grouped = df.groupBy('Origin').count()
df_grouped.show(5)

# Caso 2: agrupando valores a partir de dos columnas
df_grouped = df.groupBy('Origin', 'Model').count()
df_grouped.show(5)

Agrupando a partir de una sola columna
+------+-----+
|Origin|count|
+------+-----+
|Europe|   73|
|    US|  254|
| Japan|   79|
+------+-----+

+------+-----+-----+
|Origin|Model|count|
+------+-----+-----+
|Europe|   71|    5|
|Europe|   80|    9|
|Europe|   79|    4|
| Japan|   75|    4|
|    US|   72|   18|
+------+-----+-----+
only showing top 5 rows



In [101]:
# Se pueden aplicar agrupaciones para obtener valore estadísticos
# Por ejemplo, ¿Cual es el promedio de consumo de los automóviles europeos, americanos y japoneses?
df.groupBy('Origin').avg('MPG').show()

+------+------------------+
|Origin|          avg(MPG)|
+------+------------------+
|Europe|26.745205479452057|
|    US|19.688188976377948|
| Japan|30.450632911392397|
+------+------------------+



#### 5. Eliminación de columnas: drop

In [105]:
# se eliminan las columnas del dataframe

df_newCols.show(10,truncate=False)

df_newCols1 = df_newCols.drop('col1').drop('col2').drop('col3')

df_newCols1.show(10,truncate=False)

+-------------------------+----+---------+------------+----------+------+------------+-----+------+----+----+----+----------------------------+
|Car                      |MPG |Cylinders|Displacement|Horsepower|Weight|Acceleration|Model|Origin|col1|col2|col3|car_model                   |
+-------------------------+----+---------+------------+----------+------+------------+-----+------+----+----+----+----------------------------+
|Chevrolet Chevelle Malibu|18.0|8        |307.0       |130.0     |3504  |12.0        |70   |US    |1   |2   |c3  |Chevrolet Chevelle Malibu 70|
|Buick Skylark 320        |15.0|8        |350.0       |165.0     |3693  |11.5        |70   |US    |1   |2   |c3  |Buick Skylark 320 70        |
|Plymouth Satellite       |18.0|8        |318.0       |150.0     |3436  |11.0        |70   |US    |1   |2   |c3  |Plymouth Satellite 70       |
|AMC Rebel SST            |16.0|8        |304.0       |150.0     |3433  |12.0        |70   |US    |1   |2   |c3  |AMC Rebel SST 70      

## Transformaciones aplicables a renglones

#### 1. Filtrando registros: filtered

In [109]:
# filtered() permite recuperar registros que cumplan con una condición especificada

# Ejemplo 1: recuperar todos los automóviles hechos en Japon
df_japan_filtered = df.filter(df['Origin']=='Japan')
print("Total de registros origen Japones : " + str(df_japan_filtered.count()))
df_japan_filtered.show(10, truncate=False)

Total de registros origen Japones : 79
+---------------------------+----+---------+------------+----------+------+------------+-----+------+
|Car                        |MPG |Cylinders|Displacement|Horsepower|Weight|Acceleration|Model|Origin|
+---------------------------+----+---------+------------+----------+------+------------+-----+------+
|Toyota Corolla Mark ii     |24.0|4        |113.0       |95.0      |2372  |15.0        |70   |Japan |
|Datsun PL510               |27.0|4        |97.0        |88.0      |2130  |14.5        |70   |Japan |
|Datsun PL510               |27.0|4        |97.0        |88.0      |2130  |14.5        |71   |Japan |
|Toyota Corolla             |25.0|4        |113.0       |95.0      |2228  |14.0        |71   |Japan |
|Toyota Corolla 1200        |31.0|4        |71.0        |65.0      |1773  |19.0        |71   |Japan |
|Datsun 1200                |35.0|4        |72.0        |69.0      |1613  |18.0        |71   |Japan |
|Toyota Corolla Hardtop     |24.0|4        

In [110]:
# Ejemplo 2: variando la sintaxis sobre la especificación de los atributos

# Se puede usar las diferentes sintaxis para seleccionar columnas
df.filter(df.Origin=='Japan').show(10, truncate=False)
df.filter(col('Origin')=='Japan').show(10, truncate=False)

+---------------------------+----+---------+------------+----------+------+------------+-----+------+
|Car                        |MPG |Cylinders|Displacement|Horsepower|Weight|Acceleration|Model|Origin|
+---------------------------+----+---------+------------+----------+------+------------+-----+------+
|Toyota Corolla Mark ii     |24.0|4        |113.0       |95.0      |2372  |15.0        |70   |Japan |
|Datsun PL510               |27.0|4        |97.0        |88.0      |2130  |14.5        |70   |Japan |
|Datsun PL510               |27.0|4        |97.0        |88.0      |2130  |14.5        |71   |Japan |
|Toyota Corolla             |25.0|4        |113.0       |95.0      |2228  |14.0        |71   |Japan |
|Toyota Corolla 1200        |31.0|4        |71.0        |65.0      |1773  |19.0        |71   |Japan |
|Datsun 1200                |35.0|4        |72.0        |69.0      |1613  |18.0        |71   |Japan |
|Toyota Corolla Hardtop     |24.0|4        |113.0       |95.0      |2278  |15.5   

In [111]:
# Ejemplo 3: definiendo un operador a partir de una condición compuesta

japan_4cil = df.filter((df.Origin == 'Japan') & (df.Cylinders == 4))
print("Total de registros de carros Japones 4 cilindros: " + str(japan_4cil.count()))
japan_4cil.show(10,truncate=False)

Total de registros de carros Japones 4 cilindros: 69
+---------------------------+----+---------+------------+----------+------+------------+-----+------+
|Car                        |MPG |Cylinders|Displacement|Horsepower|Weight|Acceleration|Model|Origin|
+---------------------------+----+---------+------------+----------+------+------------+-----+------+
|Toyota Corolla Mark ii     |24.0|4        |113.0       |95.0      |2372  |15.0        |70   |Japan |
|Datsun PL510               |27.0|4        |97.0        |88.0      |2130  |14.5        |70   |Japan |
|Datsun PL510               |27.0|4        |97.0        |88.0      |2130  |14.5        |71   |Japan |
|Toyota Corolla             |25.0|4        |113.0       |95.0      |2228  |14.0        |71   |Japan |
|Toyota Corolla 1200        |31.0|4        |71.0        |65.0      |1773  |19.0        |71   |Japan |
|Datsun 1200                |35.0|4        |72.0        |69.0      |1613  |18.0        |71   |Japan |
|Toyota Corolla Hardtop     |

In [112]:
# Ejemplo 4
df_japan_4_6_cil = df.filter((df.Origin == 'Japan') & ((4 <= df.Cylinders) & (df.Cylinders <= 6)))
print("Total de registros de carros Japoneses 4 cilindros a 6 cilindros: " + str(df_japan_4_6_cil.count()))
df_japan_4_6_cil.show(10,truncate=False)

Total de registros de carros Japoneses 4 cilindros a 6 cilindros: 75
+---------------------------+----+---------+------------+----------+------+------------+-----+------+
|Car                        |MPG |Cylinders|Displacement|Horsepower|Weight|Acceleration|Model|Origin|
+---------------------------+----+---------+------------+----------+------+------------+-----+------+
|Toyota Corolla Mark ii     |24.0|4        |113.0       |95.0      |2372  |15.0        |70   |Japan |
|Datsun PL510               |27.0|4        |97.0        |88.0      |2130  |14.5        |70   |Japan |
|Datsun PL510               |27.0|4        |97.0        |88.0      |2130  |14.5        |71   |Japan |
|Toyota Corolla             |25.0|4        |113.0       |95.0      |2228  |14.0        |71   |Japan |
|Toyota Corolla 1200        |31.0|4        |71.0        |65.0      |1773  |19.0        |71   |Japan |
|Datsun 1200                |35.0|4        |72.0        |69.0      |1613  |18.0        |71   |Japan |
|Toyota Corol

#### Eliminando registros duplicados: distinct

In [113]:
# Eliminando registros duplicados

df_origin_model = df.select(df.Origin, df.Model).distinct()
print("Total de registros en Dataframe origin_model: " + str(df_origin_model.count()))
df_origin_model.show(10,truncate=False)
# Si no se hubieran eliminado registros repetidos
print("Total de registros en Dataframe origin_model con duplicados: " + str(df.select(df.Origin, df.Model).count()))

Total de registros en Dataframe origin_model: 39
+------+-----+
|Origin|Model|
+------+-----+
|Europe|71   |
|Europe|80   |
|Europe|79   |
|Japan |75   |
|US    |72   |
|US    |80   |
|Europe|74   |
|Japan |79   |
|Europe|76   |
|US    |75   |
+------+-----+
only showing top 10 rows

Total de registros en Dataframe origin_model con duplicados: 406


#### Ordenando registros: orderBy

In [114]:
# Ejemplo donde se ordenan un conjunto de registros a partir de un atributo específico

df_Origin_Model_sort = df.select(df.Origin, df.Model).distinct().orderBy('Model', ascending=False)
df_Origin_Model_sort.show(10,truncate=False)

+------+-----+
|Origin|Model|
+------+-----+
|Europe|82   |
|Japan |82   |
|US    |82   |
|Japan |81   |
|US    |81   |
|Europe|81   |
|Japan |80   |
|Europe|80   |
|US    |80   |
|Europe|79   |
+------+-----+
only showing top 10 rows



In [115]:
# Se puede especificar el ordenamiento de registros a partir de dos o más columnas

df_Origin_Model_sort = df.select(df.Origin, df.Model).distinct().orderBy(['Model', 'Origin'], ascending=[False, False])
df_Origin_Model_sort.show(10,truncate=False)

+------+-----+
|Origin|Model|
+------+-----+
|US    |82   |
|Japan |82   |
|Europe|82   |
|US    |81   |
|Japan |81   |
|Europe|81   |
|US    |80   |
|Japan |80   |
|Europe|80   |
|US    |79   |
+------+-----+
only showing top 10 rows



#### Uniendo Dataframes: union

In [116]:
# se crean dos dataframes diferentes con igual esquema
df_japan_4cil = df.filter((df.Origin == 'Japan') & (df.Cylinders == 4))
df_japan_6cil = df.filter((df.Origin == 'Japan') & (df.Cylinders == 6))
# Se unen los dataframes
df_japan_4cil_6cil = df_japan_4cil.union(df_japan_6cil)
df_japan_4cil_6cil.show(10,truncate = False)
df_japan_4cil_6cil.count()

+---------------------------+----+---------+------------+----------+------+------------+-----+------+
|Car                        |MPG |Cylinders|Displacement|Horsepower|Weight|Acceleration|Model|Origin|
+---------------------------+----+---------+------------+----------+------+------------+-----+------+
|Toyota Corolla Mark ii     |24.0|4        |113.0       |95.0      |2372  |15.0        |70   |Japan |
|Datsun PL510               |27.0|4        |97.0        |88.0      |2130  |14.5        |70   |Japan |
|Datsun PL510               |27.0|4        |97.0        |88.0      |2130  |14.5        |71   |Japan |
|Toyota Corolla             |25.0|4        |113.0       |95.0      |2228  |14.0        |71   |Japan |
|Toyota Corolla 1200        |31.0|4        |71.0        |65.0      |1773  |19.0        |71   |Japan |
|Datsun 1200                |35.0|4        |72.0        |69.0      |1613  |18.0        |71   |Japan |
|Toyota Corolla Hardtop     |24.0|4        |113.0       |95.0      |2278  |15.5   

75

## SQL en Spark

In [117]:
# Se registra temporalmente la tabla con el nombre "temp"
df.createOrReplaceTempView("temp")
# Se imprimen los 5 primeros registros
spark.sql("select * from temp limit 5").show()
# Se calcula el total de registros de la tabla
spark.sql("select count(*) as total_count from temp").show()

+--------------------+----+---------+------------+----------+------+------------+-----+------+
|                 Car| MPG|Cylinders|Displacement|Horsepower|Weight|Acceleration|Model|Origin|
+--------------------+----+---------+------------+----------+------+------------+-----+------+
|Chevrolet Chevell...|18.0|        8|       307.0|     130.0|  3504|        12.0|   70|    US|
|   Buick Skylark 320|15.0|        8|       350.0|     165.0|  3693|        11.5|   70|    US|
|  Plymouth Satellite|18.0|        8|       318.0|     150.0|  3436|        11.0|   70|    US|
|       AMC Rebel SST|16.0|        8|       304.0|     150.0|  3433|        12.0|   70|    US|
|         Ford Torino|17.0|        8|       302.0|     140.0|  3449|        10.5|   70|    US|
+--------------------+----+---------+------------+----------+------+------------+-----+------+

+-----------+
|total_count|
+-----------+
|        406|
+-----------+



In [118]:
# Ejemplo de una consulta con una condición
df_origin_model = spark.sql("select Origin,Model from temp where Origin=='US'")
df_origin_model.show()

+------+-----+
|Origin|Model|
+------+-----+
|    US|   70|
|    US|   70|
|    US|   70|
|    US|   70|
|    US|   70|
|    US|   70|
|    US|   70|
|    US|   70|
|    US|   70|
|    US|   70|
|    US|   70|
|    US|   70|
|    US|   70|
|    US|   70|
|    US|   70|
|    US|   70|
|    US|   70|
|    US|   70|
|    US|   70|
|    US|   70|
+------+-----+
only showing top 20 rows



In [119]:
# Ejemplo de una consulta con una condicional estructurada
df_japan_4cil_6cil = spark.sql("select Origin, Model from temp where (Origin=='Japan' and (Cylinders==4 or Cylinders==6))")
print("Numero de registros: " + str(df_japan_4cil_6cil.count()))
df_japan_4cil_6cil.show(10, truncate = False)

Numero de registros: 75
+------+-----+
|Origin|Model|
+------+-----+
|Japan |70   |
|Japan |70   |
|Japan |71   |
|Japan |71   |
|Japan |71   |
|Japan |71   |
|Japan |72   |
|Japan |72   |
|Japan |72   |
|Japan |72   |
+------+-----+
only showing top 10 rows



In [120]:
# Otro ejemplo de query estructurado
# Ejemplo en el cual se genera una cadena donde se define la consulta
query = """select Origin, Model from temp where (Origin=='Japan' and (Cylinders==4 or Cylinders==6))"""
df_japan_4cil_6cil = spark.sql(query)
print("Numero de registros: " + str(df_japan_4cil_6cil.count()))
df_japan_4cil_6cil.show(10, truncate = False)

Numero de registros: 75
+------+-----+
|Origin|Model|
+------+-----+
|Japan |70   |
|Japan |70   |
|Japan |71   |
|Japan |71   |
|Japan |71   |
|Japan |71   |
|Japan |72   |
|Japan |72   |
|Japan |72   |
|Japan |72   |
+------+-----+
only showing top 10 rows

