# Transformaciones de DataFrames

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# directorio donde estan los datos
path = 'files/'

spark = SparkSession.builder.getOrCreate()

In [4]:
# Cargar DataFrame
df = spark.read.parquet(path+'dataPARQUET.parquet')

df.printSchema()

root
 |-- video_id: string (nullable = true)
 |-- trending_date: string (nullable = true)
 |-- title: string (nullable = true)
 |-- channel_title: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- publish_time: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- views: string (nullable = true)
 |-- likes: string (nullable = true)
 |-- dislikes: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- thumbnail_link: string (nullable = true)
 |-- comments_disabled: string (nullable = true)
 |-- ratings_disabled: string (nullable = true)
 |-- video_error_or_removed: string (nullable = true)
 |-- description: string (nullable = true)



## Seleccionar columnas

In [11]:
# Primera alternativa para referirnos a las columnas
df.select('title','video_id').show()

+--------------------+-----------+
|               title|   video_id|
+--------------------+-----------+
|WE WANT TO TALK A...|2kyS6SvSYSE|
|The Trump Preside...|1ZAPwfrtAFY|
|Racist Superman |...|5qpjK5DgCt4|
|Nickelback Lyrics...|puqaWrEC7tY|
|I Dare You: GOING...|d380meD0W0M|
|2 Weeks with iPho...|gHZ1Qz0KiKM|
|Roy Moore & Jeff ...|39idVpFF7NQ|
|5 Ice Cream Gadge...|nc99ccSXST0|
|The Greatest Show...|jr9QtXwC9vc|
|Why the rise of t...|TUmyygCMMGA|
|Dion Lewis' 103-Y...|9wRQljFNDW8|
|(SPOILERS) 'Shiva...|VifQlJit6A0|
|Marshmello - Bloc...|5E4ZBSInqUU|
|Which Countries A...|GgVmn66oK_A|
|SHOPPING FOR NEW ...|TaTleo4cOs8|
|    The New SpotMini|kgaO45SyaO4|
|One Change That W...|ZAQs-ctOqXQ|
|How does your bod...|YVfyYrEmzgM|
|HomeMade Electric...|eNSN6qet1kE|
|Founding An Inbre...|B5HORANmzHw|
+--------------------+-----------+
only showing top 20 rows



In [10]:
# Segunda alternativa
df.select(col('title'),col('video_id'),col('trending_date')).show()

+--------------------+-----------+-------------+
|               title|   video_id|trending_date|
+--------------------+-----------+-------------+
|WE WANT TO TALK A...|2kyS6SvSYSE|     17.14.11|
|The Trump Preside...|1ZAPwfrtAFY|     17.14.11|
|Racist Superman |...|5qpjK5DgCt4|     17.14.11|
|Nickelback Lyrics...|puqaWrEC7tY|     17.14.11|
|I Dare You: GOING...|d380meD0W0M|     17.14.11|
|2 Weeks with iPho...|gHZ1Qz0KiKM|     17.14.11|
|Roy Moore & Jeff ...|39idVpFF7NQ|     17.14.11|
|5 Ice Cream Gadge...|nc99ccSXST0|     17.14.11|
|The Greatest Show...|jr9QtXwC9vc|     17.14.11|
|Why the rise of t...|TUmyygCMMGA|     17.14.11|
|Dion Lewis' 103-Y...|9wRQljFNDW8|     17.14.11|
|(SPOILERS) 'Shiva...|VifQlJit6A0|     17.14.11|
|Marshmello - Bloc...|5E4ZBSInqUU|     17.14.11|
|Which Countries A...|GgVmn66oK_A|     17.14.11|
|SHOPPING FOR NEW ...|TaTleo4cOs8|     17.14.11|
|    The New SpotMini|kgaO45SyaO4|     17.14.11|
|One Change That W...|ZAQs-ctOqXQ|     17.14.11|
|How does your bod..

## Seleccion de columnas - Avanzado

### Funcion select

In [12]:
df = spark.read.parquet(path+'datos.parquet')

df.printSchema()

root
 |-- video_id: string (nullable = true)
 |-- trending_date: string (nullable = true)
 |-- title: string (nullable = true)
 |-- channel_title: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- publish_time: timestamp (nullable = true)
 |-- tags: string (nullable = true)
 |-- views: integer (nullable = true)
 |-- likes: integer (nullable = true)
 |-- dislikes: integer (nullable = true)
 |-- comment_count: integer (nullable = true)
 |-- thumbnail_link: string (nullable = true)
 |-- comments_disabled: string (nullable = true)
 |-- ratings_disabled: string (nullable = true)
 |-- video_error_or_removed: string (nullable = true)
 |-- description: string (nullable = true)



In [16]:
# Seleccionar una columna
df.select(col('video_id')).show(5)

+-----------+
|   video_id|
+-----------+
|2kyS6SvSYSE|
|1ZAPwfrtAFY|
|5qpjK5DgCt4|
|puqaWrEC7tY|
|d380meD0W0M|
+-----------+
only showing top 5 rows



In [17]:
# Seleccionar multiples columnas
df.select('video_id', 'trending_date').show(5)

+-----------+-------------+
|   video_id|trending_date|
+-----------+-------------+
|2kyS6SvSYSE|     17.14.11|
|1ZAPwfrtAFY|     17.14.11|
|5qpjK5DgCt4|     17.14.11|
|puqaWrEC7tY|     17.14.11|
|d380meD0W0M|     17.14.11|
+-----------+-------------+
only showing top 5 rows



In [18]:
# Seleccionar todas las columnas
df.select('*').show(5)

+-----------+-------------+--------------------+--------------------+-----------+-------------------+--------------------+-------+------+--------+-------------+--------------------+-----------------+----------------+----------------------+--------------------+
|   video_id|trending_date|               title|       channel_title|category_id|       publish_time|                tags|  views| likes|dislikes|comment_count|      thumbnail_link|comments_disabled|ratings_disabled|video_error_or_removed|         description|
+-----------+-------------+--------------------+--------------------+-----------+-------------------+--------------------+-------+------+--------+-------------+--------------------+-----------------+----------------+----------------------+--------------------+
|2kyS6SvSYSE|     17.14.11|WE WANT TO TALK A...|        CaseyNeistat|         22|2017-11-13 12:13:01|     SHANtell martin| 748374| 57527|    2966|        15954|https://i.ytimg.c...|            False|           False| 

In [19]:
# Con solo select no se pueden crear expresiones ('likes' - 'dislikes')
# Esta vía nos dará error

df.select(
    'likes',
    'dislikes',
    ('likes' - 'dislikes')
).show(5)

TypeError: unsupported operand type(s) for -: 'str' and 'str'

In [21]:
# Con el metodo col si se pueden crear las expresiones
# Forma correcta

df.select(
    col('likes'),
    col('dislikes'),
    (col('likes') - col('dislikes')).alias('aceptacion')
).show(5)

+------+--------+----------+
| likes|dislikes|aceptacion|
+------+--------+----------+
| 57527|    2966|     54561|
| 97185|    6146|     91039|
|146033|    5339|    140694|
| 10172|     666|      9506|
|132235|    1989|    130246|
+------+--------+----------+
only showing top 5 rows



### Funcion selectExpr

In [22]:
df.selectExpr('likes', 'dislikes', '(likes - dislikes) as aceptacion').show(5)

+------+--------+----------+
| likes|dislikes|aceptacion|
+------+--------+----------+
| 57527|    2966|     54561|
| 97185|    6146|     91039|
|146033|    5339|    140694|
| 10172|     666|      9506|
|132235|    1989|    130246|
+------+--------+----------+
only showing top 5 rows



In [23]:
df.selectExpr("count(distinct(video_id)) as videos").show(5)


+------+
|videos|
+------+
|  6837|
+------+



In [2]:
spark.sparkContext.stop()