In [82]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql.window import Window

In [7]:
!pip install findspark

Defaulting to user installation because normal site-packages is not writeable
Collecting findspark
  Using cached findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1


In [7]:
import findspark
findspark.init()

spark = SparkSession.builder.getOrCreate()
spark

In [23]:
df_titles = spark.read.csv('data/title_basics.tsv', header=True, sep='\t')
df_ratings = spark.read.csv('data/title_ratings.tsv', header=True, sep='\t')

In [14]:
df_titles.printSchema()

root
 |-- tconst: string (nullable = true)
 |-- titleType: string (nullable = true)
 |-- primaryTitle: string (nullable = true)
 |-- originalTitle: string (nullable = true)
 |-- isAdult: string (nullable = true)
 |-- startYear: string (nullable = true)
 |-- endYear: string (nullable = true)
 |-- runtimeMinutes: string (nullable = true)
 |-- genres: string (nullable = true)



 ### Quantos filmes (incluindo os da televisão) foram lançados no ano de 2015?

In [24]:
df_titles.select('titleType').distinct().show()

+------------+
|   titleType|
+------------+
|    tvSeries|
|tvMiniSeries|
|     tvMovie|
|   tvEpisode|
|       movie|
|   tvSpecial|
|       video|
|   videoGame|
|     tvShort|
|       short|
|     tvPilot|
| radioSeries|
|radioEpisode|
+------------+



In [28]:
df_titles.filter((F.col('startYear') == 2015) & (F.col('titleType').isin('movie', 'tvMovie'))).count()

19987

### Qual o gênero de títulos mais frequente? 
#### Dica: Utilize as funções split e explode. 


In [61]:
(
    df_titles
    .withColumn('genres_array', F.split(F.col('genres'), ','))
    .withColumn('genres_unico', F.explode(F.col('genres_array')))
    .select('primaryTitle','genres_unico')
    .groupBy('genres_unico')
    .count()
    .orderBy(F.col('count').desc())
    .show(10)
)

+------------+-------+
|genres_unico|  count|
+------------+-------+
|       Drama|2247995|
|      Comedy|1653725|
|       Short|1021850|
|   Talk-Show| 900198|
| Documentary| 764885|
|     Romance| 724729|
|          \N| 643012|
|      Family| 571470|
|        News| 524662|
|  Reality-TV| 423455|
+------------+-------+
only showing top 10 rows



###  Pergunta 8
Qual o gênero com a melhor nota média de títulos?  

In [59]:
df_titles_ratings = (
    df_titles
    .join(df_ratings, 'tconst', 'left')
    .filter('averageRating is not null')
)

In [60]:
df_titles_ratings.show(2)

+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+-----------------+-------------+--------+
|   tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|           genres|averageRating|numVotes|
+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+-----------------+-------------+--------+
|tt0000008|    short|Edison Kinetoscop...|Edison Kinetoscop...|      0|     1894|     \N|             1|Documentary,Short|          5.5|    1965|
|tt0000015|    short| Autour d'une cabine| Autour d'une cabine|      0|     1894|     \N|             2|  Animation,Short|          6.2|     947|
+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+-----------------+-------------+--------+
only showing top 2 rows



In [None]:
+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+-----------------+-------------+--------+
|   tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|           genres|averageRating|numVotes|
+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+-----------------+-------------+--------+
|tt0000008|    short|Edison Kinetoscop...|Edison Kinetoscop...|      0|     1894|     \N|             1|Documentary,Short|          5.5|    1965|
|tt0000015|    short| Autour d'une cabine| Autour d'une cabine|      0|     1894|     \N|             2|  Animation,Short|          6.2|     947|
+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+-----------------+-------------+--------+

In [73]:
(
    df_titles_ratings
    .withColumn('genres_array', F.split(F.col('genres'), ','))
    .withColumn('genres_unico', F.explode(F.col('genres_array')))
    .select('genres_unico', 'averageRating')
    .groupBy('genres_unico')
    .agg(F.avg('averageRating').alias('media'))
    .orderBy(F.col('media').desc())
    .show(10)
)

+------------+------------------+
|genres_unico|             media|
+------------+------------------+
|     History| 7.353780102645081|
| Documentary| 7.240198535554571|
|   Biography| 7.175531914893618|
|     Mystery| 7.170086406897934|
|       Crime| 7.159842868485927|
|   Adventure| 7.107629703351722|
|     Fantasy| 7.095145650845382|
|   Animation|7.0893811714832236|
|     Western| 7.080683426568711|
|      Family| 7.070054926034473|
+------------+------------------+
only showing top 10 rows



### Pergunta 9 (1,67 pts)
Qual o vídeo game do gênero aventura mais bem avaliado em 2020?  

In [80]:
(
    df_titles_ratings
    .filter((F.col('startYear') == 2020) & (F.col('titleType') == 'videoGame'))
    .withColumn('genres_array', F.split(F.col('genres'), ','))
    .withColumn('genres_unico', F.explode(F.col('genres_array')))
    .select('originalTitle', 'genres_unico', 'averageRating')
    .filter((F.col('genres_unico') == 'Adventure'))
    .orderBy(F.col('averageRating').desc())
    .show(10)
)

+--------------------+------------+-------------+
|       originalTitle|genres_unico|averageRating|
+--------------------+------------+-------------+
|     Half-Life: Alyx|   Adventure|          9.5|
|   Ghost of Tsushima|   Adventure|          9.3|
|               Omori|   Adventure|          9.2|
|Ori and the Will ...|   Adventure|          9.1|
|Final Fantasy VII...|   Adventure|          9.1|
|There Is No Game:...|   Adventure|          8.9|
|Mega Man Zero/ZX ...|   Adventure|          8.9|
|       Demon's Souls|   Adventure|          8.8|
|Xenoblade Chronic...|   Adventure|          8.8|
|Ryu ga gotoku 7: ...|   Adventure|          8.8|
+--------------------+------------+-------------+
only showing top 10 rows



### Pergunta 11 (1,67 pts)

Qual o percentual de títulos do gênero comédia lançados em 2018 em relação ao total de títulos lançados nesse ano? 

Dica: Utilize as funções Split, explode e uma window function.

In [109]:
total_2018 = (
    df_titles
    .filter(F.col('startYear') == 2018)
    .count()
)
total_2018

402244

In [103]:
df_titles_subset = (
    df_titles
    .filter("cast(startYear as int) == 2018")
#     .sample(fraction = 0.5)
    .withColumn('genre', F.split('genres', ',').getItem(0))
    .filter(F.col('genre') == 'Comedy')
)
df_titles_subset.show(5)

+---------+---------+------------------+------------------+-------+---------+-------+--------------+--------------------+------+
|   tconst|titleType|      primaryTitle|     originalTitle|isAdult|startYear|endYear|runtimeMinutes|              genres| genre|
+---------+---------+------------------+------------------+-------+---------+-------+--------------+--------------------+------+
|tt0111414|    movie|       A Thin Life|       A Thin Life|      0|     2018|     \N|            75|              Comedy|Comedy|
|tt0365545|    movie|Nappily Ever After|Nappily Ever After|      0|     2018|     \N|            98|Comedy,Drama,Romance|Comedy|
|tt0380870| tvSeries|     Comedy Circus|     Comedy Circus|      0|     2018|     \N|            30|              Comedy|Comedy|
|tt0640426|tvEpisode| Vidas Brasileiras| Vidas Brasileiras|      0|     2018|     \N|            \N|  Comedy,Drama,Music|Comedy|
|tt0640427|tvEpisode| Vidas Brasileiras| Vidas Brasileiras|      0|     2018|     \N|            

In [111]:
w = Window.partitionBy('genre').orderBy('startYear').rowsBetween(Window.unboundedPreceding, Window.currentRow)
(
    df_titles_subset
    .withColumn('runtimeMinutes', F.col('runtimeMinutes').cast('int'))
    .groupby('genre', 'startYear')
    .agg(F.expr('mean(runtimeMinutes) as media_minutos'))
    .orderBy('genre', 'startYear')
    .withColumn('meadia_movel_3anos', F.round(F.mean('media_minutos').over(w), 3))
    .limit(15)
    .toPandas()
)

Unnamed: 0,genre,startYear,media_minutos,meadia_movel_3anos
0,Comedy,2018,33.659187,33.659


In [None]:
w = Window.partitionBy('genre').orderBy('startYear').rowsBetween(Window.unboundedPreceding, Window.currentRow)
(
    df_titles_subset
    .withColumn('runtimeMinutes', F.col('runtimeMinutes').cast('int'))
    .groupby('genre', 'startYear')
    .agg(F.expr('mean(runtimeMinutes) as media_minutos'))
    .orderBy('genre', 'startYear')
    .withColumn('meadia_movel_3anos', F.round(F.mean('media_minutos').over(w), 3))
    .withColumn('runtimeMinutes', F.col('runtimeMinutes').cast('int'))
    .limit(15)
    .toPandas()
)

##  Pergunta 14 (1,67 pts)
Considere a definição de uma udf abaixo: 

```
def sqr_divide(value): 

    return (value**2)/2 

sqr_divide_udf = udf(sqr_divide, IntegerType()) 
```

A definição de sqr_divide_udf possui um problema. Depois de solucionar o problema, ao executar:

```
(
    df_ratings
    .withColumn('averageRating', F.col('averageRating').cast('double'))
    .select(sqr_divide_udf('averageRating').alias('averageRating'))
    .agg(F.mean('averageRating').alias('averageRating'))
    .show()
)
```

o que retorna é: 


Obs.: Considere 3 casas decimais. 


In [52]:
def sqr_divide(value): 

    return (value**2)/2 

sqr_divide_udf = F.udf(sqr_divide, FloatType())

In [53]:
(
    df_ratings
    .withColumn('averageRating', F.col('averageRating').cast('double'))
    .select(sqr_divide_udf('averageRating').alias('averageRating'))
    .agg(F.mean('averageRating').alias('averageRating'))
    .show()
)

+------------------+
|     averageRating|
+------------------+
|24.899137954281525|
+------------------+

