## 0. Preparación de los ficheros

#### Cargar paquetes y librerías necesarias

In [1]:
import sys
import pyspark
import findspark
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.mllib.recommendation import ALS, Rating
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.functions import explode
from pyspark.sql.types import IntegerType
from pyspark.sql import functions as F
import pandas as pd
pd.set_option('display.max_rows', None)

In [2]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder
from pyspark.ml.stat import Summarizer
import numpy as np

In [3]:
conf = SparkConf().setMaster("local").setAppName("AML")
spark = SparkSession.builder.getOrCreate()

#### Importar los CSV y formatearlos

In [4]:
CSVanime = spark.read.option("quote", "\"").option("escape", "\"").csv('../dataset_valoraciones_anime/anime.csv', inferSchema=True, header=True)

In [5]:
CSVrating = spark.read.option("quote", "\"").option("escape", "\"").csv('../dataset_valoraciones_anime/rating_complete.csv', inferSchema=True, header=True)

In [6]:
CSVep = spark.read.option("quote", "\"").option("escape", "\"").csv('../dataset_valoraciones_anime/valoraciones_EP.csv', inferSchema=True, header=True).toDF("user_id", "anime_id", "rating")

#### Comprobar que maneja bien las comas y las comillas

In [7]:
print("Dos primeros nombres que contienen [,]:")
CSVanime.select("Name", "ID").filter(col("Name").like("%,%")).show(2, truncate=False)
print("Dos primeros nombres que contienen [']:")
CSVanime.select("Name", "ID").filter(col("Name").like("%'%")).show(2, truncate=False)
print("Dos primeros nombres que contienen [\"]:")
CSVanime.select("Name", "ID").filter(col("Name").like('%"%')).show(2, truncate=False)
print("Dos primeros nombres con caracteres japoneses:")
CSVanime.select("Japanese name", "ID").show(2, truncate=False)

Dos primeros nombres que contienen [,]:
+-------------------------------------+---+
|Name                                 |ID |
+-------------------------------------+---+
|Ima, Soko ni Iru Boku                |160|
|Chiisana Obake: Acchi, Kocchi, Socchi|310|
+-------------------------------------+---+
only showing top 2 rows

Dos primeros nombres que contienen [']:
+------------------------------------------------------+---+
|Name                                                  |ID |
+------------------------------------------------------+---+
|Mahou Shoujo Lyrical Nanoha A's                       |77 |
|Mobile Suit Gundam: The 08th MS Team - Miller's Report|83 |
+------------------------------------------------------+---+
only showing top 2 rows

Dos primeros nombres que contienen ["]:
+-----------------------------------------------------+----+
|Name                                                 |ID  |
+-----------------------------------------------------+----+
|Love Hina: Motok

#### Analizar filas de los CSV

In [8]:
print(CSVanime.columns)

['ID', 'Name', 'Score', 'Genres', 'English name', 'Japanese name', 'Type', 'Episodes', 'Aired', 'Premiered', 'Producers', 'Licensors', 'Studios', 'Source', 'Duration', 'Rating', 'Ranked', 'Popularity', 'Members', 'Favorites', 'Watching', 'Completed', 'On-Hold', 'Dropped', 'Plan to Watch', 'Score-10', 'Score-9', 'Score-8', 'Score-7', 'Score-6', 'Score-5', 'Score-4', 'Score-3', 'Score-2', 'Score-1']


In [9]:
CSVrating.show(3)

+-------+--------+------+
|user_id|anime_id|rating|
+-------+--------+------+
|      0|     430|     9|
|      0|    1004|     5|
|      0|    3010|     7|
+-------+--------+------+
only showing top 3 rows



#### Añadir cabeceras al CSV valoraciones

In [10]:
CSVep = CSVep.toDF(CSVrating.columns[0], CSVrating.columns[1], CSVrating.columns[2])
CSVep.show(3)
print("El dataframe tiene", CSVep.count(), "filas")

+-------+--------+------+
|user_id|anime_id|rating|
+-------+--------+------+
| 666666|    1358|   7.0|
| 666666|   13601|  10.0|
| 666666|    2001|   9.0|
+-------+--------+------+
only showing top 3 rows

El dataframe tiene 64 filas


## 1. Hacer un análisis exploratorio de los datos con pySpark, mostrando información relevante de los mismos (cuáles son los ítems mejor y peor valorados, la relación entre género y valoraciones, estudios con mejor y peor nota media, etc...).

In [11]:
minValoraciones = 50
cantidadTop = 3
mejorMedia = CSVrating.groupBy("anime_id").agg(F.mean("rating"), F.count("rating"))
mejorMedia = mejorMedia.withColumnRenamed("avg(rating)", "average")
mejorMedia = mejorMedia.withColumnRenamed("count(rating)", "count")
juntar = CSVanime.select("ID", "Name")
mejorMedia = mejorMedia.join(juntar, CSVrating.anime_id==juntar.ID)
mejorMedia = mejorMedia.where(col("count") > minValoraciones)
print("Top", cantidadTop, "items con mayor nota media (con al menos", minValoraciones, "valoraciones):")
mejorMedia.sort("average", ascending=False).withColumnRenamed("count", "Nº ratings").drop("ID", "anime_id").show(cantidadTop, truncate=False)
print("Top", cantidadTop, "items con peor nota media (con al menos", minValoraciones, "valoraciones):")
mejorMedia.sort("average", ascending=True).withColumnRenamed("count", "Nº ratings").drop("ID", "anime_id").show(cantidadTop, truncate=False)

Top 3 items con mayor nota media (con al menos 50 valoraciones):
+-----------------+----------+--------------------------------+
|average          |Nº ratings|Name                            |
+-----------------+----------+--------------------------------+
|9.24564778301433 |20794     |Gintama°                        |
|9.237009769219878|134197    |Fullmetal Alchemist: Brotherhood|
|9.224729815638906|9438      |Ginga Eiyuu Densetsu            |
+-----------------+----------+--------------------------------+
only showing top 3 rows

Top 3 items con peor nota media (con al menos 50 valoraciones):
+------------------+----------+----------------------------+
|average           |Nº ratings|Name                        |
+------------------+----------+----------------------------+
|1.749909974792942 |5554      |Tenkuu Danzai Skelter+Heaven|
|1.842294767162746 |4166      |Utsu Musume Sayuri          |
|1.9944827586206897|725       |Tsui no Sora                |
+------------------+----------+-

##### La categoría con menos items tiene 1690, por lo que no será necesario eliminar ninguna a la hora de futuros estudios

In [12]:
print("Géneros con mejor valoración media:")
CSVgeneros = CSVanime.select("Genres", "ID").filter(CSVanime.Genres != "Unknown")
CSVgeneros = CSVgeneros.join(CSVrating, CSVgeneros.ID==CSVrating.anime_id).drop("ID", "anime_id", "user_id")
CSVgeneros = CSVgeneros.select(split(col("Genres"), ","), "rating").withColumnRenamed("split(Genres, ,, -1)", "Genres")
CSVgeneros = CSVgeneros.select(explode(CSVgeneros.Genres), CSVgeneros.rating).withColumnRenamed("col", "Genero")
CSVgeneros = CSVgeneros.withColumn("Genero", trim(CSVgeneros.Genero))
CSVgeneros.groupBy("Genero").agg(F.mean("rating")).sort("avg(rating)", ascending=False).show()
print("Géneros con peor valoración media:")
CSVgeneros.groupBy("Genero").agg(F.mean("rating")).sort("avg(rating)", ascending=True).show()

Géneros con mejor valoración media:
+-------------+------------------+
|       Genero|       avg(rating)|
+-------------+------------------+
|     Thriller| 8.076588242745846|
|      Samurai| 7.935889107317298|
|   Historical| 7.863716624096956|
|     Military| 7.861498646075896|
|       Police| 7.835031637882817|
|      Mystery| 7.808137348924516|
|Psychological| 7.807657061332771|
|        Drama| 7.797757413110013|
|        Josei| 7.742684443560022|
|       Sports| 7.731615768552727|
|         Cars| 7.718366576096889|
|      Shounen|7.7126795533450245|
| Supernatural|7.6884020511113045|
|  Super Power| 7.687961121608626|
|Slice of Life| 7.680003910870478|
|       Parody|  7.63876670708943|
|    Adventure| 7.616019414129019|
|       Shoujo|  7.60431667784289|
|        Space| 7.598700824975208|
|       Demons|  7.57933957708387|
+-------------+------------------+
only showing top 20 rows

Géneros con peor valoración media:
+------------+------------------+
|      Genero|       avg(rati

In [13]:
CSVestudios = CSVanime.select("Studios", "Score")
CSVestudios = CSVestudios.filter((CSVestudios.Score != "Unknown") & (CSVestudios.Studios != "Unknown"))
CSVestudios = CSVestudios.groupBy("Studios").agg(F.mean("Score"), F.count("Score"))
CSVestudios = CSVestudios.withColumnRenamed("avg(Score)", "Valoracion_media").withColumnRenamed("count(Score)", "Cantidad")
CSVestudios = CSVestudios.filter(CSVestudios.Cantidad >= 30).drop("Cantidad")
print("Estudios con peor valoración media (con al menos 30 valoraciones):")
CSVestudios.orderBy('Valoracion_media', ascending=True).show()
print("Estudios con mejor valoración media (con al menos 30 valoraciones):")
CSVestudios.orderBy('Valoracion_media', ascending=False).show(truncate=False)

Estudios con peor valoración media (con al menos 30 valoraciones):
+--------------------+------------------+
|             Studios|  Valoracion_media|
+--------------------+------------------+
|                 DLE| 5.843207547169812|
|           Magic Bus| 6.241388888888888|
|          Studio 4°C| 6.268596491228069|
|                Arms|6.2762162162162145|
|               Seven| 6.388412698412699|
|     Production Reed| 6.444406779661016|
|                 TNK|6.4730952380952385|
|               Zexcs|             6.535|
|Tatsunoko Production| 6.537155172413793|
|       Studio Hibari| 6.539302325581397|
|                 AIC| 6.567372881355933|
|                PoRO|6.6000000000000005|
|              Gainax| 6.600754716981133|
|                 OLM| 6.631657754010694|
|  Tezuka Productions| 6.720606060606062|
|      Toei Animation| 6.725945485519595|
|               Actas|            6.7275|
|          LIDENFILMS|            6.7455|
|       Kinema Citrus| 6.757714285714287|
|        

## 2. Crear un programa con Spark ML y el algoritmo de recomendación ALS que genere un listado con 5 series de TV y 5 películas para recomendar al usuario EP (con id 666666) para ello se deben incorporar las valoraciones de EP al fichero de valoraciones total, entrenar el algoritmo y pedirle un listado de recomendaciones para ese usuario. Las recomendaciones generadas deben incluir el ID del anime y los títulos original (name) y en inglés (English name). El listado debe aparecer ordenado por valoración media de cada serie/película

In [14]:
def valMedia(df):
    df = df.withColumn("Val_media", col("Score-10")*10+col("Score-9")*9+col("Score-8")*8+col("Score-7")*7+col("Score-6")*6+col("Score-5")*5+col("Score-4")*4+col("Score-3")*3+col("Score-2")*2+col("Score-1"))
    df = df.withColumn("Val_media", col("Val_media")/(col("Score-1")+col("Score-2")+col("Score-3")+col("Score-4")+col("Score-5")+col("Score-6")+col("Score-7")+col("Score-8")+col("Score-9")+col("Score-10")))
    df = df.drop("Score-1", "Score-2", "Score-3", "Score-4", "Score-5", "Score-6", "Score-7", "Score-8", "Score-9", "Score-10")
    return df

In [15]:
def durationNumber(df):
    df = df.withColumn("Duration", regexp_replace("Duration", " per ep.", ""))
    df = df.withColumn("Duration", regexp_replace("Duration", "\.", ""))
    df = df.withColumn("Duration", regexp_replace("Duration", " hr", "h"))
    df = df.withColumn("Duration", regexp_replace("Duration", " min", "m"))
    df = df.withColumn("Duration", reverse(split(col("Duration"), " ")))
    df = df.withColumn("mins", regexp_replace(col("Duration")[0], "m", ""))
    df = df.withColumn("hours", regexp_replace(col("Duration")[1], "h", ""))
    df = df.fillna({"hours": 0})
    df = df.withColumn("DurationN", col("hours")*60+col("mins"))
    df = df.drop("Duration", "mins", "hours")
    return df

#### Limpiar y preparar el dataframe

In [16]:
CSVml = CSVrating.union(CSVep).withColumnRenamed("rating", "User_rating")
CSVml = CSVml.join(CSVanime, CSVml.anime_id==CSVanime.ID)
CSVml = CSVml.select("user_id", "anime_id", "User_rating", "Genres", "name", "English name", "Type", "Episodes", "Studios", "Source", "Ranked", "Duration", "Popularity", "Score-10", "Score-9", "Score-8", "Score-7", "Score-6", "Score-5", "Score-4", "Score-3", "Score-2", "Score-1")
CSVml = CSVml.withColumnRenamed("English name", "English_name")
CSVml = CSVml.withColumn("anime_id", col("anime_id").cast(IntegerType()))
CSVml = valMedia(CSVml)
CSVml = durationNumber(CSVml)
CSVml = CSVml.where(CSVml.Episodes != "Unknown").where(CSVml.English_name != "null").where(CSVml.English_name != "Unknown")
CSVml = CSVml.na.drop(subset=["DurationN", "Val_media"])
print("El dataframe tiene", CSVml.count(), "filas")
CSVml.printSchema()
CSVml.show(5)

El dataframe tiene 46357218 filas
root
 |-- user_id: integer (nullable = true)
 |-- anime_id: integer (nullable = true)
 |-- User_rating: double (nullable = true)
 |-- Genres: string (nullable = true)
 |-- name: string (nullable = true)
 |-- English_name: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- Episodes: string (nullable = true)
 |-- Studios: string (nullable = true)
 |-- Source: string (nullable = true)
 |-- Ranked: string (nullable = true)
 |-- Popularity: integer (nullable = true)
 |-- Val_media: double (nullable = true)
 |-- DurationN: double (nullable = true)

+-------+--------+-----------+--------------------+--------------------+--------------------+-----+--------+---------------+--------+------+----------+-----------------+---------+
|user_id|anime_id|User_rating|              Genres|                name|        English_name| Type|Episodes|        Studios|  Source|Ranked|Popularity|        Val_media|DurationN|
+-------+--------+-----------+------------

#### DataFrame con películas

In [17]:
EPpelis = CSVml.where(CSVml.Type == "Movie").drop("Type", "Episodes", "Japanese_name")
print("El dataframe tiene", EPpelis.count(), "filas")
EPpelis = EPpelis.withColumn("Ranked", col("Ranked").cast(IntegerType()))
indexer = StringIndexer(inputCol="Genres", outputCol="GenresN")
EPpelis = indexer.fit(EPpelis).transform(EPpelis)
indexer = StringIndexer(inputCol="Studios", outputCol="StudiosN")
EPpelis = indexer.fit(EPpelis).transform(EPpelis)
indexer = StringIndexer(inputCol="Source", outputCol="SourceN")
EPpelis = indexer.fit(EPpelis).transform(EPpelis)
EPpelis.show(5)

El dataframe tiene 6135485 filas
+-------+--------+-----------+--------------------+--------------------+--------------------+----------------+--------+------+----------+-----------------+---------+-------+--------+-------+
|user_id|anime_id|User_rating|              Genres|                name|        English_name|         Studios|  Source|Ranked|Popularity|        Val_media|DurationN|GenresN|StudiosN|SourceN|
+-------+--------+-----------+--------------------+--------------------+--------------------+----------------+--------+------+----------+-----------------+---------+-------+--------+-------+
|      0|     430|        9.0|Military, Comedy,...|Fullmetal Alchemi...|Fullmetal Alchemi...|           Bones|   Manga|  1361|       506|7.587968995355985|    105.0|   29.0|    10.0|    1.0|
|      0|     570|        7.0|Military, Police,...|             Jin-Rou|Jin-Roh:The Wolf ...|  Production I.G|   Manga|   846|      1181|7.796858984381786|    102.0|  126.0|     5.0|    1.0|
|      0|   

In [18]:
EPpelisN = EPpelis.drop("Genres", "Studios", "Source")
EPpelisN.printSchema()
EPpelisN.show(5)

root
 |-- user_id: integer (nullable = true)
 |-- anime_id: integer (nullable = true)
 |-- User_rating: double (nullable = true)
 |-- name: string (nullable = true)
 |-- English_name: string (nullable = true)
 |-- Ranked: integer (nullable = true)
 |-- Popularity: integer (nullable = true)
 |-- Val_media: double (nullable = true)
 |-- DurationN: double (nullable = true)
 |-- GenresN: double (nullable = false)
 |-- StudiosN: double (nullable = false)
 |-- SourceN: double (nullable = false)

+-------+--------+-----------+--------------------+--------------------+------+----------+-----------------+---------+-------+--------+-------+
|user_id|anime_id|User_rating|                name|        English_name|Ranked|Popularity|        Val_media|DurationN|GenresN|StudiosN|SourceN|
+-------+--------+-----------+--------------------+--------------------+------+----------+-----------------+---------+-------+--------+-------+
|      0|     430|        9.0|Fullmetal Alchemi...|Fullmetal Alchemi...| 

#### DataFrame con series

In [19]:
EPseries = CSVml.where(((CSVml.Type == "ONA") & (CSVml.Episodes > 1)) | (CSVml.Type == "TV")).drop("Japanese_name")
print("El dataframe tiene", EPseries.count(), "filas")
EPseries = EPseries.withColumn("Episodes", col("Episodes").cast(IntegerType()))
EPseries = EPseries.withColumn("Ranked", col("Ranked").cast(IntegerType()))
indexer = StringIndexer(inputCol="Genres", outputCol="GenresN")
EPseries = indexer.fit(EPseries).transform(EPseries)
indexer = StringIndexer(inputCol="Type", outputCol="TypeN")
EPseries = indexer.fit(EPseries).transform(EPseries)
indexer = StringIndexer(inputCol="Studios", outputCol="StudiosN")
EPseries = indexer.fit(EPseries).transform(EPseries)
indexer = StringIndexer(inputCol="Source", outputCol="SourceN")
EPseries = indexer.fit(EPseries).transform(EPseries)
EPseries.show(5)

El dataframe tiene 35896854 filas
+-------+--------+-----------+--------------------+-------------------+--------------------+----+--------+---------------+------------+------+----------+------------------+---------+-------+-----+--------+-------+
|user_id|anime_id|User_rating|              Genres|               name|        English_name|Type|Episodes|        Studios|      Source|Ranked|Popularity|         Val_media|DurationN|GenresN|TypeN|StudiosN|SourceN|
+-------+--------+-----------+--------------------+-------------------+--------------------+----+--------+---------------+------------+------+----------+------------------+---------+-------+-----+--------+-------+
|      0|    3010|        7.0|Adventure, Histor...|     Kaiketsu Zorro|The Magnificent Z...|  TV|      52|Ashi Production|       Other|  2655|      5104| 7.227823867262285|     24.0| 1336.0|  0.0|   302.0|    8.0|
|      0|    1571|       10.0|Mystery, Comedy, ...|         Ghost Hunt|          Ghost Hunt|  TV|      25|    

In [20]:
EPseriesN = EPseries.drop("Genres", "Type", "Studios", "Source")
EPseriesN.printSchema()
EPseriesN.show(5)

root
 |-- user_id: integer (nullable = true)
 |-- anime_id: integer (nullable = true)
 |-- User_rating: double (nullable = true)
 |-- name: string (nullable = true)
 |-- English_name: string (nullable = true)
 |-- Episodes: integer (nullable = true)
 |-- Ranked: integer (nullable = true)
 |-- Popularity: integer (nullable = true)
 |-- Val_media: double (nullable = true)
 |-- DurationN: double (nullable = true)
 |-- GenresN: double (nullable = false)
 |-- TypeN: double (nullable = false)
 |-- StudiosN: double (nullable = false)
 |-- SourceN: double (nullable = false)

+-------+--------+-----------+-------------------+--------------------+--------+------+----------+------------------+---------+-------+-----+--------+-------+
|user_id|anime_id|User_rating|               name|        English_name|Episodes|Ranked|Popularity|         Val_media|DurationN|GenresN|TypeN|StudiosN|SourceN|
+-------+--------+-----------+-------------------+--------------------+--------+------+----------+----------

#### Entrenamiento películas

In [21]:
training, test = EPpelisN.randomSplit([0.8, 0.2])
als = ALS(maxIter=5, regParam=0.01, userCol="user_id", itemCol="anime_id", ratingCol="User_rating", coldStartStrategy="drop")
model = als.fit(training)
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="User_rating", predictionCol="prediction")

userRecs = model.recommendForAllUsers(5)
userRecs.show(5, truncate=False)

+-------+---------------------------------------------------------------------------------------------------+
|user_id|recommendations                                                                                    |
+-------+---------------------------------------------------------------------------------------------------+
|34     |[{32890, 17.97881}, {40811, 17.349768}, {32269, 16.618448}, {3362, 14.566329}, {16480, 13.598028}] |
|53     |[{32890, 21.307026}, {30268, 16.022106}, {16776, 14.56802}, {40811, 13.736708}, {31135, 13.680226}]|
|65     |[{33533, 27.701912}, {17469, 27.217438}, {25921, 20.402641}, {13817, 19.524263}, {29135, 17.34979}]|
|78     |[{25093, 17.741339}, {30268, 14.929622}, {31135, 11.68317}, {6728, 10.904329}, {7366, 10.706601}]  |
|85     |[{8598, 7.55478}, {3784, 7.5107765}, {33049, 7.4911385}, {433, 7.3989887}, {1689, 7.351724}]       |
+-------+---------------------------------------------------------------------------------------------------+
only showi

In [22]:
recomPelis = userRecs.where(col("user_id") == 666666)
recomPelis = recomPelis.select("user_id", explode(recomPelis.recommendations)).withColumnRenamed("col", "seleccion")
recomPelis = recomPelis.select("user_id", "seleccion.*")
recomPelis.printSchema()
recomPelis.show(truncate=False)

root
 |-- user_id: integer (nullable = false)
 |-- anime_id: integer (nullable = true)
 |-- rating: float (nullable = true)

+-------+--------+---------+
|user_id|anime_id|rating   |
+-------+--------+---------+
|666666 |16776   |21.626589|
|666666 |37661   |17.088545|
|666666 |4621    |16.063705|
|666666 |23343   |16.050005|
|666666 |10149   |15.463894|
+-------+--------+---------+



#### Entrenamiento series
#### En el caso de las series, no cabían en la memoria de mi ordenador, así que esta parte la trabajé desde un clúster de Google Cloud. Adjunto capturas del resultado, y el código aaparecerá comentado a continuación

<img src="./gcloud_01.png"/>

<img src="./gcloud_02.png"/>

In [23]:
# training, test = EPseriesN.randomSplit([0.8, 0.2])
# als = ALS(maxIter=5, regParam=0.01, userCol="user_id", itemCol="anime_id", ratingCol="User_rating", coldStartStrategy="drop")
# model = als.fit(training)
# predictions = model.transform(test)
# evaluator = RegressionEvaluator(metricName="rmse", labelCol="User_rating", predictionCol="prediction")

# userRecs = model.recommendForAllUsers(5)
# userRecs.show(5, truncate=False)

In [24]:
# recomSeries = userRecs.where(col("user_id") == 666666)
# recomSeries = recomSeries.select("user_id", explode(recomSeries.recommendations)).withColumnRenamed("col", "seleccion")
# recomSeries = recomSeries.select("user_id", "seleccion.*")
# recomSeries.printSchema()
# recomSeries.show(truncate=False)

#### Películas recomedadas para EP:

In [25]:
EPpelisElegidas = EPpelis.where((col("anime_id") == 32890) | (col("anime_id") == 33533) | (col("anime_id") == 21129) | (col("anime_id") == 37392) | (col("anime_id") == 4621))
EPpelisElegidas = EPpelisElegidas.dropDuplicates(["anime_id"])
EPpelisElegidas = EPpelisElegidas.select("anime_id", "User_rating", "name", "English_name", "Val_media")
EPpelisElegidas = EPpelisElegidas.sort("Val_media", ascending=False)
EPpelisElegidas.show(truncate=False)

+--------+-----------+--------------------------------------------------------+--------------------------------------+-----------------+
|anime_id|User_rating|name                                                    |English_name                          |Val_media        |
+--------+-----------+--------------------------------------------------------+--------------------------------------+-----------------+
|4621    |7.0        |Arisubyeon-ui Kkumnamu                                  |The Olympic Challenge                 |6.194444444444445|
|32890   |7.0        |Tu Xia Zhi Qing Li Chuanshuo                            |Legend of a Rabbit:The Martial of Fire|5.745762711864407|
|21129   |3.0        |Youtai Nuhai Zai Shanghai                               |A Jewish Girl in Shanghai             |5.621951219512195|
|37392   |2.0        |Xi Yang Yang Yu Hui Tai Lang: Zhi Yang Nian Xi Yang Yang|Amazing Pleasant Goat                 |5.159090909090909|
|33533   |5.0        |Tokyo Onlypic      

<img src="./gcloud_03.png"/>

In [26]:
# EPseriesElegidas = EPseries.where((col("anime_id") == 32890) | (col("anime_id") == 33533) | (col("anime_id") == 21129) | (col("anime_id") == 37392) | (col("anime_id") == 4621))
# EPseriesElegidas = EPseriesElegidas.dropDuplicates(["anime_id"])
# EPseriesElegidas = EPseriesElegidas.select("anime_id", "User_rating", "name", "English_name", "Val_media")
# EPseriesElegidas = EPseriesElegidas.sort("Val_media", ascending=False)
# EPseriesElegidas.show(truncate=False)

## 3. Además se debe recuperar y mostrar información detallada (sinopsis, imagen, trailer, etc.) de cada serie o película extrayendo la información con la API Jikan (https://docs.api.jikan.moe). Por ejemplo, para recuperar la información básica del anime con ID 13601, hay que hacer una llamada a la URL https://api.jikan.moe/v4/anime/13601/full y luego parsear los datos del JSON que devuelve. Mostrar la información recuperada de forma gráfica en el cuaderno

In [29]:
import requests
from IPython.display import HTML
# from IPython.display import Image
# from IPython.core.display import HTML
# import urllib
# from PIL import Image

In [52]:
# ID de las peliculas: 4621 -> 32890 -> 21129 -> 37392 -> 33533
# ID de las series:    3869 -> 8786  -> 28145 -> 19987 -> 14623

api_url = "https://api.jikan.moe/v4/anime/4621/full"
response = requests.get(api_url).json()
sinopsis_peli1 = response["data"]["synopsis"]
imagen_peli1 = response["data"]["images"]["jpg"]["image_url"]
trailer_peli1 = response["data"]["trailer"]
# response
print(trailer_peli1)

{'youtube_id': None, 'url': None, 'embed_url': None, 'images': {'image_url': None, 'small_image_url': None, 'medium_image_url': None, 'large_image_url': None, 'maximum_image_url': None}}


In [53]:
html_code = '''
<p>'''+sinopsis_peli1+'''</p>
<div style="display:flex;">
    <img src='''+imagen_peli1+'''>
    <img src='''+'''>
</div>
'''
HTML(html_code)