In [1]:
# Membuat Spark Session

from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [None]:
print(spark)

In [2]:
# Membaca file
df_movies = spark.read.csv("ml-latest/movies.csv", header=True, inferSchema=True)

In [8]:
df_movies.count()

58098

In [3]:
df_movies.show()

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
|      6|         Heat (1995)|Action|Crime|Thri...|
|      7|      Sabrina (1995)|      Comedy|Romance|
|      8| Tom and Huck (1995)|  Adventure|Children|
|      9| Sudden Death (1995)|              Action|
|     10|    GoldenEye (1995)|Action|Adventure|...|
|     11|American Presiden...|Comedy|Drama|Romance|
|     12|Dracula: Dead and...|       Comedy|Horror|
|     13|        Balto (1995)|Adventure|Animati...|
|     14|        Nixon (1995)|               Drama|
|     15|Cutthroat Island ...|Action|Adventure|...|
|     16|       Casino (1995)|         Crime|Drama|
|     17|Sen

In [4]:
df_ratings = spark.read.csv("ml-latest/ratings.csv", header=True)

In [6]:
df_ratings.count()

27753444

In [5]:
df_ratings.show()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|    307|   3.5|1256677221|
|     1|    481|   3.5|1256677456|
|     1|   1091|   1.5|1256677471|
|     1|   1257|   4.5|1256677460|
|     1|   1449|   4.5|1256677264|
|     1|   1590|   2.5|1256677236|
|     1|   1591|   1.5|1256677475|
|     1|   2134|   4.5|1256677464|
|     1|   2478|   4.0|1256677239|
|     1|   2840|   3.0|1256677500|
|     1|   2986|   2.5|1256677496|
|     1|   3020|   4.0|1256677260|
|     1|   3424|   4.5|1256677444|
|     1|   3698|   3.5|1256677243|
|     1|   3826|   2.0|1256677210|
|     1|   3893|   3.5|1256677486|
|     2|    170|   3.5|1192913581|
|     2|    849|   3.5|1192913537|
|     2|   1186|   3.5|1192913611|
|     2|   1235|   3.0|1192913585|
+------+-------+------+----------+
only showing top 20 rows



In [9]:
df_movies.createOrReplaceTempView("movies")
df_ratings.createOrReplaceTempView("ratings")

In [10]:
hasil = spark.sql("SELECT ratings.movieId, movies.title, \
                   AVG(ratings.rating) as avg_rating\
                   FROM ratings \
                   INNER JOIN movies ON ratings.movieId = movies.movieId \
                   GROUP BY ratings.movieId, movies.title")

In [11]:
file_output = hasil.toPandas()
file_output.to_csv("hasil-besar.csv", index=False)

In [12]:
hasil.count()

53889

In [15]:
df_results = df_ratings.join(df_movies, \
                             df_ratings.movieId == df_movies.movieId, \
                             "inner")

In [31]:
df_results.groupBy("title").agg({'rating': 'avg'}).show()

+--------------------+------------------+
|               title|       avg(rating)|
+--------------------+------------------+
|Men in Black (a.k...| 3.578533203874726|
|When We Were King...| 4.128795298726739|
|Snow White and th...|3.5737629736905623|
|       Psycho (1960)| 4.061855244697532|
|   Annie Hall (1977)|  4.03722552312064|
|Voices from the L...| 4.124444444444444|
|       Quills (2000)| 3.529248366013072|
|    Fair Game (1995)|2.3543191800878476|
|Man Bites Dog (C'...|3.7407084785133566|
|O Brother, Where ...|3.8708346493998738|
|Heavenly Creature...|3.8160953978906997|
|    Elizabeth (1998)| 3.847101932045303|
|First Blood (Ramb...|3.5685894004282654|
|Captain Corelli's...| 2.859545836837679|
|Night of the Livi...|3.6201248049921997|
|In the Heat of th...| 4.045863309352518|
|Don't Tell Mom th...|2.7300925925925927|
|Futurama: Into th...|3.7653333333333334|
|  Thumbsucker (2005)| 3.435225618631732|
|22 Jump Street (2...| 3.485540119081372|
+--------------------+------------