In [None]:
!pip install pandas
!pip install pyspark
!pip install pyarrow

In [None]:
from pyspark.sql import SparkSession
 
# Building the SparkSession and name 
# it :'pandas to spark'
spark = SparkSession.builder.appName(
  "pandas to spark").getOrCreate()

spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

In [3]:
import pandas as pd

data = [[1, 'Avengers'], [2, 'Frozen 2'], [3, 'Joker']]
movies = pd.DataFrame(data, columns=['movie_id', 'title']).astype({'movie_id':'Int64', 'title':'object'})
data = [[1, 'Daniel'], [2, 'Monica'], [3, 'Maria'], [4, 'James']]
users = pd.DataFrame(data, columns=['user_id', 'name']).astype({'user_id':'Int64', 'name':'object'})
data = [[1, 1, 3, '2020-01-12'], [1, 2, 4, '2020-02-11'], [1, 3, 2, '2020-02-12'], [1, 4, 1, '2020-01-01'], [2, 1, 5, '2020-02-17'], [2, 2, 2, '2020-02-01'], [2, 3, 2, '2020-03-01'], [3, 1, 3, '2020-02-22'], [3, 2, 4, '2020-02-25']]
movie_rating = pd.DataFrame(data, columns=['movie_id', 'user_id', 'rating', 'created_at']).astype({'movie_id':'Int64', 'user_id':'Int64', 'rating':'Int64', 'created_at':'datetime64[ns]'})

In [4]:
movies = spark.createDataFrame(movies)
movies.show()

users = spark.createDataFrame(users)
users.show()

movie_rating = spark.createDataFrame(movie_rating)
movie_rating.show()

+--------+--------+
|movie_id|   title|
+--------+--------+
|       1|Avengers|
|       2|Frozen 2|
|       3|   Joker|
+--------+--------+

+-------+------+
|user_id|  name|
+-------+------+
|      1|Daniel|
|      2|Monica|
|      3| Maria|
|      4| James|
+-------+------+

+--------+-------+------+-------------------+
|movie_id|user_id|rating|         created_at|
+--------+-------+------+-------------------+
|       1|      1|     3|2020-01-12 00:00:00|
|       1|      2|     4|2020-02-11 00:00:00|
|       1|      3|     2|2020-02-12 00:00:00|
|       1|      4|     1|2020-01-01 00:00:00|
|       2|      1|     5|2020-02-17 00:00:00|
|       2|      2|     2|2020-02-01 00:00:00|
|       2|      3|     2|2020-03-01 00:00:00|
|       3|      1|     3|2020-02-22 00:00:00|
|       3|      2|     4|2020-02-25 00:00:00|
+--------+-------+------+-------------------+



In [18]:
from pyspark.sql.functions import count, col, avg

movie_rating \
.groupby('user_id') \
.agg(count('movie_id').alias('number_of_movies')) \
.join(
    users,
    'user_id',
    'inner'
) \
.sort(['number_of_movies', 'name'], ascending=[False, True]) \
.withColumn('results', col('name')) \
.select('results') \
.limit(1) \
.unionAll(
    movie_rating \
    .where('created_at > "2020-01-31" and created_at < "2020-03-01"') \
    .groupby('movie_id') \
    .agg(avg('rating').alias('average_rating')) \
    .join(
        movies,
        'movie_id',
        'inner'
    ) \
    .sort(['average_rating', 'title'], ascending=[False, True]) \
    .withColumn('results', col('title')) \
    .select('results') \
    .limit(1)
) \
.show()

+--------+
| results|
+--------+
|  Daniel|
|Frozen 2|
+--------+

