# Install pyspark

In [3]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=960cda47268ae3711769d238122d18beb5fc3fcca182beb7f49deaf671ce116d
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [4]:
!pip install -q findspark

# Analysis of Movie ratings dataset “Movie Lens Dataset”

## Import required libraries and intialize spark context

In [60]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local").appName("MovieRating").getOrCreate()
sc=spark.sparkContext


## Q a) How many lines does the ratings file contain?

In [61]:
# Load data from ratings.dat in an rdd
ratings_file_rdd = sc.textFile(r'/content/ratings.dat')
number_of_lines=ratings_file_rdd.count()

In [62]:
print(f'The ratings.dat file contains {number_of_lines} lines')

The ratings.dat file contains 1000209 lines


## Q b) How many times the rating “1” has been given?

In [87]:
# Split the entries in ratings_file_rdd by ::
splitted_ratings_data = ratings_file_rdd.map(lambda line: line.split('::'))

# Rating is present in coulumn 2 (Third column)
ratings_rdd = splitted_ratings_data.map(lambda x: int(x[2]))

# Find number of times rating 1 appeared
count_rating_1 = ratings_rdd.filter(lambda x: x == 1).count()

In [88]:
print(f'Rating 1 was given {count_rating_1} times.')

Rating 1 was given 56174 times.


## Q c) Which are the 10 most popular movies?

In [89]:
# Get top 10 rated movie ids from rating.dat
movie_rating_rdd = splitted_ratings_data.map(lambda item : (item[1], int(item[2])))
movie_total_ratings_rdd = movie_rating_rdd.groupByKey().mapValues(lambda values: sum(values))
movie_top10_rating_rdd = sc.parallelize(movie_total_ratings_rdd.sortBy(lambda x: x[1], ascending=False).take(10))

# Load moving.dat to get movie names
splitted_movies_rdd = sc.textFile(r'/content/movies.dat').map(lambda line1: line1.split('::'))

# Map the movie id with movie names
final_result_rdd = movie_top10_rating_rdd.join(splitted_movies_rdd).map(lambda x: (x[0], x[1][0], x[1][1])).collect()

Print final result

In [68]:
print(f'Following are top 10 most popular moviews based on consolidated user ratings:')
for itr, item in enumerate(final_result_rdd, start=1):
  print(f'#{itr} {item[2]}, {item[1]}')

Following are top 10 most popular moviews based on consolidated user ratings:
#1 Star Wars: Episode V - The Empire Strikes Back (1980), 12836
#2 Star Wars: Episode VI - Return of the Jedi (1983), 11598
#3 Saving Private Ryan (1998), 11507
#4 Raiders of the Lost Ark (1981), 11257
#5 Sixth Sense, The (1999), 10835
#6 American Beauty (1999), 14800
#7 Star Wars: Episode IV - A New Hope (1977), 13321
#8 Silence of the Lambs, The (1991), 11219
#9 Matrix, The (1999), 11178
#10 Terminator 2: Judgment Day (1991), 10751


## Q d) Provide statistical summary of the ratings provided by the user (count, mean, stddev, min, max)

Find out differnet values of count, mean, stddev, min and max

In [84]:
# In ratings data, first column is userid and third column is rating. get count of distinct users
ratings_rdd = splitted_ratings_data.map(lambda x: int(x[2]))

# Total count of user ratings
count_ratings = ratings_rdd.count()

# Find mean of user ratings
mean_rating = ratings_rdd.mean()

# Find standard deviation
std_dev_rating = ratings_rdd.stdev()

# Find min rating
min_rating = ratings_rdd.min()

# Find max rating
max_rating = ratings_rdd.max()


Print the summary

In [86]:
print(f'Following is the statistical summary of the ratings provided by the users')
print(f'Count of ratings   : {count_ratings}')
print(f'Mean of ratings    : {mean_rating}')
print(f'Stddev of ratings  : {std_dev_rating}')
print(f'Minimum of ratings : {min_rating}')
print(f'Maximum of ratings : {max_rating}')


Following is the statistical summary of the ratings provided by the users
Count of ratings   : 1000209
Mean of ratings    : 3.5815644530292317
Stddev of ratings  : 1.1171012869389108
Minimum of ratings : 1
Maximum of ratings : 5
