In [1]:
!module add hdp
!cypress-kinit

In [2]:
!hdfs dfs -ls -h /repository/movielens  #the movieLen dataset was update in March 2017.

Found 7 items
-rw-r--r--   2 lngo hdfs-user      9.3 K 2017-03-15 09:49 /repository/movielens/README.txt
-rw-r--r--   2 lngo hdfs-user    317.9 M 2017-03-15 09:49 /repository/movielens/genome-scores.csv
-rw-r--r--   2 lngo hdfs-user     17.7 K 2017-03-15 09:49 /repository/movielens/genome-tags.csv
-rw-r--r--   2 lngo hdfs-user    839.2 K 2017-03-15 09:49 /repository/movielens/links.csv
-rw-r--r--   2 lngo hdfs-user      1.9 M 2017-03-15 09:49 /repository/movielens/movies.csv
-rw-r--r--   2 lngo hdfs-user    632.7 M 2017-03-15 09:49 /repository/movielens/ratings.csv
-rw-r--r--   2 lngo hdfs-user     22.9 M 2017-03-15 09:49 /repository/movielens/tags.csv


In [3]:
%%writefile solution/genreMapper.py
#!/usr/bin/env python

import sys
import csv

movieFile = "./movielens/movies.csv"
movieList = {}

with open(movieFile, mode = 'r') as infile:
    reader = csv.reader(infile)
    for row in reader:
        movieList[row[0]] = {}
        movieList[row[0]]["title"] = row[1]
        movieList[row[0]]["genre"] = row[2]
        
for oneMovie in sys.stdin:
    oneMovie = oneMovie.strip()
    ratingInfo = oneMovie.split(",")
    
    try:
        genreList = movieList[ratingInfo[1]]["genre"]
        rating = float(ratingInfo[2])
        for genre in genreList.split("|"):
            print("%s\t%s" % (genre, rating))    
    except ValueError:
        continue

Overwriting solution/genreMapper.py


In [4]:
%%writefile solution/genreReducer.py 
#!/usr/bin/env python

import sys
import statistics
import csv
import json

current_genre = None
ratings = []
print("Genre\t\tMean\t\tMedian\t\tStandard Deviation Sample")

for line in sys.stdin:
    line = line.strip()
    genre, rating = line.split("\t", 1)

    if current_genre == genre:
        try:
            ratings.append(float(rating))
        except ValueError:
            continue    
    else:
        if current_genre:
            rating_mean = sum(ratings) / len(ratings)
            rating_median = statistics.median(ratings)
            rating_stdev = statistics.stdev(ratings) # Sample standard deviation of data.
            print ("%s\t\t%.4f\t\t%s\t\t%.4f" % (current_genre, rating_mean, rating_median, rating_stdev))
        current_genre = genre
        try:
            ratings = [float(rating)]
        except ValueError:
            continue

if current_genre == genre:
    rating_mean = sum(ratings) / len(ratings)
    rating_median = statistics.median(ratings)
    rating_stdev = statistics.stdev(ratings) # Sample standard deviation of data.
    print ("%s\t\t%.4f\t\t%s\t\t%.4f" % (current_genre, rating_mean, rating_median, rating_stdev))
        
    

Overwriting solution/genreReducer.py


In [6]:
!hdfs dfs -cat /repository/movielens/ratings.csv 2>/dev/null \
    | python ./solution/genreMapper.py \
    | sort \
    | python ./solution/genreReducer.py

Genre		Mean		Median		Standard Deviation Sample
Action		3.4545		3.5		1.0721
Adventure		3.5071		3.5		1.0675
Animation		3.6105		4.0		1.0317
Children		3.4166		3.5		1.1011
Comedy		3.4175		3.5		1.0850
Crime		3.6785		4.0		1.0132
Documentary		3.7228		4.0		1.0220
Drama		3.6743		4.0		1.0025
Fantasy		3.5030		3.5		1.0868
Film-Noir		3.9408		4.0		0.9155
Horror		3.2753		3.5		1.1521
IMAX		3.6371		4.0		1.0275
Musical		3.5439		4.0		1.0627
Mystery		3.6615		4.0		1.0119
(no genres listed)		3.2080		3.5		1.2311
Romance		3.5425		4.0		1.0468
Sci-Fi		3.4552		3.5		1.0916
Thriller		3.5127		3.5		1.0399
War		3.8033		4.0		0.9969
Western		3.5716		4.0		1.0256


In [9]:
%%writefile solution/meanGenreReducer.py
#!/usr/bin/env python
import sys

current_genre = None
current_rating_sum = 0
current_rating_count = 0
print("Genre\t\tMean")

for line in sys.stdin:
    line = line.strip()
    genre, rating = line.split("\t", 1)

    if current_genre == genre:
        try:
            current_rating_sum += float(rating)
            current_rating_count += 1
        except ValueError:
            continue    
    else:
        if current_genre:
            rating_average = current_rating_sum / current_rating_count
            print ("%s\t\t%.4f" % (current_genre, rating_average))    
        current_genre = genre
        try:
            current_rating_sum = float(rating)
            current_rating_count = 1
        except ValueError:
            continue

if current_genre == genre:
    rating_average = current_rating_sum / current_rating_count
    print ("%s\t\t%.4f" % (current_genre, rating_average))  

Overwriting solution/meanGenreReducer.py


In [10]:
!hdfs dfs -cat /repository/movielens/ratings.csv 2>/dev/null \
    | python ./solution/genreMapper.py \
    | sort \
    | python ./solution/meanGenreReducer.py

Genre		Mean
Action		3.4545
Adventure		3.5071
Animation		3.6105
Children		3.4166
Comedy		3.4175
Crime		3.6785
Documentary		3.7228
Drama		3.6743
Fantasy		3.5030
Film-Noir		3.9408
Horror		3.2753
IMAX		3.6371
Musical		3.5439
Mystery		3.6615
(no genres listed)		3.2080
Romance		3.5425
Sci-Fi		3.4552
Thriller		3.5127
War		3.8033
Western		3.5716


In [11]:
%%writefile solution/medianGenreReducer.py
#!/usr/bin/env python
import sys
import statistics

current_genre = None
ratings = []
print("Genre\t\tMedian")

for line in sys.stdin:
    line = line.strip()
    genre, rating = line.split("\t", 1)

    if current_genre == genre:
        try:
            ratings.append(float(rating))
        except ValueError:
            continue    
    else:
        if current_genre:
            rating_median = statistics.median(ratings)
            print ("%s\t\t%.4f" % (current_genre, rating_median))    
        current_genre = genre
        try:
            ratings = [float(rating)]
        except ValueError:
            continue

if current_genre == genre:
        rating_median = statistics.median(ratings)
        print ("%s\t\t%.4f" % (current_genre, rating_median))  

Writing solution/medianGenreReducer.py


In [12]:
!hdfs dfs -cat /repository/movielens/ratings.csv 2>/dev/null \
    | python ./solution/genreMapper.py \
    | sort \
    | python ./solution/medianGenreReducer.py

Genre		Median
Action		3.5000
Adventure		3.5000
Animation		4.0000
Children		3.5000
Comedy		3.5000
Crime		4.0000
Documentary		4.0000
Drama		4.0000
Fantasy		3.5000
Film-Noir		4.0000
Horror		3.5000
IMAX		4.0000
Musical		4.0000
Mystery		4.0000
(no genres listed)		3.5000
Romance		4.0000
Sci-Fi		3.5000
Thriller		3.5000
War		4.0000
Western		4.0000


In [13]:
%%writefile solution/stdevGenreReducer.py
#!/usr/bin/env python
import sys
import statistics

current_genre = None
ratings = []
print("Genre\t\tStandard Deviation Sample")

for line in sys.stdin:
    line = line.strip()
    genre, rating = line.split("\t", 1)

    if current_genre == genre:
        try:
            ratings.append(float(rating))
        except ValueError:
            continue    
    else:
        if current_genre:
            rating_stdev = statistics.stdev(ratings)
            print ("%s\t%s" % (current_genre, rating_stdev))    
        current_genre = genre
        try:
            ratings = [float(rating)]
        except ValueError:
            continue

if current_genre == genre:
        rating_stdev = statistics.stdev(ratings)
        print ("%s\t%s" % (current_genre, rating_stdev))

Writing solution/stdevGenreReducer.py


In [14]:
!hdfs dfs -cat /repository/movielens/ratings.csv 2>/dev/null \
    | python ./solution/genreMapper.py \
    | sort \
    | python ./solution/stdevGenreReducer.py

Genre		Standard Deviation Sample
Action	1.0721365559782599
Adventure	1.0675220659603486
Animation	1.0317178473795843
Children	1.1010527199833091
Comedy	1.0850123560000309
Crime	1.0132314135781613
Documentary	1.0220074103047196
Drama	1.002524350894216
Fantasy	1.0867500247329882
Film-Noir	0.9154781611402477
Horror	1.1520617325923062
IMAX	1.0274731139134385
Musical	1.0627111413546169
Mystery	1.0119406716579022
(no genres listed)	1.2310507794221146
Romance	1.046780167005858
Sci-Fi	1.0916316918987354
Thriller	1.0398685086065347
War	0.9968902171996887
Western	1.0256165325514484
