In [1]:
from pyspark import SparkConf, SparkContext

In [2]:
conf = SparkConf().setMaster("local").setAppName("PopularMovies")
sc = SparkContext(conf = conf)

In [3]:
def loadMovieNames():
    movieNames = {}
    with open("ml-100k/u.ITEM", encoding='Latin-1') as f:
        for line in f:
            fields = line.split('|')
            movieNames[int(fields[0])] = fields[1]
    return movieNames

In [4]:
"""
{
    123: 'Star Wars,
    1341: 'Star Trek'
}
"""

"\n{\n    123: 'Star Wars,\n    1341: 'Star Trek'\n}\n"

In [5]:
# my_example_variable
# Scale: camelCase

In [6]:
nameDict = sc.broadcast(loadMovieNames())

In [7]:
lines = sc.textFile("ml-100k/u.data")

In [8]:
type(lines)

pyspark.rdd.RDD

In [9]:
lines.first()

'196\t242\t3\t881250949'

In [10]:
type(lines.first())

str

In [11]:
movies = lines.map(lambda x: x.split())

In [12]:
type(movies)

pyspark.rdd.PipelinedRDD

In [13]:
movies.first()

['196', '242', '3', '881250949']

In [14]:
movies = lines.map(lambda x: (int(x.split()[1]), 1))
movies.first()

(242, 1)

In [15]:
'''
(242, 1)
(242, 1)
(242, 1)
(242, 1)
(242, 1)

(242, 5)

'''

'\n(242, 1)\n(242, 1)\n(242, 1)\n(242, 1)\n(242, 1)\n\n(242, 5)\n\n'

In [16]:
movieCounts = movies.reduceByKey(lambda x, y: x + y)

In [17]:
flipped = movieCounts.map(lambda x: (x[1], x[0]))
flipped.first()

(117, 242)

In [18]:
sortedMovies = flipped.sortByKey()
sortedMovies.first()

(1, 1348)

In [19]:
sortedMoviesWithNames = sortedMovies.map(lambda x: (nameDict.value[x[1]], x[0]))
sortedMoviesWithNames.first()

('Every Other Weekend (1990)', 1)

In [20]:
results = sortedMoviesWithNames.collect()

In [21]:
for result in results:
    print(result)

('Every Other Weekend (1990)', 1)
('Homage (1995)', 1)
('Window to Paris (1994)', 1)
('Bird of Prey (1996)', 1)
('Modern Affair, A (1995)', 1)
('Power 98 (1995)', 1)
('Farmer & Chase (1995)', 1)
('Great Day in Harlem, A (1994)', 1)
('Fear, The (1995)', 1)
('Substance of Fire, The (1996)', 1)
('Good Morning (1971)', 1)
('Very Natural Thing, A (1974)', 1)
('Paris Was a Woman (1995)', 1)
('Other Voices, Other Rooms (1997)', 1)
('Walk in the Sun, A (1945)', 1)
('Aiqing wansui (1994)', 1)
('T-Men (1947)', 1)
('Lotto Land (1995)', 1)
('Love Is All There Is (1996)', 1)
('Johns (1996)', 1)
('Police Story 4: Project S (Chao ji ji hua) (1993)', 1)
('Damsel in Distress, A (1937)', 1)
('Tigrero: A Film That Was Never Made (1994)', 1)
("I Don't Want to Talk About It (De eso no se habla) (1993)", 1)
('Daens (1992)', 1)
('Promise, The (Versprechen, Das) (1994)', 1)
('Cyclo (1995)', 1)
('Killer: A Journal of Murder (1995)', 1)
('Bloody Child, The (1996)', 1)
('Yankee Zulu (1994)', 1)
("Eye of Vichy, T

In [None]:
sc.stop()