# <hr style="clear: both" />

# Running Spark in YARN-client mode

This notebook demonstrates how to set up a SparkContext that uses SURFsara's Hadoop cluster: [YARN resourcemanager](http://head05.hathi.surfsara.nl:8088/cluster) (note you will need to be authenticated via kerberos on your machine to visit the resourcemanager link) for executors.

First initialize kerberos via a Jupyter terminal. 
In the terminal execute: <BR>
<i>kinit -k -t data/robertop.keytab robertop@CUA.SURFSARA.NL</i><BR>
Print your credentials:


In [2]:
! klist

Ticket cache: FILE:/tmp/krb5cc_1000
Default principal: robertop@CUA.SURFSARA.NL

Valid starting       Expires              Service principal
04/17/2016 06:40:43  04/18/2016 06:40:42  krbtgt/CUA.SURFSARA.NL@CUA.SURFSARA.NL
	renew until 04/17/2016 06:40:43


In [3]:
! hdfs dfs -ls 
execfile('../spark-scripts/bullet.py')

Found 5 items
drwx------   - robertop hdfs          0 2016-04-16 06:00 .Trash
drwxr-xr-x   - robertop hdfs          0 2016-04-17 06:39 .sparkStaging
drwx------   - robertop hdfs          0 2016-04-06 15:54 .staging
drwxr-xr-x   - robertop hdfs          0 2016-04-15 14:38 mattia
drwxr-xr-x   - robertop hdfs          0 2016-04-13 10:00 recsys2016Competition


Verify that we can browse HDFS:

Next initialize Spark. Note that the code below starts a job on the Hadoop cluster that will remain running while the notebook is active. Please close and halt the notebook when you are done. Starting the SparkContext can take a little longer. You can check the YARN resourcemanager to see the current status/usage of the cluster.

In [4]:
import os
os.environ['PYSPARK_PYTHON'] = '/usr/local/bin/python2.7'

HDFS_PATH = "hdfs://hathi-surfsara"

from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
sconf = SparkConf()

# Master is now yarn-client. The YARN and hadoop config is read from the environment
sconf.setMaster("yarn-client")

# You can control many Spark settings via the SparkConf. This determines the amount of executors on the cluster:
sconf.set("spark.executor.instances", "200")
#sconf.set("spark.executor.memory", "20g")
sconf.set("spark.yarn.executor.memoryOverhead", 24000)
sconf.set("spark.yarn.driver.memoryOverhead", 24000)

sconf.set("spark.network.timeout", "1500s")
sconf.set("spark.rpc.askTimeout", "1500s")

# UFW (firewall) is active on the VM. We explicitly opened these ports and Spark should not bind to random ports:
sconf.set("spark.driver.port", 51800)
sconf.set("spark.fileserver.port", 51801)
sconf.set("spark.broadcast.port", 51802)
sconf.set("spark.replClassServer.port", 51803)
sconf.set("spark.blockManager.port", 51804)
sconf.set("spark.authenticate", True)
sconf.set("spark.yarn.keytab", "/home/jovyan/work/data/robertop.keytab")
sconf.set("spark.yarn.access.namenodes", HDFS_PATH + ":8020")

try:
    sc = SparkContext(conf=sconf)
    sqlCtx = SQLContext(sc) 
    sendNotificationToMattia("Spark Context", "Ready!")
except Exception, err:
    sendNotificationToMattia("Fuck you!", str(err))   

# <hr style="clear: both" />

# Now you can run your code

Pick a clustering algorithm (name of the file that provides a classify(x,y [,threshold]) function)

In [7]:
execfile('../spark-scripts/conventions.py')
execfile('../spark-scripts/splitCluster.py')
#execfile('../spark-scripts/utils.py')
execfile('../spark-scripts/eval.py')
execfile('../spark-scripts/implicitPlaylistAlgoFunctions.py')
execfile('../spark-scripts/implicitPlaylistAlgoMain.py')

CLUSTER_ALGO = 'jaccardBase'

execfile('../spark-scripts/' + CLUSTER_ALGO + '.py')


# Reading the conf file

In [8]:
import json
import copy

BASE_PATH = HDFS_PATH + '/user/robertop/mattia'

conf = {}

conf['split'] = {}
conf['split']['reclistSize'] = 100
conf['split']['callParams'] = {}
conf['split']['excludeAlreadyListenedTest'] = True
conf['split']['name'] = 'test'
conf['split']['split'] = conf['split']['name']
conf['split']['minEventsPerUser'] = 5
conf['split']['inputData'] = HDFS_PATH + '/user/robertop/mattia/clusterBase.split/SenzaRipetizioni_1'
#conf['split']['inputData'] = 's3n://contentwise-research-poli/30musicdataset/newFormat/relations/sessions.idomaar'
conf['split']['bucketName'] = BASE_PATH
conf['split']['percUsTr'] = 0.05
conf['split']['ts'] = int(0.75 * (1421745857 - 1390209860) + 1390209860) - 10000
conf['split']['minEventPerSession'] = 5
conf['split']['onlineTrainingLength'] = 5
conf['split']['GTlength'] = 1
conf['split']['minEventPerSessionTraining'] = 10
conf['split']['minEventPerSessionTest'] = 11
conf['split']['mode'] = 'session'
conf['split']['forceSplitCreation'] = False
conf['split']["prop"] = {'reclistSize': conf['split']['reclistSize']}
conf['split']['type'] = list
conf['split']['out'] = HDFS_PATH + '/user/robertop/mattia/clusterBase.split/'
conf['split']['location'] = '30Mdataset/relations/sessions'

conf['evaluation'] = {}
conf['evaluation']['metric'] = {}
conf['evaluation']['metric']['type'] = 'recall'
conf['evaluation']['metric']['prop'] = {}
conf['evaluation']['metric']['prop']['N'] = [1,2,5,10,15,20,25,50,100]
conf['evaluation']['name'] = 'recall@N'

conf['general'] = {}
conf['general']['clientname'] = "clusterBase.split"
conf['general']['bucketName'] = 'head02.hathi.surfsara.nl/user/robertop/mattia'
conf['general']['tracksPath'] = '30Mdataset/entities/tracks.idomaar.gz'

conf['algo'] = {}
conf['algo']['name'] = 'ClusterBase'
conf['algo']['props'] = {}
# ***** EXAMPLE OF CONFIGURATION *****#
conf['algo']['props']["sessionJaccardShrinkage"] = 5
conf['algo']['props']["clusterSimilarityThreshold"] = 0.1
conf['algo']['props']["expDecayFactor"] = 0.7
# ****** END EXAMPLE ****************#




Pick the list of songs ad create clusters


In [None]:
import json
import string

def my_replace_punct(x):
    ret = ""
    for i in x:
        if i == '+':
            ret += ' '
        else:
            ret += i
    return ret

tracksRDD = sc.textFile(BASE_PATH + '/30Mdataset/entities/tracks.idomaar.gz')
tracksRDD = tracksRDD.repartition(2000)
tracksRDD = tracksRDD.map(lambda x: x.split('\t')).map(lambda x: (x[1], json.loads(x[3])['name'].split('/') ) )
tracksRDD = tracksRDD.map(lambda x: (x[0], " ".join( (x[1][0], x[1][2]) ) ))
tracksRDD = tracksRDD.map(lambda x : (x[0], my_replace_punct(x[1])))
tracksRDD = tracksRDD.map(lambda x: (x[0], tokenize_song(x[1]), x[1]))

#tracksRDD.saveAsPickleFile(BASE_PATH + '/30Mdataset/entities/tracksPickle')
tracksIdsRDD = tracksRDD.map(lambda x: (int(x[0]), [x[0]]))

try:
    tracksRDD.count()
    parts = str(tracksRDD.getNumPartitions())
    sendNotificationToMattia("Tracks loaded" , parts + " parts")
except Exception, err:
    sendNotificationToMattia("Fuck you!", str(err))    

In [7]:
#tracksRDD = tracksRDD.take(50000)
#tracksRDD = sc.parallelize(tracksRDD)

In [None]:
'''
def append_equals(x, part):
    name = x[1]
    part = part.value
    for song in part:
        if classify(name, song[1]):
            x[2].append(song[0])
    return x

tracksEqualsRDD = sampleRDD
for i in range(2000):   
    partRDD = sc.pickleFile(BASE_PATH + '/30Mdataset/entities/tracksPickle/part-' + str(i).zfill(5))
    print 'Loaded part ' + str(i)
    part = sc.broadcast(partRDD.collect())
    print 'Broadcasted part ' + str(i)
    tracksEqualsRDD = tracksEqualsRDD.map(lambda x: append_equals(x, part))
    print 'Mapped part ' + str(i)
    tracksEqualsRDD.count()
    print 'Counted part ' + str(i)
    
tracksEqualsRDD.take(10)
'''

In [None]:
#Build an RDD with ('word' -> (id, name))
wordsRDD = tracksRDD.flatMap(lambda x: [(i, (x[0], x[2])) for i in x[1]] )

#Group by 'word' and keep only the ones with more then 1 song
wordsRDD = wordsRDD.groupByKey().mapValues(list).filter(lambda x: len(x[1]) > 1)

#Eliminate words in top 2% of words
nwords = int(wordsRDD.count()*0.02)
top_words = wordsRDD.map(lambda x: (x[0], len(x[1]))).takeOrdered(nwords, lambda x: -x[1])
wordsRDD = wordsRDD.filter(lambda x: x[0] not in top_words)
print top_words[:10]

#Compute a cartesian product for each list of songs with a common word
def filtered_cartesian(x):
    equal_couples = set()
    for i in range(len(x[1])):
        a = x[1][i]
        id_a = x[1][i][0]
        name_a = x[1][i][1]
        
        for j in range(i):
            b = x[1][j]
            id_b = x[1][j][0]
            name_b = x[1][j][1]
            if id_a != id_b:
                if classify(name_a, name_b):
                    equal_couples.add((a,b))
                    
    return (x[0], tuple(equal_couples))

coupleRDD = wordsRDD.map(filtered_cartesian).filter(lambda x: len(x[1]) > 1)
try:
    print coupleRDD.count()
    sendNotificationToMattia("Words!", "Couples found.")
except Exception, err:
    sendNotificationToMattia("Fuck you!", str(err))

In [None]:
def flat_song_couple(x):
    result = []
    for couple in x[1]:
        result.append((int(couple[0][0]), couple) )
        result.append((int(couple[1][0]), couple) )
    return result
    
#Flatmap the list of couples
#flattedCoupleRDD = coupleRDD.flatMap(lambda x: [i for i in x[1]])
#For each couple, for each song, yield song->couple
flattedCoupleRDD = coupleRDD.flatMap(flat_song_couple)

#Group by key (song). Each song has now one cluster
def merge_couples(x, y):
    return list(set(x) | set(y))

try:
    songClusterRDD = flattedCoupleRDD.reduceByKey(merge_couples).map(lambda x: (x[0], [i[0] for i in x[1]]))
    print songClusterRDD.count()
    sendNotificationToMattia("Song to Cluster!", "Found the cluster for each song. Unifying...")
except Exception, err:
    sendNotificationToMattia("Fuck you!", str(err))

In [None]:
#In this way we obtain a complete RDD with song -> group of songs
def reduce_to_biggest(x, y):
    bigger = x if len(x) > len(y) else y
    result = sorted(bigger)
    return result
 
tracksIdsRDD = tracksRDD.map(lambda x: (int(x[0]), [x[0]]))
unionRDD = songClusterRDD.union(tracksIdsRDD).reduceByKey(reduce_to_biggest)
unionRDD.take(3)

sendNotificationToMattia("Tracks unified with all tracks ids", "Inverting and unifying clusters...")

In [None]:
#Flip the mapping as cluster->song
clusterSongsRDD = unionRDD.map(lambda x: (' '.join(x[1]), x[0])).groupByKey().mapValues(list)
clusterSongsRDD = clusterSongsRDD.zipWithIndex().map(lambda x: (x[1], x[0][1]))
clusterSongsRDD.take(3)
sendNotificationToMattia("Unique clusters found!", "Writing...")

In [None]:
#Save clustering
clusterSongsRDD.saveAsPickle(BASE_PATH + '/clusters/' + CLUSTER_ALGO)

sendNotificationToMattia("File Written!", BASE_PATH + '/clusters/' + CLUSTER_ALGO)