# <hr style="clear: both" />

# Running Spark in YARN-client mode

This notebook demonstrates how to set up a SparkContext that uses SURFsara's Hadoop cluster: [YARN resourcemanager](http://head05.hathi.surfsara.nl:8088/cluster) (note you will need to be authenticated via kerberos on your machine to visit the resourcemanager link) for executors.

First initialize kerberos via a Jupyter terminal. 
In the terminal execute: <BR>
<i>kinit -k -t data/robertop.keytab robertop@CUA.SURFSARA.NL</i><BR>
Print your credentials:


In [1]:
! klist

Ticket cache: FILE:/tmp/krb5cc_1000
Default principal: robertop@CUA.SURFSARA.NL

Valid starting       Expires              Service principal
06/04/2016 13:18:14  06/05/2016 13:18:14  krbtgt/CUA.SURFSARA.NL@CUA.SURFSARA.NL
	renew until 06/04/2016 13:18:14


In [2]:
! hdfs dfs -ls 
execfile('../spark-scripts/bullet.py')

Found 5 items
drwx------   - robertop hdfs          0 2016-05-26 06:00 .Trash
drwxr-xr-x   - robertop hdfs          0 2016-06-04 13:38 .sparkStaging
drwx------   - robertop hdfs          0 2016-04-06 15:54 .staging
drwxr-xr-x   - robertop hdfs          0 2016-05-25 06:28 mattia
drwxr-xr-x   - robertop hdfs          0 2016-04-13 10:00 recsys2016Competition


Verify that we can browse HDFS:

Next initialize Spark. Note that the code below starts a job on the Hadoop cluster that will remain running while the notebook is active. Please close and halt the notebook when you are done. Starting the SparkContext can take a little longer. You can check the YARN resourcemanager to see the current status/usage of the cluster.

In [3]:
import os
os.environ['PYSPARK_PYTHON'] = '/usr/local/bin/python2.7'

HDFS_PATH = "hdfs://hathi-surfsara"

from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
sconf = SparkConf(False)

sconf.setAppName('hybrid')

# Master is now yarn-client. The YARN and hadoop config is read from the environment
sconf.setMaster("yarn-client")

# You can control many Spark settings via the SparkConf. This determines the amount of executors on the cluster:
sconf.set("spark.executor.instances", "200")
#sconf.set("spark.executor.memory", "10g")

# UFW (firewall) is active on the VM. We explicitly opened these ports and Spark should not bind to random ports:
sconf.set("spark.driver.port", 51800)
sconf.set("spark.fileserver.port", 51801)
sconf.set("spark.broadcast.port", 51802)
sconf.set("spark.replClassServer.port", 51803)
sconf.set("spark.blockManager.port", 51804)
sconf.set("spark.authenticate", True)
sconf.set("spark.yarn.keytab", "/home/jovyan/work/data/robertop.keytab")
sconf.set("spark.yarn.access.namenodes", HDFS_PATH + ":8020")

try:
    sc = SparkContext(conf=sconf)
    sqlCtx = SQLContext(sc) 
    sendNotificationToMattia("Spark Context", "Ready!")
except Exception, err:
    sendNotificationToMattia("Fuck you!", str(err)) 
    print str(err)

# <hr style="clear: both" />

# Now you can run your code

Pick a clustering algorithm (name of the file that provides a classify(x,y [,threshold]) function)

# Reading the conf file

In [4]:
import json
import copy

BASE_PATH = HDFS_PATH + '/user/robertop/mattia'

conf = {}

conf['split'] = {}
conf['split']['reclistSize'] = 100
conf['split']['callParams'] = {}
conf['split']['excludeAlreadyListenedTest'] = True
conf['split']['name'] = 'test'
conf['split']['split'] = conf['split']['name']
conf['split']['minEventsPerUser'] = 5
conf['split']['inputData'] = HDFS_PATH + '/user/robertop/mattia/clusterBase.split/SenzaRipetizioni_1'
#conf['split']['inputData'] = 's3n://contentwise-research-poli/30musicdataset/newFormat/relations/sessions.idomaar'
conf['split']['bucketName'] = BASE_PATH
conf['split']['percUsTr'] = 0.05
conf['split']['ts'] = int(0.75 * (1421745857 - 1390209860) + 1390209860) - 10000
conf['split']['minEventPerSession'] = 5
conf['split']['onlineTrainingLength'] = 5
conf['split']['GTlength'] = 1
conf['split']['minEventPerSessionTraining'] = 10
conf['split']['minEventPerSessionTest'] = 11
conf['split']['mode'] = 'session'
conf['split']['forceSplitCreation'] = False
conf['split']["prop"] = {'reclistSize': conf['split']['reclistSize']}
conf['split']['type'] = list
conf['split']['out'] = HDFS_PATH + '/user/robertop/mattia/clusterBase.split/'
conf['split']['location'] = '30Mdataset/relations/sessions'

conf['evaluation'] = {}
conf['evaluation']['metric'] = {}
conf['evaluation']['metric']['type'] = 'recall'
conf['evaluation']['metric']['prop'] = {}
conf['evaluation']['metric']['prop']['N'] = [1,2,5,10,15,20,25,50,100]
conf['evaluation']['name'] = 'recall@N'

conf['general'] = {}
conf['general']['clientname'] = "clusterBase.split"
conf['general']['bucketName'] = BASE_PATH
conf['general']['tracksPath'] = '30Mdataset/entities/tracks.idomaar.gz'

conf['algo'] = {}
conf['algo']['name'] = 'ClusterBase'
conf['algo']['props'] = {}
# ***** EXAMPLE OF CONFIGURATION *****#
conf['algo']['props']["sessionJaccardShrinkage"] = 5
conf['algo']['props']["clusterSimilarityThreshold"] = 0.1
conf['algo']['props']["expDecayFactor"] = 0.7
# ****** END EXAMPLE ****************#



# Load Clusters

In [5]:
collaborativeClusters = sc.pickleFile(BASE_PATH + '/clusters/collaborative/min_j_25_avgShrink2_higherMean')
jaccardClusters = sc.pickleFile(BASE_PATH + '/clusters/jaccardBase9')

print collaborativeClusters.take(1)
print jaccardClusters.take(1)

[(0, [3861236])]
[(0, [1730002, 1730001])]


In [6]:
collaborativeClusters = collaborativeClusters.filter(lambda x: len(x[1]) > 1)
jaccardClusters = jaccardClusters.filter(lambda x: len(x[1]) > 1)

print collaborativeClusters.count()
print jaccardClusters.count()

296
107360


In [24]:
collaborativeClustersSongs = collaborativeClusters.flatMap(lambda x: [(i, x[1]) for i in x[1] ])
jaccardClustersSongs = jaccardClusters.flatMap(lambda x: [(i, x[1]) for i in x[1] ])

unionSongs = collaborativeClustersSongs.union(jaccardClustersSongs)
print unionSongs.count()
unionSongs = unionSongs.reduceByKey(lambda x,y : (set(x) | set(y)))
unionSongs.take(3)

227463


[(2838528, [2838539, 2838528]),
 (2967552, [2967553, 2967552]),
 (2974608, [2974608, 2974607])]

In [30]:
newClusters = unionSongs.map(lambda x: (' '.join([str(i) for i in sorted(x[1])]), x[0]))
newClustersUnique = newClusters.groupByKey().mapValues(set).mapValues(list)
print newClustersUnique.count()
print newClustersUnique.take(3)
print newClustersUnique.filter(lambda x: len(x[1]) != len(x[0].split(' ')) ).count()

go_on = True
while(go_on):
    newClustersUnique = newClustersUnique.flatMap(lambda x: [(i, x[1]) for i in x[1] ])
    newClustersUnique = newClustersUnique.reduceByKey(lambda x,y : (set(x) | set(y)))
    newClustersUnique = newClustersUnique.map(lambda x: (' '.join([str(i) for i in sorted(x[1])]), x[0]))
    newClustersUnique = newClustersUnique.groupByKey().mapValues(set).mapValues(list)
    print newClustersUnique.count()
    n_bad = newClustersUnique.filter(lambda x: len(x[1]) != len(x[0].split(' ')) ).count()
    if n_bad == 0:
        go_on = False
    print n_bad




107701
[('1171487 1171488', [1171488, 1171487]), ('3672757 3672758', [3672757, 3672758]), ('554805 554806', [554805, 554806])]
145
107701
0


In [31]:
songToCluster = newClustersUnique.flatMap(lambda x: [(i, x[0]) for i in x[1]])
songToCluster.take(3)

[(1271037, '1271037 1271038'),
 (1271038, '1271037 1271038'),
 (3672757, '3672757 3672758')]

# Load all songs

In [11]:
import json
import string

def my_replace_punct(x):
    ret = ""
    for i in x:
        if i == '+':
            ret += ' '
        else:
            ret += i
    return ret

tracksRDD = sc.textFile(BASE_PATH + '/30Mdataset/entities/tracks.idomaar.gz')
tracksRDD = tracksRDD.repartition(200)
tracksRDD = tracksRDD.map(lambda x: x.split('\t')).map(lambda x: (x[1], json.loads(x[3])['name'].split('/') ) )
tracksRDD = tracksRDD.map(lambda x: (x[0], " ".join( (x[1][0], x[1][2]) ) )).distinct()
tracksRDD = tracksRDD.map(lambda x : (x[0], my_replace_punct(x[1])))

ZZ_TOP = 3893303
tracksRDD = tracksRDD.filter(lambda x: int(x[0]) < ZZ_TOP)

tracksIdsRDD = tracksRDD.map(lambda x: (int(x[0]), [x[0]]))

n_tracksID = str(tracksIdsRDD.count())
print "IDs: " + n_tracksID

IDs: 3893303


# Unify everyhting

In [32]:
def reduce_to_biggest(x, y):
    bigger = x if len(x) > len(y) else y
    result = sorted(bigger)
    return result
 
unionJoinRDD = tracksIdsRDD.leftOuterJoin(songToCluster)
unionRDD = unionJoinRDD.map(lambda x: (x[0], x[1][0]) if x[1][1] == None else (x[0], x[1][1]))
tot_n = unionRDD.count()
print tot_n

#Flip the mapping as cluster->song
clusterSongsRDD = unionRDD.map(lambda x: (' '.join(x[1]), x[0])).groupByKey().mapValues(list)
clusterSongsRDD = clusterSongsRDD.zipWithIndex().map(lambda x: (x[1], x[0][1]))
tot_cl = clusterSongsRDD.count()
print tot_n - tot_cl


3893303
119707


In [33]:
clusterSongsRDD.saveAsPickleFile(BASE_PATH + '/clusters/hybrid/coll_J25_S2_jaccard9')