# <hr style="clear: both" />

# Running Spark in YARN-client mode

This notebook demonstrates how to set up a SparkContext that uses SURFsara's Hadoop cluster: [YARN resourcemanager](http://head05.hathi.surfsara.nl:8088/cluster) (note you will need to be authenticated via kerberos on your machine to visit the resourcemanager link) for executors.

First initialize kerberos via a Jupyter terminal. 
In the terminal execute: <BR>
<i>kinit -k -t data/robertop.keytab robertop@CUA.SURFSARA.NL</i><BR>
Print your credentials:


In [1]:
! klist

Ticket cache: FILE:/tmp/krb5cc_1000
Default principal: robertop@CUA.SURFSARA.NL

Valid starting       Expires              Service principal
06/21/2016 07:57:32  06/22/2016 07:57:31  krbtgt/CUA.SURFSARA.NL@CUA.SURFSARA.NL
	renew until 06/21/2016 07:57:32


In [2]:
! hdfs dfs -ls 
execfile('../spark-scripts/bullet.py')

Found 5 items
drwx------   - robertop hdfs          0 2016-06-20 06:00 .Trash
drwxr-xr-x   - robertop hdfs          0 2016-06-21 09:01 .sparkStaging
drwx------   - robertop hdfs          0 2016-04-06 15:54 .staging
drwxr-xr-x   - robertop hdfs          0 2016-05-25 06:28 mattia
drwxr-xr-x   - robertop hdfs          0 2016-04-13 10:00 recsys2016Competition


Verify that we can browse HDFS:

Next initialize Spark. Note that the code below starts a job on the Hadoop cluster that will remain running while the notebook is active. Please close and halt the notebook when you are done. Starting the SparkContext can take a little longer. You can check the YARN resourcemanager to see the current status/usage of the cluster.

In [3]:
import os
os.environ['PYSPARK_PYTHON'] = '/usr/local/bin/python2.7'

HDFS_PATH = "hdfs://hathi-surfsara"

from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
sconf = SparkConf(False)

sconf.setAppName("eval")

# Master is now yarn-client. The YARN and hadoop config is read from the environment
sconf.setMaster("yarn-client")

# You can control many Spark settings via the SparkConf. This determines the amount of executors on the cluster:
sconf.set("spark.executor.instances", "100")
sconf.set("spark.executor.memory", "20g")

# UFW (firewall) is active on the VM. We explicitly opened these ports and Spark should not bind to random ports:
sconf.set("spark.driver.port", 51800)
sconf.set("spark.fileserver.port", 51801)
sconf.set("spark.broadcast.port", 51802)
sconf.set("spark.replClassServer.port", 51803)
sconf.set("spark.blockManager.port", 51804)
sconf.set("spark.authenticate", True)
sconf.set("spark.yarn.keytab", "/home/jovyan/work/data/robertop.keytab")
sconf.set("spark.yarn.access.namenodes", HDFS_PATH + ":8020")

try:
    sc = SparkContext(conf=sconf)
    sqlCtx = SQLContext(sc) 
    sendNotificationToMattia("Spark Context", str(sc))
except Exception, err:
    sendNotificationToMattia("Fuck you!", str(err)) 
    print str(err)

# <hr style="clear: both" />

# Now you can run your code

Pick a clustering algorithm (name of the file that provides a classify(x,y [,threshold]) function)

In [None]:
execfile('../spark-scripts/evalCluster.py')
execfile('../spark-scripts/utilsCluster.py')


import json
import copy

BASE_PATH = HDFS_PATH + '/user/robertop/mattia'

conf = {}

conf['split'] = {}
conf['split']['reclistSize'] = 100
conf['split']['callParams'] = {}
conf['split']['excludeAlreadyListenedTest'] = True
conf['split']['name'] = 'SenzaRipetizioni_1'
conf['split']['split'] = conf['split']['name']
conf['split']['minEventsPerUser'] = 5
conf['split']['inputData'] = HDFS_PATH + '/user/robertop/mattia/clusterBase.split/SenzaRipetizioni_1'
#conf['split']['inputData'] = 's3n://contentwise-research-poli/30musicdataset/newFormat/relations/sessions.idomaar'
conf['split']['bucketName'] = BASE_PATH
conf['split']['percUsTr'] = 0.05
conf['split']['ts'] = int(0.75 * (1421745857 - 1390209860) + 1390209860) - 10000
conf['split']['minEventPerSession'] = 5
conf['split']['onlineTrainingLength'] = 5
conf['split']['GTlength'] = 1
conf['split']['minEventPerSessionTraining'] = 10
conf['split']['minEventPerSessionTest'] = 11
conf['split']['mode'] = 'session'
conf['split']['forceSplitCreation'] = False
conf['split']["prop"] = {'reclistSize': conf['split']['reclistSize']}
conf['split']['type'] = None
conf['split']['out'] = HDFS_PATH + '/user/robertop/mattia/clusterBase.split/'
conf['split']['location'] = '30Mdataset/relations/sessions'

conf['evaluation'] = {}
conf['evaluation']['metric'] = {}
conf['evaluation']['metric']['type'] = 'recall'
conf['evaluation']['metric']['prop'] = {}
conf['evaluation']['metric']['prop']['N'] = [1,2,5,10,15,20,25,50,100]
conf['evaluation']['name'] = 'recall@N'

conf['general'] = {}
conf['general']['clientname'] = "clusterBase.split"
conf['general']['bucketName'] = BASE_PATH
conf['general']['tracksPath'] = '30Mdataset/entities/tracks.idomaar.gz'

conf['algo'] = {}
conf['algo']['props'] = {}
# ***** EXAMPLE OF CONFIGURATION *****#
conf['algo']['props']["sessionJaccardShrinkage"] = 7.5
conf['algo']['props']["clusterSimilarityThreshold"] = 0.2
conf['algo']['props']["expDecayFactor"] = 0.7
# ****** END EXAMPLE ****************#
clusterSim = conf['algo']['props']["clusterSimilarityThreshold"]
sessionJaccardShrinkage = conf['algo']['props']["sessionJaccardShrinkage"]
expDecay = conf['algo']['props']["expDecayFactor"]

conf['split']['excludeAlreadyListenedTest'] = str(True)



# Original Algo

In [4]:
execfile('../spark-scripts/evalClusterNew.py')
execfile('../spark-scripts/utilsCluster.py')

CLUSTER_ALGO = 'plain'
THRESHOLD = '0.0'
THRESHOLD_STR = str(THRESHOLD)[2:]
conf['algo']['name'] = CLUSTER_ALGO + THRESHOLD_STR + '_ImplicitPlaylist_shk_%d_clustSim_%.3f_decay_%.3f' % \
                    (sessionJaccardShrinkage, clusterSim, expDecay )

recRDD = loadRecommendations(conf)#.map(json.loads)
computeNewRecallPrecision(conf, recRDD, path = 'original' + '_' + '#'.join([str(sessionJaccardShrinkage), str(clusterSim), str(expDecay)]))

newRecall@N successfully written to /home/jovyan/work/data/mattia/resultsNew/original_5#0.1#0.7/recall@N
newPrecision@N successfully written to /home/jovyan/work/data/mattia/resultsNew/original_5#0.1#0.7/precision@N


# Substitute cluster with list of songs and compute metrics

In [None]:
execfile('../spark-scripts/evalClusterNew.py')
execfile('../spark-scripts/utilsCluster.py')

CLUSTER_ALGO = 'collaborative/'
THRESHOLDS = ['0.min_j_25_avgShrink2_Switch42']

for THRESHOLD in THRESHOLDS:
    
    THRESHOLD_STR = str(THRESHOLD)[2:]
    conf['algo']['name'] = CLUSTER_ALGO + THRESHOLD_STR + '_ImplicitPlaylist_shk_%d_clustSim_%.3f_decay_%.3f' % \
                    (sessionJaccardShrinkage, clusterSim, expDecay )
    
    recRDD = loadRecommendations(conf).map(json.loads)
    
    cluster_path = BASE_PATH + "/clusters/" + CLUSTER_ALGO + THRESHOLD_STR
    clustersRDD = sc.pickleFile(cluster_path)

    ALGO_PATH =   CLUSTER_ALGO + THRESHOLD_STR + '_' + '#'.join([str(sessionJaccardShrinkage), str(clusterSim), str(expDecay)])

    rec = mapClusterRecToListOfSongs(recRDD, clustersRDD, 'plug_songs')
    computeNewRecallPrecision(conf, rec, path = ALGO_PATH + '/full/flat')
    
    rec = mapClusterRecToListOfSongs(recRDD, clustersRDD, 'all_cluster')
    computeNewRecallPrecision(conf, rec, path = ALGO_PATH + '/full/ideal')
    computeNewRecallPrecision(conf, rec, loss = True, path = ALGO_PATH + '/full/loss')

In [None]:
sendNotificationToMattia("Finished", "Check")



# Compute performance with clustering just in Evaluation

In [None]:
from operator import itemgetter
def plug_clusters(x):
    row_dic = list(x[1])[0][2]
    to_be_plugged = sorted(list(x[1]), key = itemgetter(1))
    plugged = set()
    rank = 0
    row_dic['linkedinfo']['response'] = []
    
    for i in to_be_plugged:
        cl_id = i[0]
        if not cl_id in plugged:
            entry = {"type": "track", "id": cl_id, "rank": rank}
            row_dic['linkedinfo']['response'].append(entry)
            plugged.add(cl_id)
            rank += 1
            
    return json.dumps(row_dic)

In [None]:
execfile('../spark-scripts/utilsCluster.py')
execfile('../spark-scripts/evalClusterNew.py')

CLUSTER_ALGO = 'hybrid/'
THRESHOLDS = ['0.coll_J25_S2_jaccard9']

for THRESHOLD in THRESHOLDS:
    
    THRESHOLD_STR = str(THRESHOLD)[2:]
    conf['algo']['name'] = 'plain0_ImplicitPlaylist_shk_%d_clustSim_%.3f_decay_%.3f' % \
                    (5, 0.1, 0.7)
    conf['algo']['props']["sessionJaccardShrinkage"] = 5
    conf['algo']['props']["clusterSimilarityThreshold"] = 0.1
    conf['algo']['props']["expDecayFactor"] = 0.7
    
    plainRDD = loadRecommendations(conf)
    plainRDD = plainRDD.map(json.loads)

    conf['algo']['name'] = CLUSTER_ALGO + THRESHOLD_STR + '_ImplicitPlaylist_shk_%d_clustSim_%.3f_decay_%.3f' % \
                    (sessionJaccardShrinkage, clusterSim, expDecay )
        
    cluster_path = BASE_PATH + "/clusters/" + CLUSTER_ALGO + THRESHOLD_STR
    clustersRDD = sc.pickleFile(cluster_path)
    songToClusterRDD = clustersRDD.flatMap(lambda x: [(int(i), x[0]) for i in x[1]] )  
    
    plainFlatRDD = plainRDD.flatMap(lambda x: [(i['id'], (i['rank'], x)) for i in x['linkedinfo']['response']])
    plainJoinRDD = plainFlatRDD.join(songToClusterRDD).map(lambda x: (x[1][0][1]['id'], (x[1][1], x[1][0][0], x[1][0][1]) ))
    plainGroupRDD = plainJoinRDD.groupByKey().map(plug_clusters)
    recPlainClusterRDD = plainGroupRDD.map(json.loads)
      
    recPlainCluster = mapClusterRecToListOfSongs(recPlainClusterRDD, clustersRDD)
    computeNewRecallPrecision(conf,recPlainCluster, loss = False, plain = True, path = CLUSTER_ALGO + THRESHOLD_STR + "/eval/ideal")
    computeNewRecallPrecision(conf,recPlainCluster, loss = True, plain = True, path = CLUSTER_ALGO + THRESHOLD_STR + "/eval/loss")

In [None]:
sendNotificationToMattia("Finished", "Check")

# COVERAGE

In [None]:
import os
import json
execfile('../spark-scripts/utilsCluster.py')

ALGO = 'min_j_25_avgShrink2_higherMean'
conf['algo']['name'] = 'collaborative' + ALGO + '_ImplicitPlaylist_shk_%d_clustSim_%.3f_decay_%.3f' % \
                    (sessionJaccardShrinkage, clusterSim, expDecay )
    
recRDD = loadRecommendations(conf).map(json.loads)
cluster_path = BASE_PATH + "/clusters/collaborative/" + ALGO
clustersRDD = sc.pickleFile(cluster_path)

recRDD = mapClusterRecToListOfSongs(recRDD, clustersRDD, 'plug_songs')

songsRecRDD = recRDD.flatMap(lambda x: [(i['id'], i['rank']) for i in json.loads(x)['linkedinfo']['response'] ])

TOTAL_SONGS = 3893303

result = []
for n in conf['evaluation']['metric']['prop']['N']:
    uniqueSongsAtN = songsRecRDD.filter(lambda x: x[1] < n).map(lambda x: x[1]).distinct().count()
    temp = {}
    temp['evaluation'] = {}
    temp['linkedinfo'] = {}
    temp['evaluation']['N'] = n
    temp['evaluation']['value'] = float(uniqueSongsAtN) / float(TOTAL_SONGS)
    temp['linkedinfo']['splitName'] = conf['split']['name']
    temp['linkedinfo']['algoName'] = conf['algo']['name']
    result.append(temp)


DATA_PATH = '/home/jovyan/work/data/mattia/resultsNew'
path = os.path.join('collaborative', ALGO, 'full', 'flat')
directory = os.path.join(DATA_PATH, path)
if not os.path.exists(directory):
    os.makedirs(directory)

filename = os.path.join(DATA_PATH, path, 'coverage@N')
with open(filename, 'w') as f:
    for i in result:
        line = json.dumps(i)
        f.write(line + '\n')

    