# <hr style="clear: both" />

# Running Spark in YARN-client mode

This notebook demonstrates how to set up a SparkContext that uses SURFsara's Hadoop cluster: [YARN resourcemanager](http://head05.hathi.surfsara.nl:8088/cluster) (note you will need to be authenticated via kerberos on your machine to visit the resourcemanager link) for executors.

First initialize kerberos via a Jupyter terminal. 
In the terminal execute: <BR>
<i>kinit -k -t data/robertop.keytab robertop@CUA.SURFSARA.NL</i><BR>
Print your credentials:


In [1]:
! klist

Ticket cache: FILE:/tmp/krb5cc_1001
Default principal: robertop@CUA.SURFSARA.NL

Valid starting       Expires              Service principal
04/12/2016 13:42:29  04/13/2016 13:42:29  krbtgt/CUA.SURFSARA.NL@CUA.SURFSARA.NL
	renew until 04/12/2016 13:42:29


In [1]:
print sc

<pyspark.context.SparkContext object at 0x7fbf28040550>


# <hr style="clear: both" />

# Now you can run your code

Pick a clustering algorithm (name of the file that provides a classify(x,y [,threshold]) function)

In [2]:
execfile('../spark-scripts/conventions.py')
execfile('../spark-scripts/splitCluster2.py')
#execfile('../spark-scripts/utils.py')
execfile('../spark-scripts/eval.py')
execfile('../spark-scripts/implicitPlaylistAlgoFunctions.py')
execfile('../spark-scripts/implicitPlaylistAlgoMain.py')

CLUSTER_ALGO = 'jaccardBase'

execfile('../spark-scripts/' + CLUSTER_ALGO + '.py')


# Reading the conf file

In [3]:
import json
import copy

BASE_PATH = "/mnt/space/mattia"

conf = {}

conf['split'] = {}
conf['split']['reclistSize'] = 100
conf['split']['callParams'] = {}
conf['split']['excludeAlreadyListenedTest'] = True
conf['split']['name'] = 'SenzaRipetizioni_1'
conf['split']['split'] = conf['split']['name']
conf['split']['minEventsPerUser'] = 5
conf['split']['inputData'] = BASE_PATH + '/' + CLUSTER_ALGO + '.split/SenzaRipetizioni_1'
#conf['split']['inputData'] = 's3n://contentwise-research-poli/30musicdataset/newFormat/relations/sessions.idomaar'
conf['split']['bucketName'] = BASE_PATH
conf['split']['percUsTr'] = 0.05
conf['split']['ts'] = int(0.75 * (1421745857 - 1390209860) + 1390209860) - 10000
conf['split']['minEventPerSession'] = 5
conf['split']['onlineTrainingLength'] = 5
conf['split']['GTlength'] = 1
conf['split']['minEventPerSessionTraining'] = 10
conf['split']['minEventPerSessionTest'] = 11
conf['split']['mode'] = 'session'
conf['split']['forceSplitCreation'] = False
conf['split']["prop"] = {'reclistSize': conf['split']['reclistSize']}
conf['split']['type'] = list
conf['split']['out'] = BASE_PATH + '/' + CLUSTER_ALGO + '.split'
conf['split']['location'] = '30Mdataset/relations/sessions'

conf['evaluation'] = {}
conf['evaluation']['metric'] = {}
conf['evaluation']['metric']['type'] = 'recall'
conf['evaluation']['metric']['prop'] = {}
conf['evaluation']['metric']['prop']['N'] = [1,2,5,10,15,20,25,50,100]
conf['evaluation']['name'] = 'recall@N'

conf['general'] = {}
conf['general']['clientname'] = CLUSTER_ALGO + '.split'
conf['general']['bucketName'] = BASE_PATH
conf['general']['tracksPath'] = '30Mdataset/entities/tracks.idomaar.gz'

conf['algo'] = {}
conf['algo']['name'] = CLUSTER_ALGO
conf['algo']['props'] = {}
# ***** EXAMPLE OF CONFIGURATION *****#
conf['algo']['props']["sessionJaccardShrinkage"] = 5
conf['algo']['props']["clusterSimilarityThreshold"] = 0.1
conf['algo']['props']["expDecayFactor"] = 0.7
# ****** END EXAMPLE ****************#


Pick the list of songs ad create clusters


In [44]:
import json
import string

def my_replace_punct(x):
    ret = ""
    for i in x:
        if i == '+':
            ret += ' '
        else:
            ret += i
    return ret

tracksRDD = sc.textFile(BASE_PATH + '/30Mdataset/entities/tracks.idomaar.gz')
tracksRDD = tracksRDD.map(lambda x: x.split('\t')).map(lambda x: (x[1], json.loads(x[3])['name'].split('/') ) )
tracksRDD = tracksRDD.map(lambda x: (x[0], " ".join( (x[1][0], x[1][2]) ) ))
tracksRDD = tracksRDD.map(lambda x : (x[0], my_replace_punct(x[1])))
tracksRDD = tracksRDD.map(lambda x: (x[0], tokenize_song(x[1]), x[1]))

sampleRDD = tracksRDD.take(5000)
sampleRDD = sc.parallelize(sampleRDD)
sampleRDD.take(3)

[(u'0',
  [u'000003',
   u'music',
   u'instructor',
   u'dj27s',
   u'rock',
   u'da',
   u'house',
   u'c382e28988c386e28988c38a01',
   u'dj',
   u'maxpulemet',
   u'vs',
   u'bomfunk',
   u'mc27s',
   u'electro',
   u'breakdance',
   u'party',
   u'1',
   u'5b20005d',
   u'cd',
   u'onec382e28988c386e28988c38a'],
  u'000003 Music Instructor Dj%27s Rock Da House %C3%82%E2%89%88%C3%86%E2%89%88%C3%8A01 - Dj Max-Pulemet Vs. Bomfunk Mc%27s - Electro Breakdance party 1 %5B2000%5D = CD ONE%C3%82%E2%89%88%C3%86%E2%89%88%C3%8A'),
 (u'1',
  [u'0001', u'd0a2d0b5d0bad181d182'],
  u'00-01 %D0%A2%D0%B5%D0%BA%D1%81%D1%82'),
 (u'2', [u'0005', u'overkill', u'overkill'], u'0005. Overkill Overkill')]


Reduce the quantity of data by building RDD {word -> songs}.
For each for keep only couples of songs that matches.


In [45]:
#Build an RDD with ('word' -> (id, name))
wordsRDD = sampleRDD.flatMap(lambda x: [(i, (x[0], x[2])) for i in x[1]] )
wordsRDD.take(3)

#Group by 'word' and keep only the ones with more then 1 song
wordsRDD = wordsRDD.groupByKey().mapValues(list).filter(lambda x: len(x[1]) > 1)

#Compute a cartesian product for each list of songs with a common word
def filtered_cartesian(x):
    equal_couples = set()
    for i in range(len(x[1])):
        a = x[1][i]
        id_a = x[1][i][0]
        name_a = x[1][i][1]
        
        for j in range(i):
            b = x[1][j]
            id_b = x[1][j][0]
            name_b = x[1][j][1]
            if id_a != id_b:
                if classify(name_a, name_b):
                    equal_couples.add((a,b))
                    
    return (x[0], tuple(equal_couples))

coupleRDD = wordsRDD.map(filtered_cartesian).filter(lambda x: len(x[1]) > 1)
coupleRDD.take(10)


[(u'daydreamer',
  (((u'2852', u'10 Years 11.00 AM (Daydreamer)'),
    (u'2851', u'10 Years 11:00 AM (Daydreamer)')),
   ((u'2851', u'10 Years 11:00 AM (Daydreamer)'),
    (u'2850', u'10 Years 11-00 AM (Daydreamer)')),
   ((u'2852', u'10 Years 11.00 AM (Daydreamer)'),
    (u'2850', u'10 Years 11-00 AM (Daydreamer)')))),
 (u'and',
  (((u'3081', u'112 Peaches And Cream'),
    (u'2997', u'112 112 - Peaches and Cream')),
   ((u'2859', u'10 Years ... And All The Other Colors'),
    (u'2858', u'10 Years ...And All the Other Colors')))),
 (u'stones',
  (((u'4154', u'12 Stones Lie to Me (Acoustic)'),
    (u'4153', u'12 Stones Lie to me - acoustic')),
   ((u'4126', u'12 Stones Bulletproof_'),
    (u'4125', u'12 Stones Bulletproof')))),
 (u'plant',
  (((u'2648', u'10 Ft. Ganja Plant Hard Times (feat. Sylford Walker)'),
    (u'2639', u'10 Ft. Ganja Plant Feat. Sylford Walker Hard Times')),
   ((u'2680', u'10 Ft. Ganja Plant Recession (feat. Prince Jazzbo)'),
    (u'2638', u'10 Ft. Ganja Plant Fea

In [47]:
#Merge every fucking couple with common songs
def couple_merge(x):
    result = set()
    for couple in x[1]:
        merged = False
        for cluster in result:
            if len(set(couple) & set(cluster)):
                result.remove(cluster)
                result.add(tuple(set(couple) | set(cluster)))
                merged = True
                break;
        if not merged:
            result.add(couple)
    return (x[0], tuple(result))


mergedRDD = coupleRDD.map(couple_merge)
mergedRDD = mergedRDD.flatMap(lambda x: [i for i in x[1]]).distinct()

In [77]:
flippedRDD = mergedRDD.flatMap(lambda x: [(i,x) for i in x])
groupRDD = flippedRDD.groupByKey().mapValues(list)

def my_string_map(x):
    result = []
    for i in range(len(x[1][0])):
        el = int(x[1][0][i][0])
        inserted = False
        for j in range(len(result)):
            if el < result[j]:
                result.insert(j, el)
                inserted = True
                break;
        if not inserted:
            result.append(el)
            
    return (x[0], ' '.join([str(i) for i in result]))
        
groupRDD.take(2)
#
groupRDD.map(my_string_map).map(lambda x: (x[1], x[0])).groupByKey().mapValues(set).take(30)

[('2640 2670',
  {(u'2640', u'10 Ft. Ganja Plant Feat. Sylford Walker My Roots'),
   (u'2670', u'10 Ft. Ganja Plant My Roots (feat. Sylford Walker)')}),
 ('3457 3458',
  {(u'3457', u'1200 Micrograms Mescaline - Astrix Remix'),
   (u'3458', u'1200 Micrograms Mescaline (Astrix Remix)')}),
 ('2959 2961',
  {(u'2959', u'10 Years Wasteland (Acoustic Live)'),
   (u'2961', u'10 Years Wasteland (live acoustic)')}),
 ('3076 3077 3078',
  {(u'3076', u'112 Only You (feat. The Notorious B.I.G.)'),
   (u'3077', u'112 Only You - feat. The Notorious B.I.G.'),
   (u'3078', u'112 Only You (feat. the Notorious B.I.G.')}),
 ('680 681',
  {(u'680', u'04LM, James Ruskin M Place James Ruskin Remix'),
   (u'681', u'04LM M Place (James Ruskin Remix)')}),
 ('3602 3603',
  {(u'3602', u'120 Days Come Out, Come Down, Fade Out, Be Gone'),
   (u'3603', u'120 Days Come Out (Come Down, Fade Out, Be Gone)')}),
 ('2323 2353',
  {(u'2323', u'10cc 10CC - Dreadlock Holiday'),
   (u'2353', u'10cc Dreadlock Holiday')}),
 ('

In [None]:
splitter(conf)

In [None]:
from os import path
basePath = path.join(conf['general']['bucketName'], conf['general']['clientname'])
splitPath = path.join(basePath, conf['split']['name'])

clusterSimList = [0.1]
sessionJaccardShrinkageList = [5]
expDecayList = [0.7]

for exclude in [True]:
    conf['split']['excludeAlreadyListenedTest'] = str(exclude)
    #conf['split']['name'] = 'giroCompletoTestMultipleConfs_exclude%s' % exclude
    #splitter(conf)
    train, test = loadDataset(conf)
    train.cache()
    test.cache()
    
    for sessionJaccardShrinkage in sessionJaccardShrinkageList:
        conf['algo']['props']["sessionJaccardShrinkage"] = sessionJaccardShrinkage
        
        for clusterSim in clusterSimList:
            conf['algo']['props']["clusterSimilarityThreshold"] = clusterSim
            
            playlists = extractImplicitPlaylists(train, conf).cache()
            
            for expDecay in expDecayList:
                conf['algo']['props']["expDecayFactor"] = expDecay
                conf['algo']['name'] = CLUSTER_ALGO + '_ImplicitPlaylist_shk_%d_clustSim_%.3f_decay_%.3f' % \
                    (sessionJaccardShrinkage, clusterSim, expDecay )

                recJsonRDD = executeImplicitPlaylistAlgo(playlists, test, conf)
                try:
                    saveRecommendations(conf, recJsonRDD, overwrite=True)
                    try:
                        computeMetrics(conf)
                    except:
                        print 'Error in computing metrics'
                except:
                    print 'Error in saving recommndations'
                    try:
                        computeMetrics(conf)
                    except:
                        print 'Error in computing metrics'