In [2]:
from pyspark.sql import SparkSession

spark = SparkSession\
        .builder\
        .appName('użycie implicit data do rekomendacji zespołów na last.fm')\
        .getOrCreate()

rawData = spark.read.format('csv').\
option('delimiter', '\t').\
option('header','true').\
load('../datasets/lastfm/user_artists.dat')

rawData.toPandas().head()

Unnamed: 0,userID,artistID,weight
0,2,51,13883
1,2,52,11690
2,2,53,11351
3,2,54,10300
4,2,55,8983


In [3]:
from pyspark.sql.functions import col

dataset  = rawData.select(col('userID').cast('int'),
                          col('artistID').cast('int'),
                          col('weight').cast('int')
                         )

dataset.toPandas().head()

Unnamed: 0,userID,artistID,weight
0,2,51,13883
1,2,52,11690
2,2,53,11351
3,2,54,10300
4,2,55,8983


In [4]:
dataset.select('weight').describe().toPandas()

Unnamed: 0,summary,weight
0,count,92834.0
1,mean,745.2439300256372
2,stddev,3751.32208038768
3,min,1.0
4,max,352698.0


In [None]:
### Standaryzacja: skalowana wart = (wartosc - srednia)/odchylenie std.

In [5]:
from pyspark.sql.functions import stddev, mean, col

df= dataset.select(mean('weight').alias('mean_weight'),
                  stddev('weight').alias('stddev_weight'))\
            .crossJoin(dataset)\
            .withColumn('weight_scaled',
                       (col('weight')-col('mean_weight')) / col('stddev_weight'))
df.toPandas().head()


Unnamed: 0,mean_weight,stddev_weight,userID,artistID,weight,weight_scaled
0,745.24393,3751.32208,2,51,13883,3.502167
1,745.24393,3751.32208,2,52,11690,2.917573
2,745.24393,3751.32208,2,53,11351,2.827205
3,745.24393,3751.32208,2,54,10300,2.547037
4,745.24393,3751.32208,2,55,8983,2.195961


In [8]:
(trainingData, testData) = df.randomSplit([0.8 , 0.2])

In [9]:
from pyspark.ml.recommendation import ALS

als = ALS(maxIter = 10,
         regParam = 0.1,
         userCol='userID',
         itemCol = 'artistID',
         implicitPrefs = True, ###tu sie dzieje magia
         ratingCol = 'weight_scaled',
          coldStartStrategy = 'drop'
         )

model = als.fit(trainingData)

In [10]:
predictions = model.transform(testData)
predictions.toPandas().head()

Unnamed: 0,mean_weight,stddev_weight,userID,artistID,weight,weight_scaled,prediction
0,745.24393,3751.32208,1280,463,316,-0.114425,0.009873
1,745.24393,3751.32208,2055,463,863,0.031391,0.002418
2,745.24393,3751.32208,592,463,544,-0.053646,0.013848
3,745.24393,3751.32208,11,463,1235,0.130556,-0.000658
4,745.24393,3751.32208,1456,471,825,0.021261,0.004051


In [11]:
predictionsPandas=predictions.select('weight_scaled','prediction').toPandas()
predictionsPandas.describe()


Unnamed: 0,weight_scaled,prediction
count,16104.0,16104.0
mean,0.011152,0.041971
std,1.259393,0.09929
min,-0.198395,-0.40229
25%,-0.168539,0.0
50%,-0.126954,0.002338
75%,-0.026989,0.034803
max,93.820991,0.968772


In [12]:
artistData = spark.read.format('csv').\
option('delimiter', '\t').\
option('header','true').\
load('../datasets/lastfm/artists.dat')

artistData.toPandas().head()

Unnamed: 0,id,name,url,pictureURL
0,1,MALICE MIZER,http://www.last.fm/music/MALICE+MIZER,http://userserve-ak.last.fm/serve/252/10808.jpg
1,2,Diary of Dreams,http://www.last.fm/music/Diary+of+Dreams,http://userserve-ak.last.fm/serve/252/3052066.jpg
2,3,Carpathian Forest,http://www.last.fm/music/Carpathian+Forest,http://userserve-ak.last.fm/serve/252/40222717...
3,4,Moi dix Mois,http://www.last.fm/music/Moi+dix+Mois,http://userserve-ak.last.fm/serve/252/54697835...
4,5,Bella Morte,http://www.last.fm/music/Bella+Morte,http://userserve-ak.last.fm/serve/252/14789013...


In [13]:
from pyspark.sql.types import IntegerType

def getRecommendationsForUser(userId, numRecs):
    
    usersDF = spark.createDataFrame([userId], IntegerType()).toDF('userId')
    userRecs = model.recommendForUserSubset(usersDF, numRecs)
    
    artistsList = userRecs.collect()[0].recommendations
    artistsDF = spark.createDataFrame(artistsList)
    
    recommendedArtists = artistData.join(artistsDF, artistData.id == artistsDF.artistID).orderBy('rating', ascending = False).select('name', 'url', 'rating')
    return recommendedArtists

In [14]:
getRecommendationsForUser(939,10).toPandas()

Unnamed: 0,name,url,rating
0,Paramore,http://www.last.fm/music/Paramore,0.415596
1,30 Seconds to Mars,http://www.last.fm/music/30+Seconds+to+Mars,0.342345
2,My Chemical Romance,http://www.last.fm/music/My+Chemical+Romance,0.304962
3,All Time Low,http://www.last.fm/music/All+Time+Low,0.292554
4,Avenged Sevenfold,http://www.last.fm/music/Avenged+Sevenfold,0.28532
5,Linkin Park,http://www.last.fm/music/Linkin+Park,0.26671
6,Evanescence,http://www.last.fm/music/Evanescence,0.199473
7,The Used,http://www.last.fm/music/The+Used,0.162944
8,Panic! At the Disco,http://www.last.fm/music/Panic%21+At+the+Disco,0.162387
9,Bullet for My Valentine,http://www.last.fm/music/Bullet+for+My+Valentine,0.161737


In [16]:
userArtistRaw = dataset.filter(dataset.userID == 939)

userArtistsInfo = artistData.join(userArtistRaw,
                                artistData.id==userArtistRaw.artistID)\
                                .orderBy('weight', ascending = False)\
                                .select('name', 'weight')

userArtistsInfo.toPandas()

Unnamed: 0,name,weight
0,Avenged Sevenfold,10595
1,30 Seconds to Mars,1646
2,A Skylit Drive,1603
3,Sonic Syndicate,1439
4,Paramore,1184
5,Funeral for a Friend,1127
6,Light This City,1125
7,My Chemical Romance,1055
8,As I Lay Dying,1047
9,Dead by April,1044


In [17]:
getRecommendationsForUser(1726,10).toPandas()

Unnamed: 0,name,url,rating
0,Britney Spears,http://www.last.fm/music/Britney+Spears,0.750255
1,Christina Aguilera,http://www.last.fm/music/Christina+Aguilera,0.60308
2,Rihanna,http://www.last.fm/music/Rihanna,0.401273
3,Lady Gaga,http://www.last.fm/music/Lady+Gaga,0.365778
4,Beyoncé,http://www.last.fm/music/Beyonc%C3%A9,0.296125
5,Madonna,http://www.last.fm/music/Madonna,0.286955
6,Mariah Carey,http://www.last.fm/music/Mariah+Carey,0.273625
7,Katy Perry,http://www.last.fm/music/Katy+Perry,0.233365
8,Kylie Minogue,http://www.last.fm/music/Kylie+Minogue,0.212886
9,Jennifer Lopez,http://www.last.fm/music/Jennifer+Lopez,0.188941


In [18]:
userArtistRaw = dataset.filter(dataset.userID == 1726)

userArtistsInfo = artistData.join(userArtistRaw,
                                artistData.id==userArtistRaw.artistID)\
                                .orderBy('weight', ascending = False)\
                                .select('name', 'weight')

userArtistsInfo.toPandas()

Unnamed: 0,name,weight
0,Britney Spears,13804
1,Christina Aguilera,1396
2,Rihanna,1056
3,Shakira,1027
4,Katy Perry,651
5,Beyoncé,544
6,Lady Gaga,517
7,Cheryl Cole,478
8,David Guetta,466
9,Ke$ha,446
