In [1]:
from pyspark.sql import SparkSession

In [2]:
sparkSession = SparkSession.builder.enableHiveSupport().master("local").getOrCreate()

In [3]:
from pyspark.sql import Window
from pyspark.sql.functions import row_number, sum, col, abs, count, desc, asc

In [4]:
data = sparkSession.read.parquet("/data/sample264")
meta = sparkSession.read.parquet("/data/meta")

# Graph based Music Recommender. Task 4

Build the edges of the type “artist-track”. Take the amount of times the track HAS BEEN listened by all users as the weight of the edge from the artist’s vertex to the track’s vertex. 

**Tip**: group the dataframe by the columns userId and trackId and use the function “count” of DF API. For each user take top-100 artists and normalize weights.

Sort the resulting Data Frame in descending order by the column norm_weight, and then in ascending order this time first by “id1”, then by “id2”. Take top 40 rows, select only the columns “id1”, “id2”, and print the columns “id1”, “id2” of the resulting dataframe.

The part of the result on the sample dataset:

    ...
    968017 859321
    968022 852786
    968034 807671
    968038 964150
    968042 835935
    ...

For all subtasks use the same ipython notebook, each subtask should be the continuation of the previous

In [5]:
def norm(df, key1, key2, field, n): 
    
    window = Window.partitionBy(key1).orderBy(col(field).desc())
        
    topsDF = df.withColumn("row_number", row_number().over(window)) \
        .filter(col("row_number") <= n) \
        .drop(col("row_number")) 
        
    tmpDF = topsDF.groupBy(col(key1)).agg(col(key1), sum(col(field)).alias("sum_" + field))
   
    normalizedDF = topsDF.join(tmpDF, key1, "inner") \
        .withColumn("norm_" + field, col(field) / col("sum_" + field)) \
        .cache()

    return normalizedDF

In [15]:
weights = (data
           .groupBy(col("artistId"), col("trackId"))
           .count().alias("count")
          ).cache()

In [16]:
normalized = (norm(weights, "artistId", "trackId", "count", 100)
              .orderBy(col("norm_count").desc(), col("artistId").asc(), col("trackId").desc())
              .limit(40)
             )

In [17]:
results = normalized.select(col("artistId"), col("trackId"))
for u, a in results.collect():
    print("{} {}".format(u, a))

967993 869415
967998 947428
968004 927380
968017 859321
968022 852786
968034 807671
968038 964150
968042 835935
968043 913568
968046 935077
968047 806127
968065 907906
968073 964586
968086 813446
968092 837129
968118 914441
968125 821410
968140 953008
968148 877445
968161 809793
968163 803065
968168 876119
968189 858639
968221 896937
968224 892880
968232 825536
968237 932845
968238 939177
968241 879045
968242 911250
968248 953554
968255 808494
968259 880230
968265 950148
968266 824437
968269 913243
968272 816049
968278 946743
968285 847460
968286 940006
