In [1]:
from pyspark.sql import Window
from pyspark.sql.functions import row_number, sum, col, abs, count

In [2]:
from pyspark.sql import SparkSession
sparkSession = SparkSession.builder.enableHiveSupport().master("local").getOrCreate()

In [3]:
data = sparkSession.read.parquet("/data/sample264")
meta = sparkSession.read.parquet("/data/meta")

# Graph based Music Recommender. Task 1

Build the edges of the type “track-track”. To do it you will need to count the collaborative similarity between all the tracks: if a user has started listening to track B within 7 minutes after starting track A, then you should add 1 to the weight of the edge from vertex A to vertex B (initial weight is equal to 0).

Example:
    
    userId artistId trackId timestamp
    7        12        1          1534574189
    7        13        4          1534574289 
    5        12        1          1534574389 
    5        13        4          1534594189 
    6        12        1          1534574489 
    6        13        4          1534574689 

The track 1 is similar to the track 4 with the weight 2 (before normalization): the user 7 and the user 6 listened these 2 tracks together in the 7 minutes long window:

  - userId 7: 1534574289 - 1534574189 = 100 seconds = 1 min 40 seconds < 7 minutes
  - userId 6: 1534574689 - 1534574489 = 200 seconds = 3 min 20 seconds < 7 minutes

Note that the track 4 is similar to the track 1 with the same weight 2.

Tip: consider joining the graph to itself with the UserId and remove pairs with the same tracks.For each track choose top 50 tracks ordered by weight similar to it and normalize weights of its edges (divide the weight of each edge on a sum of weights of all edges). Use rank() to choose top 40 tracks as is done in the demo.

Sort the resulting Data Frame in the descending order by the column norm_weight, and then in the ascending order this time first by “id1”, then by “id2”. Take top 40 rows, select only the columns “id1”, “id2”, and print the columns “id1”, “id2” of the resulting dataframe.

Output example:

    54719		767867
    54719		767866
    50787		327676

----

For all subtasks use the same ipython notebook, each subtask should be the continuation of the previous

In [4]:
def norm(df, key1, key2, field, n): 
    
    window = Window.partitionBy(key1).orderBy(col(field).desc())
        
    topsDF = df.withColumn("row_number", row_number().over(window)) \
        .filter(col("row_number") <= n) \
        .drop(col("row_number")) 
        
    tmpDF = topsDF.groupBy(col(key1)).agg(col(key1), sum(col(field)).alias("sum_" + field))
   
    normalizedDF = topsDF.join(tmpDF, key1, "inner") \
        .withColumn("norm_" + field, col(field) / col("sum_" + field)) \
        .cache()

    return normalizedDF

In [5]:
data1 = data.select(
    col('userId').alias('userId'), 
    col('trackId').alias('trackId1'), 
    col('timestamp').alias('timestamp1')
)

data2 = data.select(
    col('userId').alias('userId'), 
    col('trackId').alias('trackId2'), 
    col('timestamp').alias('timestamp2')
)

In [6]:
similarity_count = (data1.join(data2, "userId").cache()
                    .filter(col('trackId1') != col('trackId2'))
                    .filter(abs(col('timestamp1') - col('timestamp2')) <= 420).cache()
                    .groupBy(col('trackId1'), col('trackId2'))
                    .count().alias('count')
                   ).cache()

In [7]:
normalized = norm(similarity_count, "trackId1", "trackId2", "count", 40)

In [None]:
results = (normalized
 .orderBy(col("norm_count").desc(), col("trackId1"), col("trackId2"))
 .limit(40)
)

In [None]:
results = results.select(col("trackId1"), col("trackId2"))

In [None]:
for t1, t2 in results.collect():
    print("{}\t{}".format(t1,t2))

798256	923706
798319	837992
798322	876562
798331	827364
798335	840741
798374	816874
798375	810685
798379	812055
798380	840113
798396	817687
798398	926302
798405	867217
798443	905923
798457	918918
798460	891840
798461	940379
798470	840814
798474	963162
798477	883244
798485	955521
798505	905671
798545	949238
798550	936295
798626	845438
798691	818279
798692	898823
798702	811440
798704	937570
798725	933147
798738	894170
798745	799665
798782	956938
798801	950802
798820	890393
798833	916319
798865	962662
798931	893574
798946	946408
799012	809997
799024	935246
