In [1]:
import pyspark
import pandas as pd
import numpy as np
import scipy.sparse as sp

In [2]:
# Initialize pyspark

from pyspark.sql import SparkSession

spark = SparkSession.builder \
                    .appName("LightFM Features") \
                    .getOrCreate()

In [3]:
# Load train dataframe

path_to_df = 'gs://thesis_apc_bucket/df_data2'

df_data = spark.read.orc(path_to_df + '/df_train.orc')

In [4]:
# Drop duplicate rows in pid and tid columns

df_data = df_data.dropDuplicates(['pid', 'tid'])

In [5]:
# Get number of playlists and number of tracks

tid_max = df_data.agg({'tid': 'max'}).collect()[0]['max(tid)']

num_tracks = tid_max + 1

In [6]:
num_tracks

995382

In [8]:
data = df_data.toPandas()

In [10]:
%%time

# Calculate occurrences

from collections import defaultdict
from pyspark.sql.functions import col

co_occurrence_2 = [defaultdict(int) for i in range(num_tracks)]
# co_occurrence_3 = [defaultdict(lambda: defaultdict(int)) for i in range(num_tracks)]
occurrence = [0 for i in range(num_tracks)]
# pids = data.select('pid').distinct().rdd.map(lambda x: x[0]).collect()

for q, (_, df) in enumerate(data.groupby('pid')):
    if q % 10000 == 0:
        print(q / 10000)
    tids = list(df.tid)
    for i in tids:
        occurrence[i] += 1
    for k, i in enumerate(tids):
        for j in tids[k + 1:]:
#             temp_tids = [i] + [j]
#             temp_tids.sort()
            co_occurrence_2[i][j] += 1
            co_occurrence_2[j][i] += 1
#             for l in tids[k + 2:]:
#                 temp_tids += [l]
#                 temp_tids.sort()
#                 co_occurrence_3[temp_tids[0]][temp_tids[1]][temp_tids[2]] += 1

0.0
1.0
2.0
3.0
4.0
5.0
6.0
7.0
8.0
9.0
10.0
11.0
12.0
13.0
14.0
15.0
16.0
17.0
18.0
19.0
CPU times: user 6min 47s, sys: 8.8 s, total: 6min 56s
Wall time: 6min 55s


In [30]:
# Group pids by tid

data_tracks = data.groupby('tid').pid.apply(list)

In [31]:
# Define function to get co-occurrence-3 values from data

def find_co_occurrence_3(i, j, k):
    set_i = set(data_tracks[i])
    set_j = set(data_tracks[j])
    set_k = set(data_tracks[k])
    return len(set.intersection(set_i, set_j, set_k))

In [37]:
def get_f(i, f):
    if len(i) == 0:
        return -1
    else:
        return f(i)
    
def create_co_occurrence_features(df):
    
    df = df.toPandas()
    
    pids = df.pid.unique()
    seed = data[data.pid.isin(pids)]
    tid_seed = seed.groupby('pid').tid.apply(list)
    
    co_occurrence_seq = []
    print('Creating features...')
    count = 0
    for pid, tid in df[['pid', 'tid']].values:
        tid = int(tid)
        count += 1
        if (count % 1000000 == 0):
            print(count / 1000000)
        tracks = tid_seed.get(pid, [])
        co_occurrence_seq.append(np.array([co_occurrence_2[tid][i] for i in tracks]))
        
    df['co_occurrence_max'] = [get_f(i, np.max) for i in co_occurrence_seq]
    df['co_occurrence_min'] = [get_f(i, np.min) for i in co_occurrence_seq]
    df['co_occurrence_mean'] = [get_f(i, np.mean) for i in co_occurrence_seq]
    df['co_occurrence_median'] = [get_f(i, np.median) for i in co_occurrence_seq]
    
    co_occurrence_seq = []
    print('Creating normalized features...')
    count = 0
    for pid, tid in df[['pid', 'tid']].values:
        tid = int(tid)
        count += 1
        if (count % 1000000 == 0):
            print(count / 1000000)
        tracks = tid_seed.get(pid, [])
        seq = np.empty(0)   # Initiate array
        for k in range(len(tracks) - 1):
            seq = np.append(seq, find_co_occurrence_3(tid, tracks[k], tracks[k+1]) / co_occurrence_2[tid][tracks[k]] 
                            if co_occurrence_2[tid][tracks[k]] > 0 else 0)
        co_occurrence_seq.append(seq)
#         if (count % 1000000 == 0):
#             print(co_occurence_seq)
        
    df['co_occurrence_norm_max'] = [get_f(i, np.max) for i in co_occurrence_seq]
    df['co_occurrence_norm_min'] = [get_f(i, np.min) for i in co_occurrence_seq]
    df['co_occurrence_norm_mean'] = [get_f(i, np.mean) for i in co_occurrence_seq]
    df['co_occurrence_norm_median'] = [get_f(i, np.median) for i in co_occurrence_seq]
    
    res = spark.createDataFrame(df)
    return res

In [38]:
# Load candidates

import pandas as pd

path_to_df = 'gs://thesis_apc_bucket/df_data2'

df_train = spark.read.orc(path_to_df + '/ii_candidate.orc')
df_val = spark.read.orc(path_to_df + '/iii_candidate.orc')
df_test = spark.read.orc(path_to_df + '/test_candidate.orc') 

In [None]:
%%time

df_train = create_co_occurrence_features(df_train)

Creating features...
1.0
2.0
3.0
4.0
5.0
6.0
7.0
Creating normalized features...
1.0
2.0
3.0
4.0
5.0
6.0
7.0
CPU times: user 5h 50min 19s, sys: 1min 52s, total: 5h 52min 12s
Wall time: 5h 49min 15s


In [48]:
%%time

df_train.describe().show()

+-------+-----------------+-----------------+------------------+-----------------+------------------+--------------------+----------------------+----------------------+-----------------------+-------------------------+
|summary|              pid|              tid| co_occurrence_max|co_occurrence_min|co_occurrence_mean|co_occurrence_median|co_occurrence_norm_max|co_occurrence_norm_min|co_occurrence_norm_mean|co_occurrence_norm_median|
+-------+-----------------+-----------------+------------------+-----------------+------------------+--------------------+----------------------+----------------------+-----------------------+-------------------------+
|  count|          7987156|          7987156|           7987156|          7987156|           7987156|             7987156|               7987156|               7987156|                7987156|                  7987156|
|   mean|170564.1963632612|525140.5247017837|101.83391960793054|6.523235429482034|29.011425799467858|  20.682589209475815|  

In [49]:
df_train.write.mode('overwrite').orc(path_to_df + '/ii_co_occurrence_features')

In [None]:
%%time

df_val = create_co_occurrence_features(df_val)

Creating features...
1.0
2.0
3.0
4.0
5.0
6.0
7.0
Creating normalized features...
1.0
2.0
3.0
4.0
5.0
6.0
7.0
CPU times: user 5h 47min 54s, sys: 1min 40s, total: 5h 49min 35s
Wall time: 5h 47min 3s


In [None]:
%%time

df_val.show()

+--------+------+------+-----------------+-----------------+------------------+--------------------+----------------------+----------------------+-----------------------+-------------------------+
|     pid|   tid|target|co_occurrence_max|co_occurrence_min|co_occurrence_mean|co_occurrence_median|co_occurrence_norm_max|co_occurrence_norm_min|co_occurrence_norm_mean|co_occurrence_norm_median|
+--------+------+------+-----------------+-----------------+------------------+--------------------+----------------------+----------------------+-----------------------+-------------------------+
|243154.0|511264| false|               -1|               -1|              -1.0|                -1.0|                  -1.0|                  -1.0|                   -1.0|                     -1.0|
|243154.0|511250| false|               -1|               -1|              -1.0|                -1.0|                  -1.0|                  -1.0|                   -1.0|                     -1.0|
|243154.0|51589

In [None]:
df_val.write.mode('overwrite').orc(path_to_df + '/iii_co_occurrence_features')

In [None]:
%%time

df_test = create_co_occurrence_features(df_test)

Creating features...
1.0
2.0
3.0
4.0
5.0
6.0
7.0
Creating normalized features...
1.0
2.0
3.0
4.0
5.0
6.0
CPU times: user 5h 34min 6s, sys: 1min 43s, total: 5h 35min 49s
Wall time: 5h 33min 16s


In [None]:
%%time

df_test.show()

+--------+------+-----------------+-----------------+------------------+--------------------+----------------------+----------------------+-----------------------+-------------------------+
|     pid|   tid|co_occurrence_max|co_occurrence_min|co_occurrence_mean|co_occurrence_median|co_occurrence_norm_max|co_occurrence_norm_min|co_occurrence_norm_mean|co_occurrence_norm_median|
+--------+------+-----------------+-----------------+------------------+--------------------+----------------------+----------------------+-----------------------+-------------------------+
|242381.0|519900|              188|                3|              99.4|                93.0|    0.3333333333333333|    0.2465753424657534|    0.28003861288218335|       0.2701228878648233|
|242381.0|512343|               47|                0|              18.8|                12.0|                   0.5|                   0.0|    0.21985815602836878|      0.18971631205673758|
|242381.0|543652|               37|               

In [None]:
%%time

df_train.write.mode('overwrite').orc(path_to_df + '/ii_co_occurrence_features')
df_val.write.mode('overwrite').orc(path_to_df + '/iii_co_occurrence_features')
df_test.write.mode('overwrite').orc(path_to_df + '/test_co_occurrence_features')

CPU times: user 3.47 ms, sys: 9 µs, total: 3.48 ms
Wall time: 8.11 s
