In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from apyori import apriori
import networkx as nx
%matplotlib inline

In [None]:
data = pd.read_csv("Sample_from_Million_Playlist.csv")

In [None]:
data.head()

In [None]:
artist_uri = data.artist_uri.to_list()
artist_name = data.artist_name.to_list()

name2uri = {}

for uri,name in zip(artist_uri,artist_name):
    name2uri[name] = uri

In [None]:
len(artist_uri)

In [None]:
data["pid"].unique()

In [None]:
data.isnull().sum()

In [None]:
len(list(data.track_name.unique()))

In [None]:
values = list(data['artist_name'].value_counts())
names = data['artist_name'].value_counts().index.tolist()

ax = sns.barplot(y=names[0:25], x=values[0:25])
ax.set_title('Artists with most featured songs across all dataset')
ax.set_xlabel('Total songs across dataset')

In [None]:
playlist2artists = {}

for pid,artist_name in zip(data.pid.tolist(),data.artist_name.tolist()):
    if pid in playlist2artists:
        playlist2artists[pid].append(artist_name)
    else:
        playlist2artists[pid] = [artist_name]

In [None]:
playlist2artists[0]

In [None]:
import re
import string

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[-()\"#/$@;:<>{}`+=~|.!?,]", "", text)
    text = text.rstrip(string.digits).lstrip(string.digits)
    text = text.replace("'","")
    text = text.replace('"',"")
    text = text.replace("&","")
    return text

In [None]:
clean_text(""""macklemore_&_ryan_lewis""")

In [None]:
association_rules = apriori(playlist2artists.values(), min_support=0.0230, min_confidence=0.0230, min_lift=3,max_length=2, min_length=2)

In [None]:
artists = set()

related_name2uri = {}
for item in association_rules:
    pair = item[0] 
    items = [x for x in pair]
    first_item = items[0]    
    second_item = items[1]
    confidence = float(item[2][0][2])
    artists.add(first_item)
    artists.add(second_item)
    first_uri = name2uri[first_item]
    second_uri = name2uri[second_item]
    related_name2uri[first_item] = first_uri
    related_name2uri[second_item] = second_uri

In [None]:
related_name2uri.keys()

In [None]:
import requests 

z = 1
related_uri_name_artists = []


for name,uri in zip(related_name2uri.keys(),related_name2uri.values()):
    artist_list = []
    uri_key = uri.split(":")[2]
    response = requests.get(f"https://api.spotify.com/v1/artists/{uri_key}/related-artists", headers={"Authorization": "Bearer BQDINXGFu2ch3pj3LHeIuPhjfKTlpr5ovr0hpyqgnqnp9KtxXLxghuK2EcwV4mCA6qMLXkE0RghL2zpXs6FIpFmQ_n3WxHNVtb2WTW18LHYEqwJ97C9Xph0wV3qhHFEq-SJibg82ee7C3TsWszzN7Zvr4rGYolBwS4Ci1CqlEelQqQgip8pCHq953Tf7h8wBJJxCC13bzubBS9J8cUm31w8DCWJdomxB99gbT_GwY0ABHH63aQ2NzSF4i2TIbryr8TdeNGKts-fi_kvOtdIbBGU"})
    result = response.json()
    print(response)
    for artist in result["artists"]:
        artist_list.append(artist["name"])
    related_uri_name_artists.append([uri,name,artist_list])
    print(len(artist_list))
    print(z)
    z += 1
    
import pickle
filename = 'spotify_ground_truth_data.pkl'
outfile = open(filename,'wb')
pickle.dump(related_uri_name_artists,outfile)
outfile.close()

In [None]:
import pickle
infile = open("spotify_ground_truth_data.pkl",'rb')
new_dict = pickle.load(infile)
infile.close()
new_dict[0]

In [None]:
clear_dict = []
blank = "_"

for d in new_dict:
    artists = list()
    for item in d[2]:
        artists.append(blank.join(item.split(" ")))
    clear_dict.append([d[0],blank.join(d[1].split(" ")),artists])


In [None]:
all_artists = []
for _,_,artists in related_uri_name_artists:
    all_artists.extend(artists)

In [None]:
len(set(all_artists).intersection(list(related_name2uri.keys())))

In [None]:
association_rules = apriori(playlist2artists.values(), min_support=0.0230, min_confidence=0.0230, min_lift=3,max_length=2, min_length=2)

In [None]:
association_rules

In [None]:

node1 = []
node2 = []
weight = []


query = "CREATE "
blank = "_"

relations = {}
nodes = set()

curr = 0

z = 0
for item in association_rules:
    
   
    curr += 1
    pair = item[0] 
    items = [x for x in pair]

    first_item = clean_text(blank.join(items[0].split(" ")))
    second_item = clean_text(blank.join(items[1].split(" ")))
    first_confidence = float(item[2][0][2])
    second_confidence = float(item[2][1][2])
    first = blank.join(items[0].split(" "))
    second = blank.join(items[1].split(" "))
    support = float(item[1])

    if first_item not in nodes:
        query += f"({first_item}:artist" + " {name: " +  f'"{first}"'+ "}),"
        nodes.add(first_item)
    if second_item not in nodes:
        query += f"({second_item}:artist" + " {name: " + f'"{second}"' + "}),"
        nodes.add(second_item)
    node1.append(first_item)
    node2.append(second_item)
    weight.append(first_confidence)

    composite_key = first_item + second_item
    if composite_key not in relations.keys():
        query += f"({first_item})-[:sup_rel " + "{" + f"supp: {support}" + "}]" + f"->({second_item}),"
        relations[first_item + second_item] = float(item[1])
        
    composite_key2 = second_item + first_item
    if composite_key2 not in relations.keys():
        query += f"({second_item})-[:sup_rel " + "{" + f"supp: {support}" + "}]" + f"->({first_item}),"
        relations[second_item + first_item] = float(item[1])
    
    query += f"({first_item})-[:conf_rel " + "{" + f"conf: {first_confidence}" + "}]" + f"->({second_item}),"
    query += f"({second_item})-[:conf_rel " + "{" + f"conf: {second_confidence}" + "}]" + f"->({first_item}),"
    #    relations[first_item + second_item] = float(item[1])
    #    z += 1


In [None]:
query = query[:-1]
query

In [None]:
from neo4j import GraphDatabase

In [None]:
uri             = "bolt://localhost:7687"
userName        = "neo4j"
password        = "123"
graphDB_Driver  = GraphDatabase.driver(uri, auth=(userName, password))
cqlNodeQuery          = "MATCH (x:artist) RETURN x"

In [None]:
with graphDB_Driver.session() as graphDB_Session:

    # Create nodes

    graphDB_Session.run(query)



In [None]:
import pandas as pd
  
df = pd.DataFrame(list(zip(node1, node2,weight)),
               columns =['node_1', 'node_2', 'distance'])
df.head()

In [None]:
G = nx.from_pandas_edgelist(df = df, source = 'node_1', target = 'node_2', edge_attr='distance')
G.add_nodes_from(nodes_for_adding = df['node_1'].tolist())

edge_all = [(u,v) for (u,v,d) in G.edges(data=True)]

pos=nx.spring_layout(G)
plt.figure(figsize=(40,30))
nx.draw_networkx_nodes(G,pos,node_size=10)                 # draw nodes
nx.draw_networkx_edges(G,pos,edgelist=edge_all,width=0.08) # draw edges


In [None]:
G = nx.from_pandas_edgelist(df = edges, source = 'node_1', target = 'node_2', edge_attr='distance')
G.add_nodes_from(nodes_for_adding = edges['node_1'].tolist())

edge_all = [(u,v) for (u,v,d) in G.edges(data=True)]

pos=nx.spring_layout(G)
plt.figure(figsize=(40,30))
nx.draw_networkx_nodes(G,pos,node_size=10)                 # draw nodes
nx.draw_networkx_edges(G,pos,edgelist=edge_all,width=0.08) # draw edges