# Additional Pre-Processing

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("./data/lyrics_data.csv")

In [3]:
spot = pd.read_csv("./data/spotify_data.csv",encoding='latin1')
spot = spot.loc[spot.artistName!="Vigiland"] #vigiland songs are broken
spot.reset_index(drop=True,inplace=True)
spot.head(5)

Unnamed: 0.1,Unnamed: 0,endTime,artistName,trackName,msPlayed
0,2,2019-07-24 09:06,Alli Neumann,Banditen,27701
1,5,2019-07-24 09:10,Kraftklub,Kein Liebeslied,229947
2,6,2019-07-24 09:14,Kraftklub,Dein Lied,235546
3,7,2019-07-24 09:17,Kraftklub,Ich will nicht nach Berlin,202426
4,8,2019-07-24 09:21,Khalid,Better,229320


In [4]:
google = pd.read_csv("./data/google_data.csv",encoding="latin1")
#google = google.drop(["Unnamed: 0"], axis=1)
google.columns = ["search", "words", "searchTime"]
google.reset_index(drop=True,inplace=True)
google.head(5)

Unnamed: 0,search,words,searchTime
0,1,google daten,2020-01-29+20:06:55.328
1,2,select only rows if its value in a particular ...,2020-01-29+19:59:39.879
2,3,r select rows with value in column,2020-01-29+19:59:36.708
3,4,Importing data from a JSON file into R - Stack...,2020-01-29+19:53:49.440
4,5,r read json,2020-01-29+19:53:24.214


In [5]:
tracklist = []
for i in spot.trackName:
    if i in df.song.unique():
        tracklist.append(i)

In [6]:
df = df[df['song'].isin(tracklist)]
df.replace("\n", '', regex=True,inplace=True) #For Line Breaks in Lyrics text
df.reset_index(drop=True,inplace=True)

In [7]:
#spot = spot[spot["trackName"].isin(tracklist)]
#spot.reset_index(drop=True,inplace=True)

In [9]:
from rdflib import URIRef, BNode, Literal, Namespace, Graph
from rdflib.namespace import RDF, FOAF

In [10]:
df.song.replace(' ', '+', regex=True,inplace=True)
df.artist.replace(" ","+",regex=True,inplace=True)

In [11]:
df.head(5)

Unnamed: 0,artist,song,link,text
0,ABBA,Dancing+Queen,/a/abba/dancing+queen_20002554.html,"You can dance, you can jive, having the time o..."
1,ABBA,Lay+All+Your+Love+On+Me,/a/abba/lay+all+your+love+on+me_20002834.html,I wasn't jealous before we met Now every woma...
2,ABBA,SOS,/a/abba/sos_20957357.html,And when I see the sign that points one way T...
3,Adele,Hello,/a/adele/hello_21103519.html,"[Verse 1] Hello, it's me I was wondering if ..."
4,Aerosmith,Somebody,/a/aerosmith/somebody_20004214.html,"I need a lady, not somebody shady Need someon..."


In [16]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import re

for i in range(0,len(df)):
    df.text[i] = re.sub('\W+',' ', df.text[i] )

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/markusberger/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [17]:
for i in range(0,len(google)):
    google.words[i] = re.sub('\W+',' ', google.words[i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


# RDF Graph Generation

In [18]:
g = Graph()
a = Namespace("lyrics/artist/")
s = Namespace("lyrics/song/")
t = Namespace("lyrics/text/")
for i in range(0,len(df)):
    artist = a[df.artist[i]]
    track = s[df.song[i]]
    g.add((artist,RDF.type,FOAF.Artist))
    g.add((artist,FOAF.name,Literal(df.artist[i])))
    g.add((track,RDF.type,FOAF.Song))
    g.add((track,FOAF.title,Literal(df.song[i])))
    g.add((artist, FOAF.sings, track))
    filtered = [word for word in df.text[i].split() if word not in stopwords.words('english')]
    for j in filtered:
        word = t[j]
        g.add((word,RDF.type,FOAF.Word))
        g.add((word,FOAF.value,Literal(j)))
        g.add((track, FOAF.song_contains, word))
        
    #This statement provides information about the progress of the loob
    #Shows all Artists. Last Artist = Zebrahead (ordered alphabetically)
    #print(artist)

    
g.bind("foaf", FOAF)
#g.serialize(destination='lyrics.txt', format='xml')
#g.serialize(destination='lyrics.rdf', format='xml')

print("graph has %s statements." % len(g))

graph has 49657 statements.


In [19]:
g = Graph()
s = Namespace("google/search/")
t = Namespace("google/searchTime/")
w = Namespace("google/words/")
for i in range(0,len(google)):
    search = s[google.search[i]]
    searchTime = t[google.searchTime[i]]
    
    g.add((search,RDF.type,FOAF.Search))
    g.add((search,FOAF.id,Literal(google.search[i])))
    
    g.add((searchTime,RDF.type,FOAF.SearchTime))
    g.add((searchTime,FOAF.time,Literal(google.searchTime[i])))
    
    g.add((search,FOAF.searched_at,searchTime))
    
    filtered = [word for word in google.words[i].split() if word not in stopwords.words('english')]
    for j in filtered:
        word = w[j]
        g.add((word,RDF.type,FOAF.Word))
        g.add((word,FOAF.value,Literal(j)))
        g.add((search, FOAF.search_contains, word))
        
    #This statement provides information about the progress of the loob
    #Shows all Artists. Last Artist = Zebrahead (ordered alphabetically)
    #print(artist)

    
g.bind("foaf", FOAF)
#g.serialize(destination='google_data.txt', format='xml')
#g.serialize(destination='google_data.rdf', format='xml')

print("graph has %s statements." % len(g))

graph has 20881 statements.


In [20]:
spot.artistName.replace(' ', '+', regex=True,inplace=True)
spot.trackName.replace(" ","+",regex=True,inplace=True)
spot.trackName.replace('"',"+",regex=True,inplace=True)
spot.trackName.replace(',',"+",regex=True,inplace=True)


#Remove some values by hand because not possible in RDF:
spot = spot[spot.artistName!="Axwell+/\+Ingrosso"]
#spot = spot[spot.trackName!="Friday+Night<U+202C><U+202C><U+202C"]
#spot.loc["Friday+Night" in spot.trackName]
spot.reset_index(drop=True,inplace=True)

spot_artist = pd.DataFrame(spot.groupby("artistName")["msPlayed"].sum()).reset_index()

In [21]:
h = Graph()
a = Namespace("spotify/artist/")
s = Namespace("spotify/song/")
t1 = Namespace("spotify/tracktime/")
t2 = Namespace("spotify/artisttime/")
for i in range(0,len(spot)):
    artist = a[spot.artistName[i]]
    track = s[spot.trackName[i]]
    tracktime = t1[spot.trackName[i]]
    artisttime = t2[spot.artistName[i]]
    
    h.add((artist,RDF.type,FOAF.Artist))
    h.add((artist,FOAF.name,Literal(spot.artistName[i])))
    
    h.add((track,RDF.type,FOAF.Song))
    h.add((track,FOAF.title,Literal(spot.trackName[i])))
    
    h.add((tracktime,RDF.type,FOAF.Song_Played_ms))
    h.add((tracktime,FOAF.Song_ms,Literal(spot.msPlayed[i])))
    
    h.add((artisttime,RDF.type,FOAF.Artist_Played_ms))
    h.add((artisttime,FOAF.Artist_ms,Literal(int(spot_artist.loc[spot_artist.artistName==spot.artistName[i],"msPlayed"]))))
    
    
    h.add((artist, FOAF.sings, track))
    h.add((track, FOAF.time_listened_song,tracktime))
    h.add((artist, FOAF.time_listened_artist,artisttime))

        
    #This statement provides information about the progress of the loob
    #Shows all Artists. Last Artist = Zebrahead (ordered alphabetically)
    #print(artist)

    
h.bind("foaf", FOAF)
#h.serialize(destination='times.txt', format='xml')
#h.serialize(destination='times.rdf', format='xml')

print("graph has %s statements." % len(h))

graph has 8538 statements.


# Full RDF
need to execute both parts!


In [22]:
g = Graph()
a = Namespace("MCSB/artist/")
s = Namespace("MCSB/song/")
w = Namespace("MCSB/word/")
t = Namespace("MCSB/searchTime/")

markus = URIRef("MCSB/Markus")
g.add( (markus, RDF.type, FOAF.User) )
g.add( (markus, FOAF.name, Literal("Markus")))
g.add( (markus, FOAF.surname, Literal("Unknown")))
g.add( (markus, FOAF.height, Literal(185)))
g.add( (markus, FOAF.age, Literal(22)))

for i in range(0,len(df)):
    artist = a[df.artist[i]]
    track = s[df.song[i]]
    g.add((artist,RDF.type,FOAF.Artist))
    g.add((artist,FOAF.name,Literal(df.artist[i])))
    g.add((track,RDF.type,FOAF.Song))
    g.add((track,FOAF.title,Literal(df.song[i])))
    g.add((artist, FOAF.performs, track))
    filtered = [word for word in df.text[i].split() if word not in stopwords.words('english')]
    for j in filtered:
        word = w[j]
        g.add((word,RDF.type,FOAF.Word))
        g.add((word,FOAF.value,Literal(j)))
        g.add((track, FOAF.song_contains, word))

In [23]:
for i in range(0,len(google)):
    search = w[str(google.search[i])]
    searchTime = t[google.searchTime[i]]
    
    g.add((search,RDF.type,FOAF.Search))
    g.add((search,FOAF.id,Literal(google.search[i])))
    g.add((search,FOAF.time,Literal(google.searchTime[i])))
    
    
    g.add((markus,FOAF.searched,search))
    
    filtered = [word for word in google.words[i].split() if word not in stopwords.words('english')]
    for j in filtered:
        word = w[j]
        g.add((word,RDF.type,FOAF.Word))
        g.add((word,FOAF.value,Literal(j)))
        g.add((search, FOAF.search_contains, word))

        
for i in range(0,len(spot)):
    artist = a[spot.artistName[i]]
    track = s[spot.trackName[i]]
    
    g.add((artist,RDF.type,FOAF.Artist))
    g.add((artist,FOAF.name,Literal(spot.artistName[i])))
    g.add((artist,FOAF.artist_ms,Literal(int(spot_artist.loc[spot_artist.artistName==spot.artistName[i],"msPlayed"]))))
    
    g.add((track,RDF.type,FOAF.Song))
    g.add((track,FOAF.title,Literal(spot.trackName[i])))
    g.add((track,FOAF.song_ms,Literal(spot.msPlayed[i])))
    
    
    g.add((artist, FOAF.performs, track))
    g.add((markus, FOAF.song_listened,track))
    g.add((markus, FOAF.artist_listened,artist))


    
g.bind("foaf", FOAF)
#g.serialize(destination='MCSB.txt', format='xml')
g.serialize(destination='MCSB.rdf', format='xml')

print("graph has %s statements." % len(g))

graph has 81855 statements.
