This code can be used to create sqllite db file from the preprocessed csv data files.

In [59]:
import pandas as pd
from sqlalchemy import create_engine

In [60]:
# FILE_NAME = 'lastfm.db'
FILE_NAME = 'lastfm_EuropeOnly.db'

In [61]:
listenings = pd.read_csv('preprocessed_data/listenings_with_years_and_users.csv')
# listenings = pd.read_csv('preprocessed_data/listenings_with_years_and_tags.csv')

In [62]:
europeList = ['United Kingdom', 'Finland', 'Germany', 'Sweden', 'Canada', 'Switzerland',
    'Norway', 'Poland', 'Netherlands', 'Belgium', 'Italy', 'Romania',
       'Russian Federation', 'Spain',
       'Ireland','Czech Republic','France', 'China',
       'Austria', 'Slovakia', 'Malta',
       'Latvia', 'Estonia', 'Lithuania',
      'Iceland', 'Bulgaria',
      'Portugal', 'Croatia',  'Serbia', 'Greece', 'Hungary',
       'Bosnia and Herzegovina', 'Slovenia',
       'Belarus',  'Macedonia']
# we rename the European countries so that we have a bigger geographical group which we can compare to the United States
listenings['country'] = listenings['country'].apply(lambda x: 'Europe' if x in europeList else x)

In [63]:
listenings.country.unique()

array(['Japan', 'Armenia', 'Turkey', 'Europe', nan, 'United States',
       'Mexico', 'United States Minor Outlying Islands', 'Venezuela',
       'Australia', 'Morocco', 'New Zealand', 'Peru', 'Brazil',
       'British Indian Ocean Territory', 'Singapore', 'Argentina',
       'Chile', 'Colombia', 'Israel', 'Thailand', 'Algeria', 'India',
       "Cote D'Ivoire", 'Antarctica', 'Trinidad and Tobago',
       'Congo, the Democratic Republic of the',
       'Northern Mariana Islands', 'Zimbabwe', 'Nicaragua', 'Tunisia',
       "Korea, Democratic People's Republic of", 'Netherlands Antilles'],
      dtype=object)

In [64]:
listenings = listenings[listenings['country']=='Europe']

In [65]:
listenings = listenings[listenings['year']>=2005]
# we create a unique identifier from artist and track name
listenings['art_tra_name'] = listenings['artname'] + '@' + listenings['traname']
listenings = listenings[listenings['year']!=2010]

In [66]:
listenings.head()

Unnamed: 0,userid,timestamp,year,artname,traname,gender,country,art_tra_name
36076,user_000135,2007-08-13 12:18:46+00:00,2006,Alif Tree,Deadly Species,m,Europe,Alif Tree@Deadly Species
36089,user_000135,2009-02-02 14:49:35+00:00,2005,Alif Tree,I Feel Blue,m,Europe,Alif Tree@I Feel Blue
36090,user_000135,2008-10-14 04:34:24+00:00,2005,Alif Tree,I Feel Blue,m,Europe,Alif Tree@I Feel Blue
36091,user_000135,2008-08-29 20:26:05+00:00,2005,Alif Tree,I Feel Blue,m,Europe,Alif Tree@I Feel Blue
36092,user_000135,2008-08-14 09:20:59+00:00,2005,Alif Tree,I Feel Blue,m,Europe,Alif Tree@I Feel Blue


In [67]:
# get unique songs with release years
nodes = listenings[['art_tra_name','year']].drop_duplicates()
nodes.rename({'art_tra_name':'name','year':'realtime'},axis=1,inplace=True)
# we only have years which we use as dates
nodes['realtime'] = pd.to_datetime(nodes['realtime'], format='%Y')
nodes.sort_values('realtime',inplace=True)
nodes.head()

Unnamed: 0,name,realtime
1470902,The Electric Soft Parade@Stupid Mistake,2005-01-01
1376194,Girls Aloud@Not Tonight Santa,2005-01-01
1376196,Girls Aloud@White Christmas,2005-01-01
1376276,The Futureheads@Man Made (A Mistake),2005-01-01
1376282,Jim Noir@Hotsy Wa Wa Blues,2005-01-01


In [68]:
# primary key
nodes['id_nb'] = range(0, len(nodes))

In [69]:
# we create the intrinsictime column
realtimeDistinct = sorted(list(set(nodes['realtime'])))
realtimeDict = {}
for i,realtimeValue in enumerate(realtimeDistinct):
    realtimeDict[realtimeValue] = i
nodes['intrinsictime'] = nodes['realtime'].apply(lambda x: realtimeDict[x])
# since we only have years the bins are the same as the intrinsictime
nodes['timebin'] = nodes['intrinsictime']
nodes = nodes[nodes['timebin']!=5]
nodes.head()

Unnamed: 0,name,realtime,id_nb,intrinsictime,timebin
1470902,The Electric Soft Parade@Stupid Mistake,2005-01-01,0,0,0
1376194,Girls Aloud@Not Tonight Santa,2005-01-01,1,0,0
1376196,Girls Aloud@White Christmas,2005-01-01,2,0,0
1376276,The Futureheads@Man Made (A Mistake),2005-01-01,3,0,0
1376282,Jim Noir@Hotsy Wa Wa Blues,2005-01-01,4,0,0


In [70]:
nameToIdDict = {}
for name,id_nb in zip(nodes['name'],nodes['id_nb']):
    nameToIdDict[name] = id_nb

stream = listenings[['timestamp','art_tra_name']].copy()
# convert the song names to the id used in other table
stream['node'] = stream['art_tra_name'].apply(lambda x: nameToIdDict[x])
stream['timestamp'] = pd.to_datetime(stream['timestamp'], format='%Y-%m-%d %H:%M:%S%z')
stream['node_origin'] = None
stream.rename({'timestamp':'realtime'},axis=1,inplace=True)

stream.drop('art_tra_name',axis=1,inplace=True)
stream.head()

Unnamed: 0,realtime,node,node_origin
36076,2007-08-13 12:18:46+00:00,17365,
36089,2009-02-02 14:49:35+00:00,8036,
36090,2008-10-14 04:34:24+00:00,8036,
36091,2008-08-29 20:26:05+00:00,8036,
36092,2008-08-14 09:20:59+00:00,8036,


In [71]:
# add intrinsictime collumn
realtimeDistinct = sorted(list(set(stream['realtime'])))
realtimeDict = {}
for i,realtimeValue in enumerate(realtimeDistinct):
    realtimeDict[realtimeValue] = i
stream['intrinsictime'] = stream['realtime'].apply(lambda x: realtimeDict[x])

In [72]:
# create timebin based on years
binDistinct = sorted(list(set(stream['realtime'].apply(lambda x: x.year))))
binDict = {}
for i,binValue in enumerate(binDistinct):
    binDict[binValue] = i
stream['timebin'] = stream['realtime'].apply(lambda x: x.year).apply(lambda x: binDict[x])
stream.head()

Unnamed: 0,realtime,node,node_origin,intrinsictime,timebin
36076,2007-08-13 12:18:46+00:00,17365,,311719,2
36089,2009-02-02 14:49:35+00:00,8036,,830245,4
36090,2008-10-14 04:34:24+00:00,8036,,693386,3
36091,2008-08-29 20:26:05+00:00,8036,,642541,3
36092,2008-08-14 09:20:59+00:00,8036,,627149,3


In [73]:
import sqlite3 as sql

conn = sql.connect(f'preprocessed_data/{FILE_NAME}')
nodes.to_sql('nodes',conn)
stream.to_sql('stream', conn)