In [137]:
import hdf5_getters
import sqlite3 as sq
import os
import pandas as pd
from sklearn.linear_model import LinearRegression, LogisticRegression

In [6]:
h5 = hdf5_getters.open_h5_file_read("TRAAAAW128F429D538.h5")
duration = hdf5_getters.get_artist_terms_weight(h5)
print(duration)


[ 1.          0.89793596  0.88426185  0.84262975  0.84256301  0.83239282
  0.82577707  0.79859195  0.7431759   0.73850237  0.72505245  0.71389955
  0.67049417  0.65697231  0.65105613  0.65105612  0.65105597  0.65105592
  0.65105547  0.65105532  0.65105508  0.65105506  0.65105461  0.65105427
  0.65105376  0.65104997  0.6364043   0.63334971  0.61973455  0.61889383
  0.61419433  0.59579116  0.56220197  0.55067233  0.52897541  0.49021215
  0.38341077]


## Additional Files

### Releases per Year per Genre

In [7]:
def db_list_tables(path):
    print ("Connecting to DB at path {}".format(path))
    conn = sq.connect(path)
    c = conn.cursor()
    
    c.execute("select sql from sqlite_master where type = 'table'")
    
    print ("Tables in database:")
    for row in c.fetchall():
        print (row)

In [8]:
path_artists_genres = os.path.join(".", "AdditionalFiles", "subset_artist_term.db")
db_list_tables(path_artists_genres)

path_artists_similarity = os.path.join(".", "AdditionalFiles", "subset_artist_similarity.db")
db_list_tables(path_artists_similarity)

path_track_metadata = os.path.join(".", "AdditionalFiles", "subset_track_metadata.db")
db_list_tables(path_track_metadata)

Connecting to DB at path .\AdditionalFiles\subset_artist_term.db
Tables in database:
('CREATE TABLE artists (artist_id text PRIMARY KEY)',)
('CREATE TABLE terms (term text PRIMARY KEY)',)
('CREATE TABLE artist_term (artist_id text, term text, FOREIGN KEY(artist_id) REFERENCES artists(artist_id), FOREIGN KEY(term) REFERENCES terms(term) )',)
('CREATE TABLE mbtags (mbtag text PRIMARY KEY)',)
('CREATE TABLE artist_mbtag (artist_id text, mbtag text, FOREIGN KEY(artist_id) REFERENCES artists(artist_id), FOREIGN KEY(mbtag) REFERENCES mbtags(mbtag) )',)
Connecting to DB at path .\AdditionalFiles\subset_artist_similarity.db
Tables in database:
('CREATE TABLE artists (artist_id text PRIMARY KEY)',)
('CREATE TABLE similarity (target text, similar text, FOREIGN KEY(target) REFERENCES artists(artist_id), FOREIGN KEY(similar) REFERENCES artists(artist_id) )',)
Connecting to DB at path .\AdditionalFiles\subset_track_metadata.db
Tables in database:
('CREATE TABLE songs (track_id text PRIMARY KEY, tit

In [12]:
# get number of songs per year per artist
conn = sq.connect(path_track_metadata)
c = conn.cursor()
c.execute("""SELECT artist_id, artist_name, year, count(*) as cnt
            FROM songs
            GROUP BY artist_id, year
            Having year>0
            ORDER BY year""")

res_art_song_year = c.fetchall()
for i in range(10):
    print (res_art_song_year[i])

('ARE36MM1187B991E50', 'Blind Lemon Jefferson', 1926, 2)
('ARE36MM1187B991E50', 'Blind Lemon Jefferson', 1927, 1)
('ARRU9GR1187FB43F56', 'Blind Willie McTell', 1927, 1)
('ARWQ2DR1187B98FF28', 'Ma Rainey', 1927, 1)
('ARTDUXM1187B9899ED', 'Charley Patton', 1929, 1)
('ARE2QID1187B98FA19', 'Sleepy John Estes', 1930, 1)
('ARTDUXM1187B9899ED', 'Charley Patton', 1930, 1)
('ARTDUXM1187B9899ED', 'Charlie Patton', 1934, 1)
('ARE2QID1187B98FA19', 'Sleepy John Estes', 1935, 2)
('AR9HABI1187FB3ACAA', 'Red Foley', 1936, 1)


#### Genre from EchoNest

In [13]:
#get mapping artist genre
conn = sq.connect(path_artists_genres)
c = conn.cursor()
c.execute("""SELECT artist_id, term
              FROM artist_term""")

res_art_genre = c.fetchall()
for i in range(5):
    print (res_art_genre[i])

('AR009211187B989185', 'lovers rock')
('AR009211187B989185', 'reggae')
('AR009211187B989185', 'roots reggae')
('AR009211187B989185', 'uk garage')
('AR009211187B989185', 'ballad')


In [14]:
# merge lists: count number of songs per genre (if artist has 2 gernres, count for each one)
df_art_song_year = pd.DataFrame(res_art_song_year)
df_art_song_year.columns = ["artist_id", "artist_name", "year", "tracks"]
df_art_song_year.head()

Unnamed: 0,artist_id,artist_name,year,tracks
0,ARE36MM1187B991E50,Blind Lemon Jefferson,1926,2
1,ARE36MM1187B991E50,Blind Lemon Jefferson,1927,1
2,ARRU9GR1187FB43F56,Blind Willie McTell,1927,1
3,ARWQ2DR1187B98FF28,Ma Rainey,1927,1
4,ARTDUXM1187B9899ED,Charley Patton,1929,1


In [27]:
df_art_genre = pd.DataFrame(res_art_genre)
df_art_genre.columns = ["artist_id", "genre"]
df_art_genre.head()

Unnamed: 0,artist_id,genre
0,AR009211187B989185,lovers rock
1,AR009211187B989185,reggae
2,AR009211187B989185,roots reggae
3,AR009211187B989185,uk garage
4,AR009211187B989185,ballad


In [28]:
df_art_song_year_genre = pd.merge(df_art_song_year, df_art_genre, on="artist_id")

df_song_year_genre = df_art_song_year_genre.drop(df_art_song_year_genre.columns[:2], axis=1)
print (df_song_year_genre.shape)
df_song_year_genre.head()

(109034, 3)


Unnamed: 0,year,tracks,genre
0,1926,2,country blues
1,1926,2,delta blues
2,1926,2,texas blues
3,1926,2,blues
4,1926,2,louisiana blues


In [29]:
df_song_year_genre.sort_values(["tracks"], ascending=False).head()

Unnamed: 0,year,tracks,genre
98889,2006,8,reggae
98901,2006,8,soft
98893,2006,8,female vocalist
98894,2006,8,funk
98895,2006,8,singer-songwriter


#### Genre from mb

In [30]:
#get mapping artist genre
conn = sq.connect(path_artists_genres)
c = conn.cursor()
c.execute("""SELECT artist_id, mbtag
              FROM artist_mbtag""")

res_art_genre = c.fetchall()
for i in range(5):
    print (res_art_genre[i])

('AR00A6H1187FB5402A', 'detroit')
('AR00A6H1187FB5402A', 'punk')
('AR00A6H1187FB5402A', 'michigan')
('AR00A6H1187FB5402A', 'usa')
('AR01VU31187B997DA0', 'hard rock')


In [31]:
# merge lists: count number of songs per genre (if artist has 2 gernres, count for each one)
df_art_song_year = pd.DataFrame(res_art_song_year)
df_art_song_year.columns = ["artist_id", "artist_name", "year", "tracks"]
df_art_song_year.head()

Unnamed: 0,artist_id,artist_name,year,tracks
0,ARE36MM1187B991E50,Blind Lemon Jefferson,1926,2
1,ARE36MM1187B991E50,Blind Lemon Jefferson,1927,1
2,ARRU9GR1187FB43F56,Blind Willie McTell,1927,1
3,ARWQ2DR1187B98FF28,Ma Rainey,1927,1
4,ARTDUXM1187B9899ED,Charley Patton,1929,1


In [32]:
df_art_genre = pd.DataFrame(res_art_genre)
df_art_genre.columns = ["artist_id", "genre"]
df_art_genre.head()

Unnamed: 0,artist_id,genre
0,AR00A6H1187FB5402A,detroit
1,AR00A6H1187FB5402A,punk
2,AR00A6H1187FB5402A,michigan
3,AR00A6H1187FB5402A,usa
4,AR01VU31187B997DA0,hard rock


In [33]:
df_art_song_year_genre = pd.merge(df_art_song_year, df_art_genre, on="artist_id")

df_song_year_genre = df_art_song_year_genre.drop(df_art_song_year_genre.columns[:2], axis=1)
print (df_song_year_genre.shape)
df_song_year_genre.head()

(8366, 3)


Unnamed: 0,year,tracks,genre
0,1926,2,blues
1,1926,2,american
2,1927,1,blues
3,1927,1,american
4,1972,1,blues


In [34]:
df_song_year_genre.sort_values(["tracks"], ascending=False).head()

Unnamed: 0,year,tracks,genre
7994,2006,8,soul and reggae
3982,1992,5,usa
3976,1992,5,grunge
6925,2002,5,rock and indie
3984,1992,5,american


mb has less unique tags.

# Try to iterate over all songs

For the moment we try some methods on the local set we have, then it'll be run on cluster, but to test it is much easier locally on the subset

In [17]:
import os
import glob
def count_all_files(basedir,ext='.h5') :
    cnt = 0
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root,'*'+ext))
        cnt += len(files)
    return cnt

In [21]:
local_path = 'C:/Users/Max-Pc/Downloads/millionsongsubset_full/MillionSongSubset/data'
count_all_files(local_path)

10000

In [85]:
def get_all_titles(basedir,ext='.h5') :
    titles = []
    iteration = 0
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root,'*'+ext))
        for f in files:
            h5 = hdf5_getters.open_h5_file_read(f)
            titles.append( hdf5_getters.get_title(h5))
            h5.close()
            iteration+=1
            if(iteration%1000== 0):
                print(iteration)
    return titles

In [96]:
def get_all_interesting_data(basedir,ext='.h5') :
    datas = []
    iteration = 0
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root,'*'+ext))
        for f in files:
            oneSongData = []
            h5 = hdf5_getters.open_h5_file_read(f)
            oneSongData.append( hdf5_getters.get_artist_id(h5))
            oneSongData.append(hdf5_getters.get_artist_name(h5))
            oneSongData.append(hdf5_getters.get_danceability(h5)) #always zero
            oneSongData.append(hdf5_getters.get_duration(h5))
            oneSongData.append(hdf5_getters.get_song_hotttnesss(h5))
            oneSongData.append(hdf5_getters.get_tempo(h5))
            oneSongData.append(hdf5_getters.get_year(h5))
            h5.close()
            iteration+=1
            if(iteration%1000== 0):
                print(iteration)
            datas.append(oneSongData)
    return datas

In [97]:
datas = get_all_interesting_data(local_path)

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000


In [130]:
df = pd.DataFrame(datas,columns=['ArtistID','ArtistName','Danceability','Duration','SongHotness','Tempo','Year'])

In [135]:
df = df[df.Year>0]
df = df[df.SongHotness>0]
df.head()

Unnamed: 0,ArtistID,ArtistName,Danceability,Duration,SongHotness,Tempo,Year
4,b'ARXR32B1187FB57099',b'Gob',0.0,209.60608,0.604501,129.738,2007
8,b'AR8ZCNI1187B9A069B',b'Planet P Project',0.0,269.81832,0.265861,86.643,1984
15,b'ARD842G1187B997376',b'Blue Rodeo',0.0,491.12771,0.405116,119.826,1987
18,b'ARYKCQI1187FB3B18F',b'Tesla',0.0,290.29832,0.684136,150.062,2004
23,b'ARMAC4T1187FB3FA4C',b'The Dillinger Escape Plan',0.0,207.77751,0.666528,166.862,2004


In [138]:
X = df[['Duration','Tempo','Year']]
y = df[['SongHotness']]
logistic = LinearRegression()  # create the model
logistic.fit(X, y)  # train it

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [147]:
logistic.predict([200,180,1500])



array([[ 0.00093221]])