In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
np.random.seed(42)

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style = "darkgrid")

from sklearn.preprocessing import OneHotEncoder

In [2]:
df = pd.read_csv('../data/showsandsets.csv')

**Building the List of Songs to Model on:**

In [3]:
#the number of unique songs played:
#setlists_df['songid'].nunique()

In [4]:
#setlists_df.loc[setlists_df['is_original'] == 1, 'songid'].nunique()

In [5]:
setlists = df['setlists'].tolist()

In [6]:
setlists_joined = "".join(setlists)
print(setlists_joined[0:982])

['Long Cool Woman in a Black Dress', 'Proud Mary', 'In the Midnight Hour', 'Squeeze Box', 'Roadhouse Blues', 'Happy Birthday to You', 'Scarlet Begonias', 'Fire on the Mountain']['Makisupa Policeman']['In the Midnight Hour', 'Wild Child', 'Jam', 'Bertha', "Can't You Hear Me Knocking", 'St. Stephen', "Can't You Hear Me Knocking", 'Camel Walk', 'Eyes of the World', 'Whipping Post', 'Drums']['Jam', 'Wild Child', 'Bertha', "Can't You Hear Me Knocking", 'Camel Walk', 'Jam', 'In the Midnight Hour', 'Scarlet Begonias', 'Fire', 'Fire on the Mountain', 'Makisupa Policeman', 'Slave to the Traffic Light', 'Spanish Flea', "Don't Want You No More", 'Cities', 'Drums', 'Skippy the Wondermouse', 'Fluffhead', 'Eyes of the World']['Slave to the Traffic Light', "Mike's Song", "Dave's Energy Guide", 'You Enjoy Myself', 'Alumni Blues', 'Letter to Jimmy Page', 'Alumni Blues', 'Prep School Hippie', 'Run Like an Antelope']['Anarchy', 'Camel Walk', 'Fire Up the Ganja', 'Skippy the Wondermouse'


In [7]:
setlists_delim = setlists_joined.replace('][', ', ')
# setlists_delim = setlists_joined.replace('[', ', ')
# setlists_delim = setlists_joined.replace(']', ', ')

#I wish I could do 'jams' or 'intros/outro' to designate a transition

#REVISIT REVISIT REVISIT
#REVISIT REVISIT REVISIT
#REVISIT REVISIT REVISIT

In [8]:
print(setlists_delim[0:982])

['Long Cool Woman in a Black Dress', 'Proud Mary', 'In the Midnight Hour', 'Squeeze Box', 'Roadhouse Blues', 'Happy Birthday to You', 'Scarlet Begonias', 'Fire on the Mountain', 'Makisupa Policeman', 'In the Midnight Hour', 'Wild Child', 'Jam', 'Bertha', "Can't You Hear Me Knocking", 'St. Stephen', "Can't You Hear Me Knocking", 'Camel Walk', 'Eyes of the World', 'Whipping Post', 'Drums', 'Jam', 'Wild Child', 'Bertha', "Can't You Hear Me Knocking", 'Camel Walk', 'Jam', 'In the Midnight Hour', 'Scarlet Begonias', 'Fire', 'Fire on the Mountain', 'Makisupa Policeman', 'Slave to the Traffic Light', 'Spanish Flea', "Don't Want You No More", 'Cities', 'Drums', 'Skippy the Wondermouse', 'Fluffhead', 'Eyes of the World', 'Slave to the Traffic Light', "Mike's Song", "Dave's Energy Guide", 'You Enjoy Myself', 'Alumni Blues', 'Letter to Jimmy Page', 'Alumni Blues', 'Prep School Hippie', 'Run Like an Antelope', 'Anarchy', 'Camel Walk', 'Fire Up the Ganja', 'Skippy the Wondermouse'


In [9]:
setlists_string = setlists_delim.split(', ')
setlists_string[0:5]

["['Long Cool Woman in a Black Dress'",
 "'Proud Mary'",
 "'In the Midnight Hour'",
 "'Squeeze Box'",
 "'Roadhouse Blues'"]

In [10]:
#number of all songs played:
len(setlists_string)

36023

In [11]:
#number of all unique songs played:
len(set(setlists_string))

970

In [12]:
#all unique songs played:
unique_songs = sorted(set(setlists_string))
unique_songs[0::12]

['"Ain\'t Love Funny"',
 '"Cryin\'"',
 '"Everything\'s Right"',
 '"I\'ll Come Running"',
 '"Jumpin\' Jack Flash"',
 '"Olivia\'s Pool"',
 '"Take the \'A\' Train"',
 '"Why Don\'t We Do It in the Road?"',
 "'A Apolitical Blues'",
 "'All Along the Watchtower'",
 "'American Woman'",
 "'Auld Lang Syne'",
 "'Back on the Train'",
 "'Big Alligator'",
 "'Black-Eyed Katy'",
 "'Bobby Jean'",
 "'Brother'",
 "'Cannonball'",
 "'Chalk Dust Torture'",
 "'Coconut'",
 "'Corona'",
 "'Day or Night'",
 "'Dinner and a Movie'",
 "'Down By the River'",
 "'Eclipse'",
 "'Evolve'",
 "'Fire on the Mountain'",
 "'Foreplay/Long Time'",
 "'Frost'",
 "'Girls Girls Girls'",
 "'Goodbye Jam'",
 "'Halfway to the Moon'",
 "'Heavy Rotation'",
 "'Hold Your Head Up'",
 "'I Always Wanted It This Way'",
 "'I Know a Little'",
 "'If I Only Had a Brain'",
 "'Iron Man'",
 "'Julius'",
 "'La Grange'",
 "'Life on Mars?'",
 "'Lonely Trip'",
 "'Lucky Seven'",
 "'Maybe'",
 "'Merry Pranksters Jam'",
 "'Moby Dick'",
 "'Mr. Completely'",
 "

- I want to be able to model on a list of the songs played in order to predict the next song (song N + 1 based on the previous N songs played) so I need to map the song titles to integers.

In [13]:
#https://stackoverflow.com/questions/53801614/map-elements-of-a-list-to-their-index-in-another-list
#from itertools import islice
from itertools import islice

idx_list = {song: index for index, song in enumerate(unique_songs)}
#idx_list #commented out for length

- Applying the index_list mapping to the entire (running) setlist:

In [14]:
encoded_setlists_list = [idx_list[song] for song in setlists_string]
len(encoded_setlists_list)

36023

In [15]:
#first twelve songs played:
encoded_setlists_list[0:12]

[969, 621, 439, 739, 644, 375, 679, 312, 509, 439, 934, 448]

In [16]:
#Phish's first show:
df['setlists'][0]

"['Long Cool Woman in a Black Dress', 'Proud Mary', 'In the Midnight Hour', 'Squeeze Box', 'Roadhouse Blues', 'Happy Birthday to You', 'Scarlet Begonias', 'Fire on the Mountain']"

In [17]:
#Phish's first show as an encoded list:
encoded_setlists_list[0:8]

[969, 621, 439, 739, 644, 375, 679, 312]

In [18]:
#as calculated in '3_EDA' notebook, the avg. length of a Phish setlist is 19.96:
length = 20
model_sets = []

for song in range(length, len(encoded_setlists_list)):
    
    #selecting a range of 20 songs
    songs = encoded_setlists_list[song - length: song + 1]
    
    #building the setlist from songs
    model_sets.append(songs)

In [19]:
model_sets[0:3]

[[969,
  621,
  439,
  739,
  644,
  375,
  679,
  312,
  509,
  439,
  934,
  448,
  155,
  6,
  740,
  6,
  203,
  301,
  924,
  284,
  448],
 [621,
  439,
  739,
  644,
  375,
  679,
  312,
  509,
  439,
  934,
  448,
  155,
  6,
  740,
  6,
  203,
  301,
  924,
  284,
  448,
  934],
 [439,
  739,
  644,
  375,
  679,
  312,
  509,
  439,
  934,
  448,
  155,
  6,
  740,
  6,
  203,
  301,
  924,
  284,
  448,
  934,
  155]]

In [24]:
# #is this necessary?
# ohe = OneHotEncoder() 
# ohe.fit(model_sets)

In [25]:
np.array(model_sets)

array([[969, 621, 439, ..., 924, 284, 448],
       [621, 439, 739, ..., 284, 448, 934],
       [439, 739, 644, ..., 448, 934, 155],
       ...,
       [ 87, 462, 331, ..., 727,  14, 885],
       [462, 331, 251, ...,  14, 885,  22],
       [331, 251, 465, ..., 885,  22, 379]])

In [22]:
len(np.array(model_sets)[1])

21

In [23]:
#https://numpy.org/doc/stable/reference/generated/numpy.save.html
#https://numpy.org/doc/stable/reference/generated/numpy.load.html#numpy.load

np.save('../data/model_sets.npy', np.array(model_sets), allow_pickle = True)