In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from scipy.stats import mode
%matplotlib inline

In [2]:
pd.set_option('display.max_columns', 500)

df = pd.read_csv('playlist_data_final.csv')
df = df.drop(['Unnamed: 0', 'Unnamed: 0.1','id','release_date'], axis=1)
df = df.rename(index=str, columns={'popularity': 'song_popularity'})
df['track_ids'] = [i.replace("'",'').replace('[','').replace(']','').split(',') for i in df['track_ids']]
df['album_type'] = [i.replace("'",'').replace('[','').replace(']','').split(',') for i in df['album_type']]
df['album_release_date'] = [i.replace("'",'').replace('[','').replace(']','').split(',') for i in df['album_release_date']]
df['artist_genres'] = [i.replace("'",'').replace('"','').replace('[','').replace(']','').split(',') for i in df['artist_genres']]


reformat = ['song_popularity','danceability','energy','loudness','speechiness','acousticness','instrumentalness',
           'liveness','valence','duration_ms','num_artists','num_markets','explicit','mode','key','time_signature',
           'album_popularity','artist_popularity','artist_followers']

def isfloat(value):
  try:
    float(value)
    return True
  except ValueError:
    return False

for col in reformat:
    df[col] = [i.replace('[','').replace(']','').split(',') for i in df[col]]  
    
    new_col = []
    for row in range(len(df[col])):
        new = []
        for val in df[col][row]:
            if isfloat(val):
                new.append(float(val))
        new_col.append(new)
    df[col] = new_col

years_list = []
for row in df['album_release_date']:
    new = []
    for val in row:
        new.append(val.strip(' ')[:4])
    years_list.append(new)
df['album_release_date'] = years_list

years_col = []
for ele in df['album_release_date']:
    years_col.append([int(i) if isfloat(i) else 0 for i in ele])
df['album_release_date'] = years_col
df = df.rename(index=str, columns={'album_release_date': 'album_release_year'})

In [3]:
continuous_var = ['song_popularity','danceability','energy','loudness',
                  'speechiness','acousticness','instrumentalness','liveness',
                  'valence','duration_ms','num_artists','num_markets','album_popularity',
                  'artist_popularity','artist_followers','album_release_year']
categorical_var = ['explicit','mode','key','time_signature','album_type','artist_genres']

for col in continuous_var:
    avg_val = [np.nanmean(i) for i in df[col]]
    df[col] = avg_val
    df = df.rename(index=str, columns={col: 'avg_' + col})
    
for col in categorical_var:
    majority_val = [mode(i)[0][0] if len(mode(i)[0]) > 0 else None for i in df[col]]
    df[col] = majority_val
    df = df.rename(index=str, columns={col: 'majority_' + col})
    
df['majority_album_type'] = [str(i).strip(' ') for i in df['majority_album_type']]

genre = df['majority_artist_genres']
df.drop(labels=['majority_artist_genres'], axis=1,inplace = True)
df.insert(1, 'majority_artist_genres', genre)
    
df.head()

  


Unnamed: 0,followers,name,owner,track_ids,num_tracks,avg_song_popularity,avg_danceability,avg_energy,avg_loudness,avg_speechiness,avg_acousticness,avg_instrumentalness,avg_liveness,avg_valence,avg_duration_ms,avg_num_artists,avg_num_markets,majority_explicit,majority_mode,majority_key,majority_time_signature,majority_album_type,avg_album_popularity,avg_album_release_year,avg_artist_popularity,avg_artist_followers,majority_artist_genres
0,18129916,Today's Top Hits,spotify,"[0tBbt8CrmxbjRP0pueQkyU, 2amzBJRBPOGszBem4Fed...",50,84.277778,0.624333,0.66,-5.696467,0.085307,0.170962,2.1e-05,0.154687,0.32658,202240.933333,1.833333,38.5,0.0,0.0,11.0,4.0,single,81.444444,2017.0,91.16,2545532.96,pop
1,8323892,RapCaviar,spotify,"[4dVpf9jZjcORqGTLUaeYj9, 3ncgNpxLoBQ65ABk4djD...",51,76.9,0.667125,0.697,-6.476875,0.2952,0.164925,1.8e-05,0.28925,0.35825,183295.125,1.8,38.2,1.0,1.0,5.0,4.0,album,77.8,2017.0,87.538462,1250737.0,rap
2,4600937,mint,spotify,"[3VQDpxMffTaggOHEeur7Tj, 43V1z6QToNNWNldV4FEG...",52,52.8,0.7188,0.6934,-6.5606,0.06406,0.14303,0.032978,0.1001,0.453,235286.2,2.0,38.0,0.0,0.0,6.0,4.0,single,49.0,2016.8,72.5,146571.5,tropical house
3,3787551,Are & Be,spotify,"[6gU9OKjOE7ghfEd55oRO57, 25wStx3LyTjYmHTd3RDu...",51,62.090909,0.6438,0.454,-8.3817,0.11898,0.29918,0.078657,0.11113,0.30648,235682.8,1.090909,34.818182,0.0,1.0,2.0,4.0,single,59.727273,2016.818182,75.2,1603454.8,indie r&b
4,3996091,Rock This,spotify,"[3k7JQg9M5rZJHveFYq0y9J, 5UMD1Iz7yyVK8Q5FLsVt...",50,63.75,0.559,0.846875,-4.375125,0.090525,0.002983,0.032999,0.189312,0.553625,202412.625,1.0,46.75,0.0,1.0,0.0,4.0,single,61.25,2017.0,77.0,1265657.875,modern rock


In [6]:
df.to_csv('final_dataset_no_dummies.csv')

In [4]:
df_some = df.drop(['avg_danceability', 'avg_duration_ms', 'avg_energy', 'majority_key','majority_time_signature'], 1)
df_some.head()

Unnamed: 0,followers,name,owner,track_ids,num_tracks,avg_song_popularity,avg_loudness,avg_speechiness,avg_acousticness,avg_instrumentalness,avg_liveness,avg_valence,avg_num_artists,avg_num_markets,majority_explicit,majority_mode,majority_album_type,avg_album_popularity,avg_album_release_year,avg_artist_popularity,avg_artist_followers,majority_artist_genres
0,18129916,Today's Top Hits,spotify,"[0tBbt8CrmxbjRP0pueQkyU, 2amzBJRBPOGszBem4Fed...",50,84.277778,-5.696467,0.085307,0.170962,2.1e-05,0.154687,0.32658,1.833333,38.5,0.0,0.0,single,81.444444,2017.0,91.16,2545532.96,pop
1,8323892,RapCaviar,spotify,"[4dVpf9jZjcORqGTLUaeYj9, 3ncgNpxLoBQ65ABk4djD...",51,76.9,-6.476875,0.2952,0.164925,1.8e-05,0.28925,0.35825,1.8,38.2,1.0,1.0,album,77.8,2017.0,87.538462,1250737.0,rap
2,4600937,mint,spotify,"[3VQDpxMffTaggOHEeur7Tj, 43V1z6QToNNWNldV4FEG...",52,52.8,-6.5606,0.06406,0.14303,0.032978,0.1001,0.453,2.0,38.0,0.0,0.0,single,49.0,2016.8,72.5,146571.5,tropical house
3,3787551,Are & Be,spotify,"[6gU9OKjOE7ghfEd55oRO57, 25wStx3LyTjYmHTd3RDu...",51,62.090909,-8.3817,0.11898,0.29918,0.078657,0.11113,0.30648,1.090909,34.818182,0.0,1.0,single,59.727273,2016.818182,75.2,1603454.8,indie r&b
4,3996091,Rock This,spotify,"[3k7JQg9M5rZJHveFYq0y9J, 5UMD1Iz7yyVK8Q5FLsVt...",50,63.75,-4.375125,0.090525,0.002983,0.032999,0.189312,0.553625,1.0,46.75,0.0,1.0,single,61.25,2017.0,77.0,1265657.875,modern rock


In [5]:
df_some.to_csv('spotify_predictors_no_dummies.csv')

In [None]:
df = pd.get_dummies(df, columns=['majority_key','majority_album_type','majority_time_signature'], drop_first=False)

df.head()

In [None]:
df.to_csv('final_dataset.csv')