# IMPORTS

In [11]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import os
import pdb

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.decomposition import PCA

from sklearn.preprocessing import StandardScaler
from mpl_toolkits.mplot3d import Axes3D
import requests as req
import re
from imblearn.over_sampling import SMOTE
# import bokeh
# from bokeh.io import output_notebook, show
# from bokeh.resources import INLINE
# import bokeh.plotting as bp
# from bokeh.plotting import figure
#output_notebook(resources=INLINE)



mpl.style.use('seaborn')
mpl.rcParams['figure.figsize'] = [20,15]
mpl.rcParams['axes.titlesize'] = 20
mpl.rcParams['axes.labelsize'] = 16
mpl.rcParams['xtick.labelsize'] =15
mpl.rcParams['ytick.labelsize'] = 15
mpl.rcParams['legend.fontsize'] = 15
pd.options.mode.chained_assignment = None


PLT_WIDTH_INCH = 20
PLT_HEIGHT_INCH = 15
plt.rcParams['figure.figsize'] = [PLT_WIDTH_INCH, PLT_HEIGHT_INCH]


warnings.filterwarnings('ignore')

%load_ext autoreload
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# LOAD DATA

In [12]:
#features correlation
all_songs = pd.read_pickle('fixed_all_songs.dat')
popular_songs = all_songs[all_songs['popular']==1]
unpopular_songs = all_songs[all_songs['popular']==0]
all_genres = list(pd.read_pickle('all_genres.dat')[0].values)
GENRES = list(pd.read_pickle('spotify_genres.dat')[0].values)

# CONSTS

In [13]:
SPOTIFY_COLUMNS = ['artist','artist_id','name','genre','decade','duration_ms','acousticness',
                  'danceability','energy','id','year',
                  'instrumentalness','liveness','loudness','speechiness','tempo','valence','popular']


selected_columns=['duration_ms',
                  'acousticness',
                  'danceability',
                  'energy',
                  'instrumentalness',
                  'liveness',
                  'loudness',
                  'speechiness',
                  'tempo',
                  'valence',
                  'genre_id']

# FUNCTIONS

In [14]:
# %load spotify_client/spotify_client.py
import spotipy
import json
from spotipy.oauth2 import SpotifyClientCredentials




# getting credentials to request special API
credenials = SpotifyClientCredentials(client_id="5659eaf41b194134866170761e2fb293",client_secret="17e2a9fa37c2466baa2ba05752896cdc")

# getting the client
spotify_client = spotipy.Spotify(client_credentials_manager=credenials)

"""
we will get all the features from spotify, by song name and artist name.
this function will create a json file with all the data
"""
def get_song_features_and_info_json(song_name, artist_name):

    # getting credentials to request special API
    credenials = SpotifyClientCredentials(client_id="5659eaf41b194134866170761e2fb293",client_secret="17e2a9fa37c2466baa2ba05752896cdc")

    # getting the client
    spotify_client = spotipy.Spotify(client_credentials_manager=credenials)

    # getting the song id in order to retrieve features, limiting to 1 since it sorts the results by popularity
    song_results = spotify_client.search(q='artist:' + artist_name + ' track:' + song_name, type='track',limit=1)
    #print(song_results)
    tracks = song_results['tracks']['items']
    if len(tracks)==0:
        print("***no results for the requested track:{}***".format(song_name+":"+artist_name))
        return
    result_dict = tracks[0]

    # creating dict for track
    song_dict = creating_song_dictionary(artist_name, result_dict, spotify_client)

    return song_dict
def creating_song_dictionary(artist_name, result_dict, spotify_client):
    song_dict = {}
    song_dict['artist'] = artist_name
    song_id = [result_dict['uri']]
    song_dict['uri'] = song_id
    song_dict['year'] = result_dict['album']['release_date'][:4]
    song_dict['artist_id'] = result_dict['artists'][0]['id']
    song_dict['track_id'] = result_dict['id']
    song_dict['name'] = spotify_client.track(song_dict['track_id'])['name']
    song_dict['popularity'] = result_dict['popularity']
    song_dict['preview_link'] = result_dict['preview_url']
    features = spotify_client.audio_features(song_id)
    song_dict['main_features'] = features[0]

    return song_dict    
    
def get_artist_and_title(df):
    #return ["{}:{}".format(tup[0],tup[1]) for tup in list(zip(zz.loc['artist'].values,zz.loc['title'].values))]
    return list(zip(df.loc[:,'name'].values,df.loc[:,'artist'].values))



def scale_features(data):
    clmns = data.columns
    scaler = StandardScaler(with_mean=True)
    return pd.DataFrame(scaler.fit_transform(data),columns=clmns)


def get_artist_genre(spotify_client,artist_id):
    artist_data = spotify_client.artist(artist_id)
    spotify_genres = artist_data["genres"]
    return spotify_genres

def get_number_of_apearence(genre,spotify_genres):
    count=0
    for spot_genre in spotify_genres:
        res = re.search(genre, spot_genre)
        if res:
            count+=1
    return count

def get_genres_scores_of_artist(spotify_client,artist_id):
    artist_genres=get_artist_genre(spotify_client,artist_id)
    return [get_number_of_apearence(genre,artist_genres) for genre in GENRES]

def get_genre_of_artist(spotify_client,artist_id):
    scores=get_genres_scores_of_artist(spotify_client,artist_id)
    if np.sum(scores)==0:
        return 'na'
    return GENRES[np.argmax(scores)]



def remove_dup_songs_from_random(all_songs_data_frame,subset):
    res=all_songs_data_frame.drop_duplicates(subset = subset,keep = 'first',inplace = False)
    return res


def get_year(artist,title):
    try:
        res=req.get('https://api.discogs.com/database/search?artist={0}&release_title={1}&per_page=10&page=1&token=AVfnfICgSgXIDXAWivYivedRIhMcDRvYCxrIshPk'.format(artist,title))
        return np.min([np.int(r['year']) for r in res.json()['results'] if 'year' in r.keys()]) 
    except Exception as e:
        #print(e)
        return None
    
def get_all_fixed_years(data):
    years=[]
    details=get_artist_and_title(data)
    for name,artist in details:
        year=get_year(artist,name)
        years.append(year)
        #print(year)
    return years

def extract_features_from_spotify(song_dict):
    new_song_dict={}
    for feature in SPOTIFY_COLUMNS:
        val = song_dict.get(feature,None) or song_dict['main_features'].get(feature)
        new_song_dict.update({feature:val})
    return pd.DataFrame(new_song_dict,index=[0])

def get_song(artist_name,song_name):
    song_features_dict=get_song_features_and_info_json(song_name=song_name,artist_name=artist_name)
    return spotify_preprocessing(song_features_dict)



#preprocessing
#df=pd.read_pickle('billboard.dat')

def cleaning_and_typing(df):    
    df.columns=np.arange(df.shape[1])
    df['num_words']=df['num_words'].astype(int)
    df['num_lines']=df['num_lines'].astype(int)
    df['num_dupes']=df['num_dupes'].astype(int)

    df['title'].replace(regex=True,to_replace=r"\'",value=r'',inplace=True)
    df['artist'].replace(regex=True,to_replace=r"\'",value=r'',inplace=True)
    return df
    
def create_decade_mappings():
    all_years=np.arange(1950,2016)
    decades=np.arange(1950,2016,10)
    decades_mappings={}
    cur_dec=decades[0]
    cur_dec_idx=0
    for year in all_years:
        if year%cur_dec>=10:
            cur_dec_idx+=1
            cur_dec=decades[cur_dec_idx]
        decades_mappings.update({year:cur_dec})
    return decades_mappings



def spotify_preprocessing(song_dict,is_popular=False):
    '''
        Parsing a *single* song from spotify
    '''
    #parse dict into df with SPOTIFY_COLUMNS
    df = extract_features_from_spotify(song_dict)
    
    #get year from discgo
    discgo_year=get_all_fixed_years(df)[0]
    
    #add decade
    df['year']=df['year'].astype(int)
    df['year'] = discgo_year or df['year']
    zz=df['year'].map(create_decade_mappings())
    zz=zz.fillna(1950)
    df.loc[:,'decade']=zz.astype(int)
    
    #clean artists
    df['name'].replace(regex=True,to_replace=r"\'",value=r'',inplace=True)
    df['artist'].replace(regex=True,to_replace=r"\'",value=r'',inplace=True)
    
    #add genre information for each song
    df.loc[:,'genre']=df['artist_id'].apply(lambda artist_id:get_genre_of_artist(spotify_client,artist_id))
    
    #add genre_id
    df.loc[:,'genre_id']=df['genre'].apply(lambda x: GENRES.index(x))
    
    #add popular column
    df.loc[:,'popular']=np.int(is_popular)
    
    #drop year column
    df = df.drop('year', 1)
    return df

### Plot Functions

In [21]:
#plot genres (from popular songs)
def plot_genres_distribution(popular_songs):
    df['popular_songs'].value_counts().plot(kind='bar',title='Genre Popularity',figsize=(12,8))

#genres dsitribution
def plot_genre_distribution(df):
    t=df.groupby(['genre','decade']).count()['artist'].unstack()
    sums=t.sum(axis=1).values
    t=t.div(sums,axis=0)
    ax=t.plot(kind='bar',stacked='True',figsize=(14,8),colormap='PRGn')
    ax.set_ylabel('% Songs')
    ax.set_xlabel('Genre')
    ax.set_title('Genres Distribution over Decades')
    ax.legend(bbox_to_anchor=(1.2, 0.5))
    

#decade songs distribution
def plot_decade_distribution(all_songs):
    plt.figure()
    all_songs.groupby(['decade','popular']).count()['artist'].unstack().plot(kind='bar',figsize=(10,5),title='Decades Songs Distribution')
    plt.ylabel('Popularity Distribution')


def plot_top_artist_with_hit_songs(all_song,top_art=20):
    df=all_songs[all_songs['popular']==1]
    df['artist'].value_counts().head(top_art).plot(kind='bar',figsize=(10,5),title='Top {} Aritst with hit songs'.format(top_art))
    plt.ylabel('# Hit Songs')
    
def plot_statistics(df,column,group_by='decade'):
    clmns=['mean','std','min','max','25%','50%','75%']
    stats=df.groupby(group_by)[column].describe().T
    stats = stats.loc[clmns]
    ax=stats.plot(kind='box',title='{}'.format(column),figsize=(10,5),rot=90)
    ax.set_xlabel(group_by)

#PCA
def plot_pca_by_decade(df,decade=2000,N=3,colors='popular'):
    decade_data = df[df['decade']==decade]
    pca_data = decade_data[selected_columns]
    #make std=1 and mean=0
    pca_data = scale_features(pca_data)
    pca = PCA(n_components=N,svd_solver='full')
    pca.fit(pca_data)
    X_pca = pca.transform(pca_data)
    if N < 4:
        if colors == 'popular':
            clrs = [POPULARITY_COLORS[pop] for pop in decade_data['popular'].values]
        if colors == 'genre':
            rand_colors=np.random.rand(3,len(GENRES))
            clrs = [rand_colors[:,genre_idx] for genre_idx in decade_data['genre_id'].values]

        fig=plt.figure(figsize=(8, 5))
        if N==3:
            ax = Axes3D(fig)
            ax.scatter(X_pca[:,0],X_pca[:,1],X_pca[:,2],c=clrs,s=20)
        else:
            plt.scatter(X_pca[:,0],X_pca[:,1],c=clrs)
        plt.title('PCA for decade {}'.format(decade))
    print("Total Explained Variance for decade {}: {}".format(decade,np.sum(pca.explained_variance_ratio_)))
    return pca
    
def plot_feature_freq(df,col,decade=1950,ax=None,legend=False):
    df = df[df['decade']==decade]
    g = sns.kdeplot(df[col][(df["popular"] == 1)], color="Blue", shade = True,ax=ax,legend=False)
    sns.kdeplot(df[col][(df["popular"] == 0)], color="red", shade = True,ax=ax,legend=False)
    if legend:
        g.legend(["Popular","No Popular"],fontsize=30)
    

def plot_grid_kde(df,features,decades):
    f, axarr = plt.subplots(len(decades), len(features),figsize=(80,40))
    f.subplots_adjust(hspace=0.5, wspace=0.5)
    f.text(0.5, 0.04, 'Features', ha='center')
    f.suptitle('Features Distribution across Decades')
    f.text(0.04, 0.5, 'Decades', va='center', rotation='vertical')
    legend=False
    for decade_idx,decade in enumerate(decades):
        for feature_idx,feature in enumerate(features):
            ax = axarr[decade_idx,feature_idx]
            if decade_idx==0 and feature_idx==0:
                legend=True
            else:
                legened=False
            if feature_idx==0:
                ax.set_ylabel(decade,size='50')
            if decade_idx == len(decades)-1:
                ax.set_xlabel(feature,size='50')
            plot_feature_freq(df,col=feature,decade=decade,ax=axarr[decade_idx,feature_idx],legend=legend)
    f.tight_layout()

a
