In [1]:
#libraries
import pandas as pd
import numpy as np
import seaborn as sns
import pypopulation
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from ast import literal_eval


In [2]:
# Saved for later use
'''
all_data_sources.isnull().sum()

test_df = pd.DataFrame()
test_df["audio_features_track_id"] = lyrics_features["track_id"]
test_df["tracks_track_id"] = tracks["id"]

test_df['Match'] = test_df.apply(lambda x: 1 if x['audio_features_track_id'] in x['tracks_track_id'] else 0, axis=1)
'''

'\nall_data_sources.isnull().sum()\n\ntest_df = pd.DataFrame()\ntest_df["audio_features_track_id"] = lyrics_features["track_id"]\ntest_df["tracks_track_id"] = tracks["id"]\n\ntest_df[\'Match\'] = test_df.apply(lambda x: 1 if x[\'audio_features_track_id\'] in x[\'tracks_track_id\'] else 0, axis=1)\n'

In [3]:
albums = pd.read_csv('SpotGenTrack/Data Sources/spotify_albums.csv', index_col = 0)
artists = pd.read_csv('SpotGenTrack/Data Sources/spotify_artists.csv', index_col = 0)
tracks = pd.read_csv('SpotGenTrack/Data Sources/spotify_tracks.csv', index_col = 0, converters={"available_markets": literal_eval})
audio_features = pd.read_csv('SpotGenTrack/Features Extracted/low_level_audio_features.csv', index_col = 0)
lyrics_features = pd.read_csv('SpotGenTrack/Features Extracted/lyrics_features.csv', index_col = 0)
#countries = pd.read_csv('country_population/country_population.csv', header=2)
#countries = countries[["Country Name", "Country Code", "2021"]]
# https://data.worldbank.org/indicator/SP.POP.TOTL

### Dimensionality Reduction

In [4]:
#Separate numeric and string columns (track_id) as PCA can only be performed on numeric columns
numeric_columns = audio_features.select_dtypes(include=['number']).columns
non_numeric_columns = audio_features.select_dtypes(exclude=['number']).columns

numeric_data = audio_features[numeric_columns]
non_numeric_data = audio_features[non_numeric_columns]

#Standardize numeric data
scaler = StandardScaler()
scaled_numeric_data = scaler.fit_transform(numeric_data)

#Redo PCA with number of components that explain 95% of the variance
pca = PCA(n_components=0.95)
principal_components = pca.fit_transform(scaled_numeric_data)
n_components = principal_components.shape[1]
principal_df = pd.DataFrame(data=principal_components, columns=[f'PC{i}' for i in range(1, n_components+1)])
#changed the name
processed_audio_features = pd.concat([non_numeric_data.reset_index(drop=True), principal_df], axis=1)

### Pre Processing

#### Tracks

In [5]:
#check for correlations of numerical features
num_attribs = tracks.select_dtypes(include=['float64', 'int64']).columns
numerical_corr_matrix = tracks[num_attribs].corr().abs()

# Display the correlations between the label (e.g., 'popularity') and other numerical columns
label_correlations = numerical_corr_matrix['popularity'].sort_values(ascending=False)
print(label_correlations)

popularity          1.000000
speechiness         0.304212
loudness            0.239679
duration_ms         0.169125
acousticness        0.154408
energy              0.153979
track_number        0.139460
time_signature      0.121505
danceability        0.092382
tempo               0.061869
liveness            0.059116
disc_number         0.047331
instrumentalness    0.041537
valence             0.034412
mode                0.010003
key                 0.000641
Name: popularity, dtype: float64


In [6]:
# Define a helper function to sum population
def get_sum_population(x):
    if x is not None:
        populations = [pypopulation.get_population(i) for i in x]
        filtered_populations = [pop for pop in populations if pop is not None]
        return sum(filtered_populations)
    else:
        return 0

# Apply the helper function to the column and store the result in a new column
tracks['sum_available_markets_population'] = tracks['available_markets'].apply(get_sum_population)


In [7]:
#include numerical columns with correlation greater than 0.09
include_col_tracks = ["playlist","available_markets","country","album_id","artists_id","id","lyrics", "time_signature","track_number","energy","acousticness","duration_ms","loudness","speechiness","danceability", "sum_available_markets_population", "popularity"]
tracks_filtered = tracks[include_col_tracks]

#### Albums

In [8]:
albums['album_sum_available_markets_population'] = albums['available_markets'].apply(get_sum_population)
albums_include = ['type','album_type', 'artist_id', 'album_sum_available_markets_population','total_tracks', 'track_id', 'release_date',"id"]
albums_filtered = albums[albums_include]

In [9]:
albums.columns

Index(['album_type', 'artist_id', 'available_markets', 'external_urls', 'href',
       'id', 'images', 'name', 'release_date', 'release_date_precision',
       'total_tracks', 'track_id', 'track_name_prev', 'uri', 'type',
       'album_sum_available_markets_population'],
      dtype='object')

#### Artists

In [10]:
artists_include = ['artist_popularity', 'followers', 'genres', 'id', 'name', 'track_id']
artists_filtered = artists[artists_include]

### Merging

In [11]:
#merge all dataframes
all_data_sources = tracks_filtered.merge(albums_filtered, how='left', left_on='album_id', right_on='id', suffixes=('_track','_albums')).merge(artists_filtered, how='left', left_on='artist_id', right_on='id', suffixes=('_track','_artists'))
all_data_sources = all_data_sources.merge(processed_audio_features, how="left", left_on="id_track", right_on='track_id', suffixes=('_track','_audio-features')).merge(lyrics_features, how="left", left_on="id_track", right_on='track_id', suffixes=('_tracks','_lyrics-features'))

In [12]:
all_data_sources.isnull().sum()

playlist                       0
available_markets              0
country                        0
album_id                       0
artists_id                     0
                            ... 
n_sentences                 6985
n_words                     6985
sentence_similarity         6985
track_id_lyrics-features    6985
vocabulary_wealth           6985
Length: 127, dtype: int64

In [13]:
all_data = all_data_sources
corr_num = all_data.corrwith(all_data['popularity']).abs().sort_values(ascending=False)
corr_num

  corr_num = all_data.corrwith(all_data['popularity']).abs().sort_values(ascending=False)


popularity                                1.000000
artist_popularity                         0.649121
speechiness                               0.304212
followers                                 0.286363
loudness                                  0.239679
                                            ...   
PC85                                      0.001489
PC59                                      0.001213
PC21                                      0.001106
PC86                                      0.000936
album_sum_available_markets_population         NaN
Length: 108, dtype: float64

In [16]:
all_data.columns.to_list()
drop_all = ["id_track","track_id_track","id_albums", "track_id_artists", "track_id_tracks", "track_id_lyrics-features", "album_id", "artists_id", "lyrics", "available_markets"]  
all_data = all_data.drop(columns=drop_all)

In [17]:
all_data.columns.to_list()

['playlist',
 'country',
 'time_signature',
 'track_number',
 'energy',
 'acousticness',
 'duration_ms',
 'loudness',
 'speechiness',
 'danceability',
 'sum_available_markets_population',
 'popularity',
 'type',
 'album_type',
 'artist_id',
 'album_sum_available_markets_population',
 'total_tracks',
 'release_date',
 'artist_popularity',
 'followers',
 'genres',
 'id',
 'name',
 'PC1',
 'PC2',
 'PC3',
 'PC4',
 'PC5',
 'PC6',
 'PC7',
 'PC8',
 'PC9',
 'PC10',
 'PC11',
 'PC12',
 'PC13',
 'PC14',
 'PC15',
 'PC16',
 'PC17',
 'PC18',
 'PC19',
 'PC20',
 'PC21',
 'PC22',
 'PC23',
 'PC24',
 'PC25',
 'PC26',
 'PC27',
 'PC28',
 'PC29',
 'PC30',
 'PC31',
 'PC32',
 'PC33',
 'PC34',
 'PC35',
 'PC36',
 'PC37',
 'PC38',
 'PC39',
 'PC40',
 'PC41',
 'PC42',
 'PC43',
 'PC44',
 'PC45',
 'PC46',
 'PC47',
 'PC48',
 'PC49',
 'PC50',
 'PC51',
 'PC52',
 'PC53',
 'PC54',
 'PC55',
 'PC56',
 'PC57',
 'PC58',
 'PC59',
 'PC60',
 'PC61',
 'PC62',
 'PC63',
 'PC64',
 'PC65',
 'PC66',
 'PC67',
 'PC68',
 'PC69',
 'PC70'

### To Do
1. get Dataset with country names and population to calculate the potential listening population then drop available_markets_track. Same for available_markets_albums
2. Think about how the lyrics column could be used (top 3 word count? len of the lyrics?)

### Pre Pro Pipeline

In [18]:
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures

In [19]:
#define features and labels 
X = all_data.drop(columns=["popularity"])
y = all_data["popularity"]

In [20]:
train_data, test_data = train_test_split(all_data, test_size=0.2, random_state=42)
train_data.corr().sort_values(by="popularity",ascending=False)["popularity"]

  train_data.corr().sort_values(by="popularity",ascending=False)["popularity"]


popularity                                1.000000
artist_popularity                         0.646580
followers                                 0.285169
loudness                                  0.237553
PC1                                       0.203467
                                            ...   
acousticness                             -0.153994
duration_ms                              -0.164460
PC4                                      -0.208334
speechiness                              -0.302021
album_sum_available_markets_population         NaN
Name: popularity, Length: 108, dtype: float64

In [21]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.attribute_names]

    
def get_preprocessor(data):
    num_attribs = data.select_dtypes(include=['float64', 'int64']).columns
    cat_attribs = data.select_dtypes(include=['object']).columns
    
    num_pipeline = Pipeline([
            ('selector', DataFrameSelector(num_attribs)),
            ('imputer', SimpleImputer(strategy="median")),
            ('std_scaler', StandardScaler()),
        ])
    
    cat_pipeline = Pipeline([
            ('selector', DataFrameSelector(cat_attribs)),
            ('cat_encoder', OneHotEncoder(sparse_output=False)),
        ])
    
    full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", cat_pipeline, cat_attribs),
    ])
    
    return full_pipeline

def create_prepared_dataframe(data, preprocessor, cat_attribs):
    # Get the transformed column names
    transformed_num_cols = preprocessor.transformers_[0][2].tolist()
    transformed_cat_cols = list(preprocessor.named_transformers_['cat']['cat_encoder'].get_feature_names_out(cat_attribs))
    
    # Combine the transformed column names
    transformed_cols = transformed_num_cols + transformed_cat_cols

    # Create a DataFrame with the transformed data and column names
    data_prepared = pd.DataFrame(data, columns=transformed_cols)

    return data_prepared

num_attribs = train_data.select_dtypes(include=['float64', 'int64']).columns
cat_attribs = train_data.select_dtypes(include=['object']).columns

preprocessor = get_preprocessor(train_data)
data_prepared_array = preprocessor.fit_transform(train_data[:500])

# Create a DataFrame with original column names
data_prepared_df = create_prepared_dataframe(data_prepared_array, preprocessor, cat_attribs)



In [22]:
data_prepared_df.corr()["popularity"].sort_values(ascending=False)

popularity                                1.000000
artist_popularity                         0.610101
followers                                 0.266291
loudness                                  0.232791
PC1                                       0.223500
                                            ...   
total_tracks                             -0.218432
speechiness                              -0.272963
genres_[]                                -0.303132
album_sum_available_markets_population         NaN
type_album                                     NaN
Name: popularity, Length: 2706, dtype: float64

### Polynomial Features

In [23]:
#Create polynomial features
X = data_prepared_df.drop(columns="popularity")
y = data_prepared_df["popularity"]

degree = 2  #set to desired degree
poly_features = PolynomialFeatures(degree=degree)
X_poly = poly_features.fit_transform(X)

In [24]:
X_train_poly, X_test_poly, y_train_poly, y_test_poly = train_test_split(X_poly, y, test_size=0.2, random_state=42)

In [25]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train_poly, y_train_poly)

MemoryError: Unable to allocate 10.9 GiB for an array with shape (400, 3662571) and data type float64

### KBinsDiscretizer

##### Continuous variables
- spotify_albums: total_tracks
- spotify_ artists: artist_popularity, followers
- spotify_tracks: acousticness, danceability, duration_ms, energy, instrumentalness, key, liveness, loudness, popularity, speechiness, tempo, time_signature, track_number, valence 

### Model

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import randint

X = data_prepared_df.drop(columns="popularity")
y = data_prepared_df["popularity"]

param_distribs = {
        'n_estimators': randint(low=1, high=200),
        'max_features': randint(low=1, high=8),
    }

forest_reg = RandomForestRegressor(random_state=42)
rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
                                n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42, error_score="raise")
rnd_search.fit(X, y)

### Model Polynomial Features

In [None]:
forest_reg_poly = RandomForestRegressor(random_state=42)
rnd_search_poly = RandomizedSearchCV(forest_reg_poly, param_distributions=param_distribs,
                                n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42, error_score="raise")
rnd_search_poly.fit(X_poly, y)

best_params_poly = rnd_search_poly.best_params_
print(f"Best parameters: {best_params_poly}")

# Get the best mean squared error (MSE) score
best_mse_poly = -rnd_search_poly.best_score_
print(f"Best mean squared error: {best_mse_poly}")

# Get the best estimator
best_estimator_poly = rnd_search_poly.best_estimator_

# Calculate R-squared using the best estimator
from sklearn.metrics import r2_score

y_pred_poly = best_estimator_poly.predict(X_poly)
r2_poly = r2_score(y, y_pred_poly)
print(f"R-squared: {r2_poly}")

KeyboardInterrupt: 

### KBinsDiscretizer

##### Continuous variables
- spotify_albums: total_tracks
- spotify_ artists: artist_popularity, followers
- spotify_tracks: acousticness, danceability, duration_ms, energy, loudness, popularity, speechiness, time_signature, track_number

out due to low correlation: instrumentalness, liveness, tempo, valence

popularity          1.000000
speechiness         0.304212
loudness            0.239679
duration_ms         0.169125
acousticness        0.154408
energy              0.153979
track_number        0.139460
time_signature      0.121505
danceability        0.092382

In [None]:
#define features and labels 
X = all_data.drop(columns=["popularity"])
y = all_data["popularity"]
train_data, test_data = train_test_split(all_data, test_size=0.2, random_state=42)
train_data.corr().sort_values(by="popularity",ascending=False)["popularity"]

  train_data.corr().sort_values(by="popularity",ascending=False)["popularity"]


popularity           1.000000
artist_popularity    0.646580
followers            0.285169
loudness             0.237553
PC1                  0.203467
                       ...   
track_number        -0.134224
acousticness        -0.153994
duration_ms         -0.164460
PC4                 -0.208334
speechiness         -0.302021
Name: popularity, Length: 106, dtype: float64

In [27]:
from sklearn.preprocessing import KBinsDiscretizer

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.attribute_names]
    
    

    
def get_preprocessor(data):
    num_attribs = data.select_dtypes(include=['float64', 'int64']).columns
    cat_attribs = data.select_dtypes(include=['object']).columns
    continuous_features = ['total_tracks', 'artist_popularity', 'followers', 'acousticness', 'danceability', 
                           'duration_ms', 'energy', 'loudness', 'popularity', 
                           'speechiness', 'time_signature', 'track_number']
    kbins_discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile')
    
    num_pipeline = Pipeline([
            ('selector', DataFrameSelector(num_attribs)),
            ('imputer', SimpleImputer(strategy="median")),
            ('std_scaler', StandardScaler()),
        ])
    
    cat_pipeline = Pipeline([
            ('selector', DataFrameSelector(cat_attribs)),
            ('cat_encoder', OneHotEncoder(sparse=False)),
        ])
    
    full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", cat_pipeline, cat_attribs),
        ('kbins', kbins_discretizer, continuous_features)
        ],remainder='passthrough'
    )
    
    return full_pipeline

def create_prepared_dataframe(data, preprocessor, cat_attribs):
    # Get the transformed column names
    transformed_num_cols = preprocessor.transformers_[0][2].tolist()
    transformed_cat_cols = list(preprocessor.named_transformers_['cat']['cat_encoder'].get_feature_names_out(cat_attribs))
    
    # Combine the transformed column names
    transformed_cols = transformed_num_cols + transformed_cat_cols

    # Create a DataFrame with the transformed data and column names
    data_prepared = pd.DataFrame(data, columns=transformed_cols)

    return data_prepared

num_attribs = train_data.select_dtypes(include=['float64', 'int64']).columns
cat_attribs = train_data.select_dtypes(include=['object']).columns

preprocessor = get_preprocessor(train_data)
data_prepared_array = preprocessor.fit_transform(train_data[:500])

# Create a DataFrame with original column names
data_prepared_df = create_prepared_dataframe(data_prepared_array, preprocessor, cat_attribs)



ValueError: Input X contains NaN.
KBinsDiscretizer does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values