In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score
import warnings
warnings.filterwarnings("ignore")
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, RobustScaler, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer

## Models
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.neighbors import KNeighborsRegressor

## Model selection and fine tuning
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error

from xgboost import XGBRegressor

# Tensorflow Keras imports
from tensorflow.keras import Sequential, layers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.callbacks import EarlyStopping

from scipy.stats import uniform, randint
data = pd.read_csv("../../data/data_o.csv")
data

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo
0,0.0594,1921,0.98200,"['Sergei Rachmaninoff', 'James Levine', 'Berli...",0.279,831667,0.211,0,4BJqT0PrAfrxzMOxytFOIz,0.878000,10,0.6650,-20.096,1,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",4,1921,0.0366,80.954
1,0.9630,1921,0.73200,['Dennis Day'],0.819,180533,0.341,0,7xPhfUan2yNtyFG0cUWkt8,0.000000,7,0.1600,-12.441,1,Clancy Lowered the Boom,5,1921,0.4150,60.936
2,0.0394,1921,0.96100,['KHP Kridhamardawa Karaton Ngayogyakarta Hadi...,0.328,500062,0.166,0,1o6I8BglA6ylDMrIELygv1,0.913000,3,0.1010,-14.850,1,Gati Bali,5,1921,0.0339,110.339
3,0.1650,1921,0.96700,['Frank Parker'],0.275,210000,0.309,0,3ftBPsC5vPBKxYSee08FDH,0.000028,5,0.3810,-9.316,1,Danny Boy,3,1921,0.0354,100.109
4,0.2530,1921,0.95700,['Phil Regan'],0.418,166693,0.193,0,4d6HGyGT8e121BsdKmw9v6,0.000002,3,0.2290,-10.096,1,When Irish Eyes Are Smiling,2,1921,0.0380,101.665
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170648,0.6080,2020,0.08460,"['Anuel AA', 'Daddy Yankee', 'KAROL G', 'Ozuna...",0.786,301714,0.808,0,0KkIkfsLEJbrcIhYsCL7L5,0.000289,7,0.0822,-3.702,1,China,72,2020-05-29,0.0881,105.029
170649,0.7340,2020,0.20600,['Ashnikko'],0.717,150654,0.753,0,0OStKKAuXlxA0fMH54Qs6E,0.000000,7,0.1010,-6.020,1,Halloweenie III: Seven Days,68,2020-10-23,0.0605,137.936
170650,0.6370,2020,0.10100,['MAMAMOO'],0.634,211280,0.858,0,4BZXVFYCb76Q0Klojq4piV,0.000009,4,0.2580,-2.226,0,AYA,76,2020-11-03,0.0809,91.688
170651,0.1950,2020,0.00998,['Eminem'],0.671,337147,0.623,1,5SiZJoLXp3WOl3J4C8IK0d,0.000008,2,0.6430,-7.161,1,Darkness,70,2020-01-17,0.3080,75.055


In [2]:
data['artists+name'] = data.apply(lambda row: row['artists'] + row['name'], axis=1)

In [3]:
df = data[data['artists+name'].duplicated()]

In [4]:
df = df.drop(columns= ['id'], axis=1)
df = df.drop(columns= ['artists'], axis=1)
df = df.drop(columns= ['name'], axis=1)
df = df.drop(columns= ['artists+name'], axis=1)
df = df.drop(columns= ['release_date'], axis=1)




In [5]:
data[data['name']=='Gimme Love']

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,artists+name
75102,0.0743,2020,0.696,['Joji'],0.482,214558,0.631,0,2hfoyc7ve6xM4ZEiNIiU1B,0.0242,5,0.209,-6.917,1,Gimme Love,76,2020-04-16,0.0986,126.576,['Joji']Gimme Love
75220,0.0736,2020,0.695,['Joji'],0.484,214558,0.632,0,0HItcI6qN6Dr4MC3CZryQh,0.0243,5,0.209,-6.917,1,Gimme Love,74,2020-09-25,0.0976,126.384,['Joji']Gimme Love
170603,0.0736,2020,0.695,['Joji'],0.484,214558,0.632,0,5jSBnH9NyaNP5zdSB3pwgu,0.0243,5,0.209,-6.917,1,Gimme Love,69,2020-09-24,0.0976,126.384,['Joji']Gimme Love


In [6]:
X = df.sort_values(by='year').drop(columns=['popularity']).copy()
y = df.sort_values(by='year')['popularity'].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=False)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle=False)

In [7]:
ct = ColumnTransformer([('minmax', MinMaxScaler(), ['year', 'tempo', 'duration_ms']),
                        ('categorical', OneHotEncoder(), ['key']),
                        ],
                       remainder='passthrough')

ct.fit(X_train)

X_train_preprocessed = ct.transform(X_train)
X_val_preprocessed = ct.transform(X_val)
X_test_preprocessed = ct.transform(X_test)

In [8]:
X_train_preprocessed.shape

(9336, 25)

In [9]:
df

Unnamed: 0,valence,year,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo
25,0.0731,1921,0.9930,0.389,218773,0.0880,0,0.527000,1,0.3630,-21.091,0,0,0.0456,92.867
39,0.0594,1921,0.9820,0.279,831667,0.2110,0,0.878000,10,0.6650,-20.096,1,1,0.0366,80.954
84,0.0770,1921,0.9940,0.248,117467,0.0876,0,0.907000,5,0.1650,-25.786,1,0,0.0566,82.025
87,0.2820,1921,0.9890,0.384,221013,0.1710,0,0.820000,7,0.1160,-20.476,0,0,0.0319,107.698
104,0.1860,1921,0.9940,0.263,132400,0.0536,0,0.927000,8,0.0980,-28.147,1,0,0.0386,129.365
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170603,0.0736,2020,0.6950,0.484,214558,0.6320,0,0.024300,5,0.2090,-6.917,1,69,0.0976,126.384
170607,0.6310,2020,0.1690,0.844,125853,0.7200,1,0.000000,1,0.0797,-7.418,0,65,0.3000,99.917
170624,0.6970,2020,0.0529,0.735,217240,0.7910,0,0.000146,6,0.1450,-4.462,1,80,0.0679,168.087
170626,0.1550,2020,0.0627,0.778,84000,0.5760,1,0.004400,10,0.1160,-8.698,1,68,0.3010,80.027


In [10]:
## We instantiate the model
lin_reg = LinearRegression()

# We fit the model on preprocessed train data
lin_reg.fit(X_train_preprocessed, y_train)

# We make predictions on the validation set, also preprocessed
y_pred = lin_reg.predict(X_val_preprocessed)

# We output the root mean squared error on the validation set
mean_squared_error(y_val, y_pred, squared=False)

8.017005889851536

In [11]:
lasso=Lasso()

lasso.fit(X_train_preprocessed,y_train)

y_pred = lasso.predict(X_val_preprocessed)

mean_squared_error(y_val, y_pred, squared=False)

10.029501780844463

In [12]:
xgb_regressor = XGBRegressor(n_estimators=100, max_depth=20, learning_rate=0.01)

xgb_regressor.fit(X_train_preprocessed, y_train)

y_pred = xgb_regressor.predict(X_val_preprocessed)

mean_squared_error(y_val, y_pred, squared=False)

23.606209324560727

In [13]:
neigh = KNeighborsRegressor(n_neighbors=7)

neigh.fit(X_train_preprocessed, y_train)

y_pred=neigh.predict(X_val_preprocessed)

mean_squared_error(y_val, y_pred, squared=False)

15.746532446775857

In [14]:
neigh = KNeighborsRegressor()

params = {
    'weights': ['uniform', 'distance'], 
    'n_neighbors': randint(2, 15),
    'algorithm': ['ball_tree', 'kd_tree', 'brute']
}

rnd_search = RandomizedSearchCV(estimator=neigh, 
                                param_distributions=params,
                                n_iter=10, 
                                cv=5,
                                verbose=1,
                                n_jobs=-1)

rnd_search.fit(X_train_preprocessed, y_train)

rnd_search.best_score_

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   12.0s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   13.6s finished


-2.2079061522616357

In [15]:
neigh = KNeighborsRegressor(n_neighbors=18)

neigh.fit(X_train_preprocessed, y_train)

y_pred=neigh.predict(X_val_preprocessed)

mean_squared_error(y_val, y_pred, squared=False)

16.56839407220297

In [16]:
def build_nn_model():
    # We instantiate the sequential model.
    model = Sequential()

    # We add several Dense layers with ReLU activation, and 1 Dropout layer to prevent overfitting.
    model.add(layers.Dense(100, activation = 'relu',input_dim=30))
    model.add(layers.Dense(50, activation = 'relu'))
    model.add(layers.Dropout(0.2))
    model.add(layers.Dense(30, activation = 'relu'))

    # Finally, the last layer will count 1 neuron with linear activation since we are dealing with a regression model. 
    model.add(layers.Dense(1, activation = 'linear'))
    return model
    
model = build_nn_model()

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 100)               3100      
_________________________________________________________________
dense_1 (Dense)              (None, 50)                5050      
_________________________________________________________________
dropout (Dropout)            (None, 50)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 30)                1530      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 31        
Total params: 9,711
Trainable params: 9,711
Non-trainable params: 0
_________________________________________________________________


In [17]:
adam = Adam(learning_rate=0.00001)
mse = MeanSquaredError()

# We compile the model with mean squared error as loss and root mean squared error as metric. 
model.compile(loss=mse, optimizer=adam, metrics=[RootMeanSquaredError()])

In [18]:
es = EarlyStopping(monitor='val_loss', patience=10, verbose=1)

history = model.fit(X_train_preprocessed, y_train, 
                    validation_data=(X_val_preprocessed, y_val),
                    epochs = 1000, 
                    batch_size = 32, 
                    callbacks = [es], 
                    verbose = 2)

Epoch 1/1000


ValueError: in user code:

    /opt/anaconda3/envs/learn-env/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:806 train_function  *
        return step_function(self, iterator)
    /opt/anaconda3/envs/learn-env/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:796 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    /opt/anaconda3/envs/learn-env/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:1211 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /opt/anaconda3/envs/learn-env/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:2585 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /opt/anaconda3/envs/learn-env/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:2945 _call_for_each_replica
        return fn(*args, **kwargs)
    /opt/anaconda3/envs/learn-env/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:789 run_step  **
        outputs = model.train_step(data)
    /opt/anaconda3/envs/learn-env/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:747 train_step
        y_pred = self(x, training=True)
    /opt/anaconda3/envs/learn-env/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py:975 __call__
        input_spec.assert_input_compatibility(self.input_spec, inputs,
    /opt/anaconda3/envs/learn-env/lib/python3.8/site-packages/tensorflow/python/keras/engine/input_spec.py:212 assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer sequential is incompatible with the layer: expected axis -1 of input shape to have value 30 but received input with shape [None, 25]


In [19]:
def plot_history(history):
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(17, 5))
    
    axes[0].plot(history.history['loss'], color='darkred', label='Train - Loss')
    axes[0].plot(history.history['val_loss'], color='darkblue', label='Validation - Loss')
    axes[0].legend()
    axes[0].set_title('Loss (MSE) on train and validation sets')    
    
    axes[1].plot(history.history['root_mean_squared_error'], color='darkred', label='Train - RMSE')
    axes[1].plot(history.history['val_root_mean_squared_error'], color='darkblue', label='Validation - RMSE')
    axes[1].legend()
    axes[1].set_title('RMSE on train and validation sets')

plot_history(history)

NameError: name 'history' is not defined