In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import spotipy
import os
%matplotlib inline


In [57]:
df = pd.read_csv('../data/all.csv')
df = df.dropna(subset=['lyrics'])
df = df.reset_index(drop=True)

import re

def remove_disclaimer(string):
    s = re.sub(r'(\*)+ This Lyrics is NOT for Commercial use .*\r\n\([0-9]*\)',"", string)
    return s

df['lyrics'] = df['lyrics'].apply(remove_disclaimer)

In [9]:
df.shape

(14270, 34)

In [84]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.utils import shuffle
from sklearn.tree import DecisionTreeRegressor


X = df[['lyrics']]
y = df['Age']

#X_train, y_train = shuffle(X, y, random_state=42)

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42)

bag_of_words_model = Pipeline([
    ('vectorizer', ColumnTransformer([('counter', CountVectorizer(),'lyrics')])),
    ('regressor', Ridge())     
])


param_grid = {'regressor__alpha':[0, 5, 10, 15, 20], 'vectorizer__counter__min_df': [2], 
              'vectorizer__counter__max_df': [0.9, 0.95]}


gs_est = GridSearchCV(bag_of_words_model, param_grid, cv=3, n_jobs=2, verbose=1, return_train_score = True)
model = gs_est.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [85]:
gs_est.best_estimator_

Pipeline(steps=[('vectorizer',
                 ColumnTransformer(transformers=[('counter',
                                                  CountVectorizer(max_df=0.9,
                                                                  min_df=2),
                                                  'lyrics')])),
                ('regressor', Ridge(alpha=20))])

In [86]:
gs_est.cv_results_['mean_test_score']

array([-1.66442037e+08, -1.66442037e+08,  3.69911289e-02,  3.69911289e-02,
        1.67789105e-01,  1.67789105e-01,  2.21945275e-01,  2.21945275e-01,
        2.52346946e-01,  2.52346946e-01])

In [87]:
alpha = gs_est.param_grid['regressor__alpha']

#plt.plot(alpha, gs_est.cv_results_['mean_test_score'], c='r', label = 'validation score')
#plt.plot(alpha, gs_est.cv_results_['mean_train_score'], c='b', label = 'train score')
#plt.xlabel('regressor_alpha')
#plt.ylabel('score')
#plt.legend(loc='upper right');

In [88]:
gs_est.best_score_

0.2523469464603774

In [89]:
y_pred = gs_est.predict(X_test)

In [90]:
from sklearn.metrics import mean_squared_error, r2_score

print ("Error (RMSE): ", mean_squared_error(y_test, y_pred, squared=False))
print ("R^2 Score: " , r2_score(y_test, y_pred))

Error (RMSE):  3.351922865054057
R^2 Score:  0.3108768002078405
