# Summary: Use song lyrics to predict age ratings.

After basic text preprocessing (tokenization, lemmiztization, removing stop words), the processed lyrics are then feed into a model pipeline consisting of `TfIdfVectorizer` and `RidgeRegressor`. `GridSearchCV` is used on a smaller subset to select the paramters: `min_df` for `TfIdfVectorizer`, and `alpha` for `RidgeRegressor`. These parameteres will be used later for song recommendation using Lyrics with KNN model.

The model achieves an $R^2$ score of 0.4.

#  Load Lyrics and Preprocess

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import spotipy
import os
%matplotlib inline

In [2]:
df = pd.read_csv('../data/all.csv')
df = df.dropna(subset=['lyrics'])
df = df.reset_index(drop=True)

import re

def remove_disclaimer(string):
    s = re.sub(r'(\*)+ This Lyrics is NOT for Commercial use .*\r\n\([0-9]*\)',"", string)
    return s

df['lyrics'] = df['lyrics'].apply(remove_disclaimer)

In [3]:
import spacy
import re

nlp = spacy.load("en_core_web_sm",  disable=['parser','ner', 'textcat'])


def my_lemmatizer(doc):
    doc_cleaned = ' '.join(re.findall(r'\b\w\w\w+\b', doc)) 
    return [ w.lemma_.lower() for w in nlp(doc_cleaned) 
                      if len(w.lemma_)>2 ]

def process_text(text, stop_words=set()):
    if not text:
        return []

    text = ' '.join(re.findall(r'\b\w\w\w+\b', text))   #only keep word with at least 3 chars
    doc = nlp(text)
    result=  [token.lemma_.lower() for token in doc if token.lemma_.lower() not in stop_words]
    result = " ".join(result)
    return result



stopwords = spacy.lang.en.STOP_WORDS.union(['-pron-', 'oh','ooh','la'])
stopwords = set(my_lemmatizer(' '.join(list(stopwords))))



In [4]:
df['processed_lyrics'] = df['lyrics'].apply(process_text, args=(stopwords,))

df['processed_lyrics'].head(3)

0    old transylvania be lad castle be poor be sad ...
1    saw monster mirror wake today monster mirror d...
2    big red car roll street people meet like hello...
Name: processed_lyrics, dtype: object

## Tuning Parameters using a smaller subset

In [5]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.utils import shuffle
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_extraction.text import TfidfVectorizer

X = df
y = df['Age']

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42)


bag_of_words_model = Pipeline([
    ('vectorizer', ColumnTransformer([('tfidf', TfidfVectorizer(min_df=2), 'processed_lyrics')])),
    ('regressor', Ridge())     
], verbose = True)


param_grid = {'regressor__alpha':[0.1, 1, 5], 
              'vectorizer__tfidf__max_df': [0.95]}


gs_est = GridSearchCV(bag_of_words_model, param_grid, cv=3, n_jobs=2, verbose=5, return_train_score = True)

model = gs_est.fit(X_train[0:len(X_train)//3], y_train[0:len(X_train)//3])   #
print (gs_est.best_estimator_)


Fitting 3 folds for each of 3 candidates, totalling 9 fits
[Pipeline] ........ (step 1 of 2) Processing vectorizer, total=   0.2s
[Pipeline] ......... (step 2 of 2) Processing regressor, total=   0.1s
Pipeline(steps=[('vectorizer',
                 ColumnTransformer(transformers=[('tfidf',
                                                  TfidfVectorizer(max_df=0.95,
                                                                  min_df=2),
                                                  'processed_lyrics')])),
                ('regressor', Ridge(alpha=1))],
         verbose=True)


In [7]:
gs_est.best_estimator_

Pipeline(steps=[('vectorizer',
                 ColumnTransformer(transformers=[('tfidf',
                                                  TfidfVectorizer(max_df=0.95,
                                                                  min_df=2),
                                                  'processed_lyrics')])),
                ('regressor', Ridge(alpha=1))],
         verbose=True)

In [16]:
for i in range(len(gs_est.cv_results_['params'])):
    print (gs_est.cv_results_['params'][i], "score:", gs_est.cv_results_['mean_test_score'][i])

{'regressor__alpha': 0.1, 'vectorizer__tfidf__max_df': 0.95} score: 0.14854305618087812
{'regressor__alpha': 1, 'vectorizer__tfidf__max_df': 0.95} score: 0.3298979379942056
{'regressor__alpha': 5, 'vectorizer__tfidf__max_df': 0.95} score: 0.28848560142146


# Age-Rating Model, using full dataset

In [17]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.utils import shuffle
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_extraction.text import TfidfVectorizer

X = df
y = df['Age']

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42)


bag_of_words_model = Pipeline([
    ('vectorizer', ColumnTransformer([('tfidf', TfidfVectorizer(min_df=2, max_df=0.95), 'processed_lyrics')])),
    ('regressor', Ridge(1))     
], verbose = True)


model = bag_of_words_model.fit(X_train, y_train)  


[Pipeline] ........ (step 1 of 2) Processing vectorizer, total=   0.6s
[Pipeline] ......... (step 2 of 2) Processing regressor, total=   0.1s


In [18]:
y_pred = model.predict(X_test)

In [27]:
from sklearn.metrics import mean_squared_error, r2_score

mean = np.mean(y_train)
baseline_score  = r2_score(y_test, [mean for _ in range(len(y_test))])
baseline_error  = mean_squared_error(y_test, [mean for _ in range(len(y_test))],squared=False)

print (f"Baseline model: Use the mean age (age={mean}) as the prediction")
print ("Baseline Error: ", baseline_error)
print ("Baseline R^2 Score: ", baseline_score, "\n")



print ("Lyrics Model: ")
print ("Error (RMSE): ", mean_squared_error(y_test, y_pred, squared=False))
print ("R^2 Score: " , r2_score(y_test, y_pred))

Baseline model: Use the mean age (age=11.431499649614576) as the prediction
Baseline Error:  4.038358889273143
Baseline R^2 Score:  -0.0002732693107434514 

Lyrics Model: 
Error (RMSE):  3.117914328171984
R^2 Score:  0.40373791486012056
