# AnaLyrics Engine: Predicting Music Genres with NLP

Genre Playlists
- https://everynoise.com/engenremap.html

Spotify Metadata
- https://developer.spotify.com/documentation/web-api
- https://spotipy.readthedocs.io/en/2.22.1/

Genius Lyrics
- https://docs.genius.com
- https://lyricsgenius.readthedocs.io/en/master/reference/genius.html

Modeling

- https://www.kaggle.com/code/nilaychauhan/getting-started-with-nlp-pipelines
    - Extracting plain text / Reducing complexity

# nlp

In [None]:
import pandas as pd
import nltk
import re

from nltk.corpus import stopwords
from nltk import FreqDist
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams


In [None]:
# instantiate lemmatizer
lemmer = WordNetLemmatizer()

# custom stopwords list
sw = set(stopwords.words('english'))
custom_sw = ["i'd", "i'm",
             'yeah', 'ah', 'oh']
sw.update(custom_sw)

In [None]:
def preprocess_text(lyrics):
    # remove numbers and special characters
    lyrics = re.sub(r'[^a-zA-Z\s]', '', lyrics)
    # remove extra spaces and new lines
    lyrics = re.sub(r'\s+|\n\s*\n', ' ', lyrics)
    # lowercase all
    lyrics = lyrics.lower()

    # tokenize, lemmatize, remove stopwords
    tokens = word_tokenize(lyrics)
    tokens = [lemmer.lemmatize(word) for word in tokens]
    tokens = ' '.join([word for word in tokens if word not in sw])

    return tokens

In [None]:
df['tokens'] = df['lyrics_text'].apply(preprocess_text)
df['tokens']

# modeling

In [8]:
# add tuning and final model analysis

In [None]:

from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, accuracy_score, precision_recall_fscore_support

from sklearn.dummy import DummyClassifier

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
# set variables for modeling
X = df['tokens']
y = df['genre']

# split holdout set for final model validation
X_df, X_hold, y_df, y_hold = train_test_split(X, y, test_size=0.2, random_state=95)
y_df.value_counts(normalize=True)*100

In [None]:
def analyze_model(model, X, y):
    genres = ['pop', 'hip hop', 'rock', 'country']

    X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=95)
    model.fit(X_tr, y_tr)
    
    pred = cross_val_predict(model, X, y)
    print("Classification Report:")
    print(classification_report(y, pred))

    cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    print(f"Cross-Validation Scores: {cv_scores}")
    print(f"Mean Accuracy: {cv_scores.mean():.2f}")

    accuracy = accuracy_score(y, pred)
    print(f"Overall Accuracy: {accuracy:.4f}")

    cm = confusion_matrix(y, pred, normalize='true')

    sns.heatmap(cm, xticklabels=genres, yticklabels=genres,
                annot=True, fmt='.2f', cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()

In [None]:
# use dummy classifier as baseline model
baseline = DummyClassifier(strategy='uniform')
analyze_model(baseline, X_df, y_df)

In [None]:
pipe_cv_mnb = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', MultinomialNB())
])

analyze_model(pipe_cv_mnb, X_df, y_df)

In [None]:
pipe_tf_mnb = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', MultinomialNB())
]) 

analyze_model(pipe_tf_mnb, X_df, y_df)

In [None]:
pipe_cv_rf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', RandomForestClassifier())
])

analyze_model(pipe_cv_rf, X_df, y_df)

In [None]:
pipe_tf_rf = Pipeline([
    ('vectorizer',TfidfVectorizer()),
    ('classifier', RandomForestClassifier())
])

analyze_model(pipe_tf_rf, X_df, y_df)

In [None]:
pipe_cv_dt = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', DecisionTreeClassifier())
])

analyze_model(pipe_cv_dt, X_df, y_df)

In [None]:
pipe_tf_dt = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', DecisionTreeClassifier())
    ])

analyze_model(pipe_tf_dt, X_df, y_df)