In [28]:
import time
import os.path
import pickle
import pandas as pd
import mlflow
import numpy as np
from scipy.stats import uniform
from mlflow.models import infer_signature
from sklearn import set_config
from sklearn.metrics import f1_score, classification_report
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline

In [2]:
dataset = pd.read_csv('data/processed_english_lyrics.csv')

In [3]:
dataset = dataset[~dataset['lyrics'].isna()]
dataset['artist_tag_combination'] = dataset['artist'] + "#____#"+ dataset['tag']
dataset.columns

Index(['title', 'tag', 'artist', 'year', 'views', 'features', 'lyrics', 'id',
       'language_cld3', 'language_ft', 'language', 'artist_tag_combination'],
      dtype='object')

In this part we will use classical NLP methods for text classification:  
1) vectorize documents (lyrics) using TF-IDF vectorization method  
2) classify document vectors with Logistic Regression  

### Lyrics vectorization

TF-IDF vectorization a.k.a term frequence inverse document frequency encodes each unique word in the vocabulary with a real number from 0 to 1. The formula $tf*idf$.  
$tf$ - term frequency is calculated per document to identify how often word appears in the document  
$idf$ - inverse document frequency = $\dfrac{1}{frequency}$ of word across all documents  

In [4]:
X, y = dataset['lyrics'], dataset['tag']

In [5]:
def train_test_split_stratified_artist(df: pd.DataFrame, test_size: float = 0.10) -> tuple[pd.DataFrame, pd.Series, pd.Series]:
    """Function that splits the dataset in stratified fashion by artist to save tags distribution"""
    artist_tag_unique = sorted(df['artist_tag_combination'].unique())
    tags_list = list(map(lambda c: c.split("#____#")[1], artist_tag_unique))
    artist_tag_train, _, _, _ = train_test_split(artist_tag_unique,
                                                    tags_list,
                                                    test_size=test_size,
                                                    random_state=42,
                                                    stratify=tags_list)
    train_df = df[df['artist_tag_combination'].isin(artist_tag_train)]
    test_df = df[~df['artist_tag_combination'].isin(artist_tag_train)]
    X_test, y_test = test_df['lyrics'], test_df['tag']
    return (train_df, X_test, y_test)

#### Splitting data into 10/20/70 for test, validation and train datasets

In [6]:
train_validation_df, X_test, y_test = train_test_split_stratified_artist(dataset)
train_df, X_valid, y_valid = train_test_split_stratified_artist(train_validation_df, test_size=0.22)
X_train, y_train = train_df['lyrics'], train_df['tag']

In [7]:
del train_validation_df
del train_df

In [8]:
print('test len: ', len(X_test), len(y_test))
print('valid len: ', len(X_valid), len(y_valid))
print('train len: ', len(X_train), len(y_train))

test len:  307203 307203
valid len:  608271 608271
train len:  2151609 2151609


#### Testing pipeline parts individually and finding best hyper parameters. Than combining into one prediction pipeline.

In [9]:
vectorizer_file_exists = os.path.isfile('vectorizer.pk')
svd_file_exists = os.path.isfile('svd.pk')

Vectorizing text with TF-IDF

In [10]:
start_time = time.time()
if not vectorizer_file_exists:
    start_time = time.time()
    vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2), max_features=200000)
    X_train_vectorized = vectorizer.fit_transform(X_train)
    with open('vectorizer.pk', 'wb') as f:
        pickle.dump(vectorizer, f)
else:
    with open('vectorizer.pk', 'rb') as f:
        vectorizer = pickle.load(f)
        X_train_vectorized = vectorizer.transform(X_train)
        
print(X_train_vectorized.shape)
print('Execution time: ', str((time.time()-start_time)/60), 'minutes')

(2151609, 200000)
Execution time:  5.068227597077688 minutes


Reducing dimensionality and sparsity with SVD

In [11]:
start_time = time.time()
if not svd_file_exists:  
    svd = TruncatedSVD(n_components=250, n_iter=5, random_state=42)
    with open('svd.pk', 'wb') as f:
        pickle.dump(svd, f)
    X_train_reduced = svd.fit_transform(X_train_vectorized)
else:
    with open('svd.pk', 'rb') as f:
        svd = pickle.load(f)
    X_train_reduced = svd.transform(X_train_vectorized)
    
print(svd.explained_variance_ratio_.sum())
print(X_train_reduced.shape)
print('Execution time: ', str((time.time()-start_time)/60), 'minutes')

0.13002072028513775
Execution time:  0.6455830295880636 minutes


Training Logistic Regression Model, for now hyperparameters are not optimized

In [13]:
start_time = time.time()
classifier = LogisticRegression(penalty='elasticnet',
                                class_weight='balanced',
                                solver='saga',
                                random_state=42,
                                warm_start=True,
                                n_jobs=-1,
                                l1_ratio=0.2)
classifier.fit(X_train_reduced, y_train)
print('Execution time: ', str((time.time()-start_time)/60), 'm')
print(classifier.score(X_train_reduced, y_train))

Execution time:  6.817437199751536 m
0.48780424324307997


Transforming validation dataset

In [14]:
%%time
X_valid_vectorized = vectorizer.transform(X_valid)
X_valid_reduced = svd.transform(X_valid_vectorized)

CPU times: user 1min 34s, sys: 5.25 s, total: 1min 39s
Wall time: 2min 7s


In [21]:
print(classification_report(y_train, classifier.predict(X_train_reduced)))

              precision    recall  f1-score   support

     country       0.11      0.65      0.19     59336
         pop       0.65      0.23      0.33    910019
         rap       0.89      0.76      0.82    653512
          rb       0.17      0.54      0.26    103627
        rock       0.41      0.59      0.49    425115

    accuracy                           0.49   2151609
   macro avg       0.45      0.55      0.42   2151609
weighted avg       0.64      0.49      0.51   2151609



In [22]:
print(classification_report(y_valid, classifier.predict(X_valid_reduced)))

              precision    recall  f1-score   support

     country       0.11      0.65      0.18     16028
         pop       0.67      0.23      0.34    263184
         rap       0.89      0.76      0.82    184871
          rb       0.17      0.55      0.26     29011
        rock       0.40      0.60      0.48    115177

    accuracy                           0.48    608271
   macro avg       0.45      0.56      0.42    608271
weighted avg       0.65      0.48      0.50    608271



Optimizing regularization params

In [None]:
%%time
distributions = dict(C=uniform(loc=0, scale=4),
                     l1_ratio=np.arange(0, 1, 0.05))
classifier = LogisticRegression(penalty='elasticnet',
                                class_weight='balanced',
                                solver='saga',
                                random_state=42,
                                warm_start=True,
                                n_jobs=-1)
clf = RandomizedSearchCV(classifier,
                         distributions,
                         random_state=42,
                         scoring='f1_macro',
                         cv=3,
                         verbose=3
                        )
search = clf.fit(X_train_reduced, y_train)
search.best_params_

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV 1/3] END C=1.49816047538945, l1_ratio=0.7000000000000001;, score=0.416 total time= 5.5min
[CV 2/3] END C=1.49816047538945, l1_ratio=0.7000000000000001;, score=0.419 total time= 5.6min
[CV 3/3] END C=1.49816047538945, l1_ratio=0.7000000000000001;, score=0.401 total time= 5.8min
[CV 1/3] END C=2.9279757672456204, l1_ratio=0.30000000000000004;, score=0.416 total time= 4.9min
[CV 2/3] END C=2.9279757672456204, l1_ratio=0.30000000000000004;, score=0.419 total time= 5.4min
[CV 3/3] END C=2.9279757672456204, l1_ratio=0.30000000000000004;, score=0.401 total time= 5.3min
[CV 1/3] END C=1.7833310114143646, l1_ratio=0.5;, score=0.416 total time= 5.0min
[CV 2/3] END C=1.7833310114143646, l1_ratio=0.5;, score=0.419 total time= 5.1min


In [4]:
X, y = dataset['lyrics'], dataset['tag']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1), max_features=200000)
classifier = LogisticRegression(penalty='elasticnet', class_weight='balanced', random_state=42)

pipe = Pipeline(steps=[("vectorizer", vectorizer),
                       ("classifier", classifier)],
                memory='/Users/vtysch/sklearn_cache',
                verbose=True
               )

set_config(display='diagram')
display(pipe)

In [None]:
pipe.fit(X_train, y_train).score(X_test, y_test)