In [13]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

full = pd.read_csv("../../data/poptrag_lyrics_genres_corpus_filtered_english.csv")
full.info()

vectorizer = CountVectorizer(
            ngram_range=(1, 1),
            token_pattern=r"\b[\w']+\b",
            lowercase=True,
        )
matrix = vectorizer.fit_transform(full["full_lyrics"])

# top 20 most common words in the lyrics
sum_words = matrix.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
print(words_freq[:20])

<class 'pandas.DataFrame'>
RangeIndex: 111938 entries, 0 to 111937
Data columns (total 21 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Unnamed: 0                111938 non-null  int64  
 1   track.s.id                111938 non-null  str    
 2   track.s.title             111937 non-null  str    
 3   track.s.firstartist.name  111938 non-null  str    
 4   album.s.title             111938 non-null  str    
 5   album.s.releaseyear       111938 non-null  int64  
 6   track.s.popularity        111938 non-null  int64  
 7   track.language            111938 non-null  str    
 8   full_lyrics               111938 non-null  str    
 9   cat5                      111938 non-null  str    
 10  pmax5                     111938 non-null  float64
 11  nmax5                     111938 non-null  float64
 12  cat12                     111938 non-null  str    
 13  pmax12                    111938 non-null  float64
 14 

In [6]:
from helpers.LyricsClassficationExperiment import LyricsClassificationExperiment

exp_fs = LyricsClassificationExperiment(
    corpus=full, 
    genrecol="cat12",
    lyricscol="full_lyrics", 
    artistcol="track.s.firstartist.name", 
    output_dir="cat5_mock_experiment_fs",
    test_size=0.2,
    random_state=42, 
    subsample_debug=0.01,
)
exp_fs.compute_fs_ngram_features(min_artists=20, top_n=100)
fs_features = exp_fs.X_train.keys()
print(exp_fs)
print(type(exp_fs.X_train))
exp_fs.train_fixed_parametrer_logistic_regression()
exp_fs.show_model_evaluation()
exp_fs.show_top_coefficients_per_genre()

✓ Extracted unigrams:
  - Unique: 9,057
  - Shape: (893, 9057)
  - Examples: ['cuffs', 'asphyxiated', 'lies', 'inspire', 'hersheys']
✓ Extracted bigrams:
  - Unique: 55,891
  - Shape: (893, 55891)
  - Examples: ['stay or', 'bueno chinga', 'aimlessly but', 'town yeah', 'goodbye the']
✓ Extracted trigrams:
  - Unique: 95,312
  - Shape: (893, 95312)
  - Examples: ['wait in silence', 'challenged all across', 'all made the', "i'm off st", 'hit oh by']
Calculating genre-level TF-IDF for unigrams with genre ...
✓ Calculated TF-IDF for 19,739 genre-ngram pairs
Calculating genre-level TF-IDF for bigrams with genre ...
✓ Calculated TF-IDF for 74,810 genre-ngram pairs
Calculating genre-level TF-IDF for trigrams with genre ...
✓ Calculated TF-IDF for 101,923 genre-ngram pairs
Counting artists per n-gram...
  10% complete
  20% complete
  30% complete
  40% complete
  50% complete
  60% complete
  70% complete
  80% complete
  90% complete
✓ Calculated artist diversity for 9,057 n-grams
Counting ar



Selected model parameters:
  C: 1.000
  l1_ratio: 0.500
  target_ratio: 3.000
F1 macro: 0.169
Precision macro: 0.169
Recall macro: 0.169
Cohen's kappa: 0.130
                  precision    recall  f1-score   support

alternative rock       0.17      0.08      0.11        13
      electronic       0.05      0.05      0.05        19
       hard rock       0.00      0.00      0.00         7
     heavy metal       0.14      0.22      0.17         9
         hip hop       0.43      0.40      0.41        15
      indie rock       0.00      0.00      0.00         5
            jazz       0.00      0.00      0.00         3
           metal       0.33      0.42      0.37        24
             pop       0.35      0.42      0.39        52
        pop rock       0.00      0.00      0.00         2
            rock       0.39      0.34      0.36        77

        accuracy                           0.30       226
       macro avg       0.17      0.18      0.17       226
    weighted avg       0.30 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [None]:
from helpers.LyricsClassficationExperiment import LyricsClassificationExperiment

exp_informed = LyricsClassificationExperiment(
    corpus=full,
    genrecol="cat12",
    lyricscol="full_lyrics",
    artistcol="track.s.firstartist.name",
    output_dir="cat5_mock_experiment_informed",
    test_size=0.2,
    random_state=42,
    # subsample_debug=0.01,
)
exp_informed.compute_informed_ngram_features(min_artists=20, top_n=100)
fs_features = exp_informed.X_train.keys()
print(exp_informed)
print(type(exp_informed.X_train))
exp_informed.train_fixed_parametrer_logistic_regression()
exp_informed.show_model_evaluation()

In [26]:
# print("FS\n" + "=" * 60)
# exp_fs.show_top_coefficients_per_genre()
print("INFORMED\n" + "=" * 60)
exp_informed.show_top_coefficients_per_genre(top_n=20)

INFORMED
Top 20 coefficients for genre: ALTERNATIVE ROCK
oh (0.820)
life (-0.685)
stars (0.631)
man (0.611)
land (-0.539)
heart (0.519)
lose (0.470)
like (0.464)
around (-0.460)
every (0.436)
say (-0.433)
wonder (0.427)
falling (0.423)
eyes (-0.409)
somebody (0.406)
live (0.399)
who's (-0.391)
bout (0.382)
keeps (0.376)
stop (-0.366)


Top 20 coefficients for genre: ELECTRONIC
hear (-0.790)
leave (0.602)
believe (0.591)
turn (0.590)
ya (0.582)
dreams (0.556)
fly (-0.553)
since (0.542)
call (-0.542)
girl (-0.521)
day (0.515)
side (-0.504)
walk (-0.500)
window (-0.486)
break (0.478)
pain (-0.472)
always (-0.467)
dead (0.466)
without (-0.464)
remember (0.450)


Top 20 coefficients for genre: HARD ROCK
street (0.589)
moment (0.543)
tonight (0.497)
left (0.480)
head (0.464)
ready (0.455)
blind (0.448)
pass (0.429)
last (0.416)
wild (0.403)
stand (0.392)
way (0.384)
woman (0.378)
people (0.376)
fool (0.372)
full (0.349)
line (0.348)
dreams (0.344)
boy (0.342)
gone (0.333)


Top 20 coefficien

In [21]:
for feautre in exp_informed.X_train.columns:
    print(feautre)



a dream
a little
a man
ah
ain't
air
alive
alone
along
always
another
apart
around
ask
ass
away
baby
back
back to
bad
bed
behind
believe
better
big
black
blind
blood
blow
blue
body
born
bout
boy
break
breath
bright
bring
broken
burn
burning
call
came
can see
can't
car
care
cause
cause i
cause you
child
city
clear
close
cold
come
come on
come to
comes
coming
control
could
could be
cry
dark
darkness
day
days
dead
death
deep
die
don't know
don't need
don't wanna
don't want
done
door
dream
dreams
em
empty
end
enough
even
ever
every
every day
everything
eye
eyes
face
faith
fall
falling
far
fear
feel
feel like
feel the
feeling
fight
find
find a
fine
fire
fly
fool
forever
forget
found
free
friend
fuck
full
full of
fun
future
game
games
gave
get
get it
gets
getting
girl
give
give me
give you
go
god
goin
going
gold
gone
gonna
good
goodbye
got
got a
got the
got to
gotta
ground
guess
hand
hands
hard
hard to
hate
head
hear
heard
heart
heaven
hell
hey
high
hit
hold
home
hot
house
hurt
i ain't
i can'

In [23]:
for feautre in exp_fs.X_train.columns:
    print(feautre)

name
it's a
the one
was a
when you
i said
i think
am i
we
see
you in
i've been
hard to
together
you have
yeah i
alive
with your
you will
the door
things
an
baby
in
and we
must be
to
just a
lonely
close
wanna
i
end
could
you're
me in
future
a dream
going
do
my own
the time
tell me
i have
i've got
and the
but you
and
and i
we have
so i
em
when you're
as i
i see
forever
a
we can
into
with
way i
more than
know that
know you
about
when the
through
run
not a
you i
did
words
and all
leave me
down and
i ain't
but i'm
think
get it
right
of your
in love
i could
let
it's all
me you
all of
when
don't need
i can't
more
those
am
sky
other
had a
you you
can't
mind
of a
just like
never
me i
how to
is it
had
i'm in
i feel
land
these
with my
dark
there's
your face
own
ever
has
there's no
and i don't
why
nothing
stay
will be
what
they say
but
and when
cause you
my heart
tried
alone
left
sun
days
yeah
there is no
had to
sweet
but the
up your
got to
be the
light
just to
live
my mind
so much
door
all your
y