In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import joblib

# Încarcă setul de date
data = pd.read_csv('12000_lyrics_dataset.csv')

# Elimină rândurile cu valori NaN în coloana 'Cleaned_Lyrics'
data = data.dropna(subset=['Cleaned_Lyrics'])

# Initializează vectorizatorul cu trigramme și antrenează-l pe date
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english', ngram_range=(3, 3))
X = vectorizer.fit_transform(data['Cleaned_Lyrics'])

# Initializează modelul LDA
lda = LatentDirichletAllocation(n_components=10, random_state=0)

# Antrenează modelul LDA pe datele vectorizate
lda.fit(X)

# Salvează modelul LDA și vectorizatorul
joblib.dump(lda, 'lda_model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')

# Adaugă topic-ul dominant la setul de date original
topic_distributions = lda.transform(X)
dominant_topics = topic_distributions.argmax(axis=1)
data['topic_dominant'] = dominant_topics

# Analiza frecvenței topicurilor pe genuri
topic_genre_distribution = data.groupby(['Genre', 'topic_dominant']).size().unstack(fill_value=0)

# Salvează distribuția topicurilor pe genuri într-un fișier CSV
topic_genre_distribution.to_csv('topic_genre_distribution_for_predict.csv')

# Salvează setul de date original cu topicurile dominante într-un fișier CSV
data.to_csv('12000_lyrics_with_topics_for_predict.csv', index=False)


In [4]:
import pandas as pd
import joblib
import numpy as np

# Încarcă modelul LDA și vectorizatorul
lda = joblib.load('lda_model.pkl')
vectorizer = joblib.load('vectorizer.pkl')

# Încarcă distribuția topicurilor pe genuri
topic_genre_distribution = pd.read_csv('topic_genre_distribution_for_predict.csv', index_col=0)

# Funcție pentru a prelucra și clasifica noile versuri
def classify_lyrics(lyrics):
    # Prelucrează textul: elimină caracterele speciale și unește rândurile într-un singur șir de caractere
    lyrics = " ".join(lyrics.split("\n")).strip()
    
    # Transformă versurile în trigramme și vectorizează-le
    X_new = vectorizer.transform([lyrics])
    
    # Obține distribuția topicurilor pentru noile versuri
    topic_distribution = lda.transform(X_new)
    
    # Identifică topicul dominant
    dominant_topic = topic_distribution.argmax(axis=1)[0]
    
    # Determină genul bazat pe topicul dominant și distribuția topicurilor pe genuri
    genre = topic_genre_distribution.idxmax(axis=0)[dominant_topic]
    return genre

# Exemplu de utilizare
new_lyrics = """
I give him all my love
That's all I do
And if you saw my love
You'd love him too and I love him
He gives my everything
And tenderly
The kiss my lover brings
He brings to me and I love him
A love like ours
Could never die
As long as I
Have you near me
Bright are the stars that shine
Dark is the sky
I know this love of mine
Will never die and I love him
"""
predicted_genre = classify_lyrics(new_lyrics)
print(f"The predicted genre is: {predicted_genre}")

The predicted genre is: Jazz


In [3]:
# Verifică numărul de topicuri din modelul LDA
print(f"Number of topics in LDA model: {lda.n_components}")

# Verifică forma distribuției topicurilor pe genuri
print(f"Shape of topic_genre_distribution: {topic_genre_distribution.shape}")
print(topic_genre_distribution)


Number of topics in LDA model: 10
Shape of topic_genre_distribution: (4, 10)
          0     1     2     3     4     5     6     7     8     9
Genre                                                            
Jazz   1466  1118  1097   995  1011  1356  1353  1145  1164  1295
Metal  2394  1078  1075  1131  1031  1050  1057  1065  1064  1055
Pop    1600  1149  1118  1227  1110  1176  1073  1083  1238  1226
Rap    1248  1153  1184  1246  1093  1238  1172  1198  1205  1261


In [7]:
import pandas as pd
import joblib
import numpy as np

# Încarcă modelul LDA și vectorizatorul
lda = joblib.load('lda_model.pkl')
vectorizer = joblib.load('vectorizer.pkl')

# Încarcă distribuția topicurilor pe genuri
topic_genre_distribution = pd.read_csv('topic_genre_distribution_for_predict.csv', index_col=0)

# Funcție pentru a prelucra și clasifica noile versuri
def classify_lyrics(lyrics):
    # Prelucrează textul: elimină caracterele speciale și unește rândurile într-un singur șir de caractere
    lyrics = " ".join(lyrics.split("\n")).strip()
    
    # Transformă versurile în trigramme și vectorizează-le
    X_new = vectorizer.transform([lyrics])
    
    # Obține distribuția topicurilor pentru noile versuri
    topic_distribution = lda.transform(X_new)
    
    # Identifică topicul dominant
    dominant_topic = topic_distribution.argmax(axis=1)[0]
    
    # Determină genul bazat pe topicul dominant și distribuția topicurilor pe genuri
    genre = topic_genre_distribution.idxmax(axis=0)[dominant_topic]
    return genre

# Exemplu de utilizare
new_lyrics = """
This is the world you're free to explore
Life is the paradise you've been searching for
You are the god you try to implore
You're living in a hell that none
But you have bore
Born as you were in deadly sin
You are to lose you cannot win
Words meant to build walls around your skin
To restrain the powers kept within
Converted you were told to find the way
To reach the land of everyday
Lobotomized drowning internal dismay
I live while you decay
Penetrated soul weakened and misled
Beholder of a life that perish
Soon you'll wake up dead
A lie have built unto itself
A throne in your head
Beholder of a life that perished
You are living dead!
Defiled the cross hanging around
Your neck
With blood from never concluded lives
The symbol of spiritual lobotomy tells you're
A slave to repeated lies
I can see right through you will
Cease to exist
And fade away, into your minds wasteland
I live while you decay
I gaze into your eyes
Two wells of extinguished life
Mental winter living lie
You're searching the long lost and long to die
Illuminate your inner shadows the darkness
Kept from light
You walk one way but your heads on backwards
And end up being what you fight
"""
predicted_genre = classify_lyrics(new_lyrics)
print(f"The predicted genre is: {predicted_genre}")

The predicted genre is: Metal


In [13]:
import pandas as pd
import joblib
import numpy as np

# Încarcă modelul LDA și vectorizatorul
lda = joblib.load('lda_model.pkl')
vectorizer = joblib.load('vectorizer.pkl')

# Încarcă distribuția topicurilor pe genuri
topic_genre_distribution = pd.read_csv('topic_genre_distribution_for_predict.csv', index_col=0)

# Funcție pentru a prelucra și clasifica noile versuri
def classify_lyrics(lyrics):
    # Prelucrează textul: elimină caracterele speciale și unește rândurile într-un singur șir de caractere
    lyrics = " ".join(lyrics.split("\n")).strip()
    
    # Transformă versurile în trigramme și vectorizează-le
    X_new = vectorizer.transform([lyrics])
    
    # Obține distribuția topicurilor pentru noile versuri
    topic_distribution = lda.transform(X_new)
    
    # Identifică topicul dominant
    dominant_topic = topic_distribution.argmax(axis=1)[0]
    
    # Determină genul bazat pe topicul dominant și distribuția topicurilor pe genuri
    genre = topic_genre_distribution.idxmax(axis=0)[dominant_topic]
    return genre

# Exemplu de utilizare
new_lyrics = """
Intro
Hit me baby thats right
Hurt me baby thats right
Thatthatthat funk
Thatthatthatthat
Thatthatsthats funkdafiedlike fonk
Thats that retardedlike fonk
Like you put your armpits in the drum machine
Good God
Break me off some of that
Come on

Verse 
Easy Mo just let the beat rock
From your sweet stock
To bounce the complete block
Make everyone on the street flock
To this here rhythm
Musicll hit em get em
My lyricsll fit em when I get with em
Theyll admit Im smokin like bags of ism
The way that I keep amazin
Yall dont even know who youre facin pa
The Kane has flipped more tracks than a racing car
The one who created it and many others imitated it
I heard your rap style kid  I hated it
Let me show you how look here now check my style
What I spit out itll raise your brow make you say wow
Badabowbow come on now lets get down
And sway this way when you hear the music play
Hiphop hooray this is what I want you to say
Hey
Bridge
Im in love with Big Daddy Kane
He makes the party swing
He turns the mother out
And rips apart things

Then I go

Hook
Dadadadaaada
Yeah I like the way it sound
And I love the way its goin down
Dadadadaaada
You know I like the way it sound
And I just love the way its goin down

Verse 
Now whats this BS youre sayin
And dont try to act like Martin now with all that I was just playin
No need to grief or mourn cause now The Beef is On
Boombap boobowbow  kid your teeth is gone
Just cause you rap that dont mean that ya catchin wreck with me
You step to this I give your mic a vasectomy
I only know one person that can come next to me
No thats a tattle
Cause I cant count my own shadow
A battle I gots to have it
Unless youre gonna rob me like they did Whittaker when he fought Chavez
Cause tryin to go against the Kane rappin
Is like a pimp tryna pull a nun  aint nothin happenin
Clear the way for the one champion true black don
Who begun gettin the job done take a look hon
Back up son you know you cant get none
Come on Im on a whole nother level of rap
And its like that now show me where the party at
Uh huh
Hook
Dadadadaaada
Yeah I like the way it sound
And I love the way its goin down
Dadadadaaada
You know I like the way it sound
And I love how its goin down

Verse 
The BIG DAD Y no back up and add another D
Then back to the K to the A to the N to the E
Live from New York the one and only
I give it to you raw for my homies
Uh and to the ladies I take em lookin somethin fine
And dont mind if we bump n grind
If youre with me jump in line
Because if in my wallet I can find one prophylactic
Then you better believe girl that you gonna get your ass dicked
Hard type of rappers extinct like a dinosaur
The kind you saw the rhyme before
But now you never find no more
Steppin to the Kane with some drama to be startin
Because I put em all on ice like Tonya Harding
Back up boy I got the whole convoy
Rollin with me on a mission thats to seek and destroy
So to all the people thats been tryin to talk about me
You better change your name to  cause youre Audi
And if you bring on your crew Im steppin to them too
Just put the beat on and watch how I swim through
The groove with more style than a backstroke
Drivin past my competition like cab drivers do black folks
Thats the way I move I always stayed the Smooth
Operator with data watin for you to play a groove
To turn it out without a doubt and show what Im about
Good lookin Brooklyn yeah we in the house
Sham on
Hook
Dadadadaaada
Yeah I like the way it sound
And I love the way its goin down
Dadadadaaada
You know I like the way it sound
And I just love the way its goin down
"""
predicted_genre = classify_lyrics(new_lyrics)
print(f"The predicted genre is: {predicted_genre}")

The predicted genre is: Rap
