In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Load the dataset
data = pd.read_csv('12000_lyrics_dataset.csv')

# Drop rows with NaN values in the 'Cleaned_Lyrics' column
data = data.dropna(subset=['Cleaned_Lyrics'])

# Initialize the vectorizer with bigrams and fit it to the text data
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english', ngram_range=(2, 2))
X = vectorizer.fit_transform(data['Cleaned_Lyrics'])

# Initialize the LDA model
lda = LatentDirichletAllocation(n_components=10, random_state=0)

# Fit the LDA model to the vectorized data
lda.fit(X)

# Get the dominant topic for each instance
topic_distributions = lda.transform(X)
dominant_topics = topic_distributions.argmax(axis=1)

# Add the dominant topic to the original dataset
data['topic_dominant'] = dominant_topics

# Print the initial results
print(data[['Cleaned_Lyrics', 'Genre', 'topic_dominant']])

# Function to get the top words for each topic
def get_top_words(model, feature_names, n_top_words):
    top_words = {}
    for topic_idx, topic in enumerate(model.components_):
        top_words[topic_idx] = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
    return top_words

# Get the top bigrams for each topic
feature_names = vectorizer.get_feature_names_out()
top_bigrams = get_top_words(lda, feature_names, 20)

# Print the top bigrams for each topic
for topic, bigrams in top_bigrams.items():
    print(f"Topic {topic}: {', '.join(bigrams)}")

# Save the top bigrams for each topic to a CSV file
with open('top_bigrams.csv', 'w') as f:
    f.write("Topic,Bigrams\n")
    for topic, bigrams in top_bigrams.items():
        f.write(f"{topic},{', '.join(bigrams)}\n")

# Analyze the frequency of topics per genre
topic_genre_distribution = data.groupby(['Genre', 'topic_dominant']).size().unstack(fill_value=0)

# Print the distribution of topics per genre
print(topic_genre_distribution)

# Save the distribution of topics per genre to a CSV file
topic_genre_distribution.to_csv('bigrams_12000_topic_genre_distribution.csv')

# Save the original dataset with dominant topics to a CSV file
data.to_csv('bigrams_12000_lyrics_with_topics.csv', index=False)


                                          Cleaned_Lyrics  Genre  \
0      Red dragon from the first morning of time\nRed...  Metal   
1      Broad incision sits across the evening\nA vict...  Metal   
2      I dont wanna see\nI dont wanna say\nTime stand...  Metal   
3      Weve walked together down this winding road\nI...  Metal   
4      Brothers the battle is raging choose your side...  Metal   
...                                                  ...    ...   
47995  Intro\nYou did it baby Congratulations homie\n...    Rap   
47996  Tah Murdah\nPerminently dedicated to the stree...    Rap   
47997  Intro\nCompton in the house\n\n\nChorus DJ Qui...    Rap   
47998  Bizzy\nThere is no way in hell marching factio...    Rap   
47999  Produced by Tae Beast\n\nIntro\nMake sure the ...    Rap   

       topic_dominant  
0                   7  
1                   4  
2                   4  
3                   9  
4                   6  
...               ...  
47995               5  
479

In [2]:
                        import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Load the dataset
data = pd.read_csv('12000_lyrics_dataset.csv')

# Drop rows with NaN values in the 'Cleaned_Lyrics' column
data = data.dropna(subset=['Cleaned_Lyrics'])

# Initialize the vectorizer with trigrams and fit it to the text data
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english', ngram_range=(3, 3))
X = vectorizer.fit_transform(data['Cleaned_Lyrics'])

# Initialize the LDA model
lda = LatentDirichletAllocation(n_components=10, random_state=0)

# Fit the LDA model to the vectorized data
lda.fit(X)

# Get the dominant topic for each instance
topic_distributions = lda.transform(X)
dominant_topics = topic_distributions.argmax(axis=1)

# Add the dominant topic to the original dataset
data['topic_dominant'] = dominant_topics

# Print the initial results
print(data[['Cleaned_Lyrics', 'Genre', 'topic_dominant']])

# Function to get the top words for each topic
def get_top_words(model, feature_names, n_top_words):
    top_words = {}
    for topic_idx, topic in enumerate(model.components_):
        top_words[topic_idx] = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
    return top_words

# Get the top trigrams for each topic
feature_names = vectorizer.get_feature_names_out()
top_trigrams = get_top_words(lda, feature_names, 20)

# Print the top trigrams for each topic
for topic, trigrams in top_trigrams.items():
    print(f"Topic {topic}: {', '.join(trigrams)}")

# Save the top trigrams for each topic to a CSV file
with open('top_trigrams.csv', 'w') as f:
    f.write("Topic,Trigrams\n")
    for topic, trigrams in top_trigrams.items():
        f.write(f"{topic},{', '.join(trigrams)}\n")

# Analyze the frequency of topics per genre
topic_genre_distribution = data.groupby(['Genre', 'topic_dominant']).size().unstack(fill_value=0)

# Print the distribution of topics per genre
print(topic_genre_distribution)

# Save the distribution of topics per genre to a CSV file
topic_genre_distribution.to_csv('trigrams_12000_topic_genre_distribution.csv')

# Save the original dataset with dominant topics to a CSV file
data.to_csv('trigrams_12000_lyrics_with_topics.csv', index=False)
                                                             

                                          Cleaned_Lyrics  Genre  \
0      Red dragon from the first morning of time\nRed...  Metal   
1      Broad incision sits across the evening\nA vict...  Metal   
2      I dont wanna see\nI dont wanna say\nTime stand...  Metal   
3      Weve walked together down this winding road\nI...  Metal   
4      Brothers the battle is raging choose your side...  Metal   
...                                                  ...    ...   
47995  Intro\nYou did it baby Congratulations homie\n...    Rap   
47996  Tah Murdah\nPerminently dedicated to the stree...    Rap   
47997  Intro\nCompton in the house\n\n\nChorus DJ Qui...    Rap   
47998  Bizzy\nThere is no way in hell marching factio...    Rap   
47999  Produced by Tae Beast\n\nIntro\nMake sure the ...    Rap   

       topic_dominant  
0                   6  
1                   3  
2                   2  
3                   3  
4                   6  
...               ...  
47995               3  
479