# Topic Modeling

## Topic Modeling - Attempt #1 (All Text)

In [292]:
# Read in the document-term matrix
import pandas as pd

data_clean = pd.read_csv("lyrics_clean.csv",index_col=0)
data_clean

Unnamed: 0,lyrics
ABBA,ive love thought would manage hit ceiling stil...
David_Bowie,small jean genie snuck city strung laser slash...
Janis_Joplin,oh come come come come didnt make feel like ma...
Michael_Jackson,butt mine gon na tell right show face broad da...
Queen,dim light sing song full sad thing tango two s...
Rolling_Stones,drag get old kid different today hear every mo...
The_Clash,stay around dont play around old town seem lik...
Bob_Dylan,go away window leave choose speed im one want ...
Elton_John,hear distance sense far away old rudolph reind...
Led_Zeppeling,hey thats right ask sweet mama let kid say mig...


## LSA



In [281]:
# We are going to create a document-term matrix using CountVectorizer, and exclude common English stop words
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(data_clean.lyrics)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = data_clean.index
data_dtm

Unnamed: 0,aaaah,aaaahaaah,aaah,aah,aahah,aaow,abandon,abe,abel,abide,...,zombie,zombies,zone,zoo,zoomin,zuma,zwei,élysées,über,überm
ABBA,0,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,1,0,0
David_Bowie,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Janis_Joplin,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Michael_Jackson,3,0,0,1,2,8,1,0,0,0,...,0,0,0,0,0,0,0,0,1,1
Queen,0,0,0,7,0,0,1,0,0,0,...,0,0,0,2,0,0,0,0,0,0
Rolling_Stones,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,3,0,0,0,0
The_Clash,0,0,0,0,0,0,0,0,0,0,...,1,0,2,0,3,0,0,0,0,0
Bob_Dylan,0,0,0,0,0,0,1,4,1,0,...,0,0,0,0,0,0,0,0,0,0
Elton_John,0,0,0,0,0,0,0,0,0,0,...,0,0,3,0,0,0,0,0,0,0
Led_Zeppeling,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [336]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()#min_df=.2, max_df=.6)
tfidf = vectorizer.fit_transform(data_clean.lyrics)
len(vectorizer.get_feature_names())

7070

In [337]:
data_tfidf = pd.DataFrame(tfidf.toarray(), columns=vectorizer.get_feature_names())
data_tfidf.index = data_clean.index
data_tfidf

Unnamed: 0,aaaah,aaaahaaah,aaah,aah,aahah,aaow,abandon,abe,abel,abide,...,zombie,zombies,zone,zoo,zoomin,zuma,zwei,élysées,über,überm
ABBA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012863,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006431,0.0,0.0
David_Bowie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Janis_Joplin,0.0,0.0,0.0,0.006696,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Michael_Jackson,0.008317,0.0,0.0,0.002036,0.006406,0.025624,0.002467,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003203,0.003203
Queen,0.0,0.0,0.0,0.02445,0.0,0.0,0.004232,0.0,0.0,0.0,...,0.0,0.0,0.0,0.010991,0.0,0.0,0.0,0.0,0.0,0.0
Rolling_Stones,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00606,0.0,0.0,0.0,0.018179,0.0,0.0,0.0,0.0
The_Clash,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.005672,0.0,0.009818,0.0,0.017015,0.0,0.0,0.0,0.0,0.0
Bob_Dylan,0.0,0.0,0.0,0.0,0.0,0.0,0.00427,0.022177,0.005544,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Elton_John,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.015687,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Led_Zeppeling,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [339]:
from sklearn.decomposition import TruncatedSVD

# SVD represent documents and terms in vectors 
svd_model = TruncatedSVD(n_components=5)

svd_model.fit(data_tfidf)

print(svd_model.components_.shape)
print(svd_model.singular_values_)


(5, 7070)
[2.71317978 0.91797529 0.85821445 0.82409314 0.77620707]


In [340]:
terms = vectorizer.get_feature_names()

for i, comp in enumerate(svd_model.components_):
    terms_comp = zip(terms, comp)
    sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:7]
    print("Topic "+str(i)+": ")
    for t in sorted_terms:
        print("%.2f*%s "% (t[1], t[0]) ,end='')
    print("")

Topic 0: 
0.28*get 0.23*oh 0.22*love 0.21*go 0.21*im 0.21*dont 0.19*know 
Topic 1: 
0.31*oh 0.28*la 0.25*yeah 0.24*baby 0.20*love 0.13*want 0.11*honey 
Topic 2: 
0.41*na 0.22*oh 0.19*bird 0.14*rock 0.11*wart 0.11*wan 0.11*number 
Topic 3: 
0.22*rudie 0.19*rock 0.18*dont 0.18*bird 0.17*want 0.17*get 0.14*baby 
Topic 4: 
0.38*li 0.24*whop 0.21*im 0.19*american 0.16*la 0.16*modern 0.15*fame 


In [200]:
#import sys
#!conda install --yes --prefix {sys.prefix} umap-learn

## LDA

In [342]:
# We are going to create a document-term matrix using CountVectorizer, and exclude common English stop words
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')#,min_df=.1, max_df=.9)
data_cv = cv.fit_transform(data_clean.lyrics)
data = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data.index = data_clean.index
data

Unnamed: 0,aaaah,aaaahaaah,aaah,aah,aahah,aaow,abandon,abe,abel,abide,...,zombie,zombies,zone,zoo,zoomin,zuma,zwei,élysées,über,überm
ABBA,0,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,1,0,0
David_Bowie,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Janis_Joplin,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Michael_Jackson,3,0,0,1,2,8,1,0,0,0,...,0,0,0,0,0,0,0,0,1,1
Queen,0,0,0,7,0,0,1,0,0,0,...,0,0,0,2,0,0,0,0,0,0
Rolling_Stones,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,3,0,0,0,0
The_Clash,0,0,0,0,0,0,0,0,0,0,...,1,0,2,0,3,0,0,0,0,0
Bob_Dylan,0,0,0,0,0,0,1,4,1,0,...,0,0,0,0,0,0,0,0,0,0
Elton_John,0,0,0,0,0,0,0,0,0,0,...,0,0,3,0,0,0,0,0,0,0
Led_Zeppeling,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [343]:
from gensim import matutils, models
import scipy.sparse


In [344]:
tdm = data.transpose()
tdm.head()

Unnamed: 0,ABBA,David_Bowie,Janis_Joplin,Michael_Jackson,Queen,Rolling_Stones,The_Clash,Bob_Dylan,Elton_John,Led_Zeppeling,Pink_Floyd,Ramones,The_Beatles,The_Doors
aaaah,0,0,0,3,0,0,0,0,0,0,0,0,6,0
aaaahaaah,0,0,0,0,0,0,0,0,0,0,0,0,1,0
aaah,0,0,0,0,0,0,0,0,0,0,0,0,3,0
aah,0,0,1,1,7,0,0,0,0,0,0,2,1,0
aahah,0,0,0,2,0,0,0,0,0,0,0,0,0,0


In [348]:
# We're going to put the term-document matrix into a new gensim format, from df --> sparse matrix --> gensim corpus
sparse_counts = scipy.sparse.csr_matrix(tdm)
corpus = matutils.Sparse2Corpus(sparse_counts)

In [349]:
# Gensim also requires dictionary of the all terms and their respective location in the term-document matrix
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

In [350]:
# Now that we have the corpus (term-document matrix) and id2word (dictionary of location: term),
# we need to specify two other parameters as well - the number of topics and the number of passes
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=2)
lda.print_topics()

[(0,
  '0.019*"love" + 0.017*"oh" + 0.016*"im" + 0.012*"na" + 0.012*"dont" + 0.012*"come" + 0.011*"know" + 0.010*"yeah" + 0.010*"want" + 0.009*"baby"'),
 (1,
  '0.015*"dont" + 0.014*"know" + 0.013*"yeah" + 0.013*"na" + 0.012*"oh" + 0.012*"im" + 0.011*"say" + 0.010*"baby" + 0.009*"time" + 0.009*"want"')]

In [351]:
# LDA for num_topics = 3
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=3)
lda.print_topics()

[(0,
  '0.017*"dont" + 0.014*"know" + 0.012*"na" + 0.011*"love" + 0.010*"oh" + 0.010*"say" + 0.010*"want" + 0.010*"im" + 0.009*"yeah" + 0.008*"like"'),
 (1,
  '0.015*"love" + 0.015*"dont" + 0.014*"oh" + 0.014*"im" + 0.013*"baby" + 0.013*"yeah" + 0.012*"know" + 0.012*"want" + 0.011*"na" + 0.009*"come"'),
 (2,
  '0.018*"oh" + 0.017*"im" + 0.016*"love" + 0.013*"na" + 0.012*"yeah" + 0.012*"know" + 0.011*"come" + 0.010*"time" + 0.010*"dont" + 0.009*"say"')]

In [352]:
# LDA for num_topics = 4
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=4)
lda.print_topics()

[(0,
  '0.017*"im" + 0.016*"na" + 0.014*"dont" + 0.014*"oh" + 0.010*"love" + 0.010*"yeah" + 0.010*"know" + 0.009*"say" + 0.009*"come" + 0.009*"youre"'),
 (1,
  '0.016*"love" + 0.015*"dont" + 0.015*"oh" + 0.015*"im" + 0.013*"know" + 0.013*"want" + 0.012*"baby" + 0.011*"yeah" + 0.010*"time" + 0.010*"say"'),
 (2,
  '0.019*"love" + 0.015*"oh" + 0.015*"yeah" + 0.014*"na" + 0.013*"im" + 0.012*"know" + 0.012*"dont" + 0.010*"come" + 0.009*"baby" + 0.009*"like"'),
 (3,
  '0.014*"oh" + 0.014*"know" + 0.012*"na" + 0.012*"dont" + 0.012*"come" + 0.011*"love" + 0.010*"im" + 0.009*"yeah" + 0.009*"say" + 0.009*"make"')]

## Topic Modeling - Attempt #2 (Nouns Only)

In [353]:
# Let's create a function to pull out nouns from a string of text
from nltk import word_tokenize, pos_tag

def nouns(text):
    '''Given a string of text, tokenize the text and pull out only the nouns.'''
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    return ' '.join(all_nouns)

In [354]:
# Read in the cleaned data, before the CountVectorizer step
import pandas as pd

data_clean = pd.read_csv("lyrics_clean.csv",index_col=0)
data_clean

Unnamed: 0,lyrics
ABBA,ive love thought would manage hit ceiling stil...
David_Bowie,small jean genie snuck city strung laser slash...
Janis_Joplin,oh come come come come didnt make feel like ma...
Michael_Jackson,butt mine gon na tell right show face broad da...
Queen,dim light sing song full sad thing tango two s...
Rolling_Stones,drag get old kid different today hear every mo...
The_Clash,stay around dont play around old town seem lik...
Bob_Dylan,go away window leave choose speed im one want ...
Elton_John,hear distance sense far away old rudolph reind...
Led_Zeppeling,hey thats right ask sweet mama let kid say mig...


In [355]:
# Apply the nouns function to the transcripts to filter only on nouns
data_nouns = pd.DataFrame(data_clean.lyrics.apply(nouns))
data_nouns

Unnamed: 0,lyrics
ABBA,love thought ceiling feeling dont think make d...
David_Bowie,jean city strung laser slash eat razor pull wa...
Janis_Joplin,didnt feel man yeah everything woman honey tim...
Michael_Jackson,butt mine gon show face im tell gon mind shoot...
Queen,dim light thing tango serenade heart valentino...
Rolling_Stones,drag kid today mother need something today she...
The_Clash,stay play town lotta people tonight lotta peop...
Bob_Dylan,window speed babe im need youre someone defend...
Elton_John,distance sense rudolph reindeer santa head eas...
Led_Zeppeling,hey thats mama kid keep hid baby see im rider ...


In [356]:
# Create a new document-term matrix using only nouns
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer

# Re-add the additional stop words since we are recreating the document-term matrix
add_stop_words = ['like', 'im', 'know', 'just', 'dont', 'thats', 'right', 'people',
                  'youre', 'got', 'gonna', 'time', 'think', 'yeah', 'said']
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

# Recreate a document-term matrix with only nouns
cvn = CountVectorizer(stop_words=stop_words, min_df=.1, max_df=.9)
data_cvn = cvn.fit_transform(data_nouns.lyrics)
data_dtmn = pd.DataFrame(data_cvn.toarray(), columns=cvn.get_feature_names())
data_dtmn.index = data_nouns.index
data_dtmn

Unnamed: 0,aaaah,aah,abuse,ace,ache,act,action,actor,advertising,advice,...,yawn,yell,yellow,yes,yesterday,yo,york,youd,youll,youth
ABBA,0,0,0,1,0,0,1,0,0,0,...,0,0,0,7,0,1,0,0,4,0
David_Bowie,0,0,0,0,1,1,1,1,0,0,...,1,1,0,1,1,0,2,0,2,0
Janis_Joplin,0,0,0,0,0,0,0,0,0,0,...,0,0,0,4,1,0,0,0,0,0
Michael_Jackson,2,1,5,0,1,1,0,0,0,1,...,1,0,0,2,2,0,0,1,13,0
Queen,0,4,0,0,2,1,1,0,0,0,...,0,0,1,9,1,0,0,1,3,3
Rolling_Stones,0,0,0,0,1,0,0,0,0,0,...,0,0,1,10,2,0,1,1,1,1
The_Clash,0,0,0,0,0,0,0,0,2,1,...,0,0,0,4,0,12,1,0,4,3
Bob_Dylan,0,0,0,1,3,1,0,0,1,1,...,0,0,0,9,3,0,1,4,5,0
Elton_John,0,0,1,0,3,2,3,0,0,0,...,0,0,0,0,0,0,5,1,7,0
Led_Zeppeling,0,0,2,0,1,0,0,0,0,0,...,0,0,0,5,2,0,0,0,0,0


In [357]:
# Create the gensim corpus
corpusn = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmn.transpose()))

# Create the vocabulary dictionary
id2wordn = dict((v, k) for k, v in cvn.vocabulary_.items())

In [358]:
# Let's start with 2 topics
ldan = models.LdaModel(corpus=corpusn, num_topics=2, id2word=id2wordn, passes=10)
ldan.print_topics()

[(0,
  '0.014*"aint" + 0.010*"cause" + 0.008*"rock" + 0.008*"ah" + 0.008*"ooh" + 0.008*"number" + 0.007*"babe" + 0.006*"place" + 0.006*"honey" + 0.006*"stop"'),
 (1,
  '0.020*"rock" + 0.019*"bird" + 0.013*"dance" + 0.012*"tonight" + 0.010*"wan" + 0.010*"cause" + 0.008*"care" + 0.008*"ah" + 0.008*"family" + 0.007*"bitch"')]

In [360]:
# Let's try topics = 3
ldan = models.LdaModel(corpus=corpusn, num_topics=3, id2word=id2wordn, passes=10)
ldan.print_topics()

[(0,
  '0.016*"aint" + 0.015*"cause" + 0.011*"ooh" + 0.010*"ah" + 0.009*"honey" + 0.009*"dance" + 0.008*"somebody" + 0.008*"rock" + 0.008*"babe" + 0.008*"wan"'),
 (1,
  '0.013*"rock" + 0.009*"law" + 0.008*"cause" + 0.008*"roll" + 0.007*"change" + 0.007*"city" + 0.007*"ya" + 0.007*"shatter" + 0.006*"ah" + 0.006*"aint"'),
 (2,
  '0.018*"bird" + 0.016*"number" + 0.015*"rock" + 0.010*"aint" + 0.009*"dance" + 0.008*"care" + 0.008*"ah" + 0.007*"babe" + 0.007*"blow" + 0.007*"wan"')]

In [361]:
# Let's try 4 topics
ldan = models.LdaModel(corpus=corpusn, num_topics=4, id2word=id2wordn)
ldan.print_topics()

[(0,
  '0.014*"aint" + 0.012*"rock" + 0.012*"cause" + 0.009*"ooh" + 0.009*"ah" + 0.007*"wan" + 0.006*"dance" + 0.006*"stop" + 0.006*"tonight" + 0.005*"god"'),
 (1,
  '0.017*"rock" + 0.012*"aint" + 0.010*"cause" + 0.010*"bird" + 0.009*"dance" + 0.009*"ah" + 0.007*"wan" + 0.007*"tonight" + 0.006*"babe" + 0.006*"change"'),
 (2,
  '0.012*"cause" + 0.010*"number" + 0.010*"aint" + 0.007*"rock" + 0.007*"child" + 0.006*"ah" + 0.006*"place" + 0.006*"babe" + 0.006*"change" + 0.005*"dance"'),
 (3,
  '0.014*"number" + 0.012*"honey" + 0.008*"babe" + 0.008*"lord" + 0.008*"rock" + 0.008*"ah" + 0.007*"ooh" + 0.007*"somebody" + 0.006*"aint" + 0.006*"dance"')]

## Topic Modeling - Attempt #3 (Nouns and Adjectives)

In [362]:
# Let's create a function to pull out nouns from a string of text
def nouns_adj(text):
    '''Given a string of text, tokenize the text and pull out only the nouns and adjectives.'''
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    nouns_adj = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)] 
    return ' '.join(nouns_adj)

In [363]:
# Apply the nouns function to the transcripts to filter only on nouns
data_nouns_adj = pd.DataFrame(data_clean.lyrics.apply(nouns_adj))
data_nouns_adj

Unnamed: 0,lyrics
ABBA,ive love thought ceiling strange old feeling d...
David_Bowie,small jean genie snuck city strung laser slash...
Janis_Joplin,come didnt feel man yeah everything woman hone...
Michael_Jackson,butt mine gon show face broad daylight im tell...
Queen,dim light song full sad thing tango serenade h...
Rolling_Stones,drag old kid different today mother mother nee...
The_Clash,stay dont play old town travel lotta people su...
Bob_Dylan,window choose speed babe im need youre someone...
Elton_John,hear distance sense old rudolph reindeer santa...
Led_Zeppeling,hey thats sweet mama kid dont keep hid baby da...


In [369]:
# Create a new document-term matrix using only nouns and adjectives, also remove common words with max_df
cvna = CountVectorizer(stop_words=stop_words, min_df=.1, max_df=.7)
data_cvna = cvna.fit_transform(data_nouns_adj.lyrics)
data_dtmna = pd.DataFrame(data_cvna.toarray(), columns=cvna.get_feature_names())
data_dtmna.index = data_nouns_adj.index
data_dtmna

Unnamed: 0,aaaah,aah,abandon,able,abraham,abuse,accept,ace,ache,act,...,writ,write,writer,yawn,yearn,yell,yellow,yo,york,youth
ABBA,0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
David_Bowie,0,0,0,0,0,0,0,0,1,1,...,2,5,0,1,0,1,0,0,2,0
Janis_Joplin,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Michael_Jackson,3,1,1,1,1,5,0,0,1,2,...,0,0,0,1,1,0,0,1,0,0
Queen,0,5,1,0,0,0,0,0,2,1,...,0,4,0,0,0,0,1,0,0,3
Rolling_Stones,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,1,1
The_Clash,0,0,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,1,12,3,3
Bob_Dylan,0,0,0,1,1,0,2,1,3,2,...,0,4,1,0,0,0,1,0,1,0
Elton_John,0,0,0,0,0,1,0,0,3,2,...,0,1,0,0,0,0,8,0,5,0
Led_Zeppeling,0,0,0,0,0,2,0,0,1,0,...,1,0,0,0,1,0,1,0,0,0


In [370]:
# Create the gensim corpus
corpusna = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmna.transpose()))

# Create the vocabulary dictionary
id2wordna = dict((v, k) for k, v in cvna.vocabulary_.items())

In [371]:
# Let's start with 2 topics
ldana = models.LdaModel(corpus=corpusna, num_topics=2, id2word=id2wordna, passes=10)
ldana.print_topics()

[(0,
  '0.006*"ta" + 0.006*"til" + 0.005*"okay" + 0.005*"whoa" + 0.004*"annie" + 0.004*"rhythm" + 0.004*"force" + 0.004*"bitch" + 0.004*"cmon" + 0.004*"radio"'),
 (1,
  '0.012*"bird" + 0.011*"number" + 0.007*"law" + 0.006*"fame" + 0.006*"american" + 0.005*"shoot" + 0.005*"white" + 0.005*"shatter" + 0.005*"diamond" + 0.004*"afraid"')]

In [372]:
# Let's try 3 topics
ldana = models.LdaModel(corpus=corpusna, num_topics=3, id2word=id2wordna)
ldana.print_topics()

[(0,
  '0.014*"bird" + 0.006*"radio" + 0.005*"family" + 0.005*"whoa" + 0.005*"ta" + 0.005*"number" + 0.004*"law" + 0.004*"theyll" + 0.004*"fame" + 0.003*"white"'),
 (1,
  '0.006*"okay" + 0.006*"til" + 0.006*"ta" + 0.005*"annie" + 0.005*"rhythm" + 0.005*"force" + 0.004*"hoo" + 0.004*"number" + 0.004*"kiss" + 0.004*"american"'),
 (2,
  '0.012*"number" + 0.006*"white" + 0.005*"shoot" + 0.005*"law" + 0.004*"diamond" + 0.004*"yellow" + 0.003*"fame" + 0.003*"ta" + 0.003*"ha" + 0.003*"bird"')]

In [373]:
# Let's try 4 topics
ldana = models.LdaModel(corpus=corpusna, num_topics=4, id2word=id2wordna)
ldana.print_topics()

[(0,
  '0.007*"number" + 0.006*"bird" + 0.005*"white" + 0.004*"kiss" + 0.004*"gim" + 0.004*"american" + 0.004*"diamond" + 0.004*"family" + 0.004*"til" + 0.003*"fame"'),
 (1,
  '0.008*"number" + 0.005*"bird" + 0.005*"til" + 0.005*"ta" + 0.005*"okay" + 0.004*"whoa" + 0.004*"radio" + 0.003*"shatter" + 0.003*"family" + 0.003*"rhythm"'),
 (2,
  '0.012*"bird" + 0.008*"number" + 0.005*"radio" + 0.005*"shoot" + 0.004*"ta" + 0.004*"white" + 0.004*"family" + 0.004*"theyll" + 0.004*"law" + 0.004*"whoa"'),
 (3,
  '0.006*"ta" + 0.005*"fame" + 0.005*"law" + 0.005*"white" + 0.004*"american" + 0.004*"afraid" + 0.003*"number" + 0.003*"gold" + 0.003*"kiss" + 0.003*"ha"')]

## Identify Topics in Each Document

In [374]:
ldana = models.LdaModel(corpus=corpusna, num_topics=4, id2word=id2wordna)
ldana.print_topics()

[(0,
  '0.012*"number" + 0.010*"bird" + 0.005*"white" + 0.004*"shoot" + 0.004*"yellow" + 0.004*"radio" + 0.004*"ta" + 0.003*"family" + 0.003*"ha" + 0.003*"theyll"'),
 (1,
  '0.006*"til" + 0.006*"okay" + 0.005*"rhythm" + 0.005*"gim" + 0.005*"force" + 0.005*"kiss" + 0.005*"hoo" + 0.005*"law" + 0.005*"annie" + 0.004*"ta"'),
 (2,
  '0.009*"number" + 0.005*"ta" + 0.004*"bitch" + 0.004*"white" + 0.004*"shoot" + 0.003*"radio" + 0.003*"diamond" + 0.003*"saturday" + 0.003*"wonder" + 0.003*"freedom"'),
 (3,
  '0.009*"bird" + 0.005*"fame" + 0.005*"ta" + 0.005*"law" + 0.005*"american" + 0.005*"white" + 0.004*"radio" + 0.004*"family" + 0.003*"til" + 0.003*"afraid"')]

In [375]:
corpus_transformed = ldana[corpusna]
for i,j in enumerate(list(corpus_transformed)):
    print(data_dtmna.index[i],j)

ABBA [(1, 0.99921584)]
David_Bowie [(3, 0.99952286)]
Janis_Joplin [(1, 0.99665684)]
Michael_Jackson [(1, 0.9995713)]
Queen [(1, 0.09932316), (2, 0.46819854), (3, 0.43142626)]
Rolling_Stones [(2, 0.86347115), (3, 0.13224414)]
The_Clash [(1, 0.018301507), (3, 0.9771132)]
Bob_Dylan [(2, 0.9860181)]
Elton_John [(2, 0.9985938)]
Led_Zeppeling [(0, 0.4943678), (2, 0.4580378), (3, 0.047334626)]
Pink_Floyd [(2, 0.9989018)]
Ramones [(0, 0.5742816), (3, 0.42511645)]
The_Beatles [(0, 0.96598035), (2, 0.033525664)]
The_Doors [(3, 0.99922305)]
