In [1]:
import gensim
from gensim.corpora import Dictionary
from gensim import corpora, models

In [3]:
docs = [
    ['cat', 'dog', 'possum', 'wolf', 'rat'],  # topic 0
    ['cat', 'possum', 'wolf', 'fox', 'rabbit', 'rat'],  # topic 0
    ['tuna', 'whale', 'shark', 'salmon', 'stringray'],  # topic 1
    ['tuna', 'shark', 'salmon', 'eel', 'stingray'],  # topic 1
    ['pidgeon', 'hawk', 'sparrow', 'crow', 'parrot'],  # topic 2
    ['pidgeon', 'crow', 'raven', 'parrot', 'eagle']   # topic 2
]

In [4]:
dct = Dictionary(docs)
print(' '.join('%s'%(i,) for i in dct.items()))

(0, 'cat') (1, 'dog') (2, 'possum') (3, 'rat') (4, 'wolf') (5, 'fox') (6, 'rabbit') (7, 'salmon') (8, 'shark') (9, 'stringray') (10, 'tuna') (11, 'whale') (12, 'eel') (13, 'stingray') (14, 'crow') (15, 'hawk') (16, 'parrot') (17, 'pidgeon') (18, 'sparrow') (19, 'eagle') (20, 'raven')


In [5]:
docs_bow = [dct.doc2bow(doc) for doc in docs]
docs_bow

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)],
 [(0, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)],
 [(7, 1), (8, 1), (9, 1), (10, 1), (11, 1)],
 [(7, 1), (8, 1), (10, 1), (12, 1), (13, 1)],
 [(14, 1), (15, 1), (16, 1), (17, 1), (18, 1)],
 [(14, 1), (16, 1), (17, 1), (19, 1), (20, 1)]]

In [6]:
lda_model = gensim.models.LdaModel(docs_bow, num_topics=3, id2word=dct)

In [13]:
# Seems like all the documents are assigned to a single topic!
for i, db in enumerate(docs_bow):
    print(docs[i], '>>>>>', lda_model[db])

['cat', 'dog', 'possum', 'wolf', 'rat'] >>>>> [(0, 0.05577975), (1, 0.88824743), (2, 0.05597283)]
['cat', 'possum', 'wolf', 'fox', 'rabbit', 'rat'] >>>>> [(0, 0.04783169), (1, 0.90415514), (2, 0.048013188)]
['tuna', 'whale', 'shark', 'salmon', 'stringray'] >>>>> [(0, 0.05868661), (1, 0.05589025), (2, 0.8854231)]
['tuna', 'shark', 'salmon', 'eel', 'stingray'] >>>>> [(0, 0.8647225), (1, 0.05618122), (2, 0.07909632)]
['pidgeon', 'hawk', 'sparrow', 'crow', 'parrot'] >>>>> [(0, 0.8878725), (1, 0.055951286), (2, 0.05617614)]
['pidgeon', 'crow', 'raven', 'parrot', 'eagle'] >>>>> [(0, 0.8878262), (1, 0.055959612), (2, 0.056214225)]


In [7]:
lda_model.print_topics()

[(0,
  '0.105*"parrot" + 0.104*"crow" + 0.104*"pidgeon" + 0.061*"salmon" + 0.061*"shark" + 0.060*"tuna" + 0.060*"hawk" + 0.060*"stingray" + 0.060*"sparrow" + 0.060*"eel"'),
 (1,
  '0.127*"cat" + 0.127*"possum" + 0.127*"wolf" + 0.126*"rat" + 0.072*"rabbit" + 0.072*"dog" + 0.072*"fox" + 0.021*"tuna" + 0.020*"salmon" + 0.020*"shark"'),
 (2,
  '0.108*"tuna" + 0.108*"shark" + 0.107*"salmon" + 0.107*"whale" + 0.107*"stringray" + 0.030*"crow" + 0.030*"pidgeon" + 0.030*"rat" + 0.030*"raven" + 0.029*"wolf"')]

In [14]:
# Try TFIDF
tfidf = models.TfidfModel(docs_bow)
docs_tfidf = tfidf[docs_bow]
tfidf_model = gensim.models.LdaMulticore(docs_tfidf, num_topics=3, id2word=dct)

In [15]:
# Finds 'birds' (topic 0) and 'animals' (topic 1), but wrongly assigns one of the 'sea creatures' doc to 'animals'.
for i, db in enumerate(docs_bow):
    print(docs[i], '>>>>>', tfidf_model[db])

['cat', 'dog', 'possum', 'wolf', 'rat'] >>>>> [(0, 0.056555387), (1, 0.88662046), (2, 0.056824137)]
['cat', 'possum', 'wolf', 'fox', 'rabbit', 'rat'] >>>>> [(0, 0.04854632), (1, 0.9026757), (2, 0.048777964)]
['tuna', 'whale', 'shark', 'salmon', 'stringray'] >>>>> [(0, 0.057446647), (1, 0.87425447), (2, 0.06829886)]
['tuna', 'shark', 'salmon', 'eel', 'stingray'] >>>>> [(0, 0.056884717), (1, 0.060229216), (2, 0.8828861)]
['pidgeon', 'hawk', 'sparrow', 'crow', 'parrot'] >>>>> [(0, 0.8869137), (1, 0.056311637), (2, 0.05677466)]
['pidgeon', 'crow', 'raven', 'parrot', 'eagle'] >>>>> [(0, 0.8869467), (1, 0.05627355), (2, 0.056779798)]


In [11]:
tfidf_model.print_topics()  # Looks better

[(0,
  '0.073*"shark" + 0.072*"tuna" + 0.072*"salmon" + 0.065*"stingray" + 0.065*"raven" + 0.064*"stringray" + 0.064*"eagle" + 0.064*"eel" + 0.063*"whale" + 0.051*"crow"'),
 (1,
  '0.097*"dog" + 0.075*"cat" + 0.074*"wolf" + 0.074*"possum" + 0.073*"rat" + 0.040*"salmon" + 0.039*"tuna" + 0.039*"parrot" + 0.038*"crow" + 0.038*"whale"'),
 (2,
  '0.075*"sparrow" + 0.072*"fox" + 0.072*"hawk" + 0.070*"rabbit" + 0.058*"parrot" + 0.058*"pidgeon" + 0.057*"crow" + 0.056*"cat" + 0.056*"rat" + 0.056*"wolf"')]

In [None]:
# Running LDA with the same parameters produces different results. I.e. the documents are distributed
# to different clusters
# LDA2: cluster 0 = animals & sea creatures, cluster 1 = birds
# LDA3: cluster 2 = animals, cluster 0 = sea creatures, cluster 1 = birds
# LDA4: 

In [16]:
# Setting the alpha parameter for LDA makes it a lot better.
lda_model2 = gensim.models.LdaModel(docs_bow, num_topics=3, id2word=dct, alpha=[0.33, 0.33, 0.33])

# Seems like all the documents are assigned to a single topic!
for i, db in enumerate(docs_bow):
    print(docs[i], '>>>>>', lda_model2[db])

['cat', 'dog', 'possum', 'wolf', 'rat'] >>>>> [(0, 0.88830477), (1, 0.055536024), (2, 0.05615925)]
['cat', 'possum', 'wolf', 'fox', 'rabbit', 'rat'] >>>>> [(0, 0.90415084), (1, 0.047622196), (2, 0.048226975)]
['tuna', 'whale', 'shark', 'salmon', 'stringray'] >>>>> [(0, 0.8879269), (1, 0.05563086), (2, 0.056442298)]
['tuna', 'shark', 'salmon', 'eel', 'stingray'] >>>>> [(0, 0.8879263), (1, 0.05564019), (2, 0.05643354)]
['pidgeon', 'hawk', 'sparrow', 'crow', 'parrot'] >>>>> [(0, 0.055285085), (1, 0.88846135), (2, 0.056253582)]
['pidgeon', 'crow', 'raven', 'parrot', 'eagle'] >>>>> [(0, 0.055278324), (1, 0.88870174), (2, 0.05601993)]


In [17]:
# Setting the alpha parameter for LDA makes it a lot better.
lda_model3 = gensim.models.LdaModel(docs_bow, num_topics=3, id2word=dct, alpha=[0.33, 0.33, 0.33])

# Seems like all the documents are assigned to a single topic!
for i, db in enumerate(docs_bow):
    print(docs[i], '>>>>>', lda_model3[db])

['cat', 'dog', 'possum', 'wolf', 'rat'] >>>>> [(0, 0.055462886), (1, 0.055627797), (2, 0.88890934)]
['cat', 'possum', 'wolf', 'fox', 'rabbit', 'rat'] >>>>> [(0, 0.0475648), (1, 0.04772295), (2, 0.90471226)]
['tuna', 'whale', 'shark', 'salmon', 'stringray'] >>>>> [(0, 0.88911754), (1, 0.055556968), (2, 0.0553255)]
['tuna', 'shark', 'salmon', 'eel', 'stingray'] >>>>> [(0, 0.88913167), (1, 0.05554036), (2, 0.05532796)]
['pidgeon', 'hawk', 'sparrow', 'crow', 'parrot'] >>>>> [(0, 0.055435132), (1, 0.88655597), (2, 0.058008883)]
['pidgeon', 'crow', 'raven', 'parrot', 'eagle'] >>>>> [(0, 0.055749834), (1, 0.081180245), (2, 0.86306995)]


In [18]:
# Setting the alpha parameter for LDA makes it a lot better.
lda_model4 = gensim.models.LdaModel(docs_bow, num_topics=3, id2word=dct, alpha=[0.33, 0.33, 0.33])

# Seems like all the documents are assigned to a single topic!
for i, db in enumerate(docs_bow):
    print(docs[i], '>>>>>', lda_model4[db])

['cat', 'dog', 'possum', 'wolf', 'rat'] >>>>> [(0, 0.055444412), (1, 0.88919723), (2, 0.055358402)]
['cat', 'possum', 'wolf', 'fox', 'rabbit', 'rat'] >>>>> [(0, 0.04753296), (1, 0.9050161), (2, 0.04745087)]
['tuna', 'whale', 'shark', 'salmon', 'stringray'] >>>>> [(0, 0.8882601), (1, 0.05539308), (2, 0.056346808)]
['tuna', 'shark', 'salmon', 'eel', 'stingray'] >>>>> [(0, 0.88526106), (1, 0.055563055), (2, 0.059175946)]
['pidgeon', 'hawk', 'sparrow', 'crow', 'parrot'] >>>>> [(0, 0.05552601), (1, 0.055446446), (2, 0.8890276)]
['pidgeon', 'crow', 'raven', 'parrot', 'eagle'] >>>>> [(0, 0.055518735), (1, 0.055454127), (2, 0.8890271)]


In [19]:
# Setting the alpha parameter for LDA makes it a lot better.
lda_model5 = gensim.models.LdaModel(docs_bow, num_topics=3, id2word=dct, alpha=[0.33, 0.33, 0.33])

# Seems like all the documents are assigned to a single topic!
for i, db in enumerate(docs_bow):
    print(docs[i], '>>>>>', lda_model5[db])

['cat', 'dog', 'possum', 'wolf', 'rat'] >>>>> [(0, 0.055993363), (1, 0.8886343), (2, 0.055372357)]
['cat', 'possum', 'wolf', 'fox', 'rabbit', 'rat'] >>>>> [(0, 0.048065916), (1, 0.90445805), (2, 0.047475997)]
['tuna', 'whale', 'shark', 'salmon', 'stringray'] >>>>> [(0, 0.05614015), (1, 0.05539772), (2, 0.8884621)]
['tuna', 'shark', 'salmon', 'eel', 'stingray'] >>>>> [(0, 0.05617918), (1, 0.05540006), (2, 0.88842076)]
['pidgeon', 'hawk', 'sparrow', 'crow', 'parrot'] >>>>> [(0, 0.056796912), (1, 0.8809422), (2, 0.06226087)]
['pidgeon', 'crow', 'raven', 'parrot', 'eagle'] >>>>> [(0, 0.056672018), (1, 0.061162397), (2, 0.88216555)]


In [None]:
# Now trying this using NMF
# https://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html#sphx-glr-auto-examples-applications-plot-topics-extraction-with-nmf-lda-py

text_docs = [' '.join(e for e in d) for d in docs]

text_docs

tfidf = tfidf_vectorizer.fit_transform(text_docs)
cv = count_vectorizer.fit_transform(text_docs)


tfidf.toarray()

cv.toarray()

tfidf_transformer = TfidfTransformer()

tfidf_v = tfidf_transformer.fit_transform(cv)

tfidf_v.toarray()

In [20]:
text_docs = [' '.join(e for e in d) for d in docs]

text_docs


['cat dog possum wolf rat',
 'cat possum wolf fox rabbit rat',
 'tuna whale shark salmon stringray',
 'tuna shark salmon eel stingray',
 'pidgeon hawk sparrow crow parrot',
 'pidgeon crow raven parrot eagle']

In [54]:
# Try using TFIDF
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer

tfidf_vectorizer = TfidfVectorizer(#max_df=1, min_df=0,
                                   #max_features=3,  # Ha! this was causing the problems!
                                   stop_words='english')

tfidf = tfidf_vectorizer.fit_transform(text_docs)


In [55]:
tfidf

<6x21 sparse matrix of type '<class 'numpy.float64'>'
	with 31 stored elements in Compressed Sparse Row format>

In [56]:
tfidf.toarray()

array([[0.42690011, 0.        , 0.5206008 , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.42690011,
        0.        , 0.42690011, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.42690011],
       [0.37865978, 0.        , 0.        , 0.        , 0.        ,
        0.46177217, 0.        , 0.        , 0.        , 0.37865978,
        0.46177217, 0.37865978, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.37865978],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.40912489, 0.40912489,
        0.        , 0.        , 0.49892408, 0.40912489, 0.49892408,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.49892408,
        0.        , 0.        , 0.        , 0.       

In [57]:
tfidf_vectorizer.get_feature_names()

['cat',
 'crow',
 'dog',
 'eagle',
 'eel',
 'fox',
 'hawk',
 'parrot',
 'pidgeon',
 'possum',
 'rabbit',
 'rat',
 'raven',
 'salmon',
 'shark',
 'sparrow',
 'stingray',
 'stringray',
 'tuna',
 'whale',
 'wolf']

In [46]:
tfidf_vectorizer.fit(text_docs)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=3, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [58]:
tfidf_vectorizer.vocabulary_

{'cat': 0,
 'dog': 2,
 'possum': 9,
 'wolf': 20,
 'rat': 11,
 'fox': 5,
 'rabbit': 10,
 'tuna': 18,
 'whale': 19,
 'shark': 14,
 'salmon': 13,
 'stringray': 17,
 'eel': 4,
 'stingray': 16,
 'pidgeon': 8,
 'hawk': 6,
 'sparrow': 15,
 'crow': 1,
 'parrot': 7,
 'raven': 12,
 'eagle': 3}

In [26]:
# Also try it out step by step.
# First just get a count of the words.
count_vectorizer = CountVectorizer()
cv = count_vectorizer.fit_transform(text_docs)


In [27]:
cv.toarray()
# Ok the counts look good.

array([[1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0],
       [0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]],
      dtype=int64)

In [28]:
tfidf_transformer = TfidfTransformer()

tfidf_v = tfidf_transformer.fit_transform(cv)

tfidf_v.toarray()
# Not sure how the below tf-idf terms are calculated!

array([[0.42690011, 0.        , 0.5206008 , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.42690011,
        0.        , 0.42690011, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.42690011],
       [0.37865978, 0.        , 0.        , 0.        , 0.        ,
        0.46177217, 0.        , 0.        , 0.        , 0.37865978,
        0.46177217, 0.37865978, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.37865978],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.40912489, 0.40912489,
        0.        , 0.        , 0.49892408, 0.40912489, 0.49892408,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.49892408,
        0.        , 0.        , 0.        , 0.       

In [52]:
# manual tf-idf calculation for the first word. 
# Unsure how above is done, probably using some different normalization for idf.
# But relatively they look ok.
import math
_tf = (1.0/5)
_idf = math.log(6.0/2, math.e)
_idf2 = math.log(6.0/2, 10) + 1
_idf3 = math.log(((1+6)/(2+1)), 10) + 1
print('TF: ', _tf)
print('IDF using ln:', _idf, ' then TF*IDF: ', _tf*_idf)
print('IDF using base10: ', _idf2, ' then TF*IDF: ', _tf*_idf2)
print('IDF using doc formula: ', _idf3, ' then TF*IDF: ', _tf*_idf3)

# Ah. The exact details are in the documentation. There is normalization as well.
# https://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction


TF:  0.2
IDF using ln: 1.0986122886681098  then TF*IDF:  0.21972245773362198
IDF using base10:  1.4771212547196624  then TF*IDF:  0.2954242509439325
IDF using doc formula:  1.3679767852945943  then TF*IDF:  0.27359535705891885


In [59]:
# Try without any normalization.
tfidf_transformer2 = TfidfTransformer(norm=None)
tfidf_v2 = tfidf_transformer2.fit_transform(cv)
tfidf_v2.toarray()

array([[1.84729786, 0.        , 2.25276297, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 1.84729786,
        0.        , 1.84729786, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        1.84729786],
       [1.84729786, 0.        , 0.        , 0.        , 0.        ,
        2.25276297, 0.        , 0.        , 0.        , 1.84729786,
        2.25276297, 1.84729786, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        1.84729786],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 1.84729786, 1.84729786,
        0.        , 0.        , 2.25276297, 1.84729786, 2.25276297,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 2.25276297,
        0.        , 0.        , 0.        , 0.       

In [60]:
# Hmmm ok leave this for now. Need to read the documentation more.

In [65]:
# Use NMF to see if can get the documents classified into 3 correct topics.
# References:
# https://machinelearningmastery.com/introduction-to-matrix-decompositions-for-machine-learning/
# https://stackoverflow.com/questions/39367597/how-to-deal-with-missing-values-in-python-scikit-nmf
# https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html#sklearn.decomposition.NMF
from sklearn.decomposition import NMF
import numpy as np

In [80]:
nmf = NMF(n_components=3)
nmf_v = nmf.fit_transform(tfidf_v)

In [81]:
nmf_v  # a little hard to read

array([[8.00999056e-01, 0.00000000e+00, 0.00000000e+00],
       [8.00999056e-01, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 7.01689253e-01, 0.00000000e+00],
       [0.00000000e+00, 7.01689253e-01, 7.40228630e-18],
       [0.00000000e+00, 0.00000000e+00, 6.81305492e-01],
       [0.00000000e+00, 0.00000000e+00, 6.81305492e-01]])

In [75]:
np.round(nmf_v, 2)

array([[0.8 , 0.  , 0.  ],
       [0.8 , 0.  , 0.  ],
       [0.  , 0.71, 0.  ],
       [0.  , 0.71, 0.  ],
       [0.  , 0.  , 0.67],
       [0.  , 0.  , 0.67]])

In [None]:
# Nice!

In [68]:
tfidf_v.shape

(6, 21)

In [69]:
# 6 x 3 . 3 x 21
nmf_v.shape
# So I have the 6x3 (nmf_v), where is the 3x21? I.e. the word embeddings?


(6, 3)

In [78]:
print(nmf.components_.shape)
np.round(nmf.components_.transpose(), 2)

(3, 21)


array([[0.5 , 0.  , 0.  ],
       [0.  , 0.  , 0.61],
       [0.32, 0.  , 0.  ],
       [0.  , 0.  , 0.37],
       [0.  , 0.35, 0.  ],
       [0.29, 0.  , 0.  ],
       [0.  , 0.  , 0.37],
       [0.  , 0.  , 0.61],
       [0.  , 0.  , 0.61],
       [0.5 , 0.  , 0.  ],
       [0.29, 0.  , 0.  ],
       [0.5 , 0.  , 0.  ],
       [0.  , 0.  , 0.37],
       [0.  , 0.57, 0.  ],
       [0.  , 0.57, 0.  ],
       [0.  , 0.  , 0.37],
       [0.  , 0.35, 0.  ],
       [0.  , 0.35, 0.  ],
       [0.  , 0.57, 0.  ],
       [0.  , 0.35, 0.  ],
       [0.5 , 0.  , 0.  ]])

In [91]:
cv.toarray()

array([[1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0],
       [0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]],
      dtype=int64)

In [82]:
# Actually we should just try NMF without TFIDF
nmf2 = NMF(n_components=3)
nmf_v2 = nmf2.fit_transform(cv)

In [84]:
np.round(nmf_v2, 2)

array([[1.16, 0.  , 0.  ],
       [1.32, 0.  , 0.  ],
       [0.  , 1.17, 0.  ],
       [0.  , 1.17, 0.  ],
       [0.  , 0.  , 1.21],
       [0.  , 0.  , 1.21]])

In [85]:
np.round(nmf2.components_.transpose(), 2)

array([[0.8 , 0.  , 0.  ],
       [0.  , 0.  , 0.83],
       [0.38, 0.  , 0.  ],
       [0.  , 0.  , 0.41],
       [0.  , 0.43, 0.  ],
       [0.43, 0.  , 0.  ],
       [0.  , 0.  , 0.41],
       [0.  , 0.  , 0.83],
       [0.  , 0.  , 0.83],
       [0.8 , 0.  , 0.  ],
       [0.43, 0.  , 0.  ],
       [0.8 , 0.  , 0.  ],
       [0.  , 0.  , 0.41],
       [0.  , 0.86, 0.  ],
       [0.  , 0.86, 0.  ],
       [0.  , 0.  , 0.41],
       [0.  , 0.43, 0.  ],
       [0.  , 0.43, 0.  ],
       [0.  , 0.86, 0.  ],
       [0.  , 0.43, 0.  ],
       [0.8 , 0.  , 0.  ]])

In [87]:
# Just to check :)
np.round(np.dot(nmf_v2, nmf2.components_), 2)

array([[0.93, 0.  , 0.44, 0.  , 0.  , 0.5 , 0.  , 0.  , 0.  , 0.93, 0.5 ,
        0.93, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.93],
       [1.06, 0.  , 0.5 , 0.  , 0.  , 0.56, 0.  , 0.  , 0.  , 1.06, 0.56,
        1.06, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 1.06],
       [0.  , 0.  , 0.  , 0.  , 0.5 , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.  , 1.  , 1.  , 0.  , 0.5 , 0.5 , 1.  , 0.5 , 0.  ],
       [0.  , 0.  , 0.  , 0.  , 0.5 , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.  , 1.  , 1.  , 0.  , 0.5 , 0.5 , 1.  , 0.5 , 0.  ],
       [0.  , 1.  , 0.  , 0.5 , 0.  , 0.  , 0.5 , 1.  , 1.  , 0.  , 0.  ,
        0.  , 0.5 , 0.  , 0.  , 0.5 , 0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.  , 1.  , 0.  , 0.5 , 0.  , 0.  , 0.5 , 1.  , 1.  , 0.  , 0.  ,
        0.  , 0.5 , 0.  , 0.  , 0.5 , 0.  , 0.  , 0.  , 0.  , 0.  ]])