In [1]:
# functions to open and read files.
def read_in(path, fn):
    f=open(path + "/" + fn)
    contents = f.read()
    f.close()
    return contents

def get_file_list(folder):
    from os import listdir
    return listdir(folder)

In [2]:
# assemble the corpus into a dictionary format
# < filename : text, filename : text, ...}
corpus = {}
for f in get_file_list("sample"):
    corpus[f] = read_in("sample", f)
corpus

{'ai.txt': 'In computer science, artificial intelligence (AI), sometimes called machine intelligence, is intelligence demonstrated by machines, in contrast to the natural intelligence displayed by humans. Leading AI textbooks define the field as the study of "intelligent agents": any device that perceives its environment and takes actions that maximize its chance of successfully achieving its goals.[1] Colloquially, the term "artificial intelligence" is often used to describe machines (or computers) that mimic "cognitive" functions that humans associate with the human mind, such as "learning" and "problem solving".[2]',
 'harrypotter.txt': "Harry Potter is a series of fantasy novels written by British author J. K. Rowling. The novels chronicle the lives of a young wizard, Harry Potter, and his friends Hermione Granger and Ron Weasley, all of whom are students at Hogwarts School of Witchcraft and Wizardry. The main story arc concerns Harry's struggle against Lord Voldemort, a dark wizar

In [3]:
# check the key terms of the dictionary.  In this case, they are the file names
corpus.keys()

dict_keys(['ai.txt', 'harrypotter.txt', 'magicorder.txt'])

In [4]:
# Convert to dataframe

import pandas as pd
#data_df = pd.DataFrame.from_dict(corpus).transpose()
data_df = pd.DataFrame.from_dict(corpus,orient="index")
data_df.columns = ['text']
data_df= data_df.sort_index()
data_df

Unnamed: 0,text
ai.txt,"In computer science, artificial intelligence (..."
harrypotter.txt,Harry Potter is a series of fantasy novels wri...
magicorder.txt,The Magic Order is a comic book series written...


In [5]:
# remove punctuations

import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_round1(x)

In [6]:
# Let's take a look at the updated text
data_clean = pd.DataFrame(data_df.text.apply(round1))
data_clean

Unnamed: 0,text
ai.txt,in computer science artificial intelligence ai...
harrypotter.txt,harry potter is a series of fantasy novels wri...
magicorder.txt,the magic order is a comic book series written...


In [7]:
# We are going to create a document-term matrix using CountVectorizer, and exclude common English stop words
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(data_clean.text)
print(data_cv.toarray())
print(cv.get_feature_names())

[[1 0 1 0 1 2 0 2 1 0 0 0 0 0 1 1 0 1 0 1 0 0 1 1 0 0 0 1 0 1 1 0 1 1 1 0
  1 0 1 1 0 0 0 0 0 0 1 2 0 0 0 5 1 0 0 0 0 0 1 1 0 0 1 2 0 0 0 0 1 0 0 1
  1 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 1 1 1 1
  1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 0 0 1 1 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1
  0 1 0 0 1 1 2 1 1 1 0 0 0 0 1 0 0 1 0 0 0 1 0 0 1 1 0 0 1 1 0 0 0 0 0 0
  0 1 1 0 0 1 2 0 0 1 1 0 0 2 0 0 0 0 1 1 0 1 0 0 1 0 0 1 1 1 0 1 0 0 0 0
  0 1 0 1 1 3 1 1 1 1]
 [0 1 0 1 0 0 0 0 0 0 0 2 1 0 0 0 0 0 1 0 5 1 0 0 0 1 1 0 0 0 0 1 0 0 0 1
  0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 1 1 0 0 0 0 0 0 0 1 0 1 1 0 2 3 0
  0 0 0 0 2 0 0 1 1 0 0 0 1 0 0 1 2 1 0 0 1 0 0 1 4 0 1 0 0 0 0 0 0 0 0 0
  0 0 2 0 0 0 0 0 1 0]]
['achieving', 'acquired', 'actions', 'adult', 'agents', 'ai', 'arc', 'artificial', 'associate', 'author', 'body', 'book', 'bought', 'british', 'called', 'chance', 'chronicle', 'cognitive', 'coipel', 'colloquially', 'comic', 'comics', 'computer', 'computers', 'conc

In [8]:



# Convert to document-term matrix

data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = data_clean.index
data_dtm

Unnamed: 0,achieving,acquired,actions,adult,agents,ai,arc,artificial,associate,author,...,used,voldemort,volume,weasley,witchcraft,wizard,wizardry,wizards,written,young
ai.txt,1,0,1,0,1,2,0,2,1,0,...,1,0,0,0,0,0,0,0,0,0
harrypotter.txt,0,0,0,0,0,0,1,0,0,1,...,0,1,0,1,1,3,1,1,1,1
magicorder.txt,0,1,0,1,0,0,0,0,0,0,...,0,0,2,0,0,0,0,0,1,0


# NMF Proper :)

In [9]:
from sklearn.decomposition import NMF
infiler = NMF(n_components=4, solver="mu")
W = infiler.fit_transform(data_dtm) #document x topic
H = infiler.components_ #topic x word



In [10]:
# to get the words per topic
# I admit I don't completely understand this code.

words = cv.get_feature_names()
for i, comp in enumerate(H):
    words_comp = zip(words, comp)
    sorted_words = sorted(words_comp, key= lambda x:x[1], reverse=True)[:7]
    print("Topic "+str(i)+": "),
    for t in sorted_words:
        print(t[0]),
    print(" ")

Topic 0: 
series
magic
written
comic
fantasy
millarworld
netflix
 
Topic 1: 
comic
series
millarworld
book
millar
volume
published
 
Topic 2: 
wizard
harry
potter
novels
arc
subjugate
hermione
 
Topic 3: 
intelligence
ai
artificial
humans
machines
achieving
actions
 


In [11]:
for i, comp in enumerate(H):
    print (i)

0
1
2
3


In [12]:
for i, comp in enumerate(H):
    print ("Topic {}\n".format(i))
    for j in range(0,len(comp)):
        print ("{} -> {}".format(words[j],comp[j]))
    print ("\n")
    

Topic 0

achieving -> 0.0
acquired -> 0.034124841868053125
actions -> 0.0
adult -> 0.018221638574070907
agents -> 0.0
ai -> 0.0
arc -> 0.0005594271253144123
artificial -> 0.0
associate -> 0.0
author -> 0.01160122095254283
body -> 0.012590544832774995
book -> 0.05394934316235199
bought -> 0.03550522519729466
british -> 0.010652008650384091
called -> 0.0
chance -> 0.0
chronicle -> 0.012789127282810848
cognitive -> 0.0
coipel -> 0.03501554837513109
colloquially -> 0.0
comic -> 0.17630913509414323
comics -> 0.03318059702458815
computer -> 0.0
computers -> 0.0
concerns -> 0.012400438634817063
confirmed -> 0.027660335175419562
consist -> 0.03324929405960857
contrast -> 0.0
dark -> 0.012744550836049327
define -> 0.0
demonstrated -> 0.0
development -> 0.007512953936645481
device -> 0.0
displayed -> 0.0
environment -> 0.0
fantasy -> 0.12150246181830819
field -> 0.0
friends -> 0.01284183283550316
functions -> 0.0
goals -> 0.0
governing -> 0.012242892468049701
granger -> 0.011719730789257258
harr

In [13]:
W

array([[0.00000000e+00, 0.00000000e+00, 1.47716317e-72, 8.69726730e-01],
       [1.23319714e-01, 1.63581576e-15, 1.54039447e+00, 0.00000000e+00],
       [2.41873874e-01, 1.11566718e+00, 9.51105189e-16, 8.53081794e-84]])

In [14]:
for i in range(0,len(H)):
    print (i)

0
1
2
3


In [15]:
# W is the weight of Document to Topic
j = 0
for i in W:
    print ("document {}\n".format(j))
    z = 0
    for k in i:
        print ("topic {}: {}".format(z,k))
        z += 1
    j += 1
    print ("\n")

document 0

topic 0: 0.0
topic 1: 0.0
topic 2: 1.4771631656180612e-72
topic 3: 0.8697267298076584


document 1

topic 0: 0.12331971394952676
topic 1: 1.635815757497484e-15
topic 2: 1.5403944685623732
topic 3: 0.0


document 2

topic 0: 0.24187387425672377
topic 1: 1.1156671760702406
topic 2: 9.511051889459387e-16
topic 3: 8.530817939476471e-84




References
https://medium.com/ml2vec/topic-modeling-is-an-unsupervised-learning-approach-to-clustering-documents-to-discover-topics-fdfbf30e27df
https://datascience.stackexchange.com/questions/10299/what-is-a-good-explanation-of-non-negative-matrix-factorization
https://mlexplained.com/2017/12/28/a-practical-introduction-to-nmf-nonnegative-matrix-factorization/
