In [113]:
from sklearn.cross_decomposition import CCA
X = [[0., 0., 1.], [1.,0.,0.], [2.,2.,2.], [3.,5.,4.]]
Y = [[0.1, -0.2], [0.9, 1.1], [6.2, 5.9], [11.9, 12.3]]
cca = CCA(n_components=1)
cca.fit(X, Y)

CCA(copy=True, max_iter=500, n_components=1, scale=True, tol=1e-06)
X_c, Y_c = cca.transform(X, Y)
print (X_c)
print (Y_c)

[[-1.3373174 ]
 [-1.10847164]
 [ 0.40763151]
 [ 2.03815753]]
[[-0.85511537]
 [-0.70878547]
 [ 0.26065014]
 [ 1.3032507 ]]


In [36]:
# functions to open and read files.
def read_in(path, fn):
    f=open(path + "/" + fn)
    contents = f.read()
    f.close()
    return contents

def get_file_list(folder):
    from os import listdir
    return listdir(folder)

In [37]:
# assemble the corpus into a dictionary format
# < filename : text, filename : text, ...}
corpus = {}
for f in get_file_list("sample"):
    corpus[f] = read_in("sample", f)
corpus

{'ai.txt': 'In computer science, artificial intelligence (AI), sometimes called machine intelligence, is intelligence demonstrated by machines, in contrast to the natural intelligence displayed by humans. Leading AI textbooks define the field as the study of "intelligent agents": any device that perceives its environment and takes actions that maximize its chance of successfully achieving its goals.[1] Colloquially, the term "artificial intelligence" is often used to describe machines (or computers) that mimic "cognitive" functions that humans associate with the human mind, such as "learning" and "problem solving".[2]',
 'harrypotter.txt': "Harry Potter is a series of fantasy novels written by British author J. K. Rowling. The novels chronicle the lives of a young wizard, Harry Potter, and his friends Hermione Granger and Ron Weasley, all of whom are students at Hogwarts School of Witchcraft and Wizardry. The main story arc concerns Harry's struggle against Lord Voldemort, a dark wizar

In [38]:
# check the key terms of the dictionary.  In this case, they are the file names
corpus.keys()

dict_keys(['ai.txt', 'harrypotter.txt', 'magicorder.txt'])

In [39]:
# Convert to dataframe

import pandas as pd
#data_df = pd.DataFrame.from_dict(corpus).transpose()
data_df = pd.DataFrame.from_dict(corpus,orient="index")
data_df.columns = ['text']
data_df= data_df.sort_index()
data_df

Unnamed: 0,text
ai.txt,"In computer science, artificial intelligence (..."
harrypotter.txt,Harry Potter is a series of fantasy novels wri...
magicorder.txt,The Magic Order is a comic book series written...


In [40]:
# remove punctuations

import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_round1(x)

In [41]:
# Let's take a look at the updated text
data_clean = pd.DataFrame(data_df.text.apply(round1))
data_clean

Unnamed: 0,text
ai.txt,in computer science artificial intelligence ai...
harrypotter.txt,harry potter is a series of fantasy novels wri...
magicorder.txt,the magic order is a comic book series written...


In [42]:
# We are going to create a document-term matrix using CountVectorizer, and exclude common English stop words
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(data_clean.text)
print(data_cv.toarray())
print(cv.get_feature_names())

[[1 0 1 0 1 2 0 2 1 0 0 0 0 0 1 1 0 1 0 1 0 0 1 1 0 0 0 1 0 1 1 0 1 1 1 0
  1 0 1 1 0 0 0 0 0 0 1 2 0 0 0 5 1 0 0 0 0 0 1 1 0 0 1 2 0 0 0 0 1 0 0 1
  1 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 1 1 1 1
  1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 0 0 1 1 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1
  0 1 0 0 1 1 2 1 1 1 0 0 0 0 1 0 0 1 0 0 0 1 0 0 1 1 0 0 1 1 0 0 0 0 0 0
  0 1 1 0 0 1 2 0 0 1 1 0 0 2 0 0 0 0 1 1 0 1 0 0 1 0 0 1 1 1 0 1 0 0 0 0
  0 1 0 1 1 3 1 1 1 1]
 [0 1 0 1 0 0 0 0 0 0 0 2 1 0 0 0 0 0 1 0 5 1 0 0 0 1 1 0 0 0 0 1 0 0 0 1
  0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 1 1 0 0 0 0 0 0 0 1 0 1 1 0 2 3 0
  0 0 0 0 2 0 0 1 1 0 0 0 1 0 0 1 2 1 0 0 1 0 0 1 4 0 1 0 0 0 0 0 0 0 0 0
  0 0 2 0 0 0 0 0 1 0]]
['achieving', 'acquired', 'actions', 'adult', 'agents', 'ai', 'arc', 'artificial', 'associate', 'author', 'body', 'book', 'bought', 'british', 'called', 'chance', 'chronicle', 'cognitive', 'coipel', 'colloquially', 'comic', 'comics', 'computer', 'computers', 'conc

In [43]:
# Convert to document-term matrix

data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = data_clean.index
data_dtm

Unnamed: 0,achieving,acquired,actions,adult,agents,ai,arc,artificial,associate,author,...,used,voldemort,volume,weasley,witchcraft,wizard,wizardry,wizards,written,young
ai.txt,1,0,1,0,1,2,0,2,1,0,...,1,0,0,0,0,0,0,0,0,0
harrypotter.txt,0,0,0,0,0,0,1,0,0,1,...,0,1,0,1,1,3,1,1,1,1
magicorder.txt,0,1,0,1,0,0,0,0,0,0,...,0,0,2,0,0,0,0,0,1,0


In [92]:
from sklearn.decomposition import NMF
infiler = NMF(n_components=4, solver="mu")
W = infiler.fit_transform(data_dtm) #document x topic
H = infiler.components_ #topic x word

In [65]:
# to get the words per topic
# I admit I don't completely understand this code.

words = cv.get_feature_names()
for i, comp in enumerate(H):
    words_comp = zip(words, comp)
    sorted_words = sorted(words_comp, key= lambda x:x[1], reverse=True)[:7]
    print("Topic "+str(i)+": "),
    for t in sorted_words:
        print(t[0]),
    print(" ")

Topic 0: 
intelligence
humans
machines
mind
contrast
problem
chance
 
Topic 1: 
comic
series
millarworld
book
millar
netflix
published
 
Topic 2: 
artificial
ai
mimic
takes
successfully
perceives
learning
 
Topic 3: 
wizard
harry
novels
potter
arc
author
body
 


In [70]:
for i, comp in enumerate(H):
    print (i)

0
1
2
3


In [89]:
for i, comp in enumerate(H):
    print ("Topic {}\n".format(i))
    for j in range(0,len(comp)):
        print ("{} -> {}".format(words[j],comp[j]))
    print ("\n")
    

Topic 0

achieving -> 0.7139186821536381
acquired -> 0.0
actions -> 0.9163709922961593
adult -> 0.0
agents -> 0.5037149643158062
ai -> 0.8852525170459598
arc -> 0.0
artificial -> 0.8525598543779437
associate -> 0.650506893535826
author -> 0.0
body -> 0.0
book -> 0.0
bought -> 0.0
british -> 0.0
called -> 0.5589577535761843
chance -> 1.0329763330470012
chronicle -> 0.0
cognitive -> 0.8241197014538879
coipel -> 0.0
colloquially -> 0.81443570347802
comic -> 0.0
comics -> 0.0
computer -> 0.8053412038297215
computers -> 0.39710820868145436
concerns -> 0.0
confirmed -> 0.0
consist -> 0.0
contrast -> 1.1339361713303122
dark -> 0.0
define -> 0.7015653820706429
demonstrated -> 0.8239181427975407
development -> 0.0
device -> 0.6686054473973536
displayed -> 0.6580328434579018
environment -> 0.41217713596580413
fantasy -> 0.0
field -> 0.699929890456111
friends -> 0.0
functions -> 0.9256134839377148
goals -> 0.3965270243271649
governing -> 0.0
granger -> 0.0
harry -> 0.0
harrys -> 0.0
hermione -> 0

In [93]:
W

array([[8.98476709e-01, 0.00000000e+00, 4.94790372e-01, 0.00000000e+00],
       [0.00000000e+00, 1.86449549e-03, 1.30982251e-18, 9.96861546e-01],
       [0.00000000e+00, 1.76878200e+00, 6.24070719e-21, 3.71373760e-04]])

In [95]:
for i in range(0,len(H)):
    print (i)

0
1
2
3


In [106]:
# W is the weight of Document to Topic
j = 0
for i in W:
    print ("document {}\n".format(j))
    z = 0
    for k in i:
        print ("topic {}: {}".format(z,k))
        z += 1
    j += 1
    print ("\n")

document 0

topic 0: 0.8984767087663442
topic 1: 0.0
topic 2: 0.4947903724014674
topic 3: 0.0


document 1

topic 0: 0.0
topic 1: 0.001864495490067072
topic 2: 1.309822511711639e-18
topic 3: 0.9968615464298617


document 2

topic 0: 0.0
topic 1: 1.7687819960530864
topic 2: 6.2407071906226244e-21
topic 3: 0.0003713737599270476


