In [1]:
# functions to open and read files.
def read_in(path, fn):
    f=open(path + "/" + fn, "r", encoding="utf8")
    contents = f.read()
    f.close()
    return contents

def get_file_list(folder):
    from os import listdir
    return listdir(folder)

In [2]:
# assemble the corpus into a dictionary format
# < filename : text, filename : text, ...}
corpus = {}
for f in get_file_list("sample"):
    corpus[f] = read_in("sample", f)
corpus

{'ai.txt': 'In computer science, artificial intelligence (AI), sometimes called machine intelligence, is intelligence demonstrated by machines, in contrast to the natural intelligence displayed by humans. Leading AI textbooks define the field as the study of "intelligent agents": any device that perceives its environment and takes actions that maximize its chance of successfully achieving its goals.[1] Colloquially, the term "artificial intelligence" is often used to describe machines (or computers) that mimic "cognitive" functions that humans associate with the human mind, such as "learning" and "problem solving".[2]',
 'beast.txt': 'Beast (Dr. Henry Philip "Hank" McCoy) is a fictional superhero appearing in American comic books published by Marvel Comics and is a founding member of the X-Men. Originally called "The Beast", the character was introduced as a mutant possessing ape-like superhuman physical strength and agility, oversized hands and feet, a genius-level intellect, and othe

In [3]:
# check the key terms of the dictionary.  In this case, they are the file names
corpus.keys()

dict_keys(['ai.txt', 'harrypotter.txt', 'magicorder.txt'])

In [4]:
# Convert to dataframe

import pandas as pd
#data_df = pd.DataFrame.from_dict(corpus).transpose()
data_df = pd.DataFrame.from_dict(corpus,orient="index")
data_df.columns = ['text']
data_df= data_df.sort_index()
data_df

Unnamed: 0,text
ai.txt,"In computer science, artificial intelligence (..."
harrypotter.txt,Harry Potter is a series of fantasy novels wri...
magicorder.txt,The Magic Order is a comic book series written...


In [5]:
# remove punctuations

import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_round1(x)


In [6]:
# Let's take a look at the updated text
data_clean = pd.DataFrame(data_df.text.apply(round1))
data_clean

Unnamed: 0,text
ai.txt,in computer science artificial intelligence ai...
harrypotter.txt,harry potter is a series of fantasy novels wri...
magicorder.txt,the magic order is a comic book series written...


In [7]:
# We are going to create a document-term matrix using CountVectorizer, and exclude common English stop words
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(data_clean.text)
print(data_cv.toarray())
print(cv.get_feature_names())

[[1 0 1 0 1 2 0 2 1 0 0 0 0 0 1 1 0 1 0 1 0 0 1 1 0 0 0 1 0 1 1 0 1 1 1 0
  1 0 1 1 0 0 0 0 0 0 1 2 0 0 0 5 1 0 0 0 0 0 1 1 0 0 1 2 0 0 0 0 1 0 0 1
  1 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 1 1 1 1
  1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 0 0 1 1 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1
  0 1 0 0 1 1 2 1 1 1 0 0 0 0 1 0 0 1 0 0 0 1 0 0 1 1 0 0 1 1 0 0 0 0 0 0
  0 1 1 0 0 1 2 0 0 1 1 0 0 2 0 0 0 0 1 1 0 1 0 0 1 0 0 1 1 1 0 1 0 0 0 0
  0 1 0 1 1 3 1 1 1 1]
 [0 1 0 1 0 0 0 0 0 0 0 2 1 0 0 0 0 0 1 0 5 1 0 0 0 1 1 0 0 0 0 1 0 0 0 1
  0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 1 1 0 0 0 0 0 0 0 1 0 1 1 0 2 3 0
  0 0 0 0 2 0 0 1 1 0 0 0 1 0 0 1 2 1 0 0 1 0 0 1 4 0 1 0 0 0 0 0 0 0 0 0
  0 0 2 0 0 0 0 0 1 0]]
['achieving', 'acquired', 'actions', 'adult', 'agents', 'ai', 'arc', 'artificial', 'associate', 'author', 'body', 'book', 'bought', 'british', 'called', 'chance', 'chronicle', 'cognitive', 'coipel', 'colloquially', 'comic', 'comics', 'computer', 'computers', 'conc

In [8]:
# Convert to document-term matrix

data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = data_clean.index
data_dtm

Unnamed: 0,achieving,acquired,actions,adult,agents,ai,arc,artificial,associate,author,...,used,voldemort,volume,weasley,witchcraft,wizard,wizardry,wizards,written,young
ai.txt,1,0,1,0,1,2,0,2,1,0,...,1,0,0,0,0,0,0,0,0,0
harrypotter.txt,0,0,0,0,0,0,1,0,0,1,...,0,1,0,1,1,3,1,1,1,1
magicorder.txt,0,1,0,1,0,0,0,0,0,0,...,0,0,2,0,0,0,0,0,1,0


# FINALLY, PCA!

In [1]:
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import CCA

cca = CCA(n_components=2)
cca.fit(data_cv.toarray(), cv.get_feature_names())

#to use PCA for dimensionality reduction
pca = PCA(n_components=2)
pca.fit(data_dtm)
t_pca = pca.transform(data_dtm)

NameError: name 'X' is not defined

In [22]:
# this illustrates how PCA reduces dimensions of the data.
print(data_dtm.shape) # we used to have a 3 by 118 matrix of data 
print(t_pca.shape) # we now have a 3 by 2 matrix of components

(3, 118)
(3, 2)


In [25]:
# to see how the components are weighted
print(t_pca)
# my understanding is that each item is one document. 
# ai.txt is negatively weighted on components 0 and 1
# harrypotter.txt is negatively weighted on component 0 but postiively weighted on component 1
# this means harrypotter.txt is better described by component 1
# magicorder.txt is positively weighted on component 0 but negatively weighted in component 1
# does this mean that ai.txt and magicorder.txt differ mainly on component 0?

[[-6.45811751 -3.53764617]
 [-0.30461056  6.61786229]
 [ 6.76272807 -3.08021612]]


In [26]:
# to see how much of the variance is explained by each component
print(pca.explained_variance_)
# the two components explain 75% of the variance

[43.76728013 32.89938654]


References
https://machinelearningmastery.com/prepare-text-data-machine-learning-scikit-learn/
https://github.com/adashofdata/nlp-in-python-tutorial/blob/master/1-Data-Cleaning.ipynb
https://gdcoder.com/nlp-tutorial-topic-modeling-with-svd/
https://www.analyticsvidhya.com/blog/2018/10/stepwise-guide-topic-modeling-latent-semantic-analysis/
https://stackoverflow.com/questions/31523575/get-u-sigma-v-matrix-from-truncated-svd-in-scikit-learn
https://jakevdp.github.io/PythonDataScienceHandbook/05.09-principal-component-analysis.html