In [None]:
import os
import string
import numpy as np
from math import log
from scipy import sparse
from sklearn import datasets
from scipy import linalg as la
from collections import Counter
from matplotlib import pyplot as plt
from scipy.sparse import linalg as spla

%matplotlib inline

In [None]:
plt.rcParams["figure.dpi"] = 300

## Problem 1

Recreate Figure 18.4 by performing PCA on the iris dataset, keeping the first two principal components.

In [None]:
iris = datasets.load_iris()
X = iris.data
Y = X - X.mean(axis=0)
U,S,VT = la.svd(Y, full_matrices=False)
S**2/(S**2).sum() # variance percentages

## Problem 2

Using the techniques of LSI, applied to the word count matrix $X$, and keeping the first 7 principal components, find the most similar and least similar speeches to both Bill Clinton's 1993 speech and to Ronald Reagan's 1984 speech.

In [None]:
# Get list of filepaths to each text file in the folder.
folder = "./Addresses/"
paths = [folder+p for p in os.listdir(folder) if p[-4:]==".txt"]

# Helper function to get list of words in a string.
def extractWords(text):
    ignore = string.punctuation + string.digits
    cleaned = "".join([t for t in text.strip() if t not in ignore])
    return cleaned.lower().split()

# Initialize vocab set, then read each file and add to the vocab set.
vocab = set()
for p in paths:
    with open(p, 'r') as infile:
        for line in infile:
            vocab.update(extractWords(line))

In [None]:
# load stopwords
with open("stopwords.txt", 'r') as f:
    stops = set([w.strip().lower() for w in f.readlines()])

# remove stopwords from vocabulary, create ordering
vocab = {w:i for i, w in enumerate(vocab.difference(stops))}

In [None]:
counts = []      # holds the entries of X
doc_index = []   # holds the row index of X
word_index = []  # holds the column index of X

# Iterate through the documents.
for doc, p in enumerate(paths):
    with open(p, 'r') as f:
        # Create the word counter.
        ctr = Counter()
        for line in f:
            ctr.update(extractWords(line))
        # Iterate through the word counter, store counts.
        for word, count in ctr.items():
            if word in vocab:
                word_index.append(vocab[word])
                counts.append(count)
                doc_index.append(doc)

# Create sparse matrix holding these word counts.
X = sparse.csr_matrix((counts, [doc_index, word_index]),
                       shape=(len(paths), len(vocab)), dtype=np.float)

## Problem 3

Repeat Problem 2 using the matrix $A$.
Do your answers seem more reasonable than before?

In [None]:
t = np.zeros(len(vocab))
counts = []
doc_index = []
word_index = []

# get doc-term counts and global term counts
for doc, path in enumerate(paths):
    with open(path, 'r') as f:
        # create the word counter
        ctr = Counter()
        for line in f:
            words = extractWords(line)
            ctr.update(words)
        # iterate through the word counter, store counts
        for word, count in ctr.items():
            if word in vocab:
                word_ind = vocab[word]
                word_index.append(word_ind)
                counts.append(count)
                doc_index.append(doc)
                t[word_ind] += count

# Get global weights.
g = np.ones(len(vocab))
logM = log(len(paths))
for count, word in zip(counts, word_index):
    p = count/float(t[word])
    g[word] += p*log(p+1)/logM

# Get globally weighted counts.
gwcounts = []
for count, word in zip(counts, word_index):
    gwcounts.append(g[word]*log(count+1))

# Create sparse matrix holding these globally weighted word counts
A = sparse.csr_matrix((gwcounts, [doc_index,word_index]),
                      shape=(len(paths), len(vocab)), dtype=np.float)
