In [None]:
import numpy as np
import math


def normalize(input_matrix):
    """
    Normalizes the rows of a 2d input_matrix so they sum to 1
    """

    row_sums = input_matrix.sum(axis=1)
    try:
        assert (np.count_nonzero(row_sums)==np.shape(row_sums)[0]) # no row should sum to zero
    except Exception:
        raise Exception("Error while normalizing. Row(s) sum to zero")
    new_matrix = input_matrix / row_sums[:, np.newaxis]
    return new_matrix

       
class Corpus(object):

    """
    A collection of documents.
    """

    def __init__(self, documents_path):
        """
        Initialize empty document list.
        """
        self.documents = []
        self.vocabulary = []
        self.likelihoods = []
        self.documents_path = documents_path
        self.term_doc_matrix = None 
        self.document_topic_prob = None  # P(z | d) - INITIALIZE TO RANDOM
        self.topic_word_prob = None  # P(w | z) - INITiALIZE TO RANDOM
        self.topic_prob = None  # P(z | d, w) - NORMALIZED document_topic_prob * topic_word_prob for each doc, each word 

        self.number_of_documents = 0
        self.vocabulary_size = 0


    def build_vocabulary(self):
        bad_chars = [';', ':', '!', '*', '"', ',', '.', '?', '-', '_', '@', '[', ']', '(', ')', '{', '}', '/', ',', "'"]
        """
        Construct a list of unique words in the whole corpus. Put it in self.vocabulary
        for example: ["rain", "the", ...]

        Update self.vocabulary_size
        """
        stopwords = dict()
        with open("stopwords.txt") as swf:
            for line in swf:
                if not stopwords.get(line):
                    stopwords[line] = 1
        print(len(stopwords))
        vocab = dict()
        docnumber = 0
        self.documents.clear()

        print(self.documents_path)
        datadate = "2000.11.05"
        count = 0
        for subdir, dirs, files in os.walk(self.documents_path):
            for filename in files:
                filepath = subdir + os.sep + filename
                #print (filepath)        
                if (filepath.find(datadate) != -1)  :
                    count = count + 1
                    with open(filepath) as f:
                        for line in f:
                            line.strip(",;.?!-:@[](){}_*/'")
                            words = line.split()
                            self.documents.append([])
                            for word in words:
                                word = word.lower()
                                word = ''.join((filter(lambda i: i not in bad_chars, word)))
                                if not stopwords.get(word):
                                    self.documents[docnumber].append(word)
                                    if vocab.get(word):
                                        vocab[word] += 1
                                    else:
                                        vocab[word] = 1
                            docnumber = docnumber + 1
        self.number_of_documents = len(self.documents)
        self.vocabulary = list(vocab.keys())
        #log = open("vocab.csv", "w")
        #print(self.vocabulary, file = log) 
        #print(self.vocabulary) 
        self.vocabulary_size = len(self.vocabulary)
        print(count)     
        
        """
        
        with open(self.documents_path, encoding='utf8') as f:
            for line in f:
                line.strip()
                words = line.split()
                self.documents.append([])
                for word in words:
                    if word != "0" and word != "1": # ignore the first word, it is the 0 or 1
                        self.documents[docnumber].append(word)
                        if vocab.get(word):
                            vocab[word] += 1
                        else:
                            vocab[word] = 1
                docnumber = docnumber + 1
            self.number_of_documents = len(self.documents)
            print(self.number_of_documents)
            self.vocabulary = list(vocab.keys())
            self.vocabulary_size = len(self.vocabulary)
        """

    def build_term_doc_matrix(self):
        """
        Construct the term-document matrix where each row represents a document, 
        and each column represents a vocabulary term.

        self.term_doc_matrix[i][j] is the count of term j in document i
        """
        doccount = 0
        mymatrix = []
        for document in self.documents:
            # initialize the variables for this doc
            mymatrix.append([])
            # count the words for this doc
            vocab = dict()
            for word in document:
                if vocab.get(word):
                    vocab[word] += 1
                else:
                    vocab[word] = 1
            wordcount = 0
            for uniqueword in self.vocabulary:
                if vocab.get(uniqueword):
                    mymatrix[doccount].append(vocab.get(uniqueword))
                else:
                    mymatrix[doccount].append(0)
            
            doccount = doccount + 1
        self.term_doc_matrix = mymatrix

        
        #pass    # REMOVE THIS

    def initialize_randomly(self, number_of_topics):
        """
        Randomly initialize the matrices: document_topic_prob and topic_word_prob
        which hold the probability distributions for P(z | d) and P(w | z): self.document_topic_prob, and self.topic_word_prob

        Don't forget to normalize! 
        HINT: you will find numpy's random matrix useful [https://docs.scipy.org/doc/numpy-1.15.0/reference/generated/numpy.random.random.html]
        #np.random.random_sample((3, 2)) 
        """
        np.random.RandomState()
        self.document_topic_prob = np.random.random_sample((self.number_of_documents, number_of_topics))
        #print(self.document_topic_prob)
        self.document_topic_prob = normalize(self.document_topic_prob)
        #print(self.document_topic_prob)
        #print(self.document_topic_prob)

        self.topic_word_prob = np.random.random_sample((number_of_topics, len(self.vocabulary)))
        self.topic_word_prob = normalize(self.topic_word_prob)
        #print(self.topic_word_prob)

        #pass    # REMOVE THIS
        

    def initialize_uniformly(self, number_of_topics):
        """
        Initializes the matrices: self.document_topic_prob and self.topic_word_prob with a uniform 
        probability distribution. This is used for testing purposes.

        DO NOT CHANGE THIS FUNCTION
        """
        self.document_topic_prob = np.ones((self.number_of_documents, number_of_topics))
        self.document_topic_prob = normalize(self.document_topic_prob)

        self.topic_word_prob = np.ones((number_of_topics, len(self.vocabulary)))
        self.topic_word_prob = normalize(self.topic_word_prob)

    def initialize(self, number_of_topics, random=False):
        """ Call the functions to initialize the matrices document_topic_prob and topic_word_prob
        """
        #print("Initializing...")

        if random:
            self.initialize_randomly(number_of_topics)
        else:
            self.initialize_uniformly(number_of_topics)


    def iterate(self, number_of_topics):
        #print("E step:")
        
        #self.topic_prob = np.ones((self.number_of_documents, number_of_topics, self.vocabulary_size))
        for docindex in range(0, self.number_of_documents):
            for wordindex in range(0, self.vocabulary_size):
                mysum = 0
                #print(self.topic_prob[docindex,topicindex])
                for topicindex in range(0, number_of_topics):
                    self.topic_prob[docindex][topicindex][wordindex] = self.topic_word_prob[topicindex, wordindex] * self.document_topic_prob[docindex, topicindex]
                    mysum += self.topic_prob[docindex][topicindex][wordindex]
                for topicindex in range(0, number_of_topics):
                    self.topic_prob[docindex,topicindex,wordindex] = self.topic_prob[docindex,topicindex,wordindex] / mysum

        #print("M step:")
        for docindex in range(0, self.number_of_documents):
            for topicindex in range(0, number_of_topics):
                mysum = 0
                for wordindex in range(0, self.vocabulary_size):
                    mysum += self.topic_prob[docindex,topicindex,wordindex] * self.term_doc_matrix[docindex][wordindex]
                self.document_topic_prob[docindex][topicindex] = mysum
        self.document_topic_prob = normalize(self.document_topic_prob)
        #print(self.document_topic_prob)
            
        # update P(z | d) self.document_topic_prob
        for topicindex in range(0, number_of_topics):
            for wordindex in range(0, self.vocabulary_size):
                mysum = 0
                for docindex in range(0, self.number_of_documents):
                    mysum += self.topic_prob[docindex,topicindex,wordindex] * self.term_doc_matrix[docindex][wordindex]
                self.topic_word_prob[topicindex][wordindex] = mysum
        self.topic_word_prob = normalize(self.topic_word_prob)
        #print(self.topic_word_prob)

        


    def calculate_likelihood(self, number_of_topics):
        """ Calculate the current log-likelihood of the model using
        the model's updated probability matrices
        
        Append the calculated log-likelihood to self.likelihoods

        Likelihood:
        For each doc sum:
        C(w,d) * log (sum(Prob of that topic * prob of that word in topic)
        
        loop over docs (variable in self) - docnumber
            loop over words (variable in self) - wordnumber
                multiply Prob of that topic * prob of that word in topic
        log of this
                
       """
        newlikely = 0
        #print(self.document_topic_prob)
        #print(self.topic_word_prob)
        for docindex in range(0, self.number_of_documents):
            docsum = 0
            for wordindex in range(0, self.vocabulary_size):
                mysum = 0
                for topicindex in range(0, number_of_topics):
                    mysum += self.document_topic_prob[docindex][topicindex] * self.topic_word_prob[topicindex][wordindex]
                mysum = math.log10(mysum)
                mysum = mysum * self.term_doc_matrix[docindex][wordindex]
                docsum += mysum
            #print(docsum)
            newlikely += docsum
        self.likelihoods.append(newlikely)
        


    def plsa(self, number_of_topics, max_iter, epsilon):

        """
        Model topics.
        """
        print ("EM iteration begins...")
        
        # build term-doc matrix
        self.build_term_doc_matrix()
        
        # Create the counter arrays.
        
        # P(z | d, w)
        self.topic_prob = np.zeros([self.number_of_documents, number_of_topics, self.vocabulary_size], dtype=np.float)

        # P(z | d) P(w | z)
        self.initialize(number_of_topics, random=True)

        # Run the EM algorithm
        self.calculate_likelihood(number_of_topics)

        current_likelihood = self.likelihoods[-1]

        for iteration in range(0, max_iter):
            print("Iteration #" + str(iteration + 1) + "...")
            self.iterate(number_of_topics)
            self.calculate_likelihood(number_of_topics)
            
            new_likelihood = self.likelihoods[-1]
            print(current_likelihood)
            print(new_likelihood)
            #return
            newepsilon = abs(new_likelihood - current_likelihood)
            #print(newepsilon)
            if (newepsilon <= epsilon):
                #return
                print("Iteration #" + str(iteration + 1) + "...")
                print("Converge")
                break
            current_likelihood = new_likelihood

       

def main():
    #documents_path = 'data/test.txt' 
    documents_path = "C:\\programs\\CS410_Pres_raw"
    
    #documents_path = 'data/DBLP2.txt'
    #documents_path = 'data/test5.txt'
    corpus = Corpus(documents_path)  # instantiate corpus
    corpus.build_vocabulary()
    #return
    #print(corpus.vocabulary)
    print("Vocabulary size:" + str(len(corpus.vocabulary)))
    print("Number of documents:" + str(len(corpus.documents)))
    #corpus.build_term_doc_matrix()  # testing only REMOVE
    number_of_topics = 2
    #max_iterations = 500
    max_iterations = 500
    epsilon = 0.001
    corpus.plsa(number_of_topics, max_iterations, epsilon)
    print (corpus.document_topic_prob)
    print (corpus.topic_word_prob)


if __name__ == '__main__':
    main()


442
C:\programs\CS410_Pres_raw
402
Vocabulary size:4529
Number of documents:402
EM iteration begins...
Iteration #1...
-80934.80831038229
-62778.4261389351
Iteration #2...
-62778.4261389351
-62609.07987376448
Iteration #3...
-62609.07987376448
-62463.21926458114
Iteration #4...
-62463.21926458114
-62316.30274538726
Iteration #5...
-62316.30274538726
-62174.82464710215
Iteration #6...
-62174.82464710215
-62049.20093570307
Iteration #7...
-62049.20093570307
-61944.64427717762
Iteration #8...
-61944.64427717762
-61860.135137253
Iteration #9...
-61860.135137253
-61791.08147854571
Iteration #10...
-61791.08147854571
-61733.29813397204
Iteration #11...
-61733.29813397204
-61684.30681785084
Iteration #12...
-61684.30681785084
-61642.53926257291
Iteration #13...
-61642.53926257291
-61607.40860448986
Iteration #14...
-61607.40860448986
-61578.66692031708
Iteration #15...
-61578.66692031708
-61555.623574401936
Iteration #16...
-61555.623574401936
-61537.039403162096
Iteration #17...
-61537.03940

-61342.284744305834
-61342.24887893736
Iteration #148...
-61342.24887893736
-61342.19879184287
Iteration #149...
-61342.19879184287
-61342.107366460776
Iteration #150...
-61342.107366460776
-61341.92406399444
Iteration #151...
-61341.92406399444
-61341.634928859065
Iteration #152...
-61341.634928859065
-61341.35561360549
Iteration #153...
-61341.35561360549
-61341.15748431979
Iteration #154...
-61341.15748431979
-61340.982721193395
Iteration #155...
-61340.982721193395
-61340.80832825446
Iteration #156...
-61340.80832825446
-61340.65723277653
Iteration #157...
-61340.65723277653
-61340.54545128968
Iteration #158...
-61340.54545128968
-61340.46660382794
Iteration #159...
-61340.46660382794
-61340.407678821146
Iteration #160...
-61340.407678821146
-61340.35992235823
Iteration #161...
-61340.35992235823
-61340.31903096357
Iteration #162...
-61340.31903096357
-61340.282960882854
Iteration #163...
-61340.282960882854
-61340.25062798533
Iteration #164...
-61340.25062798533
-61340.22138473034

In [1215]:
import xml.etree.ElementTree as ET 
import os

class CleanData(object):

    """
    A collection of documents.
    """

    def __init__(self, documents_path):
        """
        Initialize empty document list.
        """
        self.documents_path = documents_path
        
    def cleanxml(self):
        print(self.documents_path)
        count = 0;
        for subdir, dirs, files in os.walk(self.documents_path):
            for filename in files:
                filepath = subdir + os.sep + filename
                #print (filepath)        
                day = 0
                month = 0
                year = 0
                docs = []
                # create element tree object 
                tree = ET.parse(filepath) 
                # get root element 
                root = tree.getroot() 
                head = tree.find('head')
                meta = head.findall('meta')
                for metadata in meta:
                    if metadata.attrib['name'] == 'publication_day_of_month' :
                        day = metadata.attrib['content']
                    if metadata.attrib['name'] == 'publication_month' :
                        month = metadata.attrib['content']
                    if metadata.attrib['name'] == 'publication_year' :
                        year = metadata.attrib['content']
                #print(year, month, day) 

                body = root.find('body')
                content = body.find('body.content')
                for block in content:
                    if block.attrib['class'] == 'full_text' :
                        for para in block :
                            if (para.text.find('Gore') != -1) or (para.text.find('Bush') != -1) :
                                mystring = 'Pres\\' + str(year) + "."  + str(month).zfill(2) + "." + str(day).zfill(2) + "." + str(count).zfill(8) + '.txt'
                                f = open(mystring, "w")
                                f.write(para.text)
                                f.close()                                
                                count = count + 1
                                #docs.append(para.text)
                            #print(para.text)
        print(count)


def main():
    #documents_path = "C:\\programs\\CS410_data\\2000\\07\\01"
    documents_path = "C:\\programs\\CS410_data"
    #documents_path = '1211543.xml' 
    cleandata = CleanData(documents_path)  # instantiate cleandata
    cleandata.cleanxml()

#from IPython.display import Javascript
#Javascript("Jupyter.notebook.execute_cells([0])")
if __name__ == '__main__':
    main()

C:\programs\CS410_data


FileNotFoundError: [Errno 2] No such file or directory: 'Pres\\2000.07.01.00000000.txt'

In [1098]:
from IPython.display import Javascript
Javascript("Jupyter.notebook.execute_cells([0])")

<IPython.core.display.Javascript object>