In [420]:
import numpy as np
import string
from nltk import word_tokenize
from nltk.probability import FreqDist
from nltk.tokenize import sent_tokenize

In [421]:
# from newspaper import Article
# # get article
# url1 = "http://www.nytimes.com/2016/05/05/technology/moores-law-running-out-of-room-tech-looks-for-a-successor.html?smid=pl-share"
# article1 = Article(url1)
# article1.download()
# article1.parse()
# raw1 = article1.text.lower()
# raw1 = raw1.encode("utf-8")
# raw1.replace(".", "")
# raw1.replace(",", "")

In [431]:
class CooccurrenceMatrix:
    """
    corpus        --  text file, declared when object created
    data          --  string
    sents         --  list of strings
    tokens        --  list of strings
    unique_tokens --  list of strings
    num_tokens    --  int
    fdist         --  fdist object
    cooc_array    --  2D list
    """
    def __init__(self, corpus):
        self.corpus = corpus 
        self.data = ""
        self.sents = []
        self.tokens = []
        self.unique_tokens = []
        self.num_tokens = 0
        self.fdist = None
        self.cooc_array = np.zeros((num_tokens, num_tokens))
        
    def prepareCorpus(self):
        """Given a text file, removes punctiation, returns the list of all tokens, list of unique tokens, 
        list of sentences, the number of unique tokens, and the frequency distribution"""
        with open(self.corpus, 'r') as myfile:
            self.data = myfile.read().replace('\n', '')
        self.data = self.data.lower()
        self.sents = sent_tokenize(self.data)
        self.data = removePunctiation(self.data)
        self.tokens = word_tokenize(self.data)
        self.unique_tokens = set(self.tokens)
        self.num_tokens = len(self.unique_tokens)
        self.fdist = FreqDist(self.tokens)
        return
#         return tokens, sents, unique_tokens, num_tokens, data

    def removePunctiation(text):
        """Given a piece of text, removes all punctiation (replacing periods, semicolons, colons with spaces)"""
        exclude = set(string.punctuation)
        text = ''.join(ch for ch in data if ch not in exclude)
        text = text.replace(".", " ")
        text = text.replace(";", " ")
        text = text.replace(":", " ")
        text = text.replace(",", "")
        return text

    def examineSentence(sent):
        """Given a sentence (list of words), updates cooccurrence array with number of cooccurrences within sentence"""
        for word1 in sent:
            for word2 in sent:
                index1 = word_indices[word1]
                index2 = word_indices[word2]
                #ignore cooccurrences with self
                if word1 != word2:
                    cooc_array[index1][index2] += 1
        return

In [436]:
# Testing on obama.txt
obama = CooccurrenceMatrix('obama.txt')
obama.prepareCorpus()


{'a',
 'abandoned',
 'ability',
 'accept',
 'account',
 'achieve',
 'across',
 'act',
 'action',
 'advance',
 'advancing',
 'adversaries',
 'afford',
 'afghanistan',
 'again',
 'against',
 'age',
 'ages',
 'ago',
 'aims',
 'alarmed',
 'all',
 'allfor',
 'alliances',
 'alone',
 'alongside',
 'already',
 'also',
 'always',
 'ambitions',
 'america',
 'americafor',
 'american',
 'americans',
 'americansthat',
 'americas',
 'amidst',
 'an',
 'ancestors',
 'and',
 'answer',
 'apologize',
 'apply',
 'are',
 'arguments',
 'arlington',
 'around',
 'as',
 'aside',
 'ask',
 'assure',
 'at',
 'back',
 'bad',
 'badly',
 'band',
 'based',
 'be',
 'because',
 'been',
 'before',
 'began',
 'begin',
 'believe',
 'beneath',
 'bestowed',
 'better',
 'between',
 'big',
 'bigger',
 'bind',
 'birth',
 'bitter',
 'blame',
 'bless',
 'blood',
 'bodies',
 'bold',
 'borders',
 'born',
 'borne',
 'brave',
 'break',
 'bridges',
 'brings',
 'broken',
 'build',
 'bush',
 'business',
 'businesses',
 'but',
 'by',
 '

In [414]:
tokens, sents, unique_tokens, num_tokens, data  = prepareCorpus('obama.txt')

In [415]:
# make cooccurrence array
cooc_array = np.zeros((num_tokens, num_tokens))
# word indices dict: key->word, value->index in array (0 to m)
word_indices = {}
counter = 0
for t in unique_tokens:
    word_indices[t] = counter
    counter+=1

In [416]:
# fill cooccurrence array
for s in sents:
    s = removePunctiation(s)
    words = word_tokenize(s)
    examineSentence(words)

In [417]:
fdist.items()[:5] #5 most frequent items in fdist

[('the', 133), ('and', 114), ('of', 82), ('to', 70), ('our', 66)]

In [419]:
print cooc_array

[[ 0.  0.  0. ...,  0.  0.  1.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 1.  0.  0. ...,  0.  0.  0.]]


In [400]:
# TO DO:
# make into class