In [27]:
import numpy as np
import string
from nltk import word_tokenize
from nltk.probability import FreqDist
from nltk.tokenize import sent_tokenize

In [28]:
# from newspaper import Article
# # get article
# url1 = "http://www.nytimes.com/2016/05/05/technology/moores-law-running-out-of-room-tech-looks-for-a-successor.html?smid=pl-share"
# article1 = Article(url1)
# article1.download()
# article1.parse()
# raw1 = article1.text.lower()
# raw1 = raw1.encode("utf-8")
# raw1.replace(".", "")
# raw1.replace(",", "")

In [29]:
class CooccurrenceMatrix:
    """
    corpus        --  text file, declared when object created
    data          --  string
    sents         --  list of strings, all the sentences in text file (with punctiation)
    tokens        --  list of strings, all tokens in text file (with repetitions)
    unique_tokens --  list of strings, all unique tokens
    num_tokens    --  int, number of unique tokens in text
    fdist         --  fdist object
    cooc_matrix   --  2D list, initialized to all 0's in prepareCorpus
    word_indices  --  dictionary (key=word, value=index in cooc_matrix)
    """
    def __init__(self, c):
        self.corpus = c 
        self.data = ""
        self.sents = []
        self.tokens = []
        self.unique_tokens = []
        self.num_tokens = 0
        self.fdist = None
        self.cooc_matrix = None
        self.word_indices = {}
        
    def prepareCorpus(self):
        """Given a text file, removes punctuation, returns the list of all tokens, list of unique tokens, 
        list of sentences, the number of unique tokens, and the frequency distribution. 
        Also initializies cooc_matrix to all 0's with size = num_tokens"""
        with open(self.corpus, 'r') as myfile:
            self.data = myfile.read().replace('\n', '')
        self.data = self.data.lower()
        self.sents = sent_tokenize(self.data)
        self.data = self.removePunctuation(self.data)
        self.tokens = word_tokenize(self.data)
        self.unique_tokens = set(self.tokens)
        self.num_tokens = len(self.unique_tokens)
        self.fdist = FreqDist(self.tokens)
        self.cooc_matrix = np.zeros((self.num_tokens, self.num_tokens))
        return

    def fillMatrix(self):
        """Fills the cooccurrence matrix"""
        self.findIndices()
        for s in self.sents:
            s = self.removePunctuation(s)
            words = word_tokenize(s)
            self.examineSentence(words)
        print "Cooccurrence Matrix:\n", self.cooc_matrix 
        return 
    
    def examineSentence(self, sentence): #problem in here
        """Given a sentence (list of words), updates cooccurrence matrix with number of cooccurrences within sentence
        Ignores cooccurrences of word with itself"""
        for word1 in sentence:
            for word2 in sentence:
                index1 = self.word_indices[word1]
                index2 = self.word_indices[word2]
                if word1 != word2:
                    self.cooc_matrix[index1][index2] += 1
        return
    
    def findIndices(self):
        """Sets word_indices, where each key is a unique token and each value is a index in the cooc_matrix"""
        counter = 0
        for t in self.unique_tokens:
            self.word_indices[t] = counter
            counter += 1
        return

    def removePunctuation(self, text):
        """Given a piece of text, removes all punctuation (replacing periods, semicolons, colons with spaces)"""
        text = text.replace(".", " ")
        text = text.replace(";", " ")
        text = text.replace(":", ".")
        text = text.replace(",", "")
        exclude = set(string.punctuation)
        text = ''.join(ch for ch in text if ch not in exclude)
        return text

In [35]:
# Testing on obama.txt
obama = CooccurrenceMatrix('obama.txt')
obama.prepareCorpus()
obama.fillMatrix()

# test = "this is a test piece of text. does the punct remove?"
# print obama.removePunctuation(test)

# data
# sents = []
# tokens = []
# unique_tokens = []
# num_tokens
# fdist
# cooc_matrix = np.zeros((num_tokens, num_tokens))
# word_indices


Cooccurrence Matrix:
[[ 0.  0.  0. ...,  0.  0.  1.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  1.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 1.  0.  1. ...,  0.  0.  0.]]


893

In [414]:
tokens, sents, unique_tokens, num_tokens, data  = prepareCorpus('obama.txt')

In [439]:
# make cooccurrence array
cooc_matrix = np.zeros((num_tokens, num_tokens))
# word indices dict: key->word, value->index in matrix (0 to m)
word_indices = {}
counter = 0
for t in unique_tokens:
    word_indices[t] = counter
    counter+=1

In [522]:
# fill cooccurrence array
for s in sents:
    s = removePunctiation(s)
    words = word_tokenize(s)
    examineSentence(words)

NameError: name 'removePunctuation' is not defined

In [441]:
fdist.items()[:5] #5 most frequent items in fdist

[('the', 133), ('and', 114), ('of', 82), ('to', 70), ('our', 66)]

In [442]:
print cooc_matrix

[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]


In [400]:
# TO DO:
# make into class