In [70]:
import numpy as np
import string
from nltk import word_tokenize
from nltk.probability import FreqDist
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords


In [71]:
# from newspaper import Article
# # get article
# url1 = "http://www.nytimes.com/2016/05/05/technology/moores-law-running-out-of-room-tech-looks-for-a-successor.html?smid=pl-share"
# article1 = Article(url1)
# article1.download()
# article1.parse()
# raw1 = article1.text.lower()
# raw1 = raw1.encode("utf-8")
# raw1.replace(".", "")
# raw1.replace(",", "")

In [81]:
class CooccurrenceMatrix:
    """
    corpus        --  text file, declared when object created
    data          --  string
    sents         --  list of strings, all the sentences in text file (with punctiation)
    tokens        --  list of strings, all tokens in text file (with repetitions)
    unique_tokens --  list of strings, all unique tokens
    num_tokens    --  int, number of unique tokens in text
    fdist         --  fdist object
    cooc_matrix   --  2D list, initialized to all 0's in prepareCorpus
    word_indices  --  dictionary (key=word, value=index in cooc_matrix)
    """
    def __init__(self, c):
        self.corpus = c 
        self.data = ""
        self.sents = []
        self.tokens = []
        self.unique_tokens = []
        self.num_tokens = 0
        self.fdist = None
        self.cooc_matrix = None
        self.word_indices = {}
        
    def prepareCorpus(self):
        """Given a text file, removes punctuation, returns the list of all tokens, list of unique tokens, 
        list of sentences, the number of unique tokens, and the frequency distribution. 
        Also initializies cooc_matrix to all 0's with size = num_tokens"""
        with open(self.corpus, 'r') as myfile:
            self.data = myfile.read().replace('\n', '')
        self.data = self.data.lower()
        data_no_stop = [i for i in self.data.split() if i not in stopwords.words('english')]
        self.data = " ".join(data_no_stop)
        self.sents = sent_tokenize(self.data)
        self.data = self.removePunctuation(self.data)
        self.tokens = word_tokenize(self.data)
        self.unique_tokens = set(self.tokens)
        self.num_tokens = len(self.unique_tokens)
        self.fdist = FreqDist(self.tokens)
        self.cooc_matrix = np.zeros((self.num_tokens, self.num_tokens))
        return

    def fillMatrix(self):
        """Fills the cooccurrence matrix"""
        self.findIndices()
        for s in self.sents:
            s = self.removePunctuation(s)
            words = word_tokenize(s)
            self.examineSentence(words)
        print "Cooccurrence Matrix:\n", self.cooc_matrix 
        return 
    
    def examineSentence(self, sentence):
        """Given a sentence (list of words), updates cooccurrence matrix with number of cooccurrences within sentence
        Ignores cooccurrences of word with itself"""
        for word1 in sentence:
            for word2 in sentence:
                index1 = self.word_indices[word1]
                index2 = self.word_indices[word2]
                if word1 != word2:
                    self.cooc_matrix[index1][index2] += 1
        return
    
    def findIndices(self):
        """Sets word_indices, where each key is a unique token and each value is a index in the cooc_matrix"""
        counter = 0
        for t in self.unique_tokens:
            self.word_indices[t] = counter
            counter += 1
        return

    def removePunctuation(self, text):
        """Given a piece of text, removes all punctuation (replacing periods, semicolons, colons with spaces)"""
        text = text.replace(".", " ")
        text = text.replace(";", " ")
        text = text.replace(":", ".")
        text = text.replace(",", "")
        exclude = set(string.punctuation)
        text = ''.join(ch for ch in text if ch not in exclude)
        return text
    
    def findCooccurrences(self, target_word):
        """returns the cooccurrences of a given word"""
        if target_word not in self.word_indices[target_word]:
            return "Target word not in corpus. Try another word."
        index_target_word = self.word_indices[target_word]
        max_index = 0
        for col in range(self.num_tokens):
            if self.cooc_matrix[index_target_word][col] > self.cooc_matrix[index_target_word][max_index]:
                max_index = col
        top_cooccurrence = (list(self.word_indices.keys())[list(self.word_indices.value()).index(max_index)])
        return top_cooccurrence

In [84]:
# Testing on obama.txt
obama = CooccurrenceMatrix('obama.txt')
obama.prepareCorpus()
obama.fillMatrix()
print obama.cooc_matrix.shape #dimensions of matrix
print obama.tokens


Cooccurrence Matrix:
[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]
(1217, 1217)
['fellow', 'citizensi', 'stand', 'today', 'humbled', 'task', 'us', 'grateful', 'trust', 'bestowed', 'mindful', 'sacrifices', 'borne', 'ancestors', 'thank', 'president', 'bush', 'service', 'nation', 'well', 'generosity', 'cooperation', 'shown', 'throughout', 'transition', 'fortyfour', 'americans', 'taken', 'presidential', 'oath', 'words', 'spoken', 'rising', 'tides', 'prosperity', 'still', 'waters', 'peace', 'yet', 'every', 'often', 'oath', 'taken', 'amidst', 'gathering', 'clouds', 'raging', 'storms', 'moments', 'america', 'carried', 'simply', 'skill', 'vision', 'high', 'office', 'people', 'remained', 'faithful', 'ideals', 'forbearers', 'true', 'founding', 'documents', 'so', 'been', 'must', 'generation', 'americans', 'that', 'midst', 'crisis', 'well', 'unde

In [80]:
# TO DO:
# input word in system, look up word in matrix and return (n)-most cooccurring words
# add a bigger corpus, or put together lots of obama's speeches together 
# what words to obama use similarly together (maybe use other political figures) 
# for this word obama uses like this, this other politician uses it like this

In [68]:
#OBAMA TEST: for given word ('america' here), return top cooccurrence
obama.word_indices['america'] #604

max_index = 0
# for row in range(obama.num_tokens):
#     for col in range(obama.num_tokens):
        
for col in range(obama.num_tokens):
    if obama.cooc_matrix[604][col] > obama.cooc_matrix[604][max_index]:
        max_index = col
# max_index is 244

top_cooccurrence = (list(obama.word_indices.keys())[list(obama.word_indices.values()).index(max_index)])
print top_cooccurrence

advancing


In [69]:
#TRUMP TEST
trump = CooccurrenceMatrix('trump.txt')
trump.prepareCorpus()
trump.fillMatrix()
print trump.cooc_matrix.shape #dimensions of matrix

#for given word ('america' here), return top cooccurrence
trump.word_indices['america'] #498

max_index = 0
for col in range(trump.num_tokens):
    if trump.cooc_matrix[498][col] > trump.cooc_matrix[498][max_index]:
        max_index = col
# max_index is 241

top_cooccurrence = (list(trump.word_indices.keys())[list(trump.word_indices.values()).index(max_index)]) 
print top_cooccurrence

Cooccurrence Matrix:
[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]
(1046, 1046)
president
