# Word Co-Occurrence Matrix

## This notebook demonstrates how to obtain word co-occurrence matrix and PPMI

#### Install necessary packages

In [None]:
!pip install sklearn pandas numpy

#### Let us import necessary packages

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
import nltk
import math
import string
from nltk.tokenize import word_tokenize

In [None]:
nltk.download('gutenberg')
nltk.download('punkt')

from nltk.corpus import gutenberg
from nltk.corpus import stopwords
nltk.download('stopwords')

english_stop_words = set(stopwords.words('english'))

punctuation = string.punctuation

In [None]:
# Load Shakespeare Macbeth
corpus = gutenberg.sents('shakespeare-macbeth.txt')

len(corpus)

In [None]:
sentences = [ 'I am not sleeping, I am waking, Would you know what I am making? I am boiling warm beer with butter,  Will you be my guest for supper?', 
            'Home! home! look at the shoe! Princess! the shoe was made for you! Prince! prince! take home thy bride, For she is the true one that sits by thy side!']

corpus = []
for each_sentence in sentences:
    corpus.append( each_sentence.split(' '))

In [None]:
tokenized_sentence = []

for each_sentence in corpus:
    temp_sentence = []
    for word in word_tokenize(' '.join(each_sentence)):
        if word.lower() not in english_stop_words and word.lower() not in punctuation:
            temp_sentence.append( word.lower())
    tokenized_sentence.append( ' '.join(temp_sentence) )
    
len(tokenized_sentence)
print(tokenized_sentence)

In [None]:
# Create a vocabulary from corpus

vocabulary = {}

for each_sentence in tokenized_sentence:
    for each_word in each_sentence.split(' '):
        if each_word not in vocabulary:
            vocabulary[each_word] = len(vocabulary)
            
print('Read {0} number of unique words'.format(len(vocabulary)))

In [None]:
# obtain bigram stats

bigram_stats = {}
unigram_stats = {}
context_window = 2
count = 0

# for every sentence
for each_sentence in tokenized_sentence:
    # for every word in the sentence
    sentence = each_sentence.split(' ')
    for word_index in range(len(sentence)):
        if sentence[word_index] not in unigram_stats:
            unigram_stats[sentence[word_index]] = 1.0
        unigram_stats[sentence[word_index]] += 1.0
        
        # define a context window
        for context_window in range( 3 - word_index, 3 + word_index):
            if context_window < 0 or context_window >= len(sentence) or context_window == word_index:
                continue
            if sentence[word_index] + ' ' + sentence[context_window] not in bigram_stats:
                bigram_stats[ sentence[word_index] + ' ' + sentence[context_window] ] = 1.0
            bigram_stats[ sentence[word_index] + ' ' + sentence[context_window] ] += 1.0
            count = count + 1


In [None]:
bigram_pmi = {}

for word_pair in bigram_stats:
    word_1 = word_pair.split(' ')[0]
    word_2 = word_pair.split(' ')[1]
    
    word_1_prob = (unigram_stats[word_1] * 1.0)/ (count * 1.0)
    word_2_prob = (unigram_stats[word_2] * 1.0)/ (count * 1.0)

    word_1_word_2_prob = (bigram_stats[word_pair] * 1.0)/ (count * 1.0)

    bigram_pmi[ word_1 + ' ' + word_2 ] =  max(math.log( word_1_word_2_prob / (word_1_prob * word_2_prob) ), 0.0)

In [None]:
bigram_pmi = dict(sorted(bigram_pmi.items(), key=lambda item: item[1]))

In [None]:
for key in bigram_pmi:
    print(key + '\t' + str(bigram_pmi[key]))