# Word Co-Occurrence Matrix

## This notebook demonstrates how to obtain word co-occurrence matrix

#### Install necessary packages

In [None]:
!pip install sklearn pandas numpy

#### Let us import necessary packages

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize

In [None]:
sentences = [ 'I am not sleeping, I am waking, Would you know what I am making? I am boiling warm beer with butter,  Will you be my guest for supper?', 
            'Home! home! look at the shoe! Princess! the shoe was made for you! Prince! prince! take home thy bride, For she is the true one that sits by thy side!']

## Co-Occurrence Matrix without Stop-word

In [None]:
tokenized_sentence = []

for each_sentence in sentences:
    tokenized_sentence.append( ' '.join( [word.lower() for word in word_tokenize(each_sentence)]) ) 

In [None]:
tokenized_sentence

In [None]:
# Create a vocabulary from corpus

vocabulary = {}

for each_sentence in tokenized_sentence:
    for each_word in each_sentence.split(' '):
        if each_word not in vocabulary:
            vocabulary[each_word] = len(vocabulary)
            
print('Read {0} number of unique words'.format(len(vocabulary)))

In [None]:
# create a nxn matrix

cooccur_matrix = np.zeros(( len(vocabulary), len(vocabulary) ))
context_window = 3


# for every sentence
for each_sentence in tokenized_sentence:
    # for every word in the sentence
    sentence = each_sentence.split(' ')
    for word_index in range(len(sentence)):
        # define a context window
        for context_window in range( 3 - word_index, 3 + word_index):
            if context_window < 0 or context_window >= len(sentence) or context_window == word_index:
                continue
            cooccur_matrix[ vocabulary[sentence[word_index]]][ vocabulary[sentence[context_window]] ] += 1.0


In [None]:
df = pd.DataFrame(data = cooccur_matrix, columns = list(vocabulary.keys()), index = list(vocabulary.keys()))
df

In [None]:
df.loc[ ['prince', 'princess', 'sleeping', 'waking', 'beer', 'butter'] ]

## Co-Occurrence Matrix with Stop-Word

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import string

english_stop_words = set(stopwords.words('english'))

punctuation = string.punctuation

In [None]:
tokenized_sentence = []

for each_sentence in sentences:
    temp_sentence = []
    for word in word_tokenize(each_sentence):
        if word.lower() not in english_stop_words and word.lower() not in punctuation:
            temp_sentence.append( word.lower())
    tokenized_sentence.append( ' '.join(temp_sentence) )
    
tokenized_sentence

In [None]:
# Create a vocabulary from corpus

vocabulary = {}

for each_sentence in tokenized_sentence:
    for each_word in each_sentence.split(' '):
        if each_word not in vocabulary:
            vocabulary[each_word] = len(vocabulary)
            
print('Read {0} number of unique words'.format(len(vocabulary)))

In [None]:
# create a nxn matrix

cooccur_matrix = np.zeros(( len(vocabulary), len(vocabulary) ))
context_window = 3


# for every sentence
for each_sentence in tokenized_sentence:
    # for every word in the sentence
    sentence = each_sentence.split(' ')
    for word_index in range(len(sentence)):
        # define a context window
        for context_window in range( 3 - word_index, 3 + word_index):
            if context_window < 0 or context_window >= len(sentence) or context_window == word_index:
                continue
            cooccur_matrix[ vocabulary[sentence[word_index]]][ vocabulary[sentence[context_window]] ] += 1.0


In [None]:
df = pd.DataFrame(data = cooccur_matrix, columns = list(vocabulary.keys()), index = list(vocabulary.keys()))
df

In [None]:
df.loc[ ['prince', 'princess', 'sleeping', 'waking', 'beer', 'butter'] ]