# Word Co-Occurrence Matrix

## This notebook demonstrates how to obtain word co-occurrence matrix

#### Install necessary packages

In [None]:
!pip install scikit-learn pandas numpy matplotlib

#### Let us import necessary packages

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize

In [None]:
sentences = ['I like dosa',
            'I like idli',
            'I hate spicy food',
            'I hate sweets'
            ]

## Co-Occurrence Matrix without Stop-word

In [None]:
tokenized_sentence = []

for each_sentence in sentences:
    tokenized_sentence.append( ' '.join( [word.lower() for word in word_tokenize(each_sentence)]) ) 

In [None]:
tokenized_sentence

In [None]:
# Create a vocabulary from corpus

vocabulary = {}

for each_sentence in tokenized_sentence:
    for each_word in each_sentence.split(' '):
        if each_word not in vocabulary:
            vocabulary[each_word] = len(vocabulary)
            
print('Read {0} number of unique words'.format(len(vocabulary)))

In [None]:
# create a nxn matrix

cooccur_matrix = np.zeros(( len(vocabulary), len(vocabulary) ))
context_window = 3


# for every sentence
for each_sentence in tokenized_sentence:
    # for every word in the sentence
    sentence = each_sentence.split(' ')
    for word_index in range(len(sentence)):
        # define a context window
        for context_window in range( 3 - word_index, 3 + word_index):
            if context_window < 0 or context_window >= len(sentence) or context_window == word_index:
                continue
            cooccur_matrix[ vocabulary[sentence[word_index]]][ vocabulary[sentence[context_window]] ] += 1.0


In [None]:
df = pd.DataFrame(data = cooccur_matrix, columns = list(vocabulary.keys()), index = list(vocabulary.keys()))
df

## SVD of Word Co-Occurence matrix

In [None]:
from scipy.linalg import svd
from numpy import diag
from numpy import dot
from numpy import array

# define a matrix
A = array(df)
print('Matrix A is: \n')
print(A)
# SVD

In [None]:
# perform SVD

U, Sigma, VT = svd(A)

# Top k 
k = 3

U = U[:,:k]
Sigma = Sigma[:k]
VT = VT[:k,:]


In [None]:
dimReduced_df = pd.DataFrame(U, index=vocabulary)
dimReduced_df

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
%matplotlib inline


def display_pca_scatterplot(model, words=None):
    word_vectors = np.array([model[w] for w in words])
    twodim = PCA().fit_transform(word_vectors)[:,:2]
    plt.figure(figsize=(5,5))
    plt.scatter(twodim[:,0], twodim[:,1], edgecolors='k', c='r')
    for word, (x,y) in zip(words, twodim):
        plt.text(x+0.01, y+0.01, word)
    plt.show()
        
        
def display_tsne_scatterplot(model, perplexity=1.0, words=None):
    word_vectors = np.array([model[w] for w in words])
    twodim = TSNE(perplexity=perplexity).fit_transform(word_vectors)[:,:2]
    plt.figure(figsize=(5,5))
    plt.scatter(twodim[:,0], twodim[:,1], edgecolors='k', c='r')
    for word, (x,y) in zip(words, twodim):
        plt.text(x+0.01, y+0.01, word)
    plt.show()


In [None]:
weights = dimReduced_df.T.to_dict(orient='list')

display_tsne_scatterplot(weights, perplexity=1.0, words=list(dimReduced_df.index))

## Word Similarity

In [None]:
sentences = ['I like dosa',
            'I like idli',
            'I hate dosa',
            'I hate sweets'
            ]

## Co-Occurrence Matrix without Stop-word

In [None]:
tokenized_sentence = []

for each_sentence in sentences:
    tokenized_sentence.append( ' '.join( [word.lower() for word in word_tokenize(each_sentence)]) ) 

In [None]:
tokenized_sentence

In [None]:
# Create a vocabulary from corpus

vocabulary = {}

for each_sentence in tokenized_sentence:
    for each_word in each_sentence.split(' '):
        if each_word not in vocabulary:
            vocabulary[each_word] = len(vocabulary)
            
print('Read {0} number of unique words'.format(len(vocabulary)))

In [None]:
# create a nxn matrix

cooccur_matrix = np.zeros(( len(vocabulary), len(vocabulary) ))
context_window = 3


# for every sentence
for each_sentence in tokenized_sentence:
    # for every word in the sentence
    sentence = each_sentence.split(' ')
    for word_index in range(len(sentence)):
        # define a context window
        for context_window in range( 3 - word_index, 3 + word_index):
            if context_window < 0 or context_window >= len(sentence) or context_window == word_index:
                continue
            cooccur_matrix[ vocabulary[sentence[word_index]]][ vocabulary[sentence[context_window]] ] += 1.0


In [None]:
df = pd.DataFrame(data = cooccur_matrix, columns = list(vocabulary.keys()), index = list(vocabulary.keys()))
df

## SVD of Word Co-Occurence matrix

In [None]:
# define a matrix
A = array(df)
print('Matrix A is: \n')
print(A)
# SVD

In [None]:
# perform SVD

U, Sigma, VT = svd(A)

# Top k 
k = 2

U = U[:,:k]
Sigma = Sigma[:k]
VT = VT[:k,:]

In [None]:
dimReduced_df = pd.DataFrame(U, index=vocabulary)
dimReduced_df

In [None]:
weights = dimReduced_df.T.to_dict(orient='list')
display_tsne_scatterplot(weights, perplexity=0.2, words=list(dimReduced_df.index))