# Word Co-Occurrence Matrix

## This notebook demonstrates how to obtain word co-occurrence matrix

#### Install necessary packages

In [None]:
!pip install scikit-learn pandas numpy matplotlib

#### Let us import necessary packages

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize

In [None]:
sentences = ['The top executives were on the gravy train with their huge bonuses.', 
            'His father worked hard to build the company, but all Percy has to do is sit back and ride the gravy train.',
            'Speaking of which, let’s take our little gravy bowl and slather this plate with curry.',
            'Don’t add water the moisture from the meat is generally enough to make gravy.',
            'Could be that fast train is going somewhere after all.',
            'The military uses all sorts of games to train troops.']

## Co-Occurrence Matrix without Stop-word

In [None]:
tokenized_sentence = []

for each_sentence in sentences:
    tokenized_sentence.append( ' '.join( [word.lower() for word in word_tokenize(each_sentence)]) ) 

In [None]:
tokenized_sentence

In [None]:
# Create a vocabulary from corpus

vocabulary = {}

for each_sentence in tokenized_sentence:
    for each_word in each_sentence.split(' '):
        if each_word not in vocabulary:
            vocabulary[each_word] = len(vocabulary)
            
print('Read {0} number of unique words'.format(len(vocabulary)))

In [None]:
# create a nxn matrix

cooccur_matrix = np.zeros(( len(vocabulary), len(vocabulary) ))
context_window = 3


# for every sentence
for each_sentence in tokenized_sentence:
    # for every word in the sentence
    sentence = each_sentence.split(' ')
    for word_index in range(len(sentence)):
        # define a context window
        for context_window in range( 3 - word_index, 3 + word_index):
            if context_window < 0 or context_window >= len(sentence) or context_window == word_index:
                continue
            cooccur_matrix[ vocabulary[sentence[word_index]]][ vocabulary[sentence[context_window]] ] += 1.0


In [None]:
df = pd.DataFrame(data = cooccur_matrix, columns = list(vocabulary.keys()), index = list(vocabulary.keys()))
df

In [None]:
df.loc[ ['gravy', 'train', 'executives', 'fast', 'troops', 'military'] ]

## Co-Occurrence Matrix with Stop-Word

In [None]:
from nltk.corpus import stopwords
import string

english_stop_words = set(stopwords.words('english'))

punctuation = string.punctuation

In [None]:
tokenized_sentence = []

for each_sentence in sentences:
    temp_sentence = []
    for word in word_tokenize(each_sentence):
        if word.lower() not in english_stop_words and word.lower() not in punctuation:
            temp_sentence.append( word.lower())
    tokenized_sentence.append( ' '.join(temp_sentence) )
    
tokenized_sentence

In [None]:
# Create a vocabulary from corpus

vocabulary = {}

for each_sentence in tokenized_sentence:
    for each_word in each_sentence.split(' '):
        if each_word not in vocabulary:
            vocabulary[each_word] = len(vocabulary)
            
print('Read {0} number of unique words'.format(len(vocabulary)))

In [None]:
# create a nxn matrix

cooccur_matrix = np.zeros(( len(vocabulary), len(vocabulary) ))
context_window = 3


# for every sentence
for each_sentence in tokenized_sentence:
    # for every word in the sentence
    sentence = each_sentence.split(' ')
    for word_index in range(len(sentence)):
        # define a context window
        for context_window in range( 3 - word_index, 3 + word_index):
            if context_window < 0 or context_window >= len(sentence) or context_window == word_index:
                continue
            cooccur_matrix[ vocabulary[sentence[word_index]]][ vocabulary[sentence[context_window]] ] += 1.0


In [None]:
df = pd.DataFrame(data = cooccur_matrix, columns = list(vocabulary.keys()), index = list(vocabulary.keys()))
df

In [None]:
df.loc[ ['gravy', 'train', 'executives', 'fast', 'troops', 'military'] ]

## SVD of Word Co-Occurence matrix

In [None]:
from scipy.linalg import svd
from numpy import diag
from numpy import dot
from numpy import array

# define a matrix
A = array(df)
print('Matrix A is: \n')
print(A)
# SVD

In [None]:
# perform SVD

U, s, VT = svd(A)

# Top k 
k = 8

U = U[:,:k]
Sigma = Sigma[:k,:k]
VT = VT[:k,:]


In [None]:
dimReduced_df = pd.DataFrame(U, index=vocabulary)
dimReduced_df

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
%matplotlib inline


def display_pca_scatterplot(model, words=None, sample=0):
    word_vectors = np.array([model[w] for w in words])
    twodim = PCA().fit_transform(word_vectors)[:,:2]
    plt.figure(figsize=(15,15))
    plt.scatter(twodim[:,0], twodim[:,1], edgecolors='k', c='r')
    for word, (x,y) in zip(words, twodim):
        plt.text(x+0.01, y+0.01, word)
    plt.show()
        
        
def display_tsne_scatterplot(model, words=None, sample=0):
    word_vectors = np.array([model[w] for w in words])
    twodim = TSNE().fit_transform(word_vectors)[:,:2]
    plt.figure(figsize=(15,15))
    plt.scatter(twodim[:,0], twodim[:,1], edgecolors='k', c='r')
    for word, (x,y) in zip(words, twodim):
        plt.text(x+0.01, y+0.01, word)
    plt.savefig("test.png")
    plt.show()


In [None]:
weights = dimReduced_df.T.to_dict(orient='list')

display_tsne_scatterplot(weights, list(dimReduced_df.index))