# Recipe 3-4. Generating Co-occurrence Matrix
Let’s discuss one more method of feature engineering called a co-occurrence matrix.

In [8]:
import numpy as np
import pandas as pd
import nltk
from nltk import bigrams
import itertools

The co_occurrence_matrix function is below.

In [25]:
def co_occurrence_matrix(corpus):
    vocab = set(corpus)
    vocab = list(vocab)
    vocab_to_index = { word:i for i, word in enumerate(vocab) }
    # Create bigrams from all words in corpus
    bi_grams = list(bigrams(corpus))
    # Frequency distribution of bigrams ((word1, word2), num_occurrences)
    bigram_freq = nltk.FreqDist(bi_grams).most_common(len(bi_grams))
    # Initialise co-occurrence matrix
    co_occurrence_matrix = np.zeros((len(vocab), len(vocab)))
    # Loop through the bigrams taking the current and previous word: co_occurrence_matrix[current][previous]
    # and the number of occurrences of the bigram.
    for bigram in bigram_freq:
        current = bigram[0][1]
        previous = bigram[0][0]
        count = bigram[1]
        pos_current = vocab_to_index[current]
        pos_previous = vocab_to_index[previous]
        co_occurrence_matrix[pos_current][pos_previous] = count
    co_occurrence_matrix = np.matrix(co_occurrence_matrix)
    # return the matrix and the index
    return co_occurrence_matrix,vocab_to_index

Here are the sentences for testing:

In [26]:
sentences = [['I', 'love', 'nlp'],
            ['I', 'love','to' 'learn'],
            ['nlp', 'is', 'future'],
            ['nlp', 'is', 'cool']]

create one list using many lists

In [27]:
merged = list(itertools.chain.from_iterable(sentences))
merged

['I',
 'love',
 'nlp',
 'I',
 'love',
 'tolearn',
 'nlp',
 'is',
 'future',
 'nlp',
 'is',
 'cool']

In [28]:
matrix, vocab_to_index = co_occurrence_matrix(merged)
matrix

matrix([[0., 1., 0., 0., 0., 1., 1.],
        [0., 0., 0., 1., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [2., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 2., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0.]])

In [29]:
vocab_to_index

{'nlp': 0, 'future': 1, 'I': 2, 'is': 3, 'cool': 4, 'love': 5, 'tolearn': 6}

generate the matrix

In [30]:
CoMatrixFinal = pd.DataFrame(matrix, index=vocab_to_index, columns=vocab_to_index)
print(CoMatrixFinal)

         nlp  future    I   is  cool  love  tolearn
nlp      0.0     1.0  0.0  0.0   0.0   1.0      1.0
future   0.0     0.0  0.0  1.0   0.0   0.0      0.0
I        1.0     0.0  0.0  0.0   0.0   0.0      0.0
is       2.0     0.0  0.0  0.0   0.0   0.0      0.0
cool     0.0     0.0  0.0  1.0   0.0   0.0      0.0
love     0.0     0.0  2.0  0.0   0.0   0.0      0.0
tolearn  0.0     0.0  0.0  0.0   0.0   1.0      0.0
