Implementing a Marathi Language Model


In [1]:
from collections import Counter, defaultdict

from nltk import ngrams, word_tokenize, sent_tokenize

import scipy
from scipy.sparse import dok_matrix, csr_matrix
from scipy.spatial.distance import euclidean, cosine
import numpy as np
import gc

In [2]:
import nltk
nltk.download("punkt")

[nltk_data] Downloading package punkt to C:\Users\Om
[nltk_data]     Ambaye\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

We will use the preprocessed corpus that we have already saved

In [3]:
input_file = 'Corpus.txt'

In [4]:
# function that generates all the unigrams from the file given
x = Counter()
with open(input_file, 'r',encoding="utf-8") as f:
    for line in f.readlines():
        for sent in sent_tokenize(line):
            x.update([i[0] for i in ngrams(word_tokenize(sent), 1)])
vocab_counts=x

In [5]:
# the size of vocabulary will be the number of entries in vocab_counts
print("Size of vocabulary is",len(vocab_counts))

Size of vocabulary is 831270


In [6]:
#stop words for marathi language
a="अधिक अनेक अशी असलयाचे असलेल्या असा असून असे आज आणि आता आपल्या आला आली आले आहे आहेत एक एका कमी करणयात करून का काम काय काही किवा की केला केली केले कोटी गेल्या घेऊन जात झाला झाली झाले झालेल्या टा डॉ तर तरी तसेच ता ती तीन ते तो त्या त्याचा त्याची त्याच्या त्याना त्यानी त्यामुळे त्री दिली दोन न नाही निर्ण्य पण पम परयतन पाटील म मात्र माहिती मी मुबी म्हणजे म्हणाले म्हणून या याचा याची याच्या यांच्या याना यांचे यांची यांना यांनी यांच्या यानी येणार येत येथील येथे लाख व व्यकत सर्व सागित्ले सुरू हजार हा ही हे होणार होत होता होती होते"
a=a.split(" ")
stp = set(a)
# creating a new vocab where only the words which are not stopwords and occur more than 100 times are considered
vocab = {x for x, count in vocab_counts.items() if count >= 100 and x not in stp}


In [7]:
# size of the newly generated vocabulary
print("Size of the reduced vocabulary is",len(vocab))

Size of the reduced vocabulary is 12035


Creating a dictionary which will act as an index, where the keys will be numbers and words where the values are the corresponding word and number respectively.

In [8]:
# number to word
vocab_list = list(vocab)

# corresponding word to number
vocab_pos = {vocab_list[i] : i for i in range(len(vocab_list))}

# compiling both into a dictionary
vocab_idx = vocab_pos.copy()
vocab_idx.update({i : w for i,w in enumerate(vocab_list)})

## Building Co-occurence matrix

The function below returns a co-occurence matrix, which is in the format of a compressed sparse row matrix (scipy csr matrix).

It will first create a defaultdict of counters, which will keep track of co-occurences where the ith index of the defaultdict corresponds to a counter which acts as a sparse vector of the word at the ith index.

Then it creates an empty dok matrix and populates it the entries in the above defaultdict.

Finally, it converts the dok matrix to a csr matrix and returns it.

In [9]:
def co_occurences(file):
    ramp = [0,4,3,2,1]
    # to keep track of co-occurences
    occurences = defaultdict(lambda : Counter())

    with open(file, 'r',encoding="utf-8") as corpus:

        for line in (corpus.readlines()):
        # list of all n-grams in the line, where n = size of window + 1. Added padding so that all the possible co-occurences are listed
            all_grams = ngrams(word_tokenize(line), 5, pad_right = True, pad_left = True)

            for grams in all_grams:
                # will proceed only if gram[0] is in the vocab.
                if grams[0] in vocab :
                    for idx, gram in enumerate(grams):
                        if gram in vocab:
                            # increase the co-occurence according to the distance between the words
                            occurences[vocab_idx[grams[0]]][vocab_idx[gram]] += ramp[idx]

                # Doing the same as above with the gram reversed
                grams_rev = grams[::-1]

                if grams_rev[0] in vocab:
                    for idx, gram in enumerate(grams_rev):
                        if gram in vocab:
                            occurences[vocab_idx[grams_rev[0]]][vocab_idx[gram]] += ramp[idx]

    # creating an empty sparse matrix
    mat = dok_matrix((len(vocab), len(vocab)), dtype=np.int64)

    # populating the matrix
    for i in (range(len(vocab))):
        for j in occurences[i].keys():
            mat[i,j] = occurences[i][j]

    # converting to csr matrix
    return mat.tocsr()


In [10]:
co_occurence_matrix = co_occurences(input_file)

## Building the Word Embedding matrix

In [11]:
import math
import numpy as np
from scipy.sparse import dok_matrix, csr_matrix
import math

def word_embedding(matrix):
    total = matrix.sum()
    row_sum = matrix.sum(axis=1).A.flatten()  # Use .A to convert to dense array
    col_sum = matrix.sum(axis=0).A.flatten()  # Use .A to convert to dense array

    vocab_size = matrix.shape[0]

    mat = dok_matrix((vocab_size, vocab_size), dtype=np.float64)

    indices = matrix.nonzero()
    for a, b in (zip(*indices)):
        a_sum = row_sum[a]
        b_sum = col_sum[b]
        num = total * matrix[a, b] - a_sum * b_sum
        l1 = np.multiply(a_sum, (total - a_sum), dtype=np.float64)
        l2 = np.multiply(b_sum, (total - b_sum), dtype=np.float64)
        den = np.multiply(l1, l2, dtype=np.float64)
        val = num / math.sqrt(float(den))
        mat[a, b] = math.sqrt(val) if val > 0 else 0

    return mat.tocsr()


embmat = word_embedding(co_occurence_matrix)


## Similarity measure

In [12]:
def word_dist(x, y):
    a1 = embmat[vocab_idx[x],:].toarray().flatten()
    a2 = embmat[vocab_idx[y],:].toarray().flatten()
    return cosine(a1, a2)

In [13]:
#an example
word_dist('आई', 'बाबा')

0.797710450044582

This function will find the k closest words to the given word.

The norms of all the vectors are pre calculated

It calculates the vector of the given word and then loops through all the words in the vocab, create their vectors and find the distance and add that to a dist list along with the word.

Then we sort the list with the distance and return the top k elements remopving the first one (as that will be the word itself)

In [40]:
# precomputing the norms of all the vectors
norms = [(lambda x: np.linalg.norm(co_occurence_matrix[x,:].toarray().flatten()))(x) for x in range(len(vocab))]

def closest_words(x, k=10):
    a1 = embmat[vocab_idx[x],:].toarray().flatten()
    a1 = a1/norms[vocab_idx[x]]  # vector of the target word

    dists = [(i, cosine(a1, embmat[vocab_idx[i],:].toarray().flatten())) for i in (vocab)]

    dists.sort(key= lambda x:x[1])

    return dists[0:k]

In [41]:
#one example
closest_words('बाबा')

[('श्री', 0.6607320716186517),
 ('आमटे', 0.6640443953675746),
 ('महाराज', 0.6848877552895938),
 ('संत', 0.6877123492122967),
 ('गाडगे', 0.7170886870711456),
 ('ऊर्फ', 0.7239856673416889),
 ('नारायण', 0.7283487325929674),
 ('गुरू', 0.7306084832046923),
 ('मुरलीधर', 0.7438177965225354),
 ('प्रकाश', 0.7452581440965179)]

## Listing similar words

The nouns picked :

In [42]:
nouns = ['कामगार', 'सचिन', 'आंबा', 'गुरुवार','काश्मीर']

Computing the similar words and putting them in a list of list of similar words, where the first word is the target word

In [43]:
similar = []
for w in nouns:
    similar.append([w]+[x for x,_ in closest_words(w, k=5)])

In [44]:
def print_similar_words(i):
    print("For the word", similar[i][0])
    print("The top 5 similar words are", str(similar[i][1:]))

In [45]:
print_similar_words(0)

For the word कामगार
The top 5 similar words are ['शेतमजूर', 'वीटभट्टीमजूर', 'नोकर', 'शेतकरी', 'सरकारी']


In [46]:
print_similar_words(1)

For the word सचिन
The top 5 similar words are ['तेंडुलकर', 'कुलकर्णी', 'राहुल', 'प्रिया', 'रमेश']


In [47]:
print_similar_words(2)

For the word आंबा
The top 5 similar words are ['काजू', 'नारळ', 'भात', 'केळी', 'ऊस']


In [48]:
print_similar_words(3)

For the word गुरुवार
The top 5 similar words are ['सोमवार', 'मंगळवार', 'शुक्रवार', 'रविवार', 'बुधवार']


In [49]:
print_similar_words(4)

For the word काश्मीर
The top 5 similar words are ['जम्मू', 'लडाख', 'काश्मीरमधील', 'पंजाब', 'केंद्रशासित']
