### Reading XML and TXT corpus simultaneously

In [1]:
'''
    Generates word2vec for Nepali corpus
    
    Author - Oyesh Mann Singh
    Date - 02/26/2019
    
    For NNC corpus data:
        https://www.sketchengine.eu/nepali-national-corpus/
        or,
        Contact - osingh1@umbc.edu
'''

import sys, os
import unicodedata
import nltk
import nltk.corpus as nc
import NNCCorpus as nnc
import string
import itertools
import csv
import numpy as np

xmlDir = "../data/nnc/"
xml = nnc.NNCCorpusReader(xmlDir, fileids=r'(?!\.).*\.xml')

combinedDir = "../data/ner/bal/text_tag_only/"
combineTxt = nc.IndianCorpusReader(combinedDir, fileids='text_only.txt')

# newsDir = "../data/clean_corpus/"
# news = nc.IndianCorpusReader(newsDir, fileids=r'(?!\.).*\.txt')

# suryaDir = "../data/ner/NER_surya_bam/raw_data/"
# surya = nc.IndianCorpusReader(suryaDir, fileids=r'(?!\.).*\.txt')

### Prepare dictionary to remove unnecessary unicode

In [2]:
'''
    XML corpus clean up
    
    Need to look more closely on Nepali Preprocessing
    for word2vec.
    
    Currently removing everything for simplicity except hyphen and period

    Lu = Latin uppercase
    Ll = Latin lowercase
    P = Punctutation
    N = Number
    S = Symbol
    Cf = Other, format
    Cn = Other, not assigned
    Cc = Other, control
    Lo = Letter Other

'''

# Not removing HYPHEN and FULL STOP
# For Danda i != 2404

stem_file=open('../data/ner/stemming/new_postposition.txt', 'r', encoding='utf-8')
stemmers=stem_file.readlines()[0].split()

def stemmer(sentence):
    lemma_tag='O'
    not_to_be_lemmatized=['एमाले', 'अमेरिका', 'अधिकारी', 'शङ्का', 'मात्रिका']
    stemmed_sentence=[]
    for words in sentence:
        saved_pp=''
        lemmatize=False
        for pp in stemmers:
            if words == pp:
                break            
            elif words.endswith(pp):
                words=words[:-len(pp)]
                saved_pp = pp
                lemmatize=True
                break
        if len(words) > 0:
            stemmed_sentence.append(words)        
            if lemmatize:
                stemmed_sentence.append(saved_pp)
                lemmatize=False
    return stemmed_sentence


table = dict.fromkeys(i for i in range(sys.maxunicode) 
                        if unicodedata.category(chr(i)).startswith(('Lu', 'Ll', 'Cf','Cn','Cc'))
                        and i != 45 and i != 46 and i != 2404)

final_sents = []

# Convert to one-string format, remove punctuations, split to string word-wise
def read_corpus(data, stem=False, process=False):
    finals=[]
    for sent in data.sents():
        if process:
            sent = ' '.join(sent).translate(table).split()
        if stem:
            sent=stemmer(sent)
        finals.append(sent)
    return finals

In [4]:
%%time
# Read NNC corpus only
'''
    STEM=True might not be necessary because NNC corpus is already lemmatized
'''
nnc_sents=read_corpus(xml, stem=False, process=True)

KeyboardInterrupt: 

In [5]:
%%time
'''
    STEM=True might not be necessary if the data source is after_stemming/text_tag_only/text_only.txt
'''
dataset_sents=read_corpus(combineTxt, stem=False, process=True)

CPU times: user 56 ms, sys: 0 ns, total: 56 ms
Wall time: 55.3 ms


In [6]:
final_sents = dataset_sents + nnc_sents

NameError: name 'nnc_sents' is not defined

In [21]:
# For fasttext embedding
import string
with open('../data/fasttext/nep2ft-corpus-bal.txt', 'w', encoding='utf-8') as outfile:
    for each in final_sents:
        outfile.write(' '.join(each))
        outfile.write('\n')

In [22]:
num_of_words = 0
for each_sent in final_sents:
    num_of_words += len(each_sent)
    
    
print("Total number of sentences", len(final_sents))
print("Total number of words", num_of_words)

Total number of sentences 804002
Total number of words 14592452


In [23]:
# %%time
# # For glove, we need to make plain txt file
# final_sents = list(itertools.chain.from_iterable(final_sents))

# with open('../data/glove/nepali_raw.txt', 'w', encoding='utf-8') as f:
#     for item in final_sents:
#         f.write("%s\n" % item)

### Create Word2Vec model and Load it

In [None]:
%%time
from gensim.models import Word2Vec, KeyedVectors

model = Word2Vec(final_sents, size=300, min_count=5, window=10, sg=0, workers=10)
# model = Word2Vec.load('./large_nep2vec.model')

In [None]:
print("Vocabulary dimension", model.vector_size)
print("Number of words in vocab", len(model.wv.vocab))

### Save Word2Vec model

In [None]:
%%time
### Save Word2Vec model

from gensim.test.utils import get_tmpfile

# model_path = '../data/ner/nep2vec.model'
# Save new model
model.wv.save_word2vec_format('../data/nep2vec/nep2vec_stem-cbow', binary=False)
# model.save(model_path)

In [12]:
# from gensim.models import Word2Vec, KeyedVectors

# model = KeyedVectors.load_word2vec_format('../data/nep2vec/nep2vec_text_stem', binary=False)

### Check for word similarity

In [13]:
model.wv.most_similar('धूलो', topn=10)

[('बोक्रा', 0.8281864523887634),
 ('पात', 0.827208399772644),
 ('झोल', 0.8253916501998901),
 ('गोबर', 0.8253674507141113),
 ('धागो', 0.819855809211731),
 ('हरियो', 0.8166791796684265),
 ('दाना', 0.816335916519165),
 ('बोटबिरुवा', 0.8141928911209106),
 ('प्लास्टिक', 0.8104241490364075),
 ('लसुन', 0.8090667128562927)]