In [1]:
import re
import codecs
import json


def handle_sentends(tokens) :
    sent_ends = ["?", "!", ".", "।", "|"]
    n = len(tokens)
    i = 0
    while(i < n) :
        token = tokens[i]
        for end in sent_ends :
            if end in token :
                if(len(token) != 1):
                    tokens[i] = token.replace(end, "") 
                    tokens.insert(i+1, end+"_SENT")
                    n += 1
                    i += 1
                else :
                    tokens[i] = end+"_SENT"
        i += 1
    return tokens
def corpus_preprocess(sent) :
    references = "[?.,]*\[?[0१२३४५६७८९०][१२३४५६७८९०0]*\]"
    eclipse = "\.+|\*+"
    sent = re.sub(references, "", sent)
    sent = re.sub(eclipse, " ", sent)
    
    remove_double_underscore = ["WQ", "XC", "N_NST", "N_NNP", "RB", "N_NN", "QT_QTF"]
    sent = sent.replace("WQ", "_WQ")
    sent = sent.replace("XC", "_XC")
    for tag in remove_double_underscore :
        sent = sent.replace("__"+tag, "_"+tag)
        
   
    sent = sent.replace("'_RD_PUNC", " '_RD_PUNC ")
    sent = sent.replace(",_RD_PUNC", " ,_RD_PUNC ")
    sent = sent.replace("'", " '_RD_PUNC ")
    sent = sent.replace(",_RD_PUNC", ",")
    sent = sent.replace(",", "")
    sent = sent.replace(";", " ;_RD_PUNC ")
    sent = sent.replace("|_RD_PUNC", "|")
    
    symbols = ["‘‘", "’", "‘", "“", "”", "–", ")", "("]
    for sym in symbols :
        sent = sent.replace(sym, " "+sym+"_RD_SYM ")
    sent = sent.replace("-SYM", " -_RD_SYM ")
    sent = sent.replace(")_RD_SYM", " )_RD_SYM ")
    sent = sent.replace("+SYM", " +_RD_SYM ")
    
    
    sent = sent.replace("QC", "QC ")
    sent = sent.replace("V_VB", "V_VB ")
    
    sent = sent.replace("<s>P", "<s>")
    sent = sent.replace("P>s\<", "</s>")
    
    sent = sent.replace("<s>", " <s> ")
    sent = sent.replace("<s/>", "<\s>")
    sent = sent.replace("</s>", " <\s>")
    
    separate_comma = ["CC", "FW", "IN", "JJ", "N_NN", "N_NNP", "N_NNS", "PRP", "RB", "UH", "VB", "WDT"]
    for tag in separate_comma :
        sent = sent.replace(tag+',', tag+' '+' ,_RD_PUNCT ')
    if(sent[0:3] == "<s>") :
        sent = "<s> "+sent[3:]
    elif(sent[1:4] == "<s>") :
        sent = "<s> "+sent[4:]
    else :
        sent = "<s> "+sent
    if(sent[-5:-1] == "<\s>") :
        if(sent[-6] != " ") :
            sent = sent[:-5]+" <\s>"
    elif(sent[-4:] == "<\s>") :
        if(sent[-5] != " ") :
            sent = sent[:-4] +" <\s>"
    else :
        sent = sent + " <\s>"
    return sent

In [2]:
def get_words_and_tags(tokens) :
    words = []
    tags = []
    for sent_tokens in tokens :
        sent_words = []
        sent_tags = []
        for i in range(len(sent_tokens)) :
            token = sent_tokens[i]
            if(token.find("_") != -1) :
                pos = token.find("_")
                sent_words.append(token[:pos])
                sent_tags.append(token[pos+1:].upper())
            elif(token == "<s>"):
                sent_words.append(token)
                sent_tags.append("START")
            elif(token == "<\s>"):
                sent_words.append(token)
                sent_tags.append("END")   
            else :
                if(token[-1] == 'P') :
                    sent_words.append(token[::-1])
                    sent_tags.append("P")
                else :
                    sent_words.append(token)
                    sent_tags.append("UN")
        words.append(sent_words)
        tags.append(sent_tags)
    return words, tags

In [3]:
import os

files = os.listdir("Labeled-Hindi-Corpus")
hindi_tokens = []
for file in files:
    name = file
    file = "Labeled-Hindi-Corpus/"+file
    f = codecs.open(file, "r", encoding='utf-8')
    sents = f.readlines()
    for sent in sents :
        sent = sent.strip()
        sent = corpus_preprocess(sent)
        hindi_tokens.append(handle_sentends(sent.split()))
    f.close()  
hindi_words, hindi_tags = get_words_and_tags(hindi_tokens)
hindi_tags_list = [item for sublist in hindi_tags for item in sublist]
hindi_words_list = [item for sublist in hindi_words for item in sublist]
print(len(hindi_tags_list))
print(len(hindi_words_list))
print(len(hindi_words))
print("Percent of UN tags:",hindi_tags_list.count("UN")*100/len(hindi_tags_list))

for tag in sorted(set(hindi_tags_list)) :
    print(tag, hindi_tags_list.count(tag))
print(sorted(set(hindi_words_list)))

265781
265781
14003
Percent of UN tags: 0.5677606751423164
CC 2971
CCD 1366
CCS 925
CC_CCD 1409
CC_CCS 1464
CC_CC_N_NNP 4
CD 412
DEM 381
DM 314
DMD 1135
DMI 323
DMQ 59
DMR 36
DM_DMD 1501
DM_DMI 311
DM_DMQ 42
DM_DMR 62
DT 462
ECH 77
END 14004
FW 961
FW_FW 2
IN 6340
INJ 3
INTF 154
IN_DT 1
IN_IN 2
JJ 7771
JJ_JJ 4
JJ_JJR 1
JJ_JJS 804
JJ_JJ_N_NNP 3
JJ_N_NN 3
JJ_V_VAUX_N_NNP 1
NEG 908
NN 13
NNN 1
NNP 17
NST 657
N_NEG 234
N_NN 71035
N_NNC 1
N_NNP 18287
N_NNP_N_NNP 6
N_NNP_N_NNP_N_NNP 4
N_NNS 141
N_NN_FW 2
N_NN_JJ 2
N_NN_N_NNP_N_NNP 2
N_NN_N_NN_N_NNP 8
N_NN_SYM 1
N_NST 2186
P 927
PDT 4
PRC 2
PRF 406
PRI 6
PRP 4747
PRP$ 85
PRP$_PRP$ 4
PRPS 176
PRP_RPD 1
PRQ 360
PR_PRC 1
PR_PRF 443
PR_PRFRF 1
PR_PRL 310
PR_PRP 4924
PR_PRPRF 2
PR_PRP_DM_N_NNP 1
PR_PRP_PR_PRP_N_NNP 6
PR_PRQ 210
PSP 33962
PSP_PSP_N_NNP 5
PUNC 97
QC 272
QF 254
QO 26
QT 297
QTC 1041
QTF 917
QTO 106
QT_QTC 2491
QT_QTF 1045
QT_QTO 168
Q_QT 1
RB 1820
RB_JJ 7
RB_N_NNP 2
RB_RB 1
RB_RBR 106
RB_RBS 404
RDF 15
RD_ECH 2
RD_PUNC 1139
RD_PUNCN 

In [4]:
import numpy as np
from sklearn.preprocessing import normalize
import pandas as pd

def get_ngrams(n, tokens) :
    i = 0
    ngrams = []
    while(i < len(tokens)-n+1) :
        ngrams.append(tokens[i:i+n])
        i += 1
    return ngrams

def get_freq_dict(ngrams) :
    ngram_freq = {}
    for ngram in ngrams :
        key = ' '.join([str(elem) for elem in ngram])
        if key not in ngram_freq :
            ngram_freq[key] = 1
        else :
            ngram_freq[key] += 1
    return ngram_freq

def create_tag_transition_matrix(hindi_tags_list) :
    tags = list(sorted(set(hindi_tags_list)))

    tag_bigrams = get_ngrams(2, hindi_tags_list)
    
    tags_count = {}
    for tag in hindi_tags_list :
        if(tag not in tags_count) :
            tags_count[tag] = 1
        else :
            tags_count[tag] += 1
            
    tag_bigrams_freq = get_freq_dict(tag_bigrams)
    V = len(tag_bigrams_freq)
    
    TTP = {}
    for i in range(len(tags)) :
        TTP[tags[i]] = {}
        for j in range(len(tags)) :
            tag_pair = tags[i]+" "+tags[j]
            if tag_pair in tag_bigrams_freq :
                TTP[tags[i]][tags[j]] = (tag_bigrams_freq[tags[i]+" "+tags[j]])/(tags_count[tags[i]])
            else :
                TTP[tags[i]][tags[j]] = 0.0001
       
    return TTP

In [5]:
def create_word_emission_prob(hindi_tags_list, hindi_words_list, k=1) :
    tags_count = {}
    word_emission = {}
    
    for tag in hindi_tags_list :
        if(tag not in tags_count) :
            tags_count[tag] = 1
        else :
            tags_count[tag] += 1
    
   
    for i in range(len(hindi_tags_list)) :
        word = hindi_words_list[i]
        tag = hindi_tags_list[i]
        if tag not in word_emission :
            word_emission[tag] = {}
            word_emission[tag][word] = 1
        else :
            if word not in word_emission[tag] :
                word_emission[tag][word] = 1
            else :
                word_emission[tag][word] += 1
        
    V = len(set(hindi_words_list))
        
    for tag in word_emission :
        for word in word_emission[tag] :
            word_emission[tag][word] = (word_emission[tag][word]+k)/(tags_count[tag]+k*V)
        
    return word_emission


In [6]:
def viterbi(test_tokens, corpus_words, corpus_tags, k=1) :
    
    TTP = create_tag_transition_matrix(corpus_tags)
    WEP = create_word_emission_prob(corpus_tags, corpus_words, k)
    
    tags_count = {}
    for tag in hindi_tags_list :
        if(tag not in tags_count) :
            tags_count[tag] = 1
        else :
            tags_count[tag] += 1
    
    test_tokens = test_tokens[1:]
    V = len(set(corpus_words))
    tags = list(sorted(set(corpus_tags)))
    
    N = len(set(corpus_tags))
    T = len(test_tokens)
    
    
    SEQSCORE = np.zeros((T, N))
    BACKPTR = np.zeros((T, N))
    
    for i in range(N) :
        tag = tags[i]
        word = test_tokens[0]
        res = TTP["START"][tag]
        
        if tag in WEP and word in WEP[tag] :
            res *= WEP[tag][word]
        else :
            res *= k/(tags_count[tag]+k*V)
        
        SEQSCORE[0][i] = res
    
    prev_prob = -1
    seq = []
    for t in range(1, T) :
        for i in range(N) :
            word = test_tokens[t]
            tag = tags[i]
            options = []
            for j in range(N) :
                res = SEQSCORE[t-1][j]*TTP[tags[j]][tags[i]]
                if tag in WEP and word in WEP[tag] :
                    res *= WEP[tag][word]
                else :
                    res *= k/(tags_count[tag]+k*V)
                options.append(res)
                
            SEQSCORE[t][i] = max(options)
            max_index = options.index(max(options))
            BACKPTR[t][i] = int(max_index)
            if(max(options)>prev_prob) :
                prev_prob = max(options)
                
    
    C = [0]*T
    C[-1] = int(np.argmax(SEQSCORE[T-1]))
    
    for i in range(T-2, -1, -1) :
        C[i] = int(BACKPTR[i+1][C[i+1]])
    
            
    tagged_sent = ""       
    for i in range(len(test_tokens)-1) :
        tagged_sent += (test_tokens[i]+"_"+tags[C[i]]+" ")
    tagged_sent = tagged_sent[:-1]
    
    return tagged_sent
            

In [7]:
import numpy as np
def preprocess_test(sent) :
    if(sent[0:3] == "<s>") :
        sent = "<s> "+sent[3:]
    elif(sent[1:4] == "<s>") :
        sent = "<s> "+sent[4:]
    else :
        sent = "<s> "+sent
    if(sent[-5:-1] == "<\s>") :
        if(sent[-6] != " ") :
            sent = sent[:-5]+" <\s>"
    elif(sent[-4:] == "<\s>") :
        if(sent[-5] != " ") :
            sent = sent[:-4] +" <\s>"
    else :
        sent = sent + " <\s>"
    return sent

In [14]:
def main() :
    
    files = os.listdir("Labeled-Hindi-Corpus")
    hindi_tokens = []
    for file in files:
        name = file
        file = "Labeled-Hindi-Corpus/"+file
        f = codecs.open(file, "r", encoding='utf-8')
        sents = f.readlines()
        for sent in sents :
            sent = sent.strip()
            sent = corpus_preprocess(sent)
            hindi_tokens.append(handle_sentends(sent.split()))
        f.close()  
    hindi_words, hindi_tags = get_words_and_tags(hindi_tokens)
    hindi_tags_list = [item for sublist in hindi_tags for item in sublist]
    hindi_words_list = [item for sublist in hindi_words for item in sublist]

    
    sent = "केजरीवाल सरकार की महत्वाकांक्षी योजना"
    sent = preprocess_test(sent)
    tagged_sent = viterbi(sent.split(), hindi_words_list, hindi_tags_list)
    print(tagged_sent)

In [15]:
main()

केजरीवाल_N_NNP सरकार_N_NN की_PSP महत्वाकांक्षी_JJ योजना_N_NN
