In [1]:
import os

files = os.listdir("Labeled-Hindi-Corpus")
print(len(files))

14


In [2]:
import re

references = "[?.,]*\[?[0१२३४५६७८९०][१२३४५६७८९०0]*\]"
extras = "[\)\(]"
eclipse = "\.+"
puncts = "[,\-']"
rep = [references, extras, puncts]
rep_regex = '|'.join('%s' % val for val in rep)
add_space = []
sent_ends = ["?", "!", ".", "।", "|"]
add_space = ["V_VB", "N_NN", "RD_PUNC", "JJ ", "NNP", "N_NNP"]
def handle_sentends(tokens) :
    n = len(tokens)
    i = 0
    while(i < n) :
        token = tokens[i]
        for end in sent_ends :
            if end in token :
                if(len(token) != 1):
                    tokens[i] = token.replace(end, "") 
                    tokens.insert(i+1, end+"_SENT")
                    n += 1
                    i += 1
                else :
                    tokens[i] = end+"_SENT"
        i += 1
    return tokens
def preprocess(sent) :
    sent = re.sub(rep_regex, "", sent)
    sent = sent.replace("<s>P", "<s>")
    sent = sent.replace("<s> P", "<s>")
    sent = sent.replace("</s>P", "</s>")
    sent = sent.replace("<\s>", "</s>")
    sent = sent.replace("</s><s>", "</s> <s>")
    sent = sent.replace("|P", "")
    sent = sent.replace("aux", "AUX")
    sent = sent.replace(" WQ", "_WQ")
    sent = sent.replace(" QW", "_QW")
    sent = sent.replace(" UT", "_UT ")
    sent = sent.replace(" XC", "_XC")
    sent = sent.replace(" QF", "_QF")
    sent = sent.replace(" DEM", "_DEM")
    sent = sent.replace(" UT", "_UT")
    sent = sent.replace(" QO", "_QO")
    sent = sent.replace(" RDP", "_RDP")
    sent = sent.replace("SP", "_SP")
    sent = sent.replace("V_VAUX", "V_VAUX ")
    sent = sent.replace("NST", "_NST")
    sent = sent.replace("fW", "FW")
    #sent = sent.replace("IN", "_IN")
    for tag in add_space :
        sent = sent.replace(tag, tag+" ")
    if(sent[0:3] == "<s>") :
        sent = "<s> "+sent[3:]
    elif(sent[1:4] == "<s>") :
        sent = "<s> "+sent[4:]
    else :
        sent = "<s> "+sent
    if(sent[-5:-1] == "</s>") :
        if(sent[-6] != " ") :
            sent = sent[:-5]+" </s>"
    elif(sent[-4:] == "</s>") :
        if(sent[-5] != " ") :
            sent = sent[:-4] +" </s>"
    else :
        sent = sent + " </s>"
    return sent

In [3]:
def get_words_and_tags(tokens) :
    words = []
    tags = []
    for sent_tokens in tokens :
        sent_words = []
        sent_tags = []
        for i in range(len(sent_tokens)) :
            token = sent_tokens[i]
            if(token.find("_") != -1) :
                pos = token.find("_")
                sent_words.append(token[:pos])
                sent_tags.append(token[pos+1:])
            elif(token == "<s>"):
                sent_words.append(token)
                sent_tags.append("START")
            elif(token == "</s>"):
                sent_words.append(token)
                sent_tags.append("END")   
            else :
                sent_words.append(token)
                sent_tags.append("UN")
        words.append(sent_words)
        tags.append(sent_tags)
    return words, tags

In [4]:
def get_ngrams(n, tokens) :
    ngrams = []
    for token in tokens :
        if(len(token) < n) :
            continue
        else :
            i = 0
            while(i < len(token)-n+1) :
                ngrams.append(token[i:i+n])
                i += 1
    return ngrams

def get_freq_dict(ngrams) :
    ngram_freq = {}
    for ngram in ngrams :
        key = ' '.join([str(elem) for elem in ngram])
        if key not in ngram_freq :
            ngram_freq[key] = 1
        else :
            ngram_freq[key] += 1
    return ngram_freq

def write_to_file(ngrams, fname) :
    f = open(fname, "w")
    to_write = ""
    for terms in ngrams :
        for term in terms :
            to_write += (term+" ")
        to_write = to_write[:-1]
        to_write += "\n"
    f.write(to_write)
    f.close()

In [5]:
hindi_tokens = []
for file in files:
    name = file
    file = "Labeled-Hindi-Corpus/"+file
    f = open(file, "r")
    sents = f.readlines()
    for sent in sents :
        sent = sent.strip()
        sent = preprocess(sent)
        hindi_tokens.append(handle_sentends(sent.split()))
    f.close()  
hindi_words, hindi_tags = get_words_and_tags(hindi_tokens)
hindi_tags_list = [item for sublist in hindi_tags for item in sublist]
print(set(hindi_tags_list))

{'PR_PRPRF', 'WDT', 'PR_PRP_PR_PRP_N_NN', 'PDT', 'QC', 'JJ_JJ', 'VBZ', 'QTC', 'FW_FW', 'NNN', 'DMI', 'WQ', 'RB_RBR', 'P', 'NN', 'V_VB', 'DM_DMQ', 'RDF', 'RD_ECH', 'UN', 'SP', 'QT_QTF', 'UH', 'RB_JJ', 'RB_N_NN', 'QTO', 'CC_CCD', 'NEG', '_RB', 'XC_XC_N_NN', 'P_SP_RD_SYM', 'QTF', 'CC_CCS', 'IN_IN', 'PR_PRP', 'END', 'QT_QTC', 'PR_PRL', '_NST', 'DM_DMI', 'QF', 'PRC', 'P_SP', 'NST', 'JJ_N_NN', 'CD', 'RD_SYM', 'SENT', 'JJ_JJ_N_NN', 'N_NEG', 'CCD', 'RB', 'V_VM_RD_PUNC', 'PR_PRQ', 'JJ_JJR', '_QT_QTF', 'QT_QTO', 'RP_INTF', 'RP_NEG', 'XC', 'V_VV', 'DM_DMR', 'JJ', 'DT', 'RP_NEG_RP_NEG_N_NN', 'RD_UNK_RD_UNK_N_NN', 'PRP', 'PR_PRP_DM_N_NN', 'Q_QT', 'PR_PRF', 'CC_CC_N_NN', 'SYM', 'N__NST', 'DMQ', 'VB', 'INJ', 'PUNC', 'PRP$_PRP$', 'PRI', 'CCS', 'WDTUNC', 'QO', 'DEM', 'P_SP_P_SP_N_NN', 'V_VM', 'NNP', 'JJ_V_VAUX', 'RB_RBS', 'WP', 'RP', 'PRF', 'FW', 'DMD', 'IN_DT', 'V_VM_SYM', 'RB_RB', 'V_VM_V_VAUX', 'N_NN', 'ECH', 'RDP', 'DMR', 'PR_PRFRF', 'IN', 'V_VMG', 'QT_QTC_RD_SYM', 'SYSM', 'PRP$', 'PRQ', 'RD_PUNC',

In [6]:
def get_perplexity_add(test_data, hindi_words, k, ngrams = 2) :
    
    test_tokens = []
    for row in test_data :
        test_tokens.extend(row)
    V = len(set(test_tokens))
    
    test_ngrams = get_ngrams(ngrams, test_data)
    N = len(test_ngrams)
    
    hindi_ngrams = get_ngrams(ngrams, hindi_words)
    prev_grams = get_ngrams(ngrams-1, hindi_words)
    
    ngram_freq = get_freq_dict(hindi_ngrams)
    prev_gram_freq = get_freq_dict(prev_grams)
    #print(ngram_freq)
    #print(prev_gram_freq)
    prob = 1
    for i in range(1, len(test_ngrams)) :
        prev = ""
        for j in range(len(test_ngrams[i])-1) :
            prev += (test_ngrams[i][j]+" ")
        prev = prev[:-1]
        ngram = ' '.join([str(elem) for elem in test_ngrams[i]])
        
        if ngram in ngram_freq :
            if prev in prev_gram_freq:
                #print(prev, ",", ngram, ",", prev_gram_freq[prev], ngram_freq[ngram])
                #print((ngram_freq[ngram]+k)/((prev_gram_freq[prev])+k*V))
                prob *= ((ngram_freq[ngram]+k)/((prev_gram_freq[prev])+k*V))
            else :
                #print(prev, ",", ngram,",", 0, ngram_freq[ngram])
                #print((ngram_freq[ngram]+k)/(k*V))
                prob *= ((ngram_freq[ngram]+k)/(k*V))
        else :
            if prev in prev_gram_freq :
                #print(prev, ",", ngram, ",", prev_gram_freq[prev], 0)
                #print(k/((prev_gram_freq[prev])+k*V))
                prob *= (k/((prev_gram_freq[prev])+k*V))
            else :
                #print(prev,",", ngram, ",", 0, 0)
                #print(1/V)
                prob *= (1/V)
        #print(prob)
    print(prob)
    print(N, V)
    perp = 1/prob
    perp = pow(perp, 1/N)
    return perp

In [7]:
#f = open("test.txt", "r")
#sents = f.readlines()
sents = ["योजना पूरे देश में होगी लागू", "देश लागू में होगी पूरे योजना"]
test_tokens = []
for sent in sents :
    sent = sent.strip()
    sent = preprocess(sent)
    test_tokens.append(sent.split())
print(test_tokens[0])
perplexity = get_perplexity_add([test_tokens[0]], hindi_words, k=1, ngrams=5)
print("Add_k: "+str(perplexity))
print(test_tokens[1])
perplexity = get_perplexity_add([test_tokens[1]], hindi_words, k=1, ngrams=3)
print("Add_k: "+str(perplexity))

['<s>', 'योजना', 'पूरे', 'देश', 'में', 'होगी', 'लागू', '</s>']
0.0054869684499314125
4 8
Add_k: 3.6742346141747673
['<s>', 'देश', 'लागू', 'में', 'होगी', 'पूरे', 'योजना', '</s>']
2.2194602272727273e-05
6 8
Add_k: 5.9652059016524355


In [8]:
f = open("test.txt", "r")
test_tokens = []
sents = f.readlines()
for sent in sents :
    sent = sent.strip()
    sent = preprocess(sent)
    test_tokens.append(sent.split())
perplexity = get_perplexity_add(test_tokens, hindi_words, k=1, ngrams=5)
print("Add_k: "+str(perplexity))
perplexity = get_perplexity_add(test_tokens, hindi_words, k=1, ngrams=2)
print("Add_k: "+str(perplexity))

3.6276622759582344e-195
105 74
Add_k: 71.09074699075714
5.587256197124121e-250
135 74
Add_k: 70.19675680052633
