In [1]:
import pandas as pd
from pprint import pprint
import numpy as np
import pycrfsuite
from sklearn.model_selection import train_test_split
import re

In [2]:
data=pd.read_csv("labeled_data.txt",encoding="utf-8",sep=" ",names=['word','label'],skip_blank_lines=False)
df=pd.DataFrame(data)
df.replace(np.NaN,"Break",inplace=True)

In [3]:
def preprocessing(sentence):
    whitespace = re.compile(u"[\ufeff\u200d\u200b\u200c\u00a0\u1680\u180e\u202f\u205f\u3000\u2000-\u200a]+", re.UNICODE)
    fullspace = re.compile(u"[\s\u0020]+", re.UNICODE)
    bangla_fullstop = u"\u0964"
    punctSeq   = u"['\"“”‘’]+|[.?!,…]+|[:;]+"
    punc = u"[(),$%^&*+={}\[\]:\"|\'\~`<>/,¦!?½£¶¼©⅐⅑⅒⅓⅔⅕⅖⅗⅘⅙⅚⅛⅜⅝⅞⅟↉¤¿º;-]+"
    sentence= whitespace.sub("",sentence)
    sentence= fullspace.sub(" ",sentence)
    sentence = re.sub(r'^https?:\/\/.*[\r\n]*', "", sentence, flags=re.MULTILINE)
    sentence = re.sub(punctSeq, "", sentence)
    sentence = re.sub(bangla_fullstop, "",sentence)
    sentence = re.sub(punc, "", sentence)
    return sentence

In [4]:
def remove_empty_list(data):
    a=list(filter(lambda x: [] != x, data))
    return a

#for separating sentence
def making_list(data):
    f=[]
    t=[]
    for i in data:
        if (i=="Break"):
            f.append(t)
            t=[]
        else:
            t.append(i)
    return remove_empty_list(f)

In [5]:
def making_list_for_preprocessing(data):
    l=[]
    for i in data:
        l.append(preprocessing(i))
    return making_list(l)

In [6]:
text=making_list_for_preprocessing(df['word'])
label=making_list(df['label'])

In [7]:
def word2features(sent, i):
    word = sent[i]
    features = [
        'bias',
#         'word[-1:]=' + word[-1:],
        'size_of_word_prefix_2[-2:]=' + word[-2:],
#         'word[-3:]=' + word[-3:],
        'size_of_word_prefix_4[-4:]=' + word[-4:],
        'word.isdigit=%s' % word.isdigit(),
    ]
    if i > 0:
        word1 = sent[i-1]
        features.extend([
            'current_word='+word,
            'previous_word=' + word1,
        ])
    else:
        features.append('BOS')
        
    if i < len(sent)-1:
        word1 = sent[i+1]
        features.extend([
            'current_word='+word,
            'next_word=' + word1,
        ])
    else:
        features.append('EOS')
                
    return features

In [8]:
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

In [9]:
def mapping(text_train,label_train):
    X_train=[sent2features(text_train[i]) for i in range(len(text_train))]

    trainer=pycrfsuite.Trainer()

    for xseq, yseq in zip(X_train, label_train):
            trainer.append(xseq, yseq)    

    trainer.set_params({
        'c1': 1.0,   # coefficient for L1 penalty
        'c2': 1e-3,  # coefficient for L2 penalty
        'max_iterations': 50, # stop earlier

        # include transitions that are possible, but not observed
        'feature.possible_transitions': True
    })
    return trainer

In [10]:
def accuracy(text_test,label_test):
    result=[tagger.tag(sent2features(i)) for i in text_test]
    a=0
    c=0
    for i in range(len(result)):
        for j in range(len(result[i])):
            if result[i][j]==label_test[i][j]:
                a+=1
            c+=1
    b=(float(a) / c)*100 #Here c is total length of test part of Label
    return b

In [11]:
def tag_details():
    tag_distribution = data.groupby("label").size().reset_index(name='counts')
    print(tag_distribution)

In [12]:
def counting_label(string):
    a=0
    for i in range(len(data['label'])):
        if string==data['label'][i]:
            a+=1
    return a

In [13]:
def data_details():
    print("Total Sentence is:",counting_label("Break"))
    print("Total Word is:",len(df['word'])-counting_label("Break"))
    print("Total Unusable word is:",counting_label("0"))
    print("Total Single Word Name is:",counting_label('B-PER')-counting_label('I-PER')-counting_label('O-PER'))
    print("Total Double Word Name is:",counting_label('O-PER'))
    print("Total Triple Word Name is:",counting_label('I-PER'))
    print("Total Single Word Location is:",counting_label('B-LOC')-counting_label('I-LOC')-counting_label('O-LOC'))
    print("Total Double Word Location is:",counting_label('O-LOC'))
    print("Total Triple Word Location is:",counting_label('I-LOC'))
    print("Total Single Word Organization is:",counting_label('B-ORG')-counting_label('I-ORG')-counting_label('O-ORG'))
    print("Total Double Word Organization is:",counting_label('O-ORG'))
    print("Total Triple Word Organization is:",counting_label('I-ORG'))
    print("Total Single Word Time is:",counting_label('B-TIME')-counting_label('I-TIME')-counting_label('O-TIME'))
    print("Total Double Word Time is:",counting_label('O-TIME'))
    print("Total Triple Word Time is:",counting_label('I-TIME'))
    print("Total Single Word Unit is:",counting_label('B-UNIT')-counting_label('I-UNIT')-counting_label('O-UNIT'))
    print("Total Double Word Unit is:",counting_label('O-UNIT'))
    print("Total Triple Word Unit is:",counting_label('I-UNIT'))

In [14]:
tag_details()

     label  counts
0        0    6908
1    B-LOC     809
2    B-ORG     272
3    B-PER     860
4   B-TIME     136
5   B-UNIT     571
6    Break    1961
7    I-LOC       7
8    I-ORG      34
9    I-PER       4
10  I-TIME       3
11  I-UNIT      36
12   O-LOC      31
13   O-ORG      55
14   O-PER      95
15  O-TIME      71
16  O-UNIT     154


In [15]:
data_details()

Total Sentence is: 1961
Total Word is: 10046
Total Unusable word is: 6908
Total Single Word Name is: 761
Total Double Word Name is: 95
Total Triple Word Name is: 4
Total Single Word Location is: 771
Total Double Word Location is: 31
Total Triple Word Location is: 7
Total Single Word Organization is: 183
Total Double Word Organization is: 55
Total Triple Word Organization is: 34
Total Single Word Time is: 62
Total Double Word Time is: 71
Total Triple Word Time is: 3
Total Single Word Unit is: 381
Total Double Word Unit is: 154
Total Triple Word Unit is: 36


In [16]:
data_t=pd.read_csv("test_data.txt",encoding="utf-8",sep=" ",names=['word','label'],skip_blank_lines=False)
df_t=pd.DataFrame(data_t)
df_t.replace(np.NaN,"Break",inplace=True)

In [17]:
text_test_data=making_list_for_preprocessing(df_t['word'])
label_test_data=making_list(df_t['label'])

In [18]:
for i in range(len(text_test_data)):
    if len(text_test_data[i])!= len(label_test_data[i]):
        print(text_test_data[i],label_test_data[i],i)

In [19]:
len(text_test_data)

270

In [20]:
text_train,text_test,label_train,label_test=train_test_split(text,label,test_size=0.2)
trainer=mapping(text_train,label_train)
trainer.train('bangla-ner.crfsuite')
tagger = pycrfsuite.Tagger()
tagger.open('bangla-ner.crfsuite')
a=accuracy(text_test_data,label_test_data)
print(a)
# trainer.train('best_accuracy2/bangla-ner'+str(a)+'.crfsuite')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 16088
Seconds required: 0.071

L-BFGS optimization
c1: 1.000000
c2: 0.001000
num_memories: 6
max_iterations: 50
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 14531.995260
Feature norm: 1.000000
Error norm: 6448.709341
Active features: 6893
Line search trials: 1
Line search step: 0.000120
Seconds required for this iteration: 0.039

***** Iteration #2 *****
Loss: 12210.888924
Feature norm: 2.878986
Error norm: 3991.382019
Active features: 6838
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.054

***** Iteration #3 *****
Loss: 10368.210364
Feature norm: 2.348081
Error norm: 1935.930744
Active features: 6780
Line search trials: 1
Line search step: 1.000000
Seconds required for this i

***** Iteration #46 *****
Loss: 3847.530930
Feature norm: 60.541760
Error norm: 55.277780
Active features: 1657
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.023

***** Iteration #47 *****
Loss: 3845.314052
Feature norm: 60.633988
Error norm: 28.711532
Active features: 1645
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.023

***** Iteration #48 *****
Loss: 3843.460486
Feature norm: 60.922550
Error norm: 48.337123
Active features: 1631
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.023

***** Iteration #49 *****
Loss: 3841.100613
Feature norm: 61.009151
Error norm: 46.516047
Active features: 1626
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.029

***** Iteration #50 *****
Loss: 3839.388469
Feature norm: 61.377005
Error norm: 61.524879
Active features: 1612
Line search trials: 1
Line search step: 1.000000
Seconds required fo

In [21]:
# trainer.train('bangla-ner.crfsuite')
# tagger = pycrfsuite.Tagger()
# tagger.open('best_accuracy2/bangla-ner82.6.crfsuite')
# print("Accuracy of this model is:",accuracy(text_test_data,label_test_data),"%")

In [22]:
example_sent = preprocessing(input().strip())

বাংলাদেশের রাজধানী ঢাকা


In [23]:
print("Predicted:", ' '.join(tagger.tag(sent2features(example_sent.split()))))

Predicted: B-LOC 0 B-LOC


In [24]:
pprint(sent2features(example_sent.split()))

[['bias',
  'size_of_word_prefix_2[-2:]=ের',
  'size_of_word_prefix_4[-4:]=েশের',
  'word.isdigit=False',
  'BOS',
  'current_word=বাংলাদেশের',
  'next_word=রাজধানী'],
 ['bias',
  'size_of_word_prefix_2[-2:]=নী',
  'size_of_word_prefix_4[-4:]=ধানী',
  'word.isdigit=False',
  'current_word=রাজধানী',
  'previous_word=বাংলাদেশের',
  'current_word=রাজধানী',
  'next_word=ঢাকা'],
 ['bias',
  'size_of_word_prefix_2[-2:]=কা',
  'size_of_word_prefix_4[-4:]=ঢাকা',
  'word.isdigit=False',
  'current_word=ঢাকা',
  'previous_word=রাজধানী',
  'EOS']]
