# Named Entity Recognition for Turkish Language

##Importing Libraries 

In [None]:
import pandas as pd
import numpy as np
import re

!pip install sklearn-crfsuite
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

## Importing Data

In [30]:
with open('/content/drive/MyDrive/NE.txt') as f:
    ne = f.readlines()
with open('/content/drive/MyDrive/NE.ma.txt') as f:
    ne_ma = f.readlines()

print('Size of the dataset:',len(ne))

Size of the dataset: 10000


## Split Data into 5 Folds

In [31]:
fold1, fold2, fold3, fold4, fold5 = [],[],[],[],[]

In [32]:
count = 1
for line in ne:
  if count == 1:
    fold1.append(line)
    count+=1
  elif count == 2:
    fold2.append(line)
    count+=1
  elif count == 3:
    fold3.append(line)
    count+=1
  elif count == 4:
    fold4.append(line)
    count+=1
  elif count == 5:
    fold5.append(line)
    count = 1

In [33]:
f1 = pd.DataFrame(fold1, columns=['Lines'])
f2 = pd.DataFrame(fold2, columns=['Lines'])
f3 = pd.DataFrame(fold3, columns=['Lines'])
f4 = pd.DataFrame(fold4, columns=['Lines'])
f5 = pd.DataFrame(fold5, columns=['Lines'])

## Extracting Labels from Data

### 1- Getting Raw Labels with REGEX

In [34]:
def GetLabel(text):
  labelList = re.findall(r'<b_enamex TYPE="([A-Z]+)">([\w\s]+)<e_enamex>',text)
  labelDict = {}
  for label, word in labelList:
    labelDict[word] = label
  return labelDict

In [35]:
f1['Labels'] = f1['Lines'].apply(GetLabel)
f2['Labels'] = f2['Lines'].apply(GetLabel)
f3['Labels'] = f3['Lines'].apply(GetLabel)
f4['Labels'] = f4['Lines'].apply(GetLabel)
f5['Labels'] = f5['Lines'].apply(GetLabel)

### 2- Getting IOB Labels

In [36]:
def GetIOBLabels(mydict):
  bioDict={}
  for key, value in mydict.items():
    bioList = key.split(' ')
    for word in bioList:
      if word == bioList[0]:
        bioDict[word] = 'B-'+value
      else:
        bioDict[word] = 'I-'+value
  return bioDict

In [37]:
f1['IOBLabels'] = f1['Labels'].apply(GetIOBLabels)
f2['IOBLabels'] = f2['Labels'].apply(GetIOBLabels)
f3['IOBLabels'] = f3['Labels'].apply(GetIOBLabels)
f4['IOBLabels'] = f4['Labels'].apply(GetIOBLabels)
f5['IOBLabels'] = f5['Labels'].apply(GetIOBLabels)

## Importing Location Gazetteer to Add New Features

In [38]:
with open('/content/drive/MyDrive/locations_lexicon.txt') as f:
    locationList = [line[:-1] for line in f]
with open('/content/drive/MyDrive/person_lexicon.txt') as f:
    personList = [line[:-1] for line in f]
with open('/content/drive/MyDrive/organization_lexicon.txt') as f:
    organizationList = [line[:-1] for line in f]

In [39]:
def LocationLexicon(text):
  if text in locationList:
    return 1
  else:
    return 0

def PersonLexicon(text):
  if text in personList:
    return 1
  else:
    return 0

def OrganizationLexicon(text):
  if text in organizationList:
    return 1
  else:
    return 0

## Feature Extraction

In [40]:
ne_df = pd.DataFrame(ne_ma)

In [41]:
def getLineNum(text):
  sepList = text.split(' ')
  return sepList[0]

def getWord(text):
  if text.split(' ')[1] != "'":
    sepList = text.split(' ')
    thewordraw = sepList[1]
    splitted = thewordraw.split("'")
    if splitted[0]=='':
      return splitted[1]
    else:
      return splitted[0]
  else:
    return "' '"

def getInflectionalPart(text):
  sepList = text.split(' ')
  infText = sepList[2]
  return infText[:-1]

In [42]:
ne_df['LineNum'] = ne_df[0].apply(getLineNum)
ne_df['Word'] = ne_df[0].apply(getWord)
ne_df['Inflectional'] = ne_df[0].apply(getInflectionalPart)

# Gazetteer Features
ne_df['LocationLexicon'] = ne_df['Word'].apply(LocationLexicon)
ne_df['PersonLexicon'] = ne_df['Word'].apply(PersonLexicon)
ne_df['OrganizationLexicon'] = ne_df['Word'].apply(OrganizationLexicon)

In [43]:
def GetRoot(text):
  splitList = text.split('+')
  return splitList[0].lower()

def GetNounCase(text):
  splitText = text.split('+')
  if 'Noun' in splitText:
    return splitText[-1]
  else:
    return '0'

def PropExist(text):
  splitText = text.split('+')
  if 'Prop' in splitText:
    return 1
  else:
    return 0

def GetPOS(text):
  splitText = text.split('+')
  if text != '*UNKNOWN*':
    if splitText[1] == 'Verb^DB':
      return 'Verb'
    elif splitText[1] == 'Adj^DB':
      return 'Adj'
    else:
      return splitText[1]
  else:
    return text

def GetAllInflectional(text):
  splitText = text.split('+')
  if text != '*UNKNOWN*':
    index = 0
    for word in splitText:
      if word == 'Verb^DB' or word == 'Adj^DB':
        index+=1
        break
      elif word in ['Noun', 'Adj', 'Conj', 'Verb', 'Num', 'Punct','Postp', 'Det', 'Adverb', 'Interj', 'Pron', 'Ques','Dup']:
        break
      index+=1
    if len(splitText)==index+1:
      return '0'
    else:
      return '+'.join(splitText[index+1:])
  else:
    return text

def GetCase(text):
  if text[0].isupper() == True:
    return 'UC'
  else:
    return 'DC'

In [44]:
ne_df['StartOfSent'] = 0

for row in range(1,ne_df.shape[0]):
  prev_val = ne_df.loc[row-1,'LineNum']
  curr_val = ne_df.loc[row,'LineNum']
  if prev_val != curr_val:
    ne_df.loc[row,'StartOfSent'] = 1

ne_df.loc[0,'StartOfSent'] = 1

ne_df['Root'] = ne_df['Inflectional'].apply(GetRoot)
ne_df['NounCase'] = ne_df['Inflectional'].apply(GetNounCase)
ne_df['PropExist'] = ne_df['Inflectional'].apply(PropExist)
ne_df['POS'] = ne_df['Inflectional'].apply(GetPOS)
ne_df['AllInflectional'] = ne_df['Inflectional'].apply(GetAllInflectional)
ne_df['Case'] = ne_df['Word'].apply(GetCase)
ne_df['Label'] = ''

### Assigning Labels to Tokens in ne_df Data

In [45]:
def LabellingFolds(ne_df, fold, foldnum):
  count_index = 0 # For fold iteration
  for ind in np.arange(foldnum,10000,5): # 1, 6, 11 ... for fold1
    for w_index in list(ne_df[ne_df['LineNum']==str(ind)].index):
      theword = ne_df.loc[w_index,'Word'] # Get the word to find its label
      thedict = fold.loc[count_index,'IOBLabels'] # Get the labels of this sentence
      if theword in thedict.keys(): # Check if that word is in the labels
        ne_df.loc[w_index,'Label'] = thedict[theword] # If exist, assign its label
      else:
        ne_df.loc[w_index,'Label'] = 'O' # If not assign 'O'
    count_index+=1
  print('Fold',foldnum,'Ends')
  return ne_df

In [46]:
ne_df = LabellingFolds(ne_df, f1, 1)
ne_df = LabellingFolds(ne_df, f2, 2)
ne_df = LabellingFolds(ne_df, f3, 3)
ne_df = LabellingFolds(ne_df, f4, 4)
ne_df = LabellingFolds(ne_df, f5, 5)
ne_df.loc[165847,'Label']='O'

Fold 1 Ends
Fold 2 Ends
Fold 3 Ends
Fold 4 Ends
Fold 5 Ends


## Creating Train Data

In [47]:
def createTrainData(ne_df,foldnum, features):
  foldList_X = []
  foldList_y = []
  for ind in np.arange(foldnum,10000,5):
    tr_list = ne_df[ne_df['LineNum']==str(ind)].loc[:,features].to_dict('records')
    tr_list2 = list(ne_df[ne_df['LineNum']==str(ind)].loc[:,'Label'])
    foldList_X.append(tr_list)
    foldList_y.append(tr_list2)
  print('Function Worked for Fold Number {}'.format(foldnum))
  return foldList_X, foldList_y

In [48]:
X_train_fold1, y_train_fold1 = createTrainData(ne_df,foldnum=1, features = ['StartOfSent','Root','NounCase','PropExist','POS','AllInflectional','LocationLexicon','PersonLexicon','OrganizationLexicon','Case'])
X_train_fold2, y_train_fold2 = createTrainData(ne_df,foldnum=2, features = ['StartOfSent','Root','NounCase','PropExist','POS','AllInflectional','LocationLexicon','PersonLexicon','OrganizationLexicon','Case'])
X_train_fold3, y_train_fold3 = createTrainData(ne_df,foldnum=3, features = ['StartOfSent','Root','NounCase','PropExist','POS','AllInflectional','LocationLexicon','PersonLexicon','OrganizationLexicon','Case'])
X_train_fold4, y_train_fold4 = createTrainData(ne_df,foldnum=4, features = ['StartOfSent','Root','NounCase','PropExist','POS','AllInflectional','LocationLexicon','PersonLexicon','OrganizationLexicon','Case'])
X_train_fold5, y_train_fold5 = createTrainData(ne_df,foldnum=5, features = ['StartOfSent','Root','NounCase','PropExist','POS','AllInflectional','LocationLexicon','PersonLexicon','OrganizationLexicon','Case'])

X_list = [X_train_fold1,X_train_fold2,X_train_fold3,X_train_fold4,X_train_fold5]
y_list = [y_train_fold1,y_train_fold2,y_train_fold3,y_train_fold4,y_train_fold5]

Function Worked for Fold Number 1
Function Worked for Fold Number 2
Function Worked for Fold Number 3
Function Worked for Fold Number 4
Function Worked for Fold Number 5


## Modelling

In [49]:
def calculateFold(listX, listy,foldnum,c1,c2,max_iter):
  if foldnum == 1:
    X_train = listX[1] + listX[2] + listX[3] + listX[4]
    y_train = listy[1] + listy[2] + listy[3] + listy[4]
    X_test = listX[0]
    y_test = listy[0]
  elif foldnum == 2:
    X_train = listX[0] + listX[2] + listX[3] + listX[4]
    y_train = listy[0] + listy[2] + listy[3] + listy[4]
    X_test = listX[1]
    y_test = listy[1]
  elif foldnum == 3:
    X_train = listX[0] + listX[1] + listX[3] + listX[4]
    y_train = listy[0] + listy[1] + listy[3] + listy[4]
    X_test = listX[2]
    y_test = listy[2]
  elif foldnum == 4:
    X_train = listX[0] + listX[1] + listX[2] + listX[4]
    y_train = listy[0] + listy[1] + listy[2] + listy[4]
    X_test = listX[3]
    y_test = listy[3]
  elif foldnum == 5:
    X_train = listX[0] + listX[1] + listX[2] + listX[3]
    y_train = listy[0] + listy[1] + listy[2] + listy[3]
    X_test = listX[4]
    y_test = listy[4]

  crf = sklearn_crfsuite.CRF(
  algorithm='lbfgs',
  c1=c1,
  c2=c2,
  max_iterations=max_iter,
  all_possible_transitions=True)
  crf.fit(X_train, y_train)

  my_labels = list(crf.classes_)
  my_labels.remove('O')

  y_pred = crf.predict(X_test)
  f1_score = metrics.flat_f1_score(y_test, y_pred,average='weighted', labels=my_labels)
  precision = metrics.flat_precision_score(y_test, y_pred,average='weighted', labels=my_labels)
  recall = metrics.flat_recall_score(y_test, y_pred,average='weighted', labels=my_labels)
  sorted_labels = sorted(my_labels,key=lambda name: (name[1:], name[0]))
  print(metrics.flat_classification_report(y_test, y_pred, labels=sorted_labels, digits=3))
  return f1_score, precision, recall

In [50]:
fold1_f1, fold1_precision, fold1_recall = calculateFold(X_list, y_list,foldnum=1, c1=0.02, c2=0.02, max_iter=100)
fold2_f1, fold2_precision, fold2_recall = calculateFold(X_list, y_list,foldnum=2, c1=0.02, c2=0.02, max_iter=100)
fold3_f1, fold3_precision, fold3_recall = calculateFold(X_list, y_list,foldnum=3, c1=0.02, c2=0.02, max_iter=100)
fold4_f1, fold4_precision, fold4_recall = calculateFold(X_list, y_list,foldnum=4, c1=0.02, c2=0.02, max_iter=100)
fold5_f1, fold5_precision, fold5_recall = calculateFold(X_list, y_list,foldnum=5, c1=0.02, c2=0.02, max_iter=100)

print('Average F1 Score:', (fold1_f1+fold2_f1+fold3_f1+fold4_f1+fold5_f1)/5)
print('Average Precision:', (fold1_precision+fold2_precision+fold3_precision+fold4_precision+fold5_precision)/5)
print('Average Recall:', (fold1_recall+fold2_recall+fold3_recall+fold4_recall+fold5_recall)/5)

                precision    recall  f1-score   support

    B-LOCATION      0.933     0.909     0.921       836
    I-LOCATION      0.658     0.575     0.613        87
B-ORGANIZATION      0.900     0.835     0.866       624
I-ORGANIZATION      0.805     0.764     0.784       351
      B-PERSON      0.920     0.889     0.904      1059
      I-PERSON      0.869     0.888     0.878       439

     micro avg      0.895     0.863     0.878      3396
     macro avg      0.847     0.810     0.828      3396
  weighted avg      0.894     0.863     0.878      3396

                precision    recall  f1-score   support

    B-LOCATION      0.937     0.914     0.925       747
    I-LOCATION      0.720     0.771     0.745        70
B-ORGANIZATION      0.885     0.828     0.856       547
I-ORGANIZATION      0.792     0.784     0.788       306
      B-PERSON      0.903     0.867     0.885      1053
      I-PERSON      0.874     0.877     0.876       457

     micro avg      0.888     0.863     0.8