#<b> Importing Dataset

In [None]:
!wget https://www.dropbox.com/s/qtbih1i6x2hrejr/entity-annotated-corpus.zip

#<b> Installing and Importing Libraries

In [None]:
! pip install eli5

In [None]:
! pip install sklearn_crfsuite

In [None]:
!unzip entity-annotated-corpus.zip

In [None]:
import pandas as pds
import eli5 as el
from sklearn_crfsuite import CRF as crf
from sklearn.model_selection import cross_val_predict as cvp
from sklearn_crfsuite.metrics import flat_classification_report as fcr


Let's consider only 10000 rows from this dataset as this is a huge dataset and for computational considerations

In [None]:

text_df = pds.read_csv("ner_dataset.csv", encoding="latin1")
text_df=text_df.head(10000)

In [None]:
text_df.head()

#<b> Data pre-processing step
Fill the data with NA wherever applicable 


In [None]:
text_df = text_df.fillna(method="ffill")

In [None]:
text_df.head()


# <b>Create the list of words with the column 'Word' from the dataset


In [None]:
text_df["Word"].values

<b> Using set function making ditinct word list

In [None]:
set(text_df["Word"].values)

In [None]:
list_of_words = list(set(text_df["Word"].values))

list_of_words

#<b> Count the number of distinct words

In [None]:
num_words = len(list_of_words)
print(num_words)

#**This class is defined to get sentences with POS and tags**

In [None]:
class get_tokenised_text(object):
    
    def __init__(dummy, df):
        dummy.index = 1
        dummy.df = df
        dummy.nulls = False
        # Below function is used to concatenate or aggregare word, pos, tag in same tuple. For eg.: ('pope','NN','O')
        concat_function = lambda text: [(word, pos, tag) for word, pos, tag in zip(text["Word"].values.tolist(),
                                                           text["POS"].values.tolist(),
                                                           text["Tag"].values.tolist())]
        # Then group the sentence with the concat function written above just to separate sentences
        dummy.grouped_sentence = dummy.df.groupby("Sentence #").apply(concat_function)
        dummy.full_text = [text for text in dummy.grouped_sentence] 
    
    

#<b>The below function returns the whole text with POS and tags in a list format


In [None]:
    def get_text(dummy):
        try:
            text = dummy.df[dummy.df["Sentence #"] == "Sentence: {}".format(dummy.index)]
            dummy.index += 1
            return text["Word"].values.tolist(), text["POS"].values.tolist(), text["Tag"].values.tolist()    
        except:
            dummy.nulls = True
            return None, None, None

In [None]:
text_df_interim = get_tokenised_text(text_df)
print(text_df_interim)

#<b>Converted Text to tokenized text

In [None]:
tokenised_text = text_df_interim.full_text
print(tokenised_text)

#<b> Function to convert text into features. </b>
<br>
Features like , , or 

1.   word parts
2.   Whether it is title
3.   Digit
4.   Upper case or a lower case 

We convert them into sklearn-crfsuite format so that it becomes easy for the package to read the data 
 <br>

<b>Each sentence is converted to a list of dicts

In [None]:

def text2features(text, index):
    word = text[index][0]
    pos_tag = text[index][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'pos_tag': pos_tag,
        'pos_tag[:2]': pos_tag[:2],
    }
    if index > 0:
        word_next = text[index-1][0]
        pos_tag_next = text[index-1][1]
        features.update({
            '-1:word.lower()': word_next.lower(),
            '-1:word.istitle()': word_next.istitle(),
            '-1:word.isupper()': word_next.isupper(),
            '-1:pos_tag': pos_tag_next,
            '-1:pos_tag[:2]': pos_tag_next[:2],
        })
    else:
        features['BOS'] = True

    if index < len(text)-1:
        word_next = text[index+1][0]
        pos_tag_next = text[index+1][1]
        features.update({
            '+1:word.lower()': word_next.lower(),
            '+1:word.istitle()': word_next.istitle(),
            '+1:word.isupper()': word_next.isupper(),
            '+1:pos_tag': pos_tag_next,
            '+1:pos_tag[:2]': pos_tag_next[:2],
        })
    else:
        features['EOS'] = True

    return features




In [None]:
def text_to_features_all(text):
    return [text2features(text, index) for index in range(len(text))]



In [None]:
def text_to_labels(text):
    return [label for token, postag, label in text]


#<b>

#<b>1.   X will contain all the features data like POS, upper or lower or digit etc. 
#<b>2.  Y is the output variable i.e. the entity




In [None]:
X = [text_to_features_all(text) for text in tokenised_text]
X

In [None]:
y = [text_to_labels(text) for text in tokenised_text]
y

#<b> Building CRF model

In [None]:
# Syntax to use the CRF model
crf_model = crf(algorithm='lbfgs',
          c1=0.1,
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=False)

#<b>Prediction

In [None]:
# Predict the model and check for accuracy and other metrics
prediction = cvp(estimator=crf_model, X=X, y=y, cv=5)

# <b>Print the output and check for the metrics. How the classification has been done using CRF


In [None]:
classification_analysis = fcr(y_pred=prediction, y_true=y)
print(classification_analysis)

#<b>Let's check what are the most frequently occured transitions and rarely occured transitions from the generated model


In [None]:
from collections import Counter
def transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))
print("Most occured transitions:")
transitions(Counter(crf_model.transition_features_).most_common(20))
print("\nMost rearely occuring transitions:")
transitions(Counter(crf_model.transition_features_).most_common()[-20:])

In [None]:
crf_model.fit(X, y)

#<b> Weights of the model

In [None]:
# Let's now inspect model weights

el.show_weights(crf_model, top=30)

# The output shows that B-art is followed by I-article, B-eve is followed by I-eve etc. which is logical.



#<b>1.   Let's do some regularization. c1 is the l1 regularization and c2 is the l2 regularization 
#<b>2.   We can increase the c1 parameter to enforce sparsity in the context 



In [None]:


crf_model_regularized = crf(algorithm='lbfgs',
          c1=10,
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=False)

#<b>Prediction using CRF after regularization 

In [None]:
prediction = cvp(estimator=crf_model_regularized, X=X, y=y, cv=5)

In [None]:
classification_analysis_reg = fcr(y_pred=prediction, y_true=y)
print(classification_analysis_reg)

In [None]:
crf_model_regularized.fit(X, y)

In [None]:
el.show_weights(crf_model_regularized, top=30)