In [None]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import SnowballStemmer
from nltk import pos_tag
import sklearn_crfsuite

In [None]:
df_train = pd.read_csv("training_set.csv")
df_valid=pd.read_csv("validation_set.csv")
df_test=pd.read_csv("testing_set.csv")
df_train.Token.fillna('NA', inplace=True)
df_valid.Token.fillna('NA', inplace=True)
df_test.Token.fillna('NA', inplace=True)

In [98]:
df_train.head()

Unnamed: 0.1,Unnamed: 0,PMID_Type,Sentence_Index,Token,Tag,pos
0,0,21826085_A,0,We,O,PRP
1,1,21826085_A,0,implemented,O,VBD
2,2,21826085_A,0,a,O,DT
3,3,21826085_A,0,two,O,CD
4,4,21826085_A,0,-,O,:


In [99]:
class GetSen(object):
    
    def __init__(self, df_train):
        self.n_sent = 1
        self.df_train = df_train
        self.empty = False
        agg_func = lambda s: [(w,p,t) for w,p,t in zip(s["Token"].values.tolist(),
                                                   s["pos"].values.tolist(),
                                                   s["Tag"].values.tolist())]
        self.grouped = self.df_train.groupby("PMID_Type").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [100]:
getter = GetSen(df_train)

In [101]:
sentences=getter.sentences

### Getting a set of features and preparing the dataset

In [111]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,

    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [112]:
X_train = [sent2features(s) for s in sentences]
y_train = [sent2labels(s) for s in sentences]

In [113]:
from sklearn_crfsuite import CRF

crf = CRF(algorithm='lbfgs',
          c1=0.1,
          c2=0.1,
          max_iterations=10,
          all_possible_transitions=False)
crf.fit(X_train,y_train)

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=False, averaging=None, c=None, c1=0.1, c2=0.1,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=10,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

In [114]:
class GetSen(object):
    
    def __init__(self, df_valid):
        self.n_sent = 1
        self.df_valid = df_valid
        self.empty = False
        agg_func = lambda s: [(w,p,t) for w,p,t in zip(s["Token"].values.tolist(),
                                                   s["pos"].values.tolist(),
                                                    s["Tag"].values.tolist())]
        self.grouped = self.df_valid.groupby("PMID_Type").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [115]:
getter = GetSen(df_valid)

In [116]:
sentences=getter.sentences

In [117]:
X_test = [sent2features(s) for s in sentences]
y_test= [sent2labels(s) for s in sentences]

In [118]:
y_predict=crf.predict(X_test)

In [119]:
from sklearn_crfsuite.metrics import flat_classification_report
report = flat_classification_report(y_pred=y_predict, y_true=y_test)
print(report)

                precision    recall  f1-score   support

B-ABBREVIATION       0.00      0.00      0.00      4521
      B-FAMILY       0.00      0.00      0.00      4223
     B-FORMULA       0.00      0.00      0.00      4135
  B-IDENTIFIER       0.00      0.00      0.00       636
    B-MULTIPLE       0.00      0.00      0.00       188
    B-NO CLASS       0.00      0.00      0.00        32
  B-SYSTEMATIC       0.00      0.00      0.00      6816
     B-TRIVIAL       0.61      0.15      0.24      8963
I-ABBREVIATION       0.00      0.00      0.00        70
      I-FAMILY       0.00      0.00      0.00      1579
     I-FORMULA       0.00      0.00      0.00       430
  I-IDENTIFIER       0.00      0.00      0.00        84
    I-MULTIPLE       0.00      0.00      0.00       650
    I-NO CLASS       0.00      0.00      0.00         2
  I-SYSTEMATIC       0.00      0.00      0.00      2119
     I-TRIVIAL       0.00      0.00      0.00      1076
             O       0.95      1.00      0.97  

  'precision', 'predicted', average, warn_for)
