### Collab Notebook

In [None]:
# Mount the drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip3 install simpletransformers

In [None]:
import pandas as pd
from simpletransformers.ner import NERModel
import numpy as np
from scipy.special import softmax
from sklearn.metrics import classification_report
PATH = 'drive/My Drive/Seq_Classification/'

In [None]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git

In [None]:
# load the plugin to run the C++ code
%load_ext nvcc_plugin

In [None]:
data = pd.read_csv('drive/My Drive/Seq_Classification/Task1.csv')
data = data.drop('Unnamed: 0',axis=1)
data = data.rename(columns={'id':'sentence_id'})
data['sentence_id'] = data['sentence_id'].astype('str')
data['words'] = data['words'].astype('str')

In [None]:
split = 0.8
train_df = data[0:int(split*len(data))]
eval_df = data[int(split*len(data)):] 

In [None]:
args = {
   'fp16': False,
   'fp16_opt_level': 'O1',
   'max_seq_length': 128,
   'train_batch_size': 8,
   'eval_batch_size': 8,
   'gradient_accumulation_steps': 1,
   'num_train_epochs': 2,
   'weight_decay': 0,
   'learning_rate': 7e-5,
   'adam_epsilon': 0.1e-8,
   'warmup_ratio': 0.06,
   'warmup_steps': 0,
   'max_grad_norm': 1.0,'logging_steps': 50,
   'evaluate_during_training': False,
   'save_steps': 2000,
   'eval_all_checkpoints': True,
   'use_tensorboard': True,'overwrite_output_dir': True,
   'reprocess_input_data': True,
    'num_labels' : 5
}

In [None]:
labels = ['O','B_INC','INC','B_EXC','EXC']
# Create a NERModel
model = NERModel("bert", "bert-base-cased", labels=labels, use_cuda=True, args=args)
# # Train the model
model.train_model(data[0:int(len(data)*split)])

# # Evaluate the model
result, model_outputs, predictions = model.eval_model(data[int(len(data)*split):])

print(result)

In [None]:
model.save_model(output_dir=PATH+'task1-bert/',model=model.model)

In [None]:
agg_func = lambda s: [t for t in s['labels'].tolist()]
grouped = eval_df.groupby("sentence_id").apply(agg_func)

tag2idx = {t: i for i, t in enumerate(labels)}
def write_results(result,file,ign):
    idx2tag = {i: w for w, i in tag2idx.items()}
    print(idx2tag)
    with open(file,'w+') as f:
        for i,lis in enumerate(result):
            line = ""
            for el in lis:
                tag = el
                # tag = idx2tag[el]
                if tag in ['O',ign,'B_'+ign]:
                    line += "O "
                elif tag[0] == 'B':
                    line += 'B '
                else:
                    line += 'I '
            f.write(line+'\n')

In [None]:
result = []
target = []
result1 = []
target1 = []
for i in range(len(predictions)):
  if len(predictions[i]) == len(grouped[i]):
    result1.append(predictions[i])
    target1.append(grouped[i])
    result.extend(predictions[i])
    target.extend(grouped[i])

report = classification_report(target,result,output_dict=True)
df = (pd.DataFrame(report)).transpose()
display(df)

In [None]:
token = 'INC'
# token = 'EXC'
write_results(result1,'drive/My Drive/Seq_Classification/pred.txt',token)
write_results(target1,'drive/My Drive/Seq_Classification/labels.txt',token)

In [None]:
## Save the results for one of the token types(INC/EXC) and run the next cell to get the results

In [None]:
#@title Calculate Metrics
#cpp code
%%cu

#include<bits/stdc++.h>

using namespace std;

struct Res
{
    vector<double> vec[3];  
};

Res testSequential(vector<vector<string> > &sents, 
                                         vector<vector<string> > &labels) {
  uint nExprPredicted = 0;
  double nExprPredictedCorrectly = 0;
  uint nExprTrue = 0;
  double precNumerProp = 0, precNumerBin = 0;
  double recallNumerProp = 0, recallNumerBin = 0;
  for (uint i=0; i<sents.size(); i++) { // per sentence
    vector<string> labelsPredicted;
    // forward(sents[i]);

    for (uint j=0; j<sents[i].size(); j++) {
        labelsPredicted.push_back(sents[i][j]);
    }
    // assert(labelsPredicted.size() == y.cols());


    string y, t, py="", pt="";
    uint match = 0;
    uint exprSize = 0;
    vector<pair<uint,uint> > pred, tru;
    int l1=-1, l2=-1;

    if (labels[i].size() != labelsPredicted.size())
      cout << labels[i].size() << " " << labelsPredicted.size() << endl;
    for (uint j=0; j<labels[i].size(); j++) { // per token in a sentence
      t = labels[i][j];
      y = labelsPredicted[j];

      if (t == "B") {
        //nExprTrue++;
        if (l1 != -1)
          tru.push_back(make_pair(l1,j));
        l1 = j;
      } else if (t == "I") {
        // cout<<"Sentence: "<<i<<" Index: "<<j<<endl;
        ;
        // assert(l1 != -1);
      } else if (t == "O") {
        if (l1 != -1)
          tru.push_back(make_pair(l1,j));
        l1 = -1;
      } else{
          cout<<t<<endl;
        assert(false);
      }
      if ((y == "B") || ((y == "I") && ((py == "") || (py == "O")))) {
        nExprPredicted++;
        if (l2 != -1)
          pred.push_back(make_pair(l2,j));
        l2 = j;
      } else if (y == "I") {
        assert(l2 != -1);
      } else if (y == "O") {
        if (l2 != -1)
          pred.push_back(make_pair(l2,j));
        l2 = -1;
      } else { 
        cout << y << endl;
        assert(false);
      }

      py = y;
      pt = t;
    }
    if ((l1 != -1) && (l1 != labels[i].size()))
      tru.push_back(make_pair(l1,labels[i].size()));
    if ((l2 != -1) && (l2 != labels[i].size()))
      pred.push_back(make_pair(l2,labels[i].size()));

    vector<bool> trum = vector<bool>(tru.size(),false);
      vector<bool> predm = vector<bool>(pred.size(),false);
    for (uint a=0; a<tru.size(); a++) {
      pair<uint,uint> truSpan = tru[a];
      nExprTrue++;
      for (uint b=0; b<pred.size(); b++) {
        pair<uint,uint> predSpan = pred[b];

        uint lmax, rmin;
        if (truSpan.first > predSpan.first)
          lmax = truSpan.first;
        else
          lmax = predSpan.first;
        if (truSpan.second < predSpan.second)
          rmin = truSpan.second;
        else
          rmin = predSpan.second;

        uint overlap = 0;
        if (rmin > lmax)
          overlap = rmin-lmax;
        if (predSpan.second == predSpan.first) cout << predSpan.first << endl;
        assert(predSpan.second != predSpan.first);
        precNumerProp += (double)overlap/(predSpan.second-predSpan.first);
        recallNumerProp += (double)overlap/(truSpan.second-truSpan.first);
        if (!predm[b] && overlap > 0) {
          precNumerBin += (double)(overlap>0);
          predm[b] = true;
        }
        if (!trum[a] && overlap>0) {
          recallNumerBin += 1;
          trum[a]=true;
        }
      }
    }

  }
  double precisionProp = (nExprPredicted==0) ? 1 : precNumerProp/nExprPredicted;
  double recallProp = recallNumerProp/nExprTrue;
  double f1Prop = (2*precisionProp*recallProp)/(precisionProp+recallProp);
  double precisionBin = (nExprPredicted==0) ? 1 : precNumerBin/nExprPredicted;
  double recallBin = recallNumerBin/nExprTrue;
  double f1Bin = (2*precisionBin*recallBin)/(precisionBin+recallBin);

  Res results;
  results.vec[0].push_back(precisionProp); results.vec[0].push_back(precisionBin);
  results.vec[1].push_back(recallProp); results.vec[1].push_back(recallBin);
  results.vec[2].push_back(f1Prop); results.vec[2].push_back(f1Bin);
  return results;
}


int main()
{
    vector<vector<string> > pred;
    vector<vector<string> > labels;

    std::ifstream file("drive/My Drive/Seq_Classification/pred.txt");
    if (file.is_open()) {
        std::string line;
        while (std::getline(file, line)) 
        {
            // using printf() in all tests for consistency
            vector<string> temp;
            for(int i=0;i<line.length();i+=2)
            {
                string tag(1,line[i]);
                temp.push_back(tag);
            }
            pred.push_back(temp);
        }
        file.close();
    }


    std::ifstream file1("drive/My Drive/Seq_Classification/labels.txt");
    if (file1.is_open()) {
        std::string line;
        while (std::getline(file1, line)) 
        {
            // using printf() in all tests for consistency
            vector<string> temp;
            for(int i=0;i<line.length();i+=2)
            {
                string tag(1,line[i]);
                temp.push_back(tag);
            }
            labels.push_back(temp);
        }
        file1.close();
    }

    // for(int i=0;i<pred.size();i++)
    // {
    //     for(int j=0;j<pred[i].size();j++)
    //         cout<<pred[i][j];
    //     cout<<endl;
    // }
    // for(int i=0;i<labels.size();i++)
    // {
    //     for(int j=0;j<labels[i].size();j++)
    //         cout<<labels[i][j];
    //     cout<<endl;
    // }

    Res result = testSequential(pred,labels);

    cout<<result.vec[0][0]<<" "<<result.vec[0][1]<<endl;
    cout<<result.vec[1][0]<<" "<<result.vec[1][1]<<endl;
    cout<<result.vec[2][0]<<" "<<result.vec[2][1]<<endl;
    return 0;
}

## Extract Predicted Phrases

In [None]:
grp_typ = eval_df.groupby("sentence_id").apply(agg_typ)
grp_sent = eval_df.groupby("sentence_id").apply(agg_sent)
categories = [c for c in categories if str(c) != 'nan' and c != '-']

def get_typcnt(arr):

    ret_typ = '-'
    cnt = 0
    for cat in categories:
    if arr.count(cat) > cnt:
      ret_typ = cat
      cnt = arr.count(cat)
return ret_typ

phrases = []
for i in range(len(predictions)):
    cur = []
    typcnt = []
    for j,tag in enumerate(predictions[i]):
    if tag != 'O':
      cur.append(grp_sent[i][j])
      if grp_typ[i][j] != '-':
        typcnt.append(grp_typ[i][j])
    else:
        if len(cur):
            temp = {}
            temp['sentence'] = " ".join(cur)
            temp['type1'] = predictions[i][j-1][-3]
            temp['type2'] = get_typcnt(typcnt)
            phrases.append(temp)
            cur = []
            typcnt = []
    if len(cur):
        temp = {}
        temp['sentence'] = " ".join(cur)
        temp['type1'] = predictions[i][-1][-3]
        temp['type2'] = get_typcnt(typcnt)
        phrases.append(temp)
        cur = []
        typcnt = []
task2_pred = pd.DataFrame(phrases)
task2_pred.to_csv(PATH+'extracted_phrases.csv')