## Collab Notebook

In [None]:
# Mount the drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
from scipy.special import softmax
from sklearn.metrics import classification_report
from keras.preprocessing.sequence import pad_sequences
PATH = 'drive/My Drive/Seq_Classification/'

!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc_plugin


In [None]:
data = pd.read_csv('drive/My Drive/Seq_Classification/Task1.csv')
data = data.rename(columns={'id':'Sentence #'})
data = data.drop('Unnamed: 0',axis=1)
data = data.fillna(method="ffill")

In [None]:
from keras.utils import to_categorical 
from sklearn.model_selection import train_test_split

class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s["words"].values.tolist(),
                                                           s["labels"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped[self.n_sent]
            self.n_sent += 1
            return s
        except:
            return None

        
words = list(set(data["words"].values))
tags = ['O','B_INC','INC','B_EXC','EXC']
# tags = list(set(data["labels"].values))
n_words = len(words)
n_tags = len(tags)

getter = SentenceGetter(data)
sentences = getter.sentences

word2idx = {w: i + 1 for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}
max_len = 170
X = [[w[0] for w in s] for s in sentences]    
# X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value='_PAD_')
new_X = []
for seq in X:
    new_seq = []
    for i in range(max_len):
        try:
            new_seq.append(seq[i])
        except:
            new_seq.append("__PAD__")
    new_X.append(new_seq)
X = new_X
y = [np.array([tag2idx[w[1]] for w in s]) for s in sentences]
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["O"])
# y = np.array([to_categorical(i, num_classes=n_tags) for i in y])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)
y_train = y_train.reshape(y_train.shape[0], y_train.shape[1], 1)


In [None]:
import tensorflow as tf
import tensorflow_hub as hub
from keras import backend as K

In [None]:
tf.compat.v1.disable_eager_execution()

sess = tf.compat.v1.Session()
tf.compat.v1.keras.backend.set_session(sess)

In [None]:
elmo_model = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)
sess.run(tf.compat.v1.initialize_all_variables())
sess.run(tf.compat.v1.tables_initializer())

In [None]:
!pip install git+https://www.github.com/keras-team/keras-contrib.git

In [None]:
from keras.models import Model, Input
from keras.layers.merge import add
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, Lambda

tags = ['O','B_INC','INC','B_EXC','EXC']
tag2idx = {t: i for i, t in enumerate(tags)}

def write_results(result,file,ign):
    print("Writing the results for {} token".format(ign))
    idx2tag = {i: w for w, i in tag2idx.items()}
    with open(file,'w+') as f:
        for i,lis in enumerate(result):
            line = ""
            for el in lis:
                tag = idx2tag[el]
                if tag in ['O',ign,'B_'+ign]:
                    line += "O "
                elif tag[0] == 'B':
                    line += 'B '
                else:
                    line += 'I '
            f.write(line+'\n')
            

batch_size = 32

def ElmoEmbedding(x):
    return elmo_model(inputs={
                            "tokens": tf.squeeze(tf.cast(x, tf.string)),
                            "sequence_len": tf.constant(batch_size*[max_len])
                      },
                      signature="tokens",
                      as_dict=True)["elmo"]
def lstm_elmo():
  input_text = Input(shape=(max_len,), dtype=tf.string)
  embedding = Lambda(ElmoEmbedding, output_shape=(None, 1024))(input_text)
  x = Bidirectional(LSTM(units=300, return_sequences=True,
                        recurrent_dropout=0.2, dropout=0.2))(embedding)
  x_rnn = Bidirectional(LSTM(units=300, return_sequences=True,
                            recurrent_dropout=0.2, dropout=0.2))(x)
  x = add([x, x_rnn])  # residual connection to the first biLSTM
  out = TimeDistributed(Dense(n_tags, activation="softmax"))(x)

  model = Model(input_text, out)
  model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
  model.summary()
  return model
  

In [None]:
total = int(len(X_train)/batch_size)
train_split = int(total*1)
val_split = total - train_split
print(train_split)
X_tr, X_val = X_train[:train_split*batch_size], X_train[-val_split*batch_size:]
y_tr, y_val = y_train[:train_split*batch_size], y_train[-val_split*batch_size:]
# y_train = y_train.reshape(y_tr.shape[0], y_tr.shape[1], 1)

lstm = lstm_elmo()
lstm.fit(np.array(X_tr),y_tr,epochs=10,verbose=1,batch_size=32)
out = lstm.predict(np.array(X_test[0:320]))
pred = np.argmax(out,2)
labels = y_test[0:320]

In [None]:
write_results(pred,PATH+'pred.txt','INC')
write_results(labels,PATH+'labels.txt','INC')

In [None]:
## Save the results for one token and run the next cell to get the metrics

In [None]:
#@title Calculate Metrics
#cpp code
%%cu

#include<bits/stdc++.h>

using namespace std;

struct Res
{
    vector<double> vec[3];  
};

Res testSequential(vector<vector<string> > &sents, 
                                         vector<vector<string> > &labels) {
  uint nExprPredicted = 0;
  double nExprPredictedCorrectly = 0;
  uint nExprTrue = 0;
  double precNumerProp = 0, precNumerBin = 0;
  double recallNumerProp = 0, recallNumerBin = 0;
  for (uint i=0; i<sents.size(); i++) { // per sentence
    vector<string> labelsPredicted;
    // forward(sents[i]);

    for (uint j=0; j<sents[i].size(); j++) {
        labelsPredicted.push_back(sents[i][j]);
    }
    // assert(labelsPredicted.size() == y.cols());


    string y, t, py="", pt="";
    uint match = 0;
    uint exprSize = 0;
    vector<pair<uint,uint> > pred, tru;
    int l1=-1, l2=-1;

    if (labels[i].size() != labelsPredicted.size())
      cout << labels[i].size() << " " << labelsPredicted.size() << endl;
    for (uint j=0; j<labels[i].size(); j++) { // per token in a sentence
      t = labels[i][j];
      y = labelsPredicted[j];

      if (t == "B") {
        //nExprTrue++;
        if (l1 != -1)
          tru.push_back(make_pair(l1,j));
        l1 = j;
      } else if (t == "I") {
        // cout<<"Sentence: "<<i<<" Index: "<<j<<endl;
        ;
        // assert(l1 != -1);
      } else if (t == "O") {
        if (l1 != -1)
          tru.push_back(make_pair(l1,j));
        l1 = -1;
      } else{
          cout<<t<<endl;
        assert(false);
      }
      if ((y == "B") || ((y == "I") && ((py == "") || (py == "O")))) {
        nExprPredicted++;
        if (l2 != -1)
          pred.push_back(make_pair(l2,j));
        l2 = j;
      } else if (y == "I") {
        assert(l2 != -1);
      } else if (y == "O") {
        if (l2 != -1)
          pred.push_back(make_pair(l2,j));
        l2 = -1;
      } else { 
        cout << y << endl;
        assert(false);
      }

      py = y;
      pt = t;
    }
    if ((l1 != -1) && (l1 != labels[i].size()))
      tru.push_back(make_pair(l1,labels[i].size()));
    if ((l2 != -1) && (l2 != labels[i].size()))
      pred.push_back(make_pair(l2,labels[i].size()));

    vector<bool> trum = vector<bool>(tru.size(),false);
      vector<bool> predm = vector<bool>(pred.size(),false);
    for (uint a=0; a<tru.size(); a++) {
      pair<uint,uint> truSpan = tru[a];
      nExprTrue++;
      for (uint b=0; b<pred.size(); b++) {
        pair<uint,uint> predSpan = pred[b];

        uint lmax, rmin;
        if (truSpan.first > predSpan.first)
          lmax = truSpan.first;
        else
          lmax = predSpan.first;
        if (truSpan.second < predSpan.second)
          rmin = truSpan.second;
        else
          rmin = predSpan.second;

        uint overlap = 0;
        if (rmin > lmax)
          overlap = rmin-lmax;
        if (predSpan.second == predSpan.first) cout << predSpan.first << endl;
        assert(predSpan.second != predSpan.first);
        precNumerProp += (double)overlap/(predSpan.second-predSpan.first);
        recallNumerProp += (double)overlap/(truSpan.second-truSpan.first);
        if (!predm[b] && overlap > 0) {
          precNumerBin += (double)(overlap>0);
          predm[b] = true;
        }
        if (!trum[a] && overlap>0) {
          recallNumerBin += 1;
          trum[a]=true;
        }
      }
    }

  }
  double precisionProp = (nExprPredicted==0) ? 1 : precNumerProp/nExprPredicted;
  double recallProp = recallNumerProp/nExprTrue;
  double f1Prop = (2*precisionProp*recallProp)/(precisionProp+recallProp);
  double precisionBin = (nExprPredicted==0) ? 1 : precNumerBin/nExprPredicted;
  double recallBin = recallNumerBin/nExprTrue;
  double f1Bin = (2*precisionBin*recallBin)/(precisionBin+recallBin);

  Res results;
  results.vec[0].push_back(precisionProp); results.vec[0].push_back(precisionBin);
  results.vec[1].push_back(recallProp); results.vec[1].push_back(recallBin);
  results.vec[2].push_back(f1Prop); results.vec[2].push_back(f1Bin);
  return results;
}


int main()
{
    vector<vector<string> > pred;
    vector<vector<string> > labels;

    std::ifstream file("drive/My Drive/Seq_Classification/pred.txt");
    if (file.is_open()) {
        std::string line;
        while (std::getline(file, line)) 
        {
            // using printf() in all tests for consistency
            vector<string> temp;
            for(int i=0;i<line.length();i+=2)
            {
                string tag(1,line[i]);
                temp.push_back(tag);
            }
            pred.push_back(temp);
        }
        file.close();
    }


    std::ifstream file1("drive/My Drive/Seq_Classification/labels.txt");
    if (file1.is_open()) {
        std::string line;
        while (std::getline(file1, line)) 
        {
            // using printf() in all tests for consistency
            vector<string> temp;
            for(int i=0;i<line.length();i+=2)
            {
                string tag(1,line[i]);
                temp.push_back(tag);
            }
            labels.push_back(temp);
        }
        file1.close();
    }

    // for(int i=0;i<pred.size();i++)
    // {
    //     for(int j=0;j<pred[i].size();j++)
    //         cout<<pred[i][j];
    //     cout<<endl;
    // }
    // for(int i=0;i<labels.size();i++)
    // {
    //     for(int j=0;j<labels[i].size();j++)
    //         cout<<labels[i][j];
    //     cout<<endl;
    // }

    Res result = testSequential(pred,labels);

    cout<<result.vec[0][0]<<" "<<result.vec[0][1]<<endl;
    cout<<result.vec[1][0]<<" "<<result.vec[1][1]<<endl;
    cout<<result.vec[2][0]<<" "<<result.vec[2][1]<<endl;
    return 0;
}