In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
!pip install transformers

In [None]:
import numpy as np
import pandas as pd
import nltk
import torch
from transformers import BertTokenizer, BertModel 
nltk.download('punkt')
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize.treebank import TreebankWordDetokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier , VotingClassifier
from sklearn.metrics import accuracy_score , f1_score

# Data Preprocessing

In [None]:
train_data = pd.read_csv('/content/drive/MyDrive/nlp/HW3/train1.csv')
test_data = pd.read_csv('/content/drive/MyDrive/nlp/HW3/test.csv')

In [None]:
def remove_tag(contex):
  contex = contex.replace("<head>" , "")
  contex = contex.replace("</head>" , "")
  return contex

import re
def find_head(contex):
  match = re.findall(r'<head>\w+</head>' , contex)
  return match[0][6:-7]

In [None]:
train_data['head'] = train_data['context'].apply(find_head)
train_data['context'] = train_data['context'].apply(remove_tag)

test_data['head'] = test_data['context'].apply(find_head)
test_data['context'] = test_data['context'].apply(remove_tag)

In [None]:
train_data = train_data.drop(['instance_id' , 'doc_src' ]  , axis=1)
train_data = train_data.rename(columns={'sense_id': 'label'})
test_data = test_data.drop(['instance_id' , 'doc_src']  , axis=1)
test_data = test_data.rename(columns={'sense_id': 'label'})

In [None]:
train_data['sent'] = train_data['context'].apply(lambda x : x.split('.'))
test_data['sent'] = test_data['context'].apply(lambda x : x.split('.'))

In [None]:
tokenizer = RegexpTokenizer(r'\w+')
for i in range(len(train_data)):
  main_sent = ''
  for s in train_data['sent'][i]:
    tokens = tokenizer.tokenize(s)
    if train_data['head'][i] in tokens: main_sent = s
  train_data['sent'][i] = main_sent 

In [None]:
tokenizer = RegexpTokenizer(r'\w+')
for i in range(len(test_data)):
  main_sent = ''
  for s in test_data['sent'][i]:
    tokens = tokenizer.tokenize(s)
    if test_data['head'][i] in tokens: main_sent = s
  test_data['sent'][i] = main_sent 

using transformers tokenizer

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

add [CLS] and [SEP] tokens to the main sentence

In [None]:
train_data['sent'] = train_data['sent'].apply(lambda x : "[CLS] " + x + " [SEP]" )
train_data['tokens'] = train_data['sent'].apply(tokenizer.tokenize)
train_data['indx'] = train_data['tokens'].apply(tokenizer.convert_tokens_to_ids)
train_data['segments_ids'] = train_data['tokens'].apply(lambda x : [1] * len(x))

In [None]:
test_data['sent'] = test_data['sent'].apply(lambda x : "[CLS] " + x + " [SEP]" )
test_data['tokens'] = test_data['sent'].apply(tokenizer.tokenize)
test_data['indx'] = test_data['tokens'].apply(tokenizer.convert_tokens_to_ids)
test_data['segments_ids'] = test_data['tokens'].apply(lambda x : [1] * len(x))

# Bert Representation

calculate vectors that the bert model needs in its input

In [None]:
train_data['tokens_tensor'] = train_data['indx'].apply(lambda x : torch.tensor([x]))
train_data['segments_tensors'] = train_data['segments_ids'].apply(lambda x : torch.tensor([x]))

test_data['tokens_tensor'] = test_data['indx'].apply(lambda x : torch.tensor([x]))
test_data['segments_tensors'] = test_data['segments_ids'].apply(lambda x : torch.tensor([x]))

embedd tokens wiht transformer bert model

In [None]:
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True )
model.eval()

In [None]:
with torch.no_grad():
  outputs = model(train_data['tokens_tensor'][10],train_data['segments_tensors'][10])
  x = outputs[2]

In [None]:
train_data['embedding'] = None
for i in range(len(train_data)):
  with torch.no_grad():
    outputs = model(train_data['tokens_tensor'][i],train_data['segments_tensors'][i])
  train_data['embedding'][i] = outputs[2]

test_data['embedding'] = None
for i in range(len(test_data)):
  with torch.no_grad():
    outputs = model(test_data['tokens_tensor'][i],test_data['segments_tensors'][i])
  test_data['embedding'][i] = outputs[2]

In [None]:
for i in range(len(train_data)):
  a = torch.stack(train_data['embedding'][i], dim=0)
  a = torch.squeeze(a, dim=1)
  a = a.numpy()
  train_data['embedding'][i] = a

for i in range(len(test_data)):
  a = torch.stack(test_data['embedding'][i], dim=0)
  a = torch.squeeze(a, dim=1)
  a = a.numpy()
  test_data['embedding'][i] = a

# MLP Model

extract [CLS] embedding

In [None]:
train_data['cls'] = None
for i in range(len(train_data)):
  train_data['cls'][i] = train_data['embedding'][i][12][0]

test_data['cls'] = None
for i in range(len(test_data)):
  test_data['cls'][i] = test_data['embedding'][i][12][0]

In [None]:
word_list = train_data['word'].unique().tolist()

In [None]:
dataset = {}
for id , word in enumerate(word_list):
  dataset[word]=train_data.loc[train_data['word']==word_list[id]]

test_dataset = {}
for id , word in enumerate(word_list):
  test_dataset[word]=test_data.loc[test_data['word']==word_list[id]]

In [None]:
label_tansformers = {}
for word in word_list:
  label_tansformers[word] = preprocessing.LabelEncoder().fit(dataset[word]['label'].values)
  dataset[word]['label'] = label_tansformers[word].transform(dataset[word]['label'].values)
  test_dataset[word]['label'] = test_dataset[word]['label'].map(lambda s: '<unknown>' if s not in label_tansformers[word].classes_ else s)
  label_tansformers[word].classes_ = np.append(label_tansformers[word].classes_, '<unknown>')
  test_dataset[word]['label'] = label_tansformers[word].transform(test_dataset[word]['label'].values)

train MLP Classifier

In [None]:
from sklearn.neural_network import MLPClassifier
acc = []
f1 = []
for word in word_list:
  x = np.vstack(dataset[word]['cls'].values)
  y = dataset[word]['label'].values
  nn = MLPClassifier(hidden_layer_sizes=(256,) ,activation='relu' , solver='adam' )
  nn.fit(x,y)
  xx = np.vstack(test_dataset[word]['cls'].values)
  yy = test_dataset[word]['label'].values
  yp = nn.predict(xx)
  acc.append(accuracy_score(yy,yp))
  f1.append(f1_score(yy,yp , average='weighted'))

print (f'Accuracy => {100 * np.mean(acc):0.2f} %' )
print (f'F1-Measure => {np.mean(f1):0.2f}' )

Accuracy => 56.67 %
F1-Measure => 0.53


In [None]:
pos_tags = train_data['pos'].unique().tolist()
for word in word_list:
  verb = train_data[train_data['pos'] == pos_tags[0]]['word'].unique().tolist()
  noun = train_data[train_data['pos'] == pos_tags[1]]['word'].unique().tolist()
  adj = train_data[train_data['pos'] == pos_tags[2]]['word'].unique().tolist()

In [None]:
verb_index = [word_list.index(i) for i in verb]
verb_acc = [acc[i] for i in verb_index]
verb_f1 = [f1[i] for i in verb_index]
noun_index = [word_list.index(i) for i in noun]
noun_acc = [acc[i] for i in noun_index]
noun_f1 = [f1[i] for i in noun_index]
adj_index = [word_list.index(i) for i in adj]
adj_acc = [acc[i] for i in adj_index]
adj_f1 = [f1[i] for i in adj_index]
print (f'Accuracy => verb: {100 * np.mean(verb_acc):0.2f} %  noun: {100*np.mean(noun_acc):0.2f} %  adjective: {100*np.mean(adj_acc):0.2f} %')
print (f'F1-Measure => verb: {np.mean(verb_f1):0.2f}  noun: {np.mean(noun_f1):0.2f}   adjective: {np.mean(adj_f1):0.2f} ')

Accuracy => verb: 57.65 %  noun: 60.83 %  adjective: 33.71 %
F1-Measure => verb: 0.53  noun: 0.57   adjective: 0.29 


# Bert-LSTM Model

find head index and extract [CLS] [HEAD] [SEP] embeddings

In [None]:
train_data['head_token'] = train_data['head'].apply(lambda x : tokenizer.tokenize(x)[0] )
test_data['head_token'] = test_data['head'].apply(lambda x : tokenizer.tokenize(x)[0] )

In [None]:
train_data['cls_head_sep'] = None
for i in range(len(train_data)):
  head_idx = train_data['tokens'][i].index(train_data['head_token'][i])
  train_data['cls_head_sep'][i] = np.array([train_data['embedding'][i][12][0],train_data['embedding'][i][12][head_idx],train_data['embedding'][i][12][-1]])

test_data['cls_head_sep'] = None
for i in range(len(test_data)):
  head_idx = test_data['tokens'][i].index(test_data['head_token'][i])
  test_data['cls_head_sep'][i] = np.array([test_data['embedding'][i][12][0],test_data['embedding'][i][12][head_idx],test_data['embedding'][i][12][-1]])

make data ready for input RNN 

In [None]:
data1 = dataset['activate']['cls_head_sep'].values
data1 = list(data1)
data1 = np.array(data1)

data2 = test_dataset['activate']['cls_head_sep'].values
data2 = list(data2)
data2 = np.array(data2)

In [None]:
dataset = {}
for id , word in enumerate(word_list):
  dataset[word]=train_data.loc[train_data['word']==word_list[id]]

test_dataset = {}
for id , word in enumerate(word_list):
  test_dataset[word]=test_data.loc[test_data['word']==word_list[id]]

In [None]:
label_tansformers = {}
for word in word_list:
  label_tansformers[word] = preprocessing.LabelEncoder().fit(dataset[word]['label'].values)
  dataset[word]['label'] = label_tansformers[word].transform(dataset[word]['label'].values)
  test_dataset[word]['label'] = test_dataset[word]['label'].map(lambda s: '<unknown>' if s not in label_tansformers[word].classes_ else s)
  label_tansformers[word].classes_ = np.append(label_tansformers[word].classes_, '<unknown>')
  test_dataset[word]['label'] = label_tansformers[word].transform(test_dataset[word]['label'].values)

In [None]:
from keras.layers import LSTM ,Dense , Dropout
from keras.models import Sequential

use keras to build the LSTM model

In [None]:
def creat_model(classes):
  model=Sequential()
  model.add(LSTM(30,input_shape=(3,768),activation='relu',return_sequences=True , recurrent_dropout=0.2))
  model.add(Dropout(0.2))
  model.add(LSTM(30,activation='relu' , recurrent_dropout=0.2))
  model.add(Dropout(0.2))
  model.add(Dense(32,activation='relu'))
  model.add(Dropout(0.2))
  model.add(Dense(classes,activation='softmax'))
  model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
  return model

In [None]:
print(model.summary())

Model: "sequential_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_23 (LSTM)               (None, 3, 128)            459264    
_________________________________________________________________
dropout_30 (Dropout)         (None, 3, 128)            0         
_________________________________________________________________
lstm_24 (LSTM)               (None, 128)               131584    
_________________________________________________________________
dropout_31 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_22 (Dense)             (None, 32)                4128      
_________________________________________________________________
dropout_32 (Dropout)         (None, 32)                0         
_________________________________________________________________
dense_23 (Dense)             (None, 4)               

train LSTM model

In [None]:
acc = []
f1 = []
for word in word_list:
  x = dataset[word]['cls_head_sep'].values
  x = list(x)
  x = np.array(x)
  y = dataset[word]['label'].values
  x_t = test_dataset[word]['cls_head_sep'].values
  x_t = list(x_t)
  x_t = np.array(x_t)
  y_t = test_dataset[word]['label'].values
  classes = len(label_tansformers[word].classes_)-1
  lstm_model = creat_model(classes)
  lstm_model.fit(x , y ,epochs=120,batch_size=64)
  yp = lstm_model.predict_classes(x_t)
  acc.append(accuracy_score(y_t,yp))
  f1.append(f1_score(y_t,yp , average='weighted'))

In [None]:
print (f'Accuracy => {100 * np.mean(acc):0.2f} %' )
print (f'F1-Measure => {np.mean(f1) :0.2f}' )

Accuracy => 70.51 %
F1-Measure => 0.67


In [None]:
verb_index = [word_list.index(i) for i in verb]
verb_acc = [acc[i] for i in verb_index]
verb_f1 = [f1[i] for i in verb_index]
noun_index = [word_list.index(i) for i in noun]
noun_acc = [acc[i] for i in noun_index]
noun_f1 = [f1[i] for i in noun_index]
adj_index = [word_list.index(i) for i in adj]
adj_acc = [acc[i] for i in adj_index]
adj_f1 = [f1[i] for i in adj_index]
print (f'Accuracy => verb: {100 * np.mean(verb_acc):0.2f} %  noun: {100*np.mean(noun_acc):0.2f} %  adjective: {100*np.mean(adj_acc):0.2f} %')
print (f'F1-Measure => verb: {np.mean(verb_f1):0.2f}  noun: {np.mean(noun_f1):0.2f}   adjective: {np.mean(adj_f1):0.2f} ')

Accuracy => verb: 71.99 %  noun: 72.94 %  adjective: 51.91 %
F1-Measure => verb: 0.69  noun: 0.72   adjective: 0.49 
