In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
!pip install bert-embedding

In [None]:
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt')
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize.treebank import TreebankWordDetokenizer
from bert_embedding import BertEmbedding
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier , VotingClassifier
from sklearn.metrics import accuracy_score , f1_score


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Data Preprocessing

loading data from google drive

In [None]:
train_data = pd.read_csv('/content/drive/MyDrive/nlp/HW3/train1.csv')
test_data = pd.read_csv('/content/drive/MyDrive/nlp/HW3/test.csv')

remove_tags => a function to omit head tag from the given context

find_head => a function to find head word in given the context

In [None]:
def remove_tag(contex):
  contex = contex.replace("<head>" , "")
  contex = contex.replace("</head>" , "")
  return contex

import re
def find_head(contex):
  match = re.findall(r'<head>\w+</head>' , contex)
  return match[0][6:-7]

In [None]:
train_data['head'] = train_data['context'].apply(find_head)
train_data['context'] = train_data['context'].apply(remove_tag)

test_data['head'] = test_data['context'].apply(find_head)
test_data['context'] = test_data['context'].apply(remove_tag)

tokenizing context with a simple word tokenizer

In [None]:
tokenizer = RegexpTokenizer(r'\w+')
train_data['tokens'] = train_data['context'].apply(tokenizer.tokenize)
test_data['tokens'] = test_data['context'].apply(tokenizer.tokenize)

seperating a part of the context with the center of head and a specific window size

In [None]:
def limiting_contex(data , window_size = 9):
  data['limited_context'] = 0
  for i in range(len(data)):
    idx = data['tokens'][i].index(data['head'][i])
    if idx < window_size:
      start = 0
      stop = idx + (window_size + 1)
    elif idx > len(data['tokens']) - (window_size + 1):
      start = idx - window_size
      stop = len(data['tokens']) - 1
    else :
      start = idx - window_size
      stop = idx + (window_size + 1)
    data['limited_context'].iloc[i] = data['tokens'][i][start:stop]
  return data

In [None]:
train_data = limiting_contex(train_data)
test_data = limiting_contex(test_data)

detokenize tokens to make a new smaller contetx

In [None]:
train_data['limited_context'] = train_data['limited_context'].apply(TreebankWordDetokenizer().detokenize)
test_data['limited_context'] = test_data['limited_context'].apply(TreebankWordDetokenizer().detokenize)

get rid of useless columns in the data

In [None]:
train_data = train_data.drop(['instance_id' , 'doc_src' , 'tokens']  , axis=1)
train_data = train_data.rename(columns={'sense_id': 'label'})

test_data = test_data.drop(['instance_id' , 'doc_src' , 'tokens']  , axis=1)
test_data = test_data.rename(columns={'sense_id': 'label'})

# Bert Embedding and make Representations

embedding context's tokens with a Bert model

In [None]:
sentences = train_data['limited_context'].values
bert_embedding = BertEmbedding()
result = bert_embedding(sentences)

Vocab file is not found. Downloading.
Downloading /root/.mxnet/models/book_corpus_wiki_en_uncased-a6607397.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/vocab/book_corpus_wiki_en_uncased-a6607397.zip...
Downloading /root/.mxnet/models/bert_12_768_12_book_corpus_wiki_en_uncased-75cc780f.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/bert_12_768_12_book_corpus_wiki_en_uncased-75cc780f.zip...


In [None]:
test_sentences = test_data['limited_context'].values
test_result = bert_embedding(test_sentences)

represent with only the the representation of head

In [None]:
def representation1(data , embedding):
  rep1 = len(data) * ['0']
  for i in range(len(data)):
    head_idx = embedding[i][0].index(data['head'][i].lower())
    rep1[i] = embedding[i][1][head_idx]
  data['rep1'] = rep1
  return data

In [None]:
train_data = representation1(train_data , result)
test_data = representation1(test_data , test_result)

calculate tf-idf and make a list that contain tf-idf for each token in each contetx

In [None]:
tfidf_vectorizer = TfidfVectorizer(sublinear_tf = True , use_idf = True)
tfidf = tfidf_vectorizer.fit_transform(train_data['limited_context'])

tfidf_doc_word = []
f_names = np.array(tfidf_vectorizer.get_feature_names())
for i in range(len(train_data)):
   _ , index = tfidf[i].nonzero()
   my_dict = dict(zip(f_names[index].tolist(), tfidf[i , index].toarray().reshape(-1)))
   tfidf_doc_word.append(my_dict)

In [None]:
test_tfidf_vectorizer = TfidfVectorizer(sublinear_tf = True , use_idf = True)
test_tfidf = test_tfidf_vectorizer.fit_transform(test_data['limited_context'])

test_tfidf_doc_word = []
f_names = np.array(test_tfidf_vectorizer.get_feature_names())
for i in range(len(test_data)):
   _ , index = test_tfidf[i].nonzero()
   my_dict = dict(zip(f_names[index].tolist(), test_tfidf[i , index].toarray().reshape(-1)))
   test_tfidf_doc_word.append(my_dict)

represent with a weghted avarage of head neighbors

In [None]:
def representation2(data , embedding , tf_idf):
  rep2 = len(data) * ['0']
  for i in range(len(data)):
    neighbors = []
    vec = []
    weight = []
    tokens = embedding[i][0]
    idx = tokens.index(data['head'][i].lower())
    for n in range(-3,4):
      try:
        neighbors.append(tokens[idx + n])
      except : continue
    for nei in neighbors:
      try : 
        weight.append(tf_idf[i][nei])
        vec.append(embedding[i][1][embedding[i][0].index(nei)])
      except : continue
    rep2[i] = np.average(vec , weights= weight , axis=0)
  data['rep2'] = rep2
  return data

In [None]:
train_data = representation2(train_data , result , tfidf_doc_word)
test_data = representation2(test_data , test_result , test_tfidf_doc_word)

represent with a weighted avarage of all tokens in the limited context

In [None]:
def representation3(data , embedding , tf_idf):
  rep3 = len(data) * ['0']
  for i in range(len(data)):
    vec = []
    weight = []
    tokens = embedding[i][0]
    for t in tokens:
      try : 
        weight.append(tf_idf[i][t])
        vec.append(embedding[i][1][embedding[i][0].index(t)])
      except : continue
    rep3[i] = np.average(vec , weights= weight , axis=0)
  data['rep3'] = rep3
  return data

In [None]:
train_data = representation3(train_data , result , tfidf_doc_word)
test_data = representation3(test_data , test_result , test_tfidf_doc_word)

dimensionality reduction with PCA

In [None]:
pca = []
for i in range(1,4):
  pca.append(PCA(n_components=300))
  matrix = np.vstack(train_data[f'rep{i}'])
  new_rep = pca[i-1].fit_transform(matrix)
  new_rep = new_rep.tolist()
  new_rep = [np.array(x) for x in new_rep]
  train_data[f'rep{i}'] = new_rep

for i in range(1,4):
  matrix = np.vstack(test_data[f'rep{i}'])
  new_rep = pca[i-1].transform(matrix)
  new_rep = new_rep.tolist()
  new_rep = [np.array(x) for x in new_rep]
  test_data[f'rep{i}'] = new_rep

load a pretrained word2vec representation

In [None]:
import gensim
pretrained_embeddings_path = "/content/drive/MyDrive/nlp/HW3/GoogleNews-vectors-negative300.bin.gz"
word2vec =  gensim.models.KeyedVectors.load_word2vec_format(pretrained_embeddings_path,binary=True)

represent with weighted avarage of neighbors word2vec representation

In [None]:
def representation4(data , embedding , tf_idf):
  rep4 = len(data) * ['0']
  for i in range(len(data)):
    neighbors = []
    vec = []
    weight = []
    tokens = tokenizer.tokenize(data['limited_context'][i])
    tokens = [t.lower() for t in tokens]
    idx = tokens.index(data['head'][i].lower())
    for n in range(-3,4):
      try:
        neighbors.append(tokens[idx + n])
      except : continue
    for nei in neighbors:
      try : 
        (w , v) = (tf_idf[i][nei] , embedding[nei]) 
        weight.append(w)
        vec.append(v)
      except : continue
    rep4[i] = np.average(vec , weights= weight , axis=0)
  data['rep4'] = rep4
  return data

In [None]:
train_data = representation4(train_data , word2vec , tfidf_doc_word)
test_data = representation4(test_data , word2vec , test_tfidf_doc_word)

represent with weighted avarage of all tokens of limited context word2vec representation

In [None]:
def representation5(data , embedding , tf_idf):
  rep5 = len(data) * ['0']
  for i in range(len(data)):
    vec = []
    weight = []
    tokens = tokenizer.tokenize(data['limited_context'][i])
    tokens = [t.lower() for t in tokens]
    for t in tokens:
      try : 
        (w , v) = (tf_idf[i][t] , embedding[t]) 
        weight.append(w)
        vec.append(v)
      except : continue
    rep5[i] = np.average(vec , weights= weight , axis=0)
  data['rep5'] = rep5
  return data

In [None]:
train_data = representation5(train_data , word2vec , tfidf_doc_word)
test_data = representation5(test_data , word2vec , test_tfidf_doc_word)

# Context Classify

## Part A

list all the ambiguous words

In [None]:
word_list = train_data['word'].unique().tolist()

make dictionary that contains train data for each ambiguous word

In [None]:
dataset = {}
for id , word in enumerate(word_list):
  dataset[word]=train_data.loc[train_data['word']==word_list[id]]

test_dataset = {}
for id , word in enumerate(word_list):
  test_dataset[word]=test_data.loc[test_data['word']==word_list[id]]

encode sense_ids with appropriate labels

In [None]:
label_tansformers = {}
for word in word_list:
  label_tansformers[word] = preprocessing.LabelEncoder().fit(dataset[word]['label'].values)
  dataset[word]['label'] = label_tansformers[word].transform(dataset[word]['label'].values)
  test_dataset[word]['label'] = test_dataset[word]['label'].map(lambda s: '<unknown>' if s not in label_tansformers[word].classes_ else s)
  label_tansformers[word].classes_ = np.append(label_tansformers[word].classes_, '<unknown>')
  test_dataset[word]['label'] = label_tansformers[word].transform(test_dataset[word]['label'].values)

a function that get data and classify them based on chosen representation

In [None]:
def classify (dataset , test_dataset , rep = 'rep1' , classifier = LogisticRegression(random_state=0) , verbose = False):
  accuracy_list = []
  f1_list = []
  model = {}
  for word in word_list:
    x = np.vstack(dataset[word][rep].values)
    y = dataset[word]['label'].values
    model[word] = classifier.fit(x,y)
    x_t = np.vstack(test_dataset[word][rep].values)
    y_t = test_dataset[word]['label'].values
    yy = model[word].predict(x_t)
    accuracy_list .append(accuracy_score(y_t, yy))
    f1_list.append(f1_score(y_t , yy , average='weighted'))
  if verbose == True:
    print (f'Accuracy => {100 * np.mean(accuracy_list):0.2f} %' )
    print (f'F1-Measure => {np.mean(f1_list):0.2f}' )
  return accuracy_list , f1_list , model

In [None]:
classifire1 = classify(dataset,test_dataset,rep = 'rep1' , classifier = LogisticRegression(random_state=0),verbose = True)

Accuracy => 68.76 %
F1-Measure => 0.65


split words based on their pos tags

In [None]:
pos_tags = train_data['pos'].unique().tolist()
for word in word_list:
  verb = train_data[train_data['pos'] == pos_tags[0]]['word'].unique().tolist()
  noun = train_data[train_data['pos'] == pos_tags[1]]['word'].unique().tolist()
  adj = train_data[train_data['pos'] == pos_tags[2]]['word'].unique().tolist()

classify data and show accuracy and f1-measure for each pos tag

In [None]:
def classify_tags(dataset , test_dataset , rep = 'rep1' , classifier = LogisticRegression(random_state=0)):
  accuracy , f1 , model= classify(dataset , test_dataset , rep = rep , classifier=classifier)
  verb_index = [word_list.index(i) for i in verb]
  verb_acc = [accuracy[i] for i in verb_index]
  verb_f1 = [f1[i] for i in verb_index]
  noun_index = [word_list.index(i) for i in noun]
  noun_acc = [accuracy[i] for i in noun_index]
  noun_f1 = [f1[i] for i in noun_index]
  adj_index = [word_list.index(i) for i in adj]
  adj_acc = [accuracy[i] for i in adj_index]
  adj_f1 = [f1[i] for i in adj_index]
  print (f'Accuracy => verb: {100 * np.mean(verb_acc):0.2f} %  noun: {100*np.mean(noun_acc):0.2f} %  adjective: {100*np.mean(adj_acc):0.2f} %')
  print (f'F1-Measure => verb: {np.mean(verb_f1):0.2f}  noun: {np.mean(noun_f1):0.2f}   adjective: {np.mean(adj_f1):0.2f} ')
  return {'verb':[verb_acc , verb_f1] , 'noun':[noun_acc, noun_f1] , 'adj':[adj_acc,adj_f1]}


In [None]:
acc,f1,model1= classify(dataset,test_dataset,rep = 'rep1' , classifier = LogisticRegression(random_state=10,solver='sag'),verbose = True)

Accuracy => 69.40 %
F1-Measure => 0.66


In [None]:
dic = classify_tags(dataset,test_dataset,rep = 'rep1', classifier = LogisticRegression(random_state=10,solver='sag'))

Accuracy => verb: 70.67 %  noun: 71.46 %  adjective: 53.00 %
F1-Measure => verb: 0.67  noun: 0.69   adjective: 0.42 


In [None]:
acc,f1,model2= classify(dataset,test_dataset,rep = 'rep2' ,classifier = LogisticRegression(random_state=10,solver='sag'),verbose = True)

Accuracy => 63.79 %
F1-Measure => 0.59


In [None]:
dic = classify_tags(dataset,test_dataset,rep = 'rep2',classifier = LogisticRegression(random_state=10,solver='sag'))

Accuracy => verb: 64.09 %  noun: 66.81 %  adjective: 49.86 %
F1-Measure => verb: 0.59  noun: 0.64   adjective: 0.43 


In [None]:
acc,f1,model3= classify(dataset,test_dataset,rep = 'rep3' , classifier = LogisticRegression(random_state=10,solver='sag'),verbose = True)

Accuracy => 59.82 %
F1-Measure => 0.55


In [None]:
dic = classify_tags(dataset,test_dataset,rep = 'rep3', classifier = LogisticRegression(random_state=10,solver='sag'))

Accuracy => verb: 59.87 %  noun: 62.96 %  adjective: 47.00 %
F1-Measure => verb: 0.55  noun: 0.60   adjective: 0.39 


In [None]:
acc,f1,model4= classify(dataset,test_dataset,rep = 'rep4' ,classifier = LogisticRegression(random_state=10,solver='sag'),verbose = True)

Accuracy => 54.77 %
F1-Measure => 0.47


In [None]:
dic = classify_tags(dataset,test_dataset,rep = 'rep4', classifier = LogisticRegression(random_state=10,solver='sag'))

Accuracy => verb: 54.89 %  noun: 58.57 %  adjective: 38.86 %
F1-Measure => verb: 0.47  noun: 0.52   adjective: 0.29 


In [None]:
acc,f1,model5= classify(dataset,test_dataset,rep = 'rep5' , classifier = LogisticRegression(random_state=10,solver='sag'),verbose = True)

Accuracy => 53.50 %
F1-Measure => 0.46


In [None]:
dic = classify_tags(dataset,test_dataset,rep = 'rep5', classifier = LogisticRegression(random_state=10,solver='sag'))

Accuracy => verb: 54.43 %  noun: 55.67 %  adjective: 38.86 %
F1-Measure => verb: 0.47  noun: 0.48   adjective: 0.28 


In [None]:
acc,f1,model6 = classify(dataset,test_dataset,rep = 'rep1' , classifier = RandomForestClassifier(max_depth=4, random_state=0) ,verbose = True)

Accuracy => 64.26 %
F1-Measure => 0.58


In [None]:
dic = classify_tags(dataset,test_dataset,rep = 'rep1', classifier = RandomForestClassifier(max_depth=4, random_state=0))

Accuracy => verb: 65.32 %  noun: 66.94 %  adjective: 46.71 %
F1-Measure => verb: 0.58  noun: 0.62   adjective: 0.38 


In [None]:
acc,f1,model7 = classify(dataset,test_dataset,rep = 'rep2' , classifier = RandomForestClassifier(max_depth=4, random_state=0) ,verbose = True)

Accuracy => 54.08 %
F1-Measure => 0.46


In [None]:
dic = classify_tags(dataset,test_dataset,rep = 'rep2', classifier = RandomForestClassifier(max_depth=4, random_state=0))

Accuracy => verb: 53.62 %  noun: 58.63 %  adjective: 38.86 %
F1-Measure => verb: 0.45  noun: 0.51   adjective: 0.28 


In [None]:
acc,f1,model8 = classify(dataset,test_dataset,rep = 'rep3' , classifier = RandomForestClassifier(max_depth=4, random_state=0) ,verbose = True)

Accuracy => 51.15 %
F1-Measure => 0.42


In [None]:
dic = classify_tags(dataset,test_dataset,rep = 'rep3', classifier = RandomForestClassifier(max_depth=4, random_state=0))

Accuracy => verb: 52.03 %  noun: 52.81 %  adjective: 38.86 %
F1-Measure => verb: 0.43  noun: 0.44   adjective: 0.28 


In [None]:
acc,f1,model9 = classify(dataset,test_dataset,rep = 'rep4' , classifier = RandomForestClassifier(max_depth=4, random_state=0) ,verbose = True)

Accuracy => 51.67 %
F1-Measure => 0.43


In [None]:
dic = classify_tags(dataset,test_dataset,rep = 'rep4', classifier = RandomForestClassifier(max_depth=4, random_state=0))

Accuracy => verb: 51.77 %  noun: 54.71 %  adjective: 38.86 %
F1-Measure => verb: 0.44  noun: 0.46   adjective: 0.29 


In [None]:
acc,f1,model10 = classify(dataset,test_dataset,rep = 'rep5' , classifier = RandomForestClassifier(max_depth=4, random_state=0) ,verbose = True)

Accuracy => 52.32 %
F1-Measure => 0.44


In [None]:
dic = classify_tags(dataset,test_dataset,rep = 'rep5', classifier = RandomForestClassifier(max_depth=4, random_state=0))

Accuracy => verb: 52.92 %  noun: 53.78 %  adjective: 42.71 %
F1-Measure => verb: 0.45  noun: 0.45   adjective: 0.32 


## Part B

a function to find most frequent itme in a list

In [None]:
def most_frequent(List):
    return max(set(List), key = List.count)

train 3 best model on their own data and vote between them to find the ensemble label

In [None]:
accuracy= []
f1= []
for word in word_list:
  x = np.vstack(dataset[word]['rep1'].values)
  y = dataset[word]['label'].values
  model1 = LogisticRegression(random_state=10,solver='sag').fit(x,y)
  x_t = np.vstack(test_dataset[word]['rep1'].values)
  y_t = test_dataset[word]['label'].values
  yy1 = model1.predict(x_t)
  x = np.vstack(dataset[word]['rep2'].values)
  model2 = LogisticRegression(random_state=10,solver='sag').fit(x,y)
  x_t = np.vstack(test_dataset[word]['rep2'].values)
  yy2 = model2.predict(x_t)
  x = np.vstack(dataset[word]['rep1'].values)
  model3 = RandomForestClassifier(max_depth=4, random_state=0).fit(x,y)
  x_t = np.vstack(test_dataset[word]['rep1'].values)
  yy3 = model3.predict(x_t)
  yy = yy1
  for i in range(len(yy1)):
    a = [yy1[i] , yy2[i] , yy3[i]]
    yy[i] = most_frequent(a)
  accuracy .append(accuracy_score(y_t, yy))
  f1.append(f1_score(y_t , yy , average='weighted'))
print (f'Accuracy => {100 * np.mean(accuracy):0.2f} %' )
print (f'F1-Measure => {np.mean(f1):0.2f}' )

Accuracy => 68.34 %
F1-Measure => 0.63


calculate the accuracy and f1-measure for each pos tag on ensemble model

In [None]:
verb_index = [word_list.index(i) for i in verb]
verb_acc = [accuracy[i] for i in verb_index]
verb_f1 = [f1[i] for i in verb_index]
noun_index = [word_list.index(i) for i in noun]
noun_acc = [accuracy[i] for i in noun_index]
noun_f1 = [f1[i] for i in noun_index]
adj_index = [word_list.index(i) for i in adj]
adj_acc = [accuracy[i] for i in adj_index]
adj_f1 = [f1[i] for i in adj_index]
print (f'Accuracy => verb: {100 * np.mean(verb_acc):0.2f} %  noun: {100*np.mean(noun_acc):0.2f} %  adjective: {100*np.mean(adj_acc):0.2f} %')
print (f'F1-Measure => verb: {np.mean(verb_f1):0.2f}  noun: {np.mean(noun_f1):0.2f}   adjective: {np.mean(adj_f1):0.2f} ')

Accuracy => verb: 69.95 %  noun: 70.08 %  adjective: 51.00 %
F1-Measure => verb: 0.65  noun: 0.66   adjective: 0.39 


## Part C

a function that return predicted labels for dataset

In [None]:
def predict(classifier , rep = 'rep1'):
  pred = {}
  for word in word_list:
    x = np.vstack(dataset[word][rep].values)
    y = dataset[word]['label'].values
    model = classifier.fit(x,y)
    x_t = np.vstack(test_dataset[word][rep].values)
    y_t = test_dataset[word]['label'].values
    yy = model.predict(x_t)
    pred[word] = yy
  return pred

get lables from each three model

In [None]:
pred1 = predict(LogisticRegression(random_state=0) , rep = 'rep1')
pred2 = predict(LogisticRegression(random_state=0) , rep = 'rep2')
pred3 = predict(RandomForestClassifier(max_depth=4, random_state=0) , rep = 'rep1')

make the input vector from labels for the classifier

In [None]:
pred = {}
for word in word_list:
  pred[word]= np.array([pred1[word],pred2[word],pred3[word]])
  pred[word] = pred[word].transpose()

y_true = {}
for word in word_list:
  y_true[word] = test_dataset[word]['label'].values

train a decision tree classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

accuracy , f1 = [] , []
for word in word_list:
  DT = DecisionTreeClassifier(random_state=0, max_depth=2).fit(pred[word],y_true[word])
  accuracy.append(DT.score(pred[word],y_true[word]))
  f1.append(DT.score(pred[word],y_true[word]))

print (f'Accuracy => {100 * np.mean(accuracy):0.2f} %' )
print (f'F1-Measure => {np.mean(f1):0.2f}' )

Accuracy => 77.34 %
F1-Measure => 0.77


In [None]:
verb_index = [word_list.index(i) for i in verb]
verb_acc = [accuracy[i] for i in verb_index]
verb_f1 = [f1[i] for i in verb_index]
noun_index = [word_list.index(i) for i in noun]
noun_acc = [accuracy[i] for i in noun_index]
noun_f1 = [f1[i] for i in noun_index]
adj_index = [word_list.index(i) for i in adj]
adj_acc = [accuracy[i] for i in adj_index]
adj_f1 = [f1[i] for i in adj_index]
print (f'Accuracy => verb: {100 * np.mean(verb_acc):0.2f} %  noun: {100*np.mean(noun_acc):0.2f} %  adjective: {100*np.mean(adj_acc):0.2f} %')
print (f'F1-Measure => verb: {np.mean(verb_f1):0.2f}  noun: {np.mean(noun_f1):0.2f}   adjective: {np.mean(adj_f1):0.2f} ')

Accuracy => verb: 77.48 %  noun: 79.28 %  adjective: 68.71 %
F1-Measure => verb: 0.77  noun: 0.79   adjective: 0.69 
