<div align="center"><h1>HW3, Part1</h1></div>
<div align="center"><h2>Mohammadreza Ghofrani, 400131076</h2></div>

In [1]:
!pip install --no-deps bert-embedding mxnet gluonnlp mxnet-cu92

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bert-embedding
  Downloading bert_embedding-1.0.1-py3-none-any.whl (13 kB)
Collecting mxnet
  Downloading mxnet-1.9.1-py3-none-manylinux2014_x86_64.whl (49.1 MB)
[K     |████████████████████████████████| 49.1 MB 1.9 MB/s 
[?25hCollecting gluonnlp
  Downloading gluonnlp-0.10.0.tar.gz (344 kB)
[K     |████████████████████████████████| 344 kB 55.7 MB/s 
[?25hCollecting mxnet-cu92
  Downloading mxnet_cu92-1.7.0-py2.py3-none-manylinux2014_x86_64.whl (789.8 MB)
[K     |████████████████████████████████| 789.8 MB 15 kB/s 
[?25hBuilding wheels for collected packages: gluonnlp
  Building wheel for gluonnlp (setup.py) ... [?25l[?25hdone
  Created wheel for gluonnlp: filename=gluonnlp-0.10.0-cp37-cp37m-linux_x86_64.whl size=595732 sha256=6c258d15c02c23fb8683c0c69faa4aca662c36395a039aa554337f51e3d3e9b6
  Stored in directory: /root/.cache/pip/wheels/be/b4/06/7f3fdfaf707e6b5e98b79c041

In [2]:
import os
import re
import numpy as np
import mxnet as mx
import pandas as pd
from tqdm import tqdm
from sklearn import svm
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from bert_embedding import BertEmbedding

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Dataset

In [None]:
!gdown 1oin_Sw1Gk_WLS9zpDrap5FdlRcUrTP_D
!gdown 1EvtGQ8-sYXXQ3VA9ByjD4OmFH13WxvAs
!gdown 154f-z0PsPAp0yvOLdNXP8NXYjPljpgCZ
!gdown 1EDG_j6F5ohIjpkihRQd0-i9daHWV7StN
!mkdir data
!mv *.txt data/

Downloading...
From: https://drive.google.com/uc?id=1oin_Sw1Gk_WLS9zpDrap5FdlRcUrTP_D
To: /content/Sentenses_train.txt
100% 1.23M/1.23M [00:00<00:00, 73.8MB/s]
Downloading...
From: https://drive.google.com/uc?id=1EvtGQ8-sYXXQ3VA9ByjD4OmFH13WxvAs
To: /content/Senses_train.txt
100% 49.7k/49.7k [00:00<00:00, 51.1MB/s]
Downloading...
From: https://drive.google.com/uc?id=154f-z0PsPAp0yvOLdNXP8NXYjPljpgCZ
To: /content/Sentenses_test.txt
100% 213k/213k [00:00<00:00, 14.6MB/s]
Downloading...
From: https://drive.google.com/uc?id=1EDG_j6F5ohIjpkihRQd0-i9daHWV7StN
To: /content/Senses_test.txt
100% 8.76k/8.76k [00:00<00:00, 7.97MB/s]


## Read files and embeddings

In [None]:
def ds_reader(ds_name, context_radius=10):
    dataset = {'word':[], 'embedding':[], 'sense':[]}
    bert_embedding = BertEmbedding()
    with open(f'data/Sentenses_{ds_name}.txt', 'r', encoding='utf-8') as fsentence, \
         open(f'data/Senses_{ds_name}.txt', 'r', encoding='utf-8') as fsense:
        senses = fsense.read().split('\n')
        sentences = fsentence.read().split('\n')
        for sentence, sense in tqdm(zip(sentences, senses), total=len(sentences)):
            match = re.search('<head>(.*?)</head>', sentence)
            if match:
                target = match.group(0) \
                    .replace('<head>', '') \
                    .replace('</head>', '') \
                    .strip() \
                    .lower()
                mytokens = sentence.replace('<head>', '') \
                                .replace('</head>', '') \
                                .lower() \
                                .split(' ')
                context = ' '.join(mytokens[max(0, mytokens.index(target) - context_radius): \
                                            min(mytokens.index(target) + context_radius, len(mytokens))])
                berttokens, embeds = bert_embedding([context,])[0]
                target_index = berttokens.index(target)
                dataset['word'].append(target)
                dataset['embedding'].append(embeds[target_index])
                dataset['sense'].append(sense)
    return pd.DataFrame(dataset)

In [None]:
train_dataframe = ds_reader('train')
test_dataframe = ds_reader('test')

100%|██████████| 6325/6325 [43:31<00:00,  2.42it/s]
100%|██████████| 1117/1117 [07:26<00:00,  2.50it/s]


## Reducing Embedding Dimension


In [7]:
pca = PCA(n_components=300)
train_dataframe['reduced_embedding'] = pca.fit_transform(np.array([vec for vec in train_dataframe['embedding'].values])).tolist()
test_dataframe['reduced_embedding'] = pca.transform(np.array([vec for vec in test_dataframe['embedding'].values])).tolist()

## Final Processes (grouping, categorizing, ...)

In [8]:
def final_dataset_generator(dataset):
    wordshape = {'hard':["hard", "harder", "hardest"],
                 'interest': ["interest", "interests", "interested", "interesting"],
                 'line': ["line", "lines"],
                 'serve': ["serve", "served", "serves"]}
    word_sense = {'hard': ['HARD1', 'HARD2', 'HARD3'],
                  'interest': ['interest1', 'interest2', 'interest3', 'interest4', 'interest5', 'interest6'],
                  'line': ['division', 'cord', 'phone', 'formation', 'product', 'text'],
                  'serve': ['SERVE10', 'SERVE12']}
    
    output = {ws:[] for ws in wordshape}

    le = LabelEncoder()
    for ws in wordshape:
        frames = []
        for word in wordshape[ws]:
            for sense in word_sense[ws]:
                try:
                    frames.append(dataset.groupby(['word', 'sense']).get_group((word,sense)))
                except KeyError as e:
                    pass
        df = pd.concat(frames)
        le.fit(df.sense)
        x = np.array([np.array(vec) for vec in df.reduced_embedding.values])
        y = le.transform(df.sense)
        output[ws] = [x, y]
    return output

In [9]:
train_dataset = final_dataset_generator(train_dataframe)
test_dataset = final_dataset_generator(test_dataframe)

# Results

In [16]:
for word in train_dataset:
    xtest, ytest = test_dataset[word]
    xtrain, ytrain = train_dataset[word]
    try:
        cls = svm.SVC(kernel='linear').fit(xtrain, ytrain)

        print()
        print(f"========== {word} ===========")
        print('On Train dataset')
        ypred = cls.predict(xtrain)
        acc = accuracy_score(ytrain, ypred)
        f1 = f1_score(ytrain, ypred, average='macro')
        print(f'Acc: {acc:.3f}')
        print(f'F1: {f1:.3f}')

        print('On Test dataset')
        ypred = cls.predict(xtest)
        acc = accuracy_score(ytest, ypred)
        f1 = f1_score(ytest, ypred, average='macro')
        print(f'Acc: {acc:.3f}')
        print(f'F1: {f1:.3f}')

    except ValueError:
        print("SVM classification can't be done for word:", word)

SVM classification can't be done for word: hard

On Train dataset
Acc: 1.000
F1: 1.000
On Test dataset
Acc: 0.940
F1: 0.822

On Train dataset
Acc: 1.000
F1: 1.000
On Test dataset
Acc: 0.948
F1: 0.947

On Train dataset
Acc: 1.000
F1: 1.000
On Test dataset
Acc: 1.000
F1: 1.000
