***Word Sense Induction:***

***Methodology***

1) Lemmatizing and tagging(PoS) text based on context. (http://lindat.mff.cuni.cz/services/udpipe/) I used online application interface for this task and copied the text back to csv format\
2) Word2vec representation of vector using pre trained model, vector length = 300. For this project I used  ruscorpora_upos_skipgram_300_5_2018 from: https://rusvectores.org/ru/models/ \
3) The function "fingerprint" takes the lemmatized and tagged representation of a word and a pre trained corpus as mentioned above and then finds the vector representations of length 300 based on context, I then took the average vector representation of all such vectors in the model's vocabulary to represent the context for our particular semantic use \
4) Taking just the average of the vectors gave the following warning  "All samples have mutually equal similarities. " and so I took a weighted average instead where the weight for a particular vector is dependent on the frequency of its occurance\
5) Affinity Propagation produces clustering of the contexts without the redefined number of clusters, which can be used
immediately as the desired sense-specific grouping. It takes two parameters for input: damping and preference which can be iteratively optimised but I just took a safe value of 0.5 each for the sake of trying 


**Running the program:**

Please save the below blocks of code in seperate files named wsi.py, helpers.py, evaluate.py respectively and use the following command:  
!python3 [wsi path] --input [test_tagged csv path] --model [zipped model path from the above mentioned rus2vec page] --test

Since I used the online service for tagging and lemmatization I am attaching the tagged csvs in the zipped folder





In [9]:
#wsi.py

from os import path
from pandas import read_csv
from evaluate import evaluate
import argparse
import sys
import numpy as np
import gensim
import logging
from sklearn.cluster import AffinityPropagation, SpectralClustering
from helpers import fingerprint, save

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


def main():
    parser = argparse.ArgumentParser()
    arg = parser.add_argument
    arg('--input', help='Path to input file with contexts', required=True)
    arg('--model', help='Path to word2vec model', required=True)
    arg('--test', dest='testing', action='store_true', help='Make predictions for test file with no gold labels?')
    arg('--weights', dest='weights', action='store_true', help='Use word weights?')

    parser.set_defaults(testing=False)
    args = parser.parse_args()

    modelfile = args.model

    model = gensim.models.KeyedVectors.load_word2vec_format(modelfile, binary=False)
    
    model.init_sims(replace=True)
    dataset = args.input

    # Affinity Cluster Algorithm input parameters damping and preference which could be further optimised but I chose some common values to proceed
    damping = 0.5
    preference = 0.5

    df = read_csv(dataset, sep="\t", encoding="utf-8")
    predicted = []
    goldsenses = []
    for query in df.word.unique():
        #analysing words in the unique word query list
        subset = df[df.word == query]
        if not args.testing:
            goldsenses.append(len(subset.gold_sense_id.unique()))
        contexts = []
        matrix = np.empty((subset.shape[0], model.vector_size))
        counter = 0
        lengths = []
        for line in subset.iterrows():
            con = line[1].context
            identifier = line[1].context_id
            label = query + str(identifier)
            contexts.append(label)
            if type(con) == float:
                print('Empty context at', label, file=sys.stderr)
                fp = np.zeros(model.vector_size)
            else:
                bow = con.split()
                bow = [b for b in bow if b != query]
                fp = fingerprint(bow, model, weights=args.weights)
                lengths.append(len(bow))
            matrix[counter, :] = fp
            counter += 1
        clustering = AffinityPropagation(preference=preference, damping=damping, random_state=None).fit(matrix)
       
        cur_predicted = clustering.labels_.tolist()
        predicted += cur_predicted
        if not args.testing:
            gold = subset.gold_sense_id
            print('Gold clusters:', len(set(gold)), file=sys.stderr)
        print('Predicted clusters:', len(set(cur_predicted)), file=sys.stderr)
       

    df.predict_sense_id = predicted
    fname = path.splitext(path.basename(args.input))[0]
    if args.testing:
        save(df, fname)
    else:
        res = evaluate(save(df, fname))
        print('ARI:', res)
        print('Average number of senses:', np.average(goldsenses))
        print('Variation of the number of senses:', np.std(goldsenses))
        print('Minimum number of senses:', np.min(goldsenses))
        print('Maximum number of senses:', np.max(goldsenses))


if __name__ == '__main__':
    main()


usage: ipykernel_launcher.py [-h] --input INPUT --model MODEL [--test]
                             [--weights]
ipykernel_launcher.py: error: the following arguments are required: --input, --model


SystemExit: ignored

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


# New Section

In [None]:
#helpers.py

import sys
import numpy as np
from sklearn.manifold import TSNE
import pylab as plot

"""
   Words list
   Word2vec model in Gensim format -- Parameter Model   
    function returns average vector of words in text
  """
def fingerprint(text, model, weights=False):

    # Creating list of all words in the document, which are present in the model
    words = [w for w in text if w in model]
    lexicon = list(set(words))
    l = len(lexicon)
    if l < 1:
        print('Empty lexicon in', text, file=sys.stderr)
        return np.zeros(model.vector_size)
    vectors = np.zeros((l, model.vector_size))  # Creating empty matrix of vectors for words
    for i in list(range(l)):  # Iterate over words in the text
        word = lexicon[i]
        if weights:
            weight = wordweight(word, model)
        else:
            weight = 1.0
        vectors[i, :] = model[word] * weight  # Adding word and its vector to matrix
    semantic_fingerprint = np.sum(vectors, axis=0)  # Computing sum of all vectors in the document
    semantic_fingerprint = np.divide(semantic_fingerprint, l)  # Computing average vector
    return semantic_fingerprint


def wordweight(word, model, a=10 ** -3, w_count=30000000):

    prob = model.wv.vocab[word].count / w_count
    weight = a / (a + prob)
    return weight


def save(df, corpus):

    #return: path to the saved file

    output_fpath = corpus + "_predictions.csv"
    df.to_csv(output_fpath, sep="\t", encoding="utf-8", index=False)
    print("Generated dataset: {}".format(output_fpath))
    return output_fpath


In [None]:
#evaluate.py
from __future__ import print_function
import argparse
from pandas import read_csv
from sklearn.metrics import adjusted_rand_score

def gold_predict(df):

    df = df.copy()

    df['predict'] = df['word'] + '_' + df['predict_sense_id']
    df['gold'] = df['word'] + '_' + df['gold_sense_id']

    return df


def ari_per_word_weighted(df):
    """ Computing ARI """

    df = gold_predict(df)

    words = {word: (adjusted_rand_score(df_word.gold, df_word.predict), len(df_word))
             for word in df.word.unique()
             for df_word in (df.loc[df['word'] == word],)}

    cumsum = sum(ari * count for ari, count in words.values())
    total = sum(count for _, count in words.values())

    r = cumsum/total

    return r, words


def evaluate(dataset_fpath):
    df = read_csv(dataset_fpath, sep='\t', dtype={'gold_sense_id': str, 'predict_sense_id': str})
    ari, words = ari_per_word_weighted(df)
    print('{}\t{}\t{}'.format('word', 'ari', 'count'))

    for word in sorted(words.keys()):
        print('{}\t{:.6f}\t{:d}'.format(word, *words[word]))

    print('\t{:.6f}\t{:d}'.format(ari, len(df)))
    return ari


def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)

    parser.add_argument('dataset', type=argparse.FileType('r'))
    args = parser.parse_args()
    evaluate(args.dataset)


if __name__ == '__main__':
    main()

In [1]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [2]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [3]:
downloaded = drive.CreateFile({'id':"1kQMG2P-qCK45ev1URrnMTD5nQSZxxfR1"})

In [4]:
downloaded.GetContentFile('ruscorpora_upos_skipgram_300_5_2018.vec.gz')

In [20]:
!python3 /content/wsi.py --input /content/test_tagged.csv --model /content/ruscorpora_upos_skipgram_300_5_2018.vec.gz --test

2021-11-28 08:48:02,982 : INFO : loading projection weights from /content/ruscorpora_upos_skipgram_300_5_2018.vec.gz
2021-11-28 08:49:08,647 : INFO : loaded (195071, 300) matrix from /content/ruscorpora_upos_skipgram_300_5_2018.vec.gz
2021-11-28 08:49:08,647 : INFO : precomputing L2-norms of word weight vectors
Predicted clusters: 112
Predicted clusters: 135
Predicted clusters: 84
Predicted clusters: 60
Predicted clusters: 20
Predicted clusters: 148
Predicted clusters: 79
Generated dataset: test_tagged_predictions.csv


**Results and Discussion**

