In [1]:
import nltk
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.tag import StanfordNERTagger

In [2]:
import numpy as np
import pandas as pd
import re

Step1 - Find all the possible translation from google

keys to classify into -

n = Noun
v = Verb
a = Adjective
s = Adjective Satellite
r = Adverb

In [3]:
#translation_list = ['disabled', 'handicapped', 'anamorphous', 'maimed', 'cripple']

In [4]:
def add_to_candidate_set(candidate_dict_synsets, synset, pos):
    if pos not in candidate_dict_synsets.keys():
        candidate_dict_synsets.update( {pos : [synset]} )
    elif pos in candidate_dict_synsets.keys():
        values = candidate_dict_synsets[pos]
        if synset not in values:                       
            candidate_dict_synsets[pos].append(synset)
    else:
        print("pos error")
                
    return candidate_dict_synsets

In [5]:
def get_candidate_set(translation_list):
    candidate_dict_synsets = {}
    for translation in translation_list:
        synsets = wordnet.synsets(translation)
        
        for synset in synsets:
            match = re.search(r"(?<=Synset\(')[^']+", str(synset))
            _synset = match.group(0)
            pos = match.group(0).split(".")[1]
            
            candidate_dict_synsets = add_to_candidate_set(candidate_dict_synsets=candidate_dict_synsets,
                                                                  synset=_synset,
                                                                  pos=pos)
            
    return candidate_dict_synsets

In [6]:
#candidate_set = get_candidate_set(translation_list)
#print(candidate_set)

## Syntactic Vector

In [7]:
class SyntacticVector:
    def __init__(self):
        self.ner_tagger = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz', 'stanford-ner-4.2.0.jar')
        
    def calculate_rank(self, possible_sense, sense):
        word = possible_sense.split('.')[0]
        all_synsets = wordnet.synsets(word)
        rank = 0
        for _synset in all_synsets:
            match = re.search(r"(?<=Synset\(')[^']+", str(_synset))
            if match.group(0) == sense:
                return (rank+1)
            rank += 1

        return rank

    
    def get_sync(self, word_pos, sense, candidate_set):
        pos_list = list(candidate_set.keys())
        if word_pos in pos_list:
            if sense in candidate_set[word_pos]:
                return 1
        return 0
    
    def get_synn(self, word):
        tokens = word_tokenize(word)
        tags = self.ner_tagger.tag(tokens)

        named_entities = [tag for word, tag in tags if tag != 'O']
        if named_entities:
            return 1
        else:
            return 0
        
    def get_synr(self, word, sense, candidate_set, word_pos, sync):
        pos_matched_candidate = list(candidate_set[word_pos])

        synr = 0
        rank = 1
        for possible_sense in pos_matched_candidate:
            rank = self.calculate_rank(possible_sense, sense)
            try:
                synr += 1/rank
            except ZeroDivisionError:
                print("Zero Division")
                continue

        return sync*synr

In [8]:
def generate_syntactic_vector(word, word_pos, sense, candidate_set):    
    vector = []
    syntactic_vector = SyntacticVector()
    vector.append(syntactic_vector.get_sync(word_pos, sense, candidate_set))
    vector.append(syntactic_vector.get_synn(word))
    vector.append(syntactic_vector.get_synr(word, sense, candidate_set, word_pos, vector[0]))
    
    return np.array(vector)

In [9]:
#word = 'handicap'
#word_pos = 'v'
#possible_sense = candidate_set[word_pos][2]

#generate_syntactic_vector(word, word_pos, possible_sense, candidate_set)

## Semantic Vector

In [10]:
class SemanticVector():
    def get_semc(self, sense_1, sense_2):
        semc1 = sense_1.split(".")[1]
        semc2 = sense_2.split(".")[1]

        if semc1 == semc2:
            return 1
        else:
            return 0
        
    def get_semr(self, sense_1, sense_2):
        synset_1 = wordnet.synset(sense_1)
        synset_2 = wordnet.synset(sense_2)

        if synset_1 == synset_2:
            return 1

        if synset_1 in synset_2.hypernyms() or synset_1 in synset_2.hyponyms() or synset_2 in synset_1.hypernyms() or synset_2 in synset_1.hyponyms():
            return 0.8

        if synset_1.hypernyms() == synset_2.hypernyms():
            return 0.7

        return 0
    
    def get_semd(self, sense_1, sense_2):
        synset_1 = wordnet.synset(sense_1)
        synset_2 = wordnet.synset(sense_2)

        semd = synset_1.shortest_path_distance(synset_2)
        
        if semd == None:
            return 0

        return float(semd)
    
    def get_sems(self, sense_1, sense_2):
        s1 = wordnet.synset(sense_1)
        s2 = wordnet.synset(sense_2)

        gloss1 = s1.definition()
        gloss2 = s2.definition()

        tokens1 = set(word_tokenize(gloss1))
        tokens2 = set(word_tokenize(gloss2))

        overlap = tokens1.intersection(tokens2)

        sems = len(overlap) / (len(tokens1) + len(tokens2) - len(overlap))

        return float(sems)

In [11]:
def generate_semantic_vector(sense_1, sense_2):
    semantic_vector = SemanticVector()
    vector = []
    vector.append(semantic_vector.get_semc(sense_1, sense_2))
    vector.append(semantic_vector.get_semr(sense_1, sense_2))
    vector.append(semantic_vector.get_semd(sense_1, sense_2))
    vector.append(semantic_vector.get_sems(sense_1, sense_2))
    
    return np.array(vector)

In [12]:
#sense_1 = 'disabled.n.01'
#sense_2 = 'disabled.s.01'

#generate_semantic_vector(sense_1, sense_2)

In [13]:
#value_list = []

#for value in values:
    #value_list += value
    
#print(value_list)

#sense_1 = 'disabled.n.01'
#for sense_2 in value_list:
    #if sense_2 != sense_1:
        #print(sense_2)
        #print(generate_semantic_vector(sense_1, sense_2))
    #else:
        #continue

In [14]:
def get_feature_vector(word, word_pos, sense_1, candidate_set, flag):
    feature_vectors = []
    y_label = []
    syntactic_vector = generate_syntactic_vector(word=word, word_pos=word_pos, 
                                                 sense=sense_1, candidate_set=candidate_set)
    
    #get list of all senses
    value_list = []
    values = candidate_set.values()
    for value in values:
        value_list += value
        
    #generate semantic vector for each sense_1, sense_2 pair 
    for sense_2 in value_list:
        symantic_vector = generate_semantic_vector(sense_1=sense_1, sense_2=sense_2)
        feature_vector = np.concatenate((syntactic_vector, symantic_vector))
        feature_vectors.append(feature_vector)
        if flag:
            y_label.append(1)
        else:
            y_label.append(0)
            
    return feature_vectors, y_label

In [15]:
#word = 'handicap'
#word_pos = 'n'
#possible_sense = candidate_set[word_pos][2]
#get_feature_vector(word, word_pos, possible_sense, candidate_set)

In [16]:
def get_correct_sense(translation_list):
    candidate_set = get_candidate_set(translation_list)
    values = []
    for val in candidate_set.values():
        values += val
    
    for i in range(len(values)):
        print(str(i) + " = " + str(values[i]))
    
    index = int(input("Enter the index corresponding to correct sense"))
    return values[index], candidate_set, values
    

In [17]:
#get_correct_sense(translation_list)

In [18]:
def get_translation():
    print("ENTER ! TO STOP TAKING TRANSLATIONS")
    word = input("Enter Word ")
    word_pos = input("Enter word's pos ")

    translation_list = []
    
    while True:
        key = input("Enter translation ")
        if key == "!":
            break
        else:
            translation_list.append(key)
        
    return translation_list, word, word_pos 

In [19]:
#get_translation()

In [20]:
def generate_datapoint():
    df = pd.DataFrame(columns = ["Word", "Word POS", "Sense", "Feature Vector", "Label"])
    translation_list, word, word_pos = get_translation()
    correct_sense, candidate_set, candidate_list = get_correct_sense(translation_list)
    
    for possible_sense in candidate_list:
        if possible_sense == correct_sense:
            feature_vectors, y_label = get_feature_vector(word=word, word_pos=word_pos, 
                                                          sense_1 = possible_sense, 
                                                          candidate_set = candidate_set, flag = True)
        else:
            feature_vectors, y_label = get_feature_vector(word=word, word_pos=word_pos, 
                                                          sense_1 = possible_sense, 
                                                          candidate_set = candidate_set, flag = False)
        
        for i in range(len(feature_vectors)):
            row = pd.Series({'Word' : word, 
                             'Word POS' : word_pos,
                             'Sense' : possible_sense,
                             'Feature Vector' : feature_vectors[i],
                             'Label' : y_label[i]})   
            df = pd.concat([df, row.to_frame().T], ignore_index = True)
        
    return df

In [21]:
#generate_datapoint()

In [22]:
#generate_datapoint()

In [29]:
start = True 
frames = []

while start:
    df = generate_datapoint()
    frames.append(df)
    check = input("Type STOP to generate dataframe else type anything")
    if check == 'STOP':
        start = False
    else:
        continue

print(frames)
result = pd.concat(frames)
print(result)

ENTER ! TO STOP TAKING TRANSLATIONS
Enter Word ज्योतिषी
Enter word's pos n
Enter translation astrologer
Enter translation !
0 = astrologer.n.01
Enter the index corresponding to correct sense0
Type STOP to generate dataframe else type anythingn
ENTER ! TO STOP TAKING TRANSLATIONS
Enter Word फीका
Enter word's pos a
Enter translation tasteless
Enter translation faded
Enter translation !
0 = tasteless.a.01
1 = tasteless.a.02
2 = fade.v.01
3 = fade.v.02
4 = evanesce.v.01
5 = languish.v.03
6 = bleached.s.01
7 = attenuate.s.01
Enter the index corresponding to correct sense0
Type STOP to generate dataframe else type anythingSTOP
[       Word Word POS            Sense                       Feature Vector  \
0  ज्योतिषी        n  astrologer.n.01  [1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0]   

  Label  
0     1  ,     Word Word POS           Sense  \
0   फीका        a  tasteless.a.01   
1   फीका        a  tasteless.a.01   
2   फीका        a  tasteless.a.01   
3   फीका        a  tasteless.a.01   
4   फीक

In [31]:
result.to_csv("Data_Set.csv")