In [3]:
import os, logging, gensim
import numpy as np
import pandas as pd
from sklearn import linear_model
import unicodedata as ud
import copy

In [1]:
# Global configuration
lang = 'French_POS'
include_pos = True
root_path = './' + lang + '/'

if lang == 'French':
    sim_model_path = root_path + 'wolf_15k_850d.txt'
    mix_model_path = root_path + 'depglove_200d_eric.txt'
    asn_model_path = root_path + 'asn_embedding.txt'
    sig_model_path = root_path + 'sig_embedding.txt'
elif lang == 'French_POS':
    sim_model_path = root_path + 'wolf_pos_850d.txt'
    mix_model_path = root_path + 'depglove_200d_eric.txt'
    asn_model_path = root_path + 'asn_embedding.txt'
    sig_model_path = root_path + 'sig_embedding.txt'
elif lang == 'English':
    sim_model_path = root_path + 'sim_embedding.txt'
    mix_model_path = root_path + 'glove.840B.300d.txt'
    asn_model_path = root_path + 'asn_embedding.txt'
    sig_model_path = root_path + 'sig_embedding.txt'


## Define key vocabulary items

In [4]:
os.getcwd()

'/Users/soshy/code/Micipsa-Decorrelation'

In [6]:
lpp_path = root_path + 'lpp_wolf_tuned_vocab.txt'
file = open(lpp_path, mode='r')
file_data = file.read()
lpp_voc = file_data.strip().split('\n')
lpp_voc = set(lpp_voc)

In [42]:
lpp_path = '../Similarity-Association-Benchmarks/fr-pos-wsrel.dataset'
wsrel_voc = pd.read_csv(lpp_path, delimiter=';', header=None).to_dict('list')
wsrel_voc = set(wsrel_voc[0]).union(wsrel_voc[1])

In [44]:
lpp_path = '../Similarity-Association-Benchmarks/fr-pos-simlex.dataset'
simlex_voc = pd.read_csv(lpp_path, delimiter=';', header=None).to_dict('list')
simlex_voc = set(simlex_voc[0]).union(simlex_voc[1])

In [45]:
relevant_voc = lpp_voc.union(wsrel_voc).union(simlex_voc)

In [46]:
len(relevant_voc)

2402

# Load Vocabularies

In [68]:
wolf_voc = open(root_path+'vocabulary_pos_wolf.txt', mode='r').read().split('\n')

In [81]:
glove_voc_raw = open(root_path+'vocabulary_depglove.txt', mode='r', encoding='utf8').read().split('\n')

## Clean WOLF vocabulary

In [67]:
latin_letters= {}

def is_latin(uchr):
    try: return latin_letters[uchr]
    except KeyError:
         return latin_letters.setdefault(uchr, 'LATIN' in ud.name(uchr))

def only_roman_chars(unistr):
    return all(is_latin(uchr)
           for uchr in unistr
           if uchr.isalpha()) # isalpha suggested by John Machin

In [112]:
wolf_voc_set = set()
for word in sim_voc_list:
    if only_roman_chars(word):
        wolf_voc_set.add(word)
    else:
        print(word, 'not a regular form')
        if word in sim_voc_roman_dict:
            print('\tconverted to', sim_voc_roman_dict[word])
            wolf_voc_set.add(sim_voc_roman_dict[word])

ابوظبي_n not a regular form
radioactivité_α_n not a regular form
	converted to radioactivité_α_n
grèbe_jougris|ko=큰논병아리|nl=roodhalsfuut_n not a regular form
	converted to grèbe_jougris_n
गुरु_n not a regular form
Nay_نای_n not a regular form
capucin|ja=オマキザル属|lt=kapucinas|nl=kapucijnapen_n not a regular form
	converted to capucin_n
α_Virginis_n not a regular form
	converted to α_Virginis_n
particule_α_n not a regular form
	converted to particule_α_n
دبي_n not a regular form
acide_α-linolénique_n not a regular form
	converted to acide_α-linolénique_n
poirier_commun|pl=grusza_pospolita|uk=груша_звичайна_n not a regular form
	converted to poirier_commun_n
β-bloquant_n not a regular form
	converted to β-bloquant_n
radioactivité_β_n not a regular form
	converted to radioactivité_β_n
gerbille|ja=スナネズミ|nl=gerbils|pl=myszoskoczki_n not a regular form
	converted to gerbille_n
в_n not a regular form
حماس_n not a regular form
nèfle|lt=šliandra|os=мугæ_n not a regular form
	converted to nèfle_n
к_

In [88]:
sim_voc_roman_dict = {"grèbe_jougris|ko=큰논병아리|nl=roodhalsfuut_n": "grèbe_jougris_n", 
                      "capucin|ja=オマキザル属|lt=kapucinas|nl=kapucijnapen_n": "capucin_n", 
                      "poirier_commun|pl=grusza_pospolita|uk=груша_звичайна_n": "poirier_commun_n",
                      "gerbille|ja=スナネズミ|nl=gerbils|pl=myszoskoczki_n": "gerbille_n",
                      "nèfle|lt=šliandra|os=мугæ_n": "nèfle_n",
                      "squamates|ja=トカゲ目_n": "squamates_n",
                      "néflier_du_japon|ja=ビワ|nl=loquat|pt=nêspera_n": "néflier_du_japon_n",
                      "radioactivité_α_n": "radioactivité_α_n",
                      "α_Virginis_n": "α_Virginis_n",
                      "particule_α_n": "particule_α_n",
                      "acide_α-linolénique_n": "acide_α-linolénique_n",
                      "β-bloquant_n": "β-bloquant_n",
                      "radioactivité_β_n": "radioactivité_β_n",
                      "particule_β_n": "particule_β_n"
                     }

for key in sim_voc_roman_dict:
    wolf_voc.remove(key)
    wolf_voc.append(sim_voc_roman_dict[key])

## Reject function words

In [82]:
glove_acc_pos = {
    'np', 'nc', 'adj', 'v', 'adv', 'title', 'advneg', 'number'
}
glove_rej_pos = {
     'coo', 'ncpred', 'pres', 'prep', 'csu', 'adjPref', 'advPref', '', 'predet', 
    'det', 'pro', 'poncts', 'clr', 'ponctw', 'advneg', 'pri', 'prel', 'suffAdj', 
    'ce', 'cln', 'aux', 'que', 'clg', 'clneg', 'N2', 'xpro', 'ilimp', 'restr', 'cld', 'cll', 'cla'
}

In [84]:
glove_voc = list()
for raw in glove_voc_raw:
    if raw.rsplit('_', 1)[1] in glove_acc_pos:
        glove_voc.append(raw)

In [89]:
len(wolf_voc), len(glove_voc), len(glove_voc_raw)

(56665, 160879, 163003)

# Word Alignment in two embeddings

## POS translator

In [90]:
pos_dict = {
    'np': 'n', 'nc': 'n',
    'adj': 'a', 'v': 'v',
    'adv': 'b', 'title': 'n',
    'advneg': 'b', 'number': 'a'
}

## Word Mapping from GloVe to WOLF

### Transform WOLF vocabulary for inquery

In [144]:
wolf_voc_dict = dict()
for word_pos in wolf_voc:
    word, pos = word_pos.rsplit('_', 1)
    if word in wolf_voc_dict:
        wolf_voc_dict[word].update(pos)
    else:
        wolf_voc_dict[word] = set(pos)

In [145]:
wolf_case_fusion = list()
for word in list(wolf_voc_dict.keys()):
    if word != word.lower():
        if word.lower() not in wolf_voc_dict:
            wolf_voc_dict[word.lower()] = word
        else:
            wolf_case_fusion.append((word, wolf_voc_dict[word]))

In [147]:
len(wolf_case_fusion)

4000

### Map GloVe to WOLF one by one

In [148]:
glove2wolf = dict()
glove_pos_no_match = list()
glove_word_no_match = list()

for word_pos in glove_voc:
    word, pos = word_pos.rsplit('_', 1)
    label = ''
    pos = pos_dict[pos]
    if ':' in word:
        word, label = word.rsplit(':', 1)
    if word in wolf_voc_dict:
        if isinstance(wolf_voc_dict[word], str):
            word = wolf_voc_dict[word]
        if pos in wolf_voc_dict[word]:
            glove2wolf[word_pos] = word + '_' + pos
        else:
            glove_pos_no_match.append((word_pos, word, wolf_voc_dict[word]))
    else:
        glove_word_no_match.append(word_pos)

In [110]:
len(glove_pos_no_match), len(glove_word_no_match), len(glove2wolf), len(wolf_voc)

(3112, 128305, 29462, 56665)

In [149]:
len(glove_pos_no_match), len(glove_word_no_match), len(glove2wolf), len(wolf_voc)

(3068, 128305, 29506, 56665)

### Manual Resolution of Unmatched Terms

In [155]:
#file = open(root_path + 'g2w_manual.csv', mode='w+')
#file.write('\n'.join(lpp_voc.difference(glove2wolf.values()).difference(lpp_voc.difference(wolf_voc))))
#file.close()

In [154]:
len(lpp_voc.difference(glove2wolf.values())), len(set(wsrel_voc).union(simlex_voc).difference(glove2wolf.values()))

(191, 70)

In [161]:
manual_correction = pd.read_csv(root_path + 'g2w_manual.csv', header=None).dropna()

In [166]:
manual_correction = manual_correction.set_index(1).to_dict()[0]

In [167]:
set(manual_correction.keys()).intersection(glove2wolf.keys())

{'accorder_v',
 'coucher_v',
 'demain_adv',
 'huit_nc',
 'ici_adv',
 'loin_adv',
 'lune_nc',
 'occuper_v',
 'oiseau_nc',
 'orgueilleux_adj',
 'petit-déjeuner_nc',
 'plonger_v',
 'public_adj',
 'quarante_nc',
 'reposer_v',
 'souvenir_v',
 'surprise_adj',
 'taire_v',
 'tenir_v',
 'terre_nc',
 'trouver_v',
 'vider_v',
 'vingt_nc'}