In [1]:
from gensim import utils
import tempfile
from gensim.models import Word2Vec
from gensim.models import FastText
from gensim.utils import tokenize
from gensim.test.utils import datapath

In [2]:
# loading medical corpus
class MyCorpus_Medical:
    """An iterator that yields sentences (lists of str)."""

    def __iter__(self):

        for line in open('QUAERO_FrenchMed_traindev.ospl'):
            #print(line)
            # assume there's one document per line, tokens separated by whitespace
            yield utils.simple_preprocess(line)

In [3]:
# loading press corpus
class MyCorpus_Press:
    """An iterator that yields sentences (lists of str)."""

    def __iter__(self):

        for line in open('QUAERO_FrenchPress_traindev.ospl'):
            #print(line)
            # assume there's one document per line, tokens separated by whitespace
            yield utils.simple_preprocess(line)

# Models

In [4]:
# Medical corpus

# medical sentences
sentences_medical = MyCorpus_Medical()
    

In [5]:
# word2vec cbow model with medical 

#Cbow sg = 0
model_word2vec_cbow_med = Word2Vec(vector_size=100, min_count = 1,sg = 0, window = 3)
model_word2vec_cbow_med.build_vocab(sentences_medical)  
model_word2vec_cbow_med.train(sentences_medical, total_examples=model_word2vec_cbow_med.corpus_count, epochs=50)
    
#save model
model_word2vec_cbow_med.save('model_word2vec_cbow_med')



In [6]:
# word2vec skipgram model with medical 

#skipgram sg = 1
model_word2vec_skipgram_med = Word2Vec(vector_size=100, min_count = 1,sg = 1, window = 3)
model_word2vec_skipgram_med.build_vocab(sentences_medical)  
model_word2vec_skipgram_med.train(sentences_medical, total_examples=model_word2vec_skipgram_med.corpus_count, epochs=50)
    
#save model
model_word2vec_skipgram_med.save('model_word2vec_skipgram_med')


In [7]:
# fasttext cbow model with medical

#Cbow sg = 0
model_fasttext_cbow_med = FastText(vector_size=100, min_count=1,sg = 0,window = 3)
model_fasttext_cbow_med.build_vocab(sentences_medical)
total_examples = model_fasttext_cbow_med.corpus_count
model_fasttext_cbow_med.train(sentences_medical, total_examples=total_examples, epochs=50)

#save model
model_fasttext_cbow_med.save('model_fasttext_cbow_med')

In [8]:
# Press corpus

# press sentences
sentences_press = MyCorpus_Press()

In [9]:
# word2vec cbow model with press

#Cbow sg = 0
model_word2vec_cbow_press = Word2Vec(vector_size=100, min_count = 1,sg = 0, window = 3)
model_word2vec_cbow_press.build_vocab(sentences_press)  
model_word2vec_cbow_press.train(sentences_press, total_examples=model_word2vec_cbow_press.corpus_count, epochs=50)

#save model
model_word2vec_cbow_press.save('model_word2vec_cbow_press')

In [10]:
# word2vec skipgram model with press

#skipgram sg = 1
model_word2vec_skipgram_press = Word2Vec(vector_size=100, min_count = 1,sg = 1, window = 3)
model_word2vec_skipgram_press.build_vocab(sentences_press)  
model_word2vec_skipgram_press.train(sentences_press, total_examples=model_word2vec_skipgram_press.corpus_count, epochs=50)
    
#save model
model_word2vec_skipgram_press.save('model_word2vec_skipgram_press')

In [11]:
# fasttext cbow model with press

#Cbow sg = 0
model_fasttext_cbow_press = FastText(vector_size=100, min_count=1,sg = 0,window = 3)
model_fasttext_cbow_press.build_vocab(sentences_press)
total_examples = model_fasttext_cbow_press.corpus_count
model_fasttext_cbow_press.train(sentences_press, total_examples=total_examples, epochs=50)
    
#save model
model_fasttext_cbow_press.save('model_fasttext_cbow_press')

# Similarity

In [12]:
# Compare embeddings trained on the same corpus

# medical corpus

for word in ['patient', 'traitement', 'maladie', 'solution', 'jaune']:

    #word2vec_cbow_medical_corpus
    print('word2vec_cbow_medical_corpus \n',word,[w for w, score in model_word2vec_cbow_med.wv.most_similar(word, topn=10)])
    #word2vec_skipgram_medical_corpus
    print('word2vec_skipgram_medical_corpus \n',word,[w for w, score in model_word2vec_skipgram_med.wv.most_similar(word, topn=10)])
    #fasttext_cbow_medical_corpus  
    print('fasttext_cbow_medical_corpus \n',word,[w for w, score in model_fasttext_cbow_med.wv.most_similar(word, topn=10)])

word2vec_cbow_medical_corpus 
 patient ['aptitude', 'délai', 'carte', 'pas', 'montrer', 'avoir', 'souffre', 'recommandé', 'arrêté', 'prescripteur']
word2vec_skipgram_medical_corpus 
 patient ['carte', 'montrez', 'alerte', 'montrer', 'spéciale', 'aptitude', 'souffre', 'existante', 'fiable', 'impliquant']
fasttext_cbow_medical_corpus 
 patient ['patiente', 'parvient', 'maintient', 'appartient', 'obtiennent', 'gradient', 'recevaient', 'patients', 'excipient', 'conscient']
word2vec_cbow_medical_corpus 
 traitement ['vih', 'débuter', 'infection', 'instauration', 'pentoses', 'début', 'diagnostic', 'concomitant', 'rôle', 'poussées']
word2vec_skipgram_medical_corpus 
 traitement ['instauration', 'contrôlée', 'chirurgical', 'concomitant', 'traités', 'réalisé', 'cassation', 'opportuniste', 'débuter', 'arrêt']
fasttext_cbow_medical_corpus 
 traitement ['traitment', 'taaitement', 'allaitement', 'évitement', 'étroitement', 'entrainement', 'département', 'directement', 'correctement', 'recrutement']

In [13]:
# Compare embeddings trained on the same corpus
# press corpus

for word in ['patient', 'traitement', 'maladie', 'solution', 'jaune']:

    #word2vec_cbow_press_corpus
    print('word2vec_cbow_press_corpus \n',word,[w for w, score in model_word2vec_cbow_press.wv.most_similar(word, topn=10)])
    #word2vec_skipgram_press_corpus
    print('word2vec_skipgram_press_corpus \n',word,[w for w, score in model_word2vec_skipgram_press.wv.most_similar(word, topn=10)])
    #fasttext_cbow_press_corpus  
    print('fasttext_cbow_press_corpus \n',word,[w for w, score in model_fasttext_cbow_press.wv.most_similar(word, topn=10)])

word2vec_cbow_press_corpus 
 patient ['concessionnaire', 'cancéreux', 'coq', 'touriste', 'instrument', 'sachet', 'lac', 'carton', 'escroc', 'messie']
word2vec_skipgram_press_corpus 
 patient ['cancéreux', 'hospitalisé', 'humble', 'transmissible', 'manquement', 'éduqué', 'derridéenne', 'coco', 'missionnaire', 'flagrant']
fasttext_cbow_press_corpus 
 patient ['patientent', 'impatient', 'détient', 'renient', 'impatientent', 'initient', 'abstient', 'remanient', 'essuient', 'retient']
word2vec_cbow_press_corpus 
 traitement ['sida', 'cancer', 'collectif', 'survivant', 'logement', 'coût', 'système', 'égal', 'calcul', 'cucs']
word2vec_skipgram_press_corpus 
 traitement ['préjudice', 'médicamenteux', 'antidouleur', 'allégations', 'infligés', 'compétences', 'générateurs', 'acquisition', 'outil', 'pourcentage']
fasttext_cbow_press_corpus 
 traitement ['retraitement', 'subitement', 'bêtement', 'recrutement', 'doctement', 'vêtement', 'gratuitement', 'dépècement', 'abruptement', 'modestement']
word

In [14]:
# Compare embeddings (same approach) trained on different corpora

# model: word2vec_cbow
for word in ['patient', 'traitement', 'maladie', 'solution', 'jaune']:

    #word2vec_cbow_medical_corpus
    print('word2vec_cbow_medical_corpus \n',word,[w for w, score in model_word2vec_cbow_med.wv.most_similar(word, topn=10)])
    #word2vec_cbow_press_corpus
    print('word2vec_cbow_press_corpus \n',word,[w for w, score in model_word2vec_cbow_press.wv.most_similar(word, topn=10)])


word2vec_cbow_medical_corpus 
 patient ['aptitude', 'délai', 'carte', 'pas', 'montrer', 'avoir', 'souffre', 'recommandé', 'arrêté', 'prescripteur']
word2vec_cbow_press_corpus 
 patient ['concessionnaire', 'cancéreux', 'coq', 'touriste', 'instrument', 'sachet', 'lac', 'carton', 'escroc', 'messie']
word2vec_cbow_medical_corpus 
 traitement ['vih', 'débuter', 'infection', 'instauration', 'pentoses', 'début', 'diagnostic', 'concomitant', 'rôle', 'poussées']
word2vec_cbow_press_corpus 
 traitement ['sida', 'cancer', 'collectif', 'survivant', 'logement', 'coût', 'système', 'égal', 'calcul', 'cucs']
word2vec_cbow_medical_corpus 
 maladie ['parkinson', 'basedow', 'recklinghausen', 'hodgkin', 'crohn', 'marfan', 'liée', 'juger', 'hirschsprung', 'lévothyroxine']
word2vec_cbow_press_corpus 
 maladie ['prévention', 'épidémie', 'mondialisation', 'pneumopathie', 'prostitution', 'publicité', 'famine', 'software', 'contamination', 'psychose']
word2vec_cbow_medical_corpus 
 solution ['diluer', 'buvable'

In [15]:
# Compare embeddings (same approach) trained on different corpora

# model: word2vec_skipgram

for word in ['patient', 'traitement', 'maladie', 'solution', 'jaune']:
    #word2vec_skipgram_medical_corpus
    print('word2vec_skipgram_medical_corpus \n',word,[w for w, score in model_word2vec_skipgram_med.wv.most_similar(word, topn=10)])
    #word2vec_skipgram_press_corpus
    print('word2vec_skipgram_press_corpus \n',word,[w for w, score in model_word2vec_skipgram_press.wv.most_similar(word, topn=10)])


word2vec_skipgram_medical_corpus 
 patient ['carte', 'montrez', 'alerte', 'montrer', 'spéciale', 'aptitude', 'souffre', 'existante', 'fiable', 'impliquant']
word2vec_skipgram_press_corpus 
 patient ['cancéreux', 'hospitalisé', 'humble', 'transmissible', 'manquement', 'éduqué', 'derridéenne', 'coco', 'missionnaire', 'flagrant']
word2vec_skipgram_medical_corpus 
 traitement ['instauration', 'contrôlée', 'chirurgical', 'concomitant', 'traités', 'réalisé', 'cassation', 'opportuniste', 'débuter', 'arrêt']
word2vec_skipgram_press_corpus 
 traitement ['préjudice', 'médicamenteux', 'antidouleur', 'allégations', 'infligés', 'compétences', 'générateurs', 'acquisition', 'outil', 'pourcentage']
word2vec_skipgram_medical_corpus 
 maladie ['parkinson', 'recklinghausen', 'avancé', 'crohn', 'liée', 'bourneville', 'basedow', 'constituée', 'thrombo', 'coronarienne']
word2vec_skipgram_press_corpus 
 maladie ['pulmonaire', 'pneumopathie', 'neurologique', 'alzheimer', 'virale', 'orpheline', 'contaminants',

In [16]:
# Compare embeddings (same approach) trained on different corpora

# model: fasttext_cbow

for word in ['patient', 'traitement', 'maladie', 'solution', 'jaune']:
    
    #fasttext_cbow_medical_corpus  
    print('fasttext_cbow_medical_corpus \n',word,[w for w, score in model_fasttext_cbow_med.wv.most_similar(word, topn=10)])
    
    #fasttext_cbow_press_corpus  
    print('fasttext_cbow_press_corpus \n',word,[w for w, score in model_fasttext_cbow_press.wv.most_similar(word, topn=10)])



fasttext_cbow_medical_corpus 
 patient ['patiente', 'parvient', 'maintient', 'appartient', 'obtiennent', 'gradient', 'recevaient', 'patients', 'excipient', 'conscient']
fasttext_cbow_press_corpus 
 patient ['patientent', 'impatient', 'détient', 'renient', 'impatientent', 'initient', 'abstient', 'remanient', 'essuient', 'retient']
fasttext_cbow_medical_corpus 
 traitement ['traitment', 'taaitement', 'allaitement', 'évitement', 'étroitement', 'entrainement', 'département', 'directement', 'correctement', 'recrutement']
fasttext_cbow_press_corpus 
 traitement ['retraitement', 'subitement', 'bêtement', 'recrutement', 'doctement', 'vêtement', 'gratuitement', 'dépècement', 'abruptement', 'modestement']
fasttext_cbow_medical_corpus 
 maladie ['malade', 'maldi', 'malaise', 'malgré', 'amantadie', 'parkinson', 'maligne', 'malherbe', 'maltraitance', 'malignité']
fasttext_cbow_press_corpus 
 maladie ['malade', 'mala', 'malawi', 'maladies', 'malawite', 'malabo', 'malaga', 'malnutrie', 'maladresse', 