# Build a Word2vec language models with the Gensim library

Word2Vec is an algorithm that learns relationships between words using large amounts of text. The Word2Vec algorithm produces a language model where words with similar meanings based on context are close together and words with different meanings based on context are far apart. For example, Copenhagen and Denmark would be close together, while Copenhagen and cheese would be relatively far apart.

Word2vec models can thus be used to find words that are similar in meaning and syntactic position and that have a common relationship.

To build the Word2vec models, you can the library Gensim, which stands for generating similarities. Building the models is a labor-intensive process that takes many hours, so once the model is built, it is a good idea to save it for later use. Documentation for and description of how models are built and saved can be found on this page: https://radimrehurek.com/gensim/auto_examples/index.html

Below we'll build a model of the corpus for French literature. One can debate whether it is responsible to build a model from novels from a wide year span, or whether a division into subsets would make more sense. For example would this model not take into account the historical and cultural changes of terms through the period.


In [None]:
os.chdir('C:\\Users\\lakj\\Love lighting in French Literature\\data')

df = pd.read_csv('text_data230722.csv', sep='|')

In [None]:
# Build a GenSim language model

#Sources:
## https://radimrehurek.com/gensim/auto_examples/index.html#documentation

## https://stackabuse.com/implementing-word2vec-with-gensim-library-in-python/

## https://tedboy.github.io/nlps/generated/generated/gensim.models.Word2Vec.most_similar.html

import nltk
from gensim.models import Word2Vec
import time

def build_w2v_model(clean_text):
    
    # data preperation
    
    ## split the text into sentences using the sent_tokenizer of the nltk library
    sent_list = nltk.sent_tokenize(clean_text) 
    
    ## split each sentence into list of words using the nltk word_tokenizer
    tok_lists = [nltk.word_tokenize(sent) for sent in sent_list]


    ### filter for the shortest words. Here this is words equal to a lenght of 1
    tok_lists = [w for w in tok_lists if len(w) >=1]


    # Build the W2V model
    ## The value of min_count does so that only words that appears at least the value are included
    
    # w2v model basered on word lists
    word2vec_tokens = Word2Vec(tok_lists, min_count=4)
    
    return word2vec_tokens


# build w2v model
startTime = time.time()

word2vec_tokens = build_w2v_model(' '.join(df['Clean_text']).lower())

executionTime = (time.time() - startTime)
print('Building time in sec.: ' + str(executionTime))

# Save the model for later use
word2vec_tokens.save('french_literature_model.model')



# Reduce the language model for the purpose of visualisation.

In [None]:

# Load the model
french_literature_model = Word2Vec.load('french_literature_model.model')

from sklearn.decomposition import IncrementalPCA    # inital reduction
from sklearn.manifold import TSNE                   # final reduction
import numpy as np                                  # array handling


def reduce_dimensions(model):
    num_dimensions = 2  # final num dimensions (2D, 3D, etc)

    # extract the words & their vectors, as numpy arrays
    vectors = np.asarray(model.wv.vectors)
    labels = np.asarray(model.wv.index_to_key)  # fixed-width numpy strings

    # reduce using t-SNE
    tsne = TSNE(n_components=num_dimensions, random_state=0)
    vectors = tsne.fit_transform(vectors)

    x_vals = [v[0] for v in vectors]
    y_vals = [v[1] for v in vectors]
    return x_vals, y_vals, labels

x_vals, y_vals, labels = reduce_dimensions(french_literature_model)

# Send the x, y, and label values to a dataframe and save it for later use
df = pd.DataFrame({'x_vals':x_vals, 'y_vals':y_vals, 'labels':labels})
df.to_csv('french_literature_model_values_labels_2D.csv', index=False)



NameError: name 'os' is not defined

# Load the Gensim model and make an analyse of word similarity

In [3]:
# load the Libraries 
import os
os.chdir('C:\\Users\\lakj\\Love lighting in French Literature\\data')

import nltk
from gensim.models import Word2Vec

# load the model
french_literature_model = Word2Vec.load('french_literature_model.model')

In [4]:
# Open four word lists
# 1
of = open('emotion_list.txt', 'r', encoding='utf-8-sig')
emotion_word_list = of.read().lower().split('\n')
emotion_word_list = [i.strip() for i in emotion_word_list]
emotion_word_list.remove('[name of love interest]')
emotion_word_list = list(set(emotion_word_list))
emotion_word_list.sort()
of.close()
# 2
of = open('lightning_list.txt', 'r', encoding='utf-8-sig')
lightning_word_list = of.read().lower().split('\n')
lightning_word_list = [i.strip() for i in lightning_word_list]
lightning_word_list = list(set(lightning_word_list))
lightning_word_list.sort()
of.close()
# 3
of = open('natural_light_list.txt', 'r', encoding='utf-8-sig')
natural_word_list = of.read().lower().split('\n')
natural_word_list = [i.strip() for i in natural_word_list]
natural_word_list = list(set(natural_word_list))
natural_word_list.sort()
of.close()
# 4
of = open('technology_list.txt', 'r', encoding='utf-8-sig')
technology_list = of.read().lower().split('\n')
technology_list = [i.strip() for i in technology_list]
technology_list = list(set(technology_list))
technology_list.sort()
of.close()

# Function that can handle a word list and return five similar word for each word in the word list

def get_similar_words(word_list, no = 5):
    
    for term in word_list:
        try:

            print ('french_literature_model:', term, '\n')
            print (french_literature_model.wv.most_similar(term, topn=no))
            print ('\n')

        except KeyError:

            print ("Term or frase is not present in the model")
            print ('\n')
            

#############            
# put a wordlist into the function and get words similar to the words in the word list
############
get_similar_words(emotion_word_list[0:15], no = 5)



##############
# If you will look up a single word then 
# replace the text string after the variable 'term'
# and adjust the integer after the variable no to get x amount of simiar words 
#############
print (10 * '*-*')
term = 'lumière'
no = 5
try:
    print ('french_literature_model:', term, '\n')
    print (french_literature_model.wv.most_similar(term, topn=no))
    print ('\n')

except KeyError:

    print ("Term or frase is not present in the model")
    print ('\n')

french_literature_model: abattu 

[('mis', 0.9754560589790344), ('beaucoup', 0.975274384021759), ('payer', 0.9752501845359802), ('somme', 0.9750092625617981), ('but', 0.9748916029930115)]


french_literature_model: abruti 

[('exproprié', 0.39207157492637634), ('verriez', 0.3917907774448395), ('contenterai', 0.3851637542247772), ('avénement', 0.3734688460826874), ('ligure', 0.3702930808067322)]


french_literature_model: agonie 

[('autant', 0.981724202632904), ('elle', 0.9807077050209045), ('effet', 0.9805507063865662), ('eut', 0.980294942855835), ('aucune', 0.9802603125572205)]


french_literature_model: aimer 

[('urbain', 0.9957081079483032), ('tiens', 0.9950606226921082), ('savoir', 0.9946962594985962), ('vérité', 0.9939529895782471), ('ait', 0.993726909160614)]


french_literature_model: aimé 

[('répondit', 0.9970437288284302), ('serait', 0.9968804121017456), ('ah', 0.9967793822288513), ("qu'il", 0.9967734217643738), ('comment', 0.9967658519744873)]


french_literature_model: am