based on https://radimrehurek.com/gensim/auto_examples/howtos/run_downloader_api.html tutorial

#### Modules

In [19]:
from gensim import corpora
from gensim.models.word2vec import Word2Vec
from gensim.utils import simple_preprocess
from gensim.models import KeyedVectors
import gensim.downloader as api
import inspect
import os
from smart_open import open

### Prepare own corpus

possible fairy tales: 

    * Maarouf, Ismaïl El and Jeanne Villaneau. “A French Fairy Tale Corpus syntactically and semantically annotated.” LREC (2012).
    
    * Vicente, Marta & Miró, María & Lloret, Elena & Suárez Cueto, Armando. (2021). Leveraging Machine Learning to Explain the Nature of Written Genres. IEEE Access. PP. 1-1. 10.1109/ACCESS.2021.3056927. 
    
    * Ragan, Kathleen. (2009). What Happened to the Heroines in Folktales?: An Analysis by Gender of a Multicultural Sample of Published Folktales Collected from Storytellers. Marvels & Tales. 23. 227-247. 10.1353/mat.0.0128. 
    
    * Jeana Jorgensen. “Quantifying the Grimm Corpus: Transgressive and Transformative Bodies in the Grimms’ Fairy Tales.” Marvels & Tales 28, no. 1 (2014): 127–41. https://doi.org/10.13110/marvelstales.28.1.0127.
    
    * Silva, R. S. (2012). Fairy tales and moral values: a corpus-based approach. BELT - Brazilian English Language Teaching Journal, 3(1). Retrieved from https://revistaseletronicas.pucrs.br/ojs/index.php/belt/article/view/10326
    
    * Walter Maik (2022).´Märchenkorpus Version 1.0 (1.0)´ Humboldt-Universität zu Berlin, 2022. Homepage: http://www.textbewegung.de/. DOI: https://doi.org/10.34644/laudatio-dev-UyRUCnMB7CArCQ9C63ji.

### Paths and Files

In [20]:
#get project path
dn = os.path.abspath('gensim1.ipynb')
print(dn)


#get path to corpus
td = os.path.join(os.path.dirname(dn),'corpora\Laudatio\grimm_aschenputtel_119-126.txt')
corpora_dir = os.path.join(os.path.dirname(dn),'corpora\Laudatio')
temp_dir=os.path.join(os.path.dirname(dn),'tmp')
print(corpora_dir)
print(td)
print(temp_dir)
#tokens = [simple_preprocess(sentence, deacc=True) for sentence in open(td)]
#gensim_dic = corpora.Dictionary()
#gc = [gensim_dic.doc2bow(token, allow_update=True) for token in tokens]
#word_freq = [[(gensim_dic[id], frequence) for id, frequence in couple] for couple in gc]
#print(word_freq)



#multiple doc:
mult_data = os.scandir(corpora_dir)
#corp_txt = [[print(document)]for documents in (open(mult_data, encoding="utf-8"))]
laudatio_files = [os.path.join(corpora_dir,entry.name) for entry in mult_data if entry.is_file()]
mult_data.close()
#print(laudatio_files)


D:\Develop\Python\Gensim_tutorial\gensim1.ipynb
D:\Develop\Python\Gensim_tutorial\corpora\Laudatio
D:\Develop\Python\Gensim_tutorial\corpora\Laudatio\grimm_aschenputtel_119-126.txt
D:\Develop\Python\Gensim_tutorial\tmp


### Create Gensim Corpus from Files

In [87]:
#single Doc

#path
data = open(td, encoding="utf-8")
txt = [[word for word in document.lower().split()] for document in data]
#print(txt)
dictionary1 = corpora.Dictionary(txt)#
#print(dictionary.token2id) #prints word and their ids

class MyCorpus:
    def __iter__(self):
        for line in open(td):
            yield dictionary1.doc2bow(line.lower().split()) #check dov2bow function
nc_b = MyCorpus()
for vector in nc_b:
    print(vector)

In [66]:
#construct a dictionary
dictionary2 = corpora.Dictionary(line.lower().split() for line in open(td))
stoplist = set('for a of the and to in'.split())
stop_ids =[
    dictionary2.token2id[stopword]
    for stopword in stoplist
    if stopword in dictionary2.token2id
]
once_ids=[tokenid for tokenid, docfreq in dictionary2.dfs.items() if docfreq==1]
dictionary2.filter_tokens(stop_ids + once_ids)
dictionary2.compactify()
print(dictionary2)

Dictionary<297 unique tokens: ['als', 'andere', 'auf', 'darauf', 'das']...>


In [21]:
#multiple documents:
class MyCorpora:
    def __iter__(self):
        for entry in laudatio_files:
            for line in open(entry, encoding="utf-8"):
                yield simple_preprocess(line,deacc=True,min_len=4,max_len=20) 

nc = MyCorpora()
mult_dict = corpora.Dictionary(nc)
#print(len(mult_dict))
#for vector in nc:
 #   print(vector)

# further processing
#min_len/stop words macht vielleicht keinen Sinn, da selbst diese Worte Einfluss haben sollen? evtl. überprüfen, wie sich diese Verändern
stoplist = set('in für auf der die das mit weil'.split()) #existieren diese Worte überhaupt? oben ist min_len ja bei 4. => nope stop words sind eh schon draußen
stop_ids =[ #map stopwords to an id
    mult_dict.token2id[stopword]
    for stopword in stoplist
    if stopword in mult_dict.token2id
]
once_ids=[tokenid for tokenid, docfreq in mult_dict.dfs.items() if docfreq==1] #<2?

mult_dict.filter_tokens(once_ids + stop_ids) #remove tokens from once (& stop)
mult_dict.compactify()
#print(len(mult_dict))

### Word2Vec

#### train & save Vectors

In [28]:
model = Word2Vec(sentences=nc, vector_size=100, window=7, min_count=2, epochs=100)
model.save("frst_word2vec.model")#saving model

In [29]:
word_vectors=model.wv
word_vectors.save("frst_word2vec.wordvectors") #saving vector


#### load & use Vectors

In [31]:
wv=KeyedVectors.load("frst_word2vec.wordvectors", mmap='r')
vector = wv['frau']
#print(vector)
print(wv.most_similar('tochter'))

[('gemahlin', 0.5540225505828857), ('ankunft', 0.5036059617996216), ('schwester', 0.4301741123199463), ('sohne', 0.4143478274345398), ('bedingung', 0.4048870801925659), ('liebste', 0.3980526626110077), ('frau', 0.3977101147174835), ('holzhauer', 0.3972511291503906), ('mienen', 0.39202576875686646), ('braut', 0.39059847593307495)]


Next steps: diachronic corpus (vllt 3 times?), check preprocessing, check creation of classifier 


https://radimrehurek.com/gensim/auto_examples/core/run_corpora_and_vector_spaces.html#corpus-formats

In [102]:
corpus = [[(1, 0.5)], []]
corpora.MmCorpus.serialize(os.path.join(temp_dir,'corpus1.mm'),corpus)

In [None]:
model = Word2Vec(corpus)

In [None]:
print(model.wv.most_similar('book'))