In [1]:
#Data Wrangling
import pandas as pd
from scipy import stats
import numpy as np
import os
import nltk
from nltk import Text
from nltk.tokenize import word_tokenize, sent_tokenize, PunktSentenceTokenizer
from nltk.tokenize.punkt import PunktLanguageVars
from random import choices
import string
import matplotlib.pyplot as plt
from collections import Counter
import json
import re
import time
import gensim #library needed for word2vec
from gensim.models import Word2Vec
import openpyxl

#Multiprocessing
from joblib import Parallel, delayed

#for visualization
from scipy.spatial.distance import cosine
from sklearn.metrics import pairwise
from sklearn.manifold import MDS, TSNE

In [46]:
model = Word2Vec.load("../models/W10_GENERALMODEL.model")

In [59]:
model.wv.most_similar("sittlich", topn=20)

[('geistig', 0.5954787731170654),
 ('moralisch', 0.478613942861557),
 ('körperlich', 0.44639357924461365),
 ('religiös', 0.44052037596702576),
 ('verwahrlost', 0.43907859921455383),
 ('Erziehung', 0.4248078167438507),
 ('leiblich', 0.41837388277053833),
 ('bürgerlich', 0.40770503878593445),
 ('sittlichen', 0.40554845333099365),
 ('verdorbener', 0.4007958471775055),
 ('religiöse', 0.398758202791214),
 ('verwahrloste', 0.37804728746414185),
 ('verwahrlosten', 0.3779931664466858),
 ('religiösen', 0.37464532256126404),
 ('Versorgung', 0.37319809198379517),
 ('sittliche', 0.3725346326828003),
 ('gefährdete', 0.37130969762802124),
 ('sittlicher', 0.36926865577697754),
 ('Erziehungsanstalten', 0.368554949760437),
 ('verkommen', 0.36803123354911804)]

In [3]:
# Assuming you have a list of model file paths
model_paths = ["../models/W2_model01_1830.model", "../models/W2_model02_1860.model", "../models/W2_model03_1890.model",
              "../models/W2_model04_1922.model"
              ]  # replace with your model paths
keywords = ["Armut"]
time_periods = ["1811-1830", "1831-1860", "1861-1890", 
                "1891-1922"
               ]  # tim eperiod labels
# Initialize an empty DataFrame
data = []

for model_path, time_period in zip(model_paths, time_periods):
    # Load the model
    model = Word2Vec.load(model_path)
    
    for keyword in keywords:
        # Check if the keyword exists in the model's vocabulary
        if keyword in model.wv.key_to_index:
            # Find the 20 nearest neighbors
            neighbors = model.wv.most_similar(keyword, topn=10)
            
            # Add each neighbor and its similarity score to the data list
            for neighbor, similarity in neighbors:
                data.append([time_period, keyword, neighbor, similarity])
        else:
            print(f"'{keyword}' not found in the vocabulary for time period {time_period}.")

# Create a DataFrame from the data list
df = pd.DataFrame(data, columns=['Time Period', 'Keyword', 'Neighbor', 'Similarity'])

# Display or save the DataFrame
print(df)

excel_filename = "../tables/W2_word2vec_neighbors.xlsx"
df.to_excel(excel_filename, index=False)

print(f"Data saved to {excel_filename}")

   Time Period Keyword            Neighbor  Similarity
0    1811-1830   Armut                Noth    0.404500
1    1811-1830   Armut          verstopfen    0.390986
2    1811-1830   Armut       verschuldeter    0.359114
3    1811-1830   Armut            Bettelei    0.357232
4    1811-1830   Armut           Verarmung    0.351039
5    1811-1830   Armut           verstopft    0.350332
6    1811-1830   Armut          Trunksucht    0.346195
7    1811-1830   Armut      verdienstlosen    0.345576
8    1811-1830   Armut     Landstreicherei    0.344344
9    1811-1830   Armut         Verstopfung    0.340305
10   1831-1860   Armut           Notstände    0.369092
11   1831-1860   Armut         Pauperismus    0.357197
12   1831-1860   Armut   überhandnehmenden    0.351932
13   1831-1860   Armut     unverschuldeten    0.345183
14   1831-1860   Armut       Verlassenheit    0.341381
15   1831-1860   Armut           erblichen    0.340415
16   1831-1860   Armut                Noth    0.338645
17   1831-

In [4]:
# Assuming you have a list of model file paths
model_paths = ["../models/W5_model01_1830.model", "../models/W5_model02_1860.model", "../models/W5_model03_1890.model",
              "../models/W5_model04_1922.model"
              ]  # replace with your model paths
keywords = ["Armut"]
time_periods = ["1811-1830", "1831-1860", "1861-1890", 
                "1891-1922"
               ]  # tim eperiod labels
# Initialize an empty DataFrame
data = []

for model_path, time_period in zip(model_paths, time_periods):
    # Load the model
    model = Word2Vec.load(model_path)
    
    for keyword in keywords:
        # Check if the keyword exists in the model's vocabulary
        if keyword in model.wv.key_to_index:
            # Find the 20 nearest neighbors
            neighbors = model.wv.most_similar(keyword, topn=10)
            
            # Add each neighbor and its similarity score to the data list
            for neighbor, similarity in neighbors:
                data.append([time_period, keyword, neighbor, similarity])
        else:
            print(f"'{keyword}' not found in the vocabulary for time period {time_period}.")

# Create a DataFrame from the data list
df = pd.DataFrame(data, columns=['Time Period', 'Keyword', 'Neighbor', 'Similarity'])

# Display or save the DataFrame
print(df)

excel_filename = "../tables/W5_word2vec_neighbors.xlsx"
df.to_excel(excel_filename, index=False)

print(f"Data saved to {excel_filename}")

   Time Period Keyword            Neighbor  Similarity
0    1811-1830   Armut        Verminderung    0.430583
1    1811-1830   Armut             Quellen    0.372884
2    1811-1830   Armut             Verordn    0.354517
3    1811-1830   Armut            Ursachen    0.349026
4    1811-1830   Armut     unverschuldeter    0.332813
5    1811-1830   Armut           wendungen    0.332427
6    1811-1830   Armut               Bürde    0.330979
7    1811-1830   Armut          vermindern    0.330646
8    1811-1830   Armut           versiegen    0.330622
9    1811-1830   Armut          Verhüthung    0.326574
10   1831-1860   Armut           erblichen    0.491133
11   1831-1860   Armut             erblich    0.421050
12   1831-1860   Armut            erbliche    0.384066
13   1831-1860   Armut         Pauperismus    0.380732
14   1831-1860   Armut  entgegenzuarbeiten    0.370878
15   1831-1860   Armut          verschämte    0.370696
16   1831-1860   Armut         verkommenen    0.365314
17   1831-

In [60]:
# Assuming you have a list of model file paths
model_paths = ["../models/W10_model01_1830.model", "../models/W10_model02_1860.model", "../models/W10_model03_1890.model",
              "../models/W10_model04_1922.model"
              ]  # replace with your model paths
keywords = ["sittlich"]
time_periods = ["1811-1830", "1831-1860", "1861-1890", 
                "1891-1922"
               ]  # tim eperiod labels
# Initialize an empty DataFrame
data = []

for model_path, time_period in zip(model_paths, time_periods):
    # Load the model
    model = Word2Vec.load(model_path)
    
    for keyword in keywords:
        # Check if the keyword exists in the model's vocabulary
        if keyword in model.wv.key_to_index:
            # Find the 20 nearest neighbors
            neighbors = model.wv.most_similar(keyword, topn=20)
            
            # Add each neighbor and its similarity score to the data list
            for neighbor, similarity in neighbors:
                data.append([time_period, keyword, neighbor, similarity])
        else:
            print(f"'{keyword}' not found in the vocabulary for time period {time_period}.")

# Create a DataFrame from the data list
df = pd.DataFrame(data, columns=['Time Period', 'Keyword', 'Neighbor', 'Similarity'])

# Display or save the DataFrame
print(df)

excel_filename = "../tables/sittlich_W10_word2vec_neighbors.xlsx"
df.to_excel(excel_filename, index=False)

print(f"Data saved to {excel_filename}")

   Time Period   Keyword          Neighbor  Similarity
0    1811-1830  sittlich       verdorbener    0.402819
1    1811-1830  sittlich      freundliches    0.362954
2    1811-1830  sittlich         Umbildung    0.359484
3    1811-1830  sittlich       wohlwollend    0.354104
4    1811-1830  sittlich          gehorsam    0.352599
..         ...       ...               ...         ...
75   1891-1922  sittlich        sittlichen    0.326674
76   1891-1922  sittlich  verbrecherischen    0.326209
77   1891-1922  sittlich         sittliche    0.321896
78   1891-1922  sittlich           Auswege    0.321004
79   1891-1922  sittlich         wahrloste    0.319207

[80 rows x 4 columns]
Data saved to ../tables/sittlich_W10_word2vec_neighbors.xlsx
