In [1]:
import sys
from pathlib import Path

# Zum übergeordneten Ordner 'src' wechseln
current_path = Path().resolve()
src_path = current_path.parents[1]  # Zwei Stufen nach oben

if str(src_path) not in sys.path:
    sys.path.append(str(src_path))

src_path


WindowsPath('C:/Users/Nasiba/Documents/1 Master Data Science/Projektpraktikum/WebScience24/webapp/backend/app')

In [2]:

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
from tensorflow.keras.preprocessing.text import tokenizer_from_json
import os
from tensorflow.keras.models import load_model


In [3]:
# Vektorisierung funktion manuel da diese speziel für ein Stringeingabe ist
def vectorize_glove_test_data_predict(text, glove_path, vector_size=200, max_seq_len=50, tokenizer=None):
    """
    Vektorisiert einen einzelnen Textstring mit vortrainierten GloVe-Embeddings.
    
    Args:
        text (str): Der Eingabetext, der vektorisiert werden soll.
        glove_path (str): Pfad zur GloVe-Embeddings-Datei.
        vector_size (int, optional): Größe der GloVe-Vektoren (Standard: 200).
        max_seq_len (int, optional): Maximale Sequenzlänge für Padding (Standard: 50).
        tokenizer (Tokenizer, optional): Der trainierte Tokenizer. Falls None, wird ein neuer erstellt.

    Returns:
        np.ndarray: Ein 3D-Array mit der Form `(1, max_seq_len, vector_size)`, das direkt für LSTM nutzbar ist.
    """


    glove_embeddings = {}
    with open(glove_path, encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            glove_embeddings[word] = vector


    if tokenizer is None:
        raise ValueError("Ein trainierter Tokenizer muss übergeben werden!")


    X_sequence = tokenizer.texts_to_sequences([text])  # Text in Sequenz umwandeln (Liste mit 1 Element)
    

    X_padded = pad_sequences(X_sequence, padding='post', maxlen=max_seq_len)

    def get_glove_vectors(sequence, glove_embeddings, vector_size, max_seq_len):
        vectors = [glove_embeddings.get(tokenizer.index_word.get(idx, ''), np.zeros(vector_size)) for idx in sequence]
        
        # Padding sicherstellen
        if len(vectors) < max_seq_len:
            vectors.extend([np.zeros(vector_size)] * (max_seq_len - len(vectors)))
        else:
            vectors = vectors[:max_seq_len]

        return np.array(vectors, dtype=np.float32)


    X_vectorized = np.array(get_glove_vectors(X_padded[0], glove_embeddings, vector_size, max_seq_len), dtype=np.float32)

    return np.expand_dims(X_vectorized, axis=0)

In [4]:


# Gespeicherten Tokenizer laden
# Glove Vektoren laden in sample path:
tokenizer_path = save_path_model = os.path.abspath(os.path.join(src_path, 'models/rrn_lstm/tokenizer.json'))


with open(tokenizer_path, "r", encoding="utf-8") as f:
    tokenizer_data = f.read()
    tokenizer_loaded = tokenizer_from_json(tokenizer_data)


In [14]:
#sample_input_text = "Just your opinion stupid white trash"
sample_input_text = "Eid Mubarak from your Atheist Seattle friends and happy Ramadan!"

In [15]:

# Vektorisierung mit Glove
twitter_glove_path = os.path.abspath(os.path.join(src_path, 'models/rrn_lstm/glove.6B.200d.txt'))

X_test_data_vectors = vectorize_glove_test_data_predict(
    text=sample_input_text,
    glove_path=twitter_glove_path,
    tokenizer=tokenizer_loaded
)

X_test_data_vectors

array([[[-0.075139 ,  0.82504  ,  0.35902  , ..., -0.71914  ,
         -0.098335 ,  0.0048383],
        [-0.023242 ,  0.47272  ,  0.24489  , ..., -0.45059  ,
         -0.054999 , -0.6276   ],
        [-0.16928  , -0.035512 ,  0.015382 , ..., -0.088303 ,
          0.22052  ,  0.74344  ],
        ...,
        [ 0.       ,  0.       ,  0.       , ...,  0.       ,
          0.       ,  0.       ],
        [ 0.       ,  0.       ,  0.       , ...,  0.       ,
          0.       ,  0.       ],
        [ 0.       ,  0.       ,  0.       , ...,  0.       ,
          0.       ,  0.       ]]], dtype=float32)

In [7]:
# Lade das Model

lstm_model_path= os.path.abspath(os.path.join(src_path, 'models/rrn_lstm/model_lstm_17.keras'))
model_lstm = load_model(lstm_model_path)


In [16]:
predictions_lstm = model_lstm.predict(X_test_data_vectors)

threshold = 0.35  # Oder teste 0.4 oder 0.6
predicted_classes_lstm = (predictions_lstm > threshold).astype(int)

print("Klasse:", predicted_classes_lstm, "Wahrscheinlichkeit:", predictions_lstm )

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
Klasse: [[0]] Wahrscheinlichkeit: [[0.02178442]]
