## **Imports**

In [33]:
#python -m spacy download de_core_news_sm
import spacy
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from gensim import corpora, similarities
from gensim.matutils import sparse2full
from gensim.matutils import sparse2full
from scipy.spatial.distance import euclidean
from IPython.display import display

## **Information**

Prefer spacy instead of nltk because it's faster and we want to compute large texts.
Lemmatization and Stemming are redundand and different aproaches. They normaly are not used together. Decided for Lemmatizing because its integrated into spacy module:


Sources:
- Text Similarity Measures in News Articles by Vector Space Model Using NLP (https://link.springer.com/article/10.1007/s40031-020-00501-5)
- Compare documents similarity using Python | NLP (https://dev.to/thedevtimeline/compare-documents-similarity-using-python-nlp-4odp)
- What is gensim.similarities.MatrixSimilarity() function? (https://www.educative.io/answers/what-is-gensimsimilaritiesmatrixsimilarity-function)

In [2]:
documents = [
    "Das ist ein Text.",
    "Ich habe einen Text geschrieben.",
    "Ich habe mehrere Texte geschrieben!",
    "Das sind viele texte. Insgesamt sind es 4."
]

sp = spacy.load("de_core_news_sm")

# **Preprocessing**

## **1. Tokenization**

In [3]:
tokenized_docs = []
for doc in documents:
    doc_tokens = sp(doc)
    tokenized_tokens = [token.text for token in doc_tokens]
    tokenized_docs.append(tokenized_tokens)

print("Tokenized Documents:")
for doc_tokens in tokenized_docs:
    print(doc_tokens)

Tokenized Documents:
['Das', 'ist', 'ein', 'Text', '.']
['Ich', 'habe', 'einen', 'Text', 'geschrieben', '.']
['Ich', 'habe', 'mehrere', 'Texte', 'geschrieben', '!']
['Das', 'sind', 'viele', 'texte', '.', 'Insgesamt', 'sind', 'es', '4.']


## **2. Lammatize**

In [4]:
lemmatized_docs = []
for doc_tokens in tokenized_docs:
    lemmatized_tokens = [token.lemma_ for token in sp(' '.join(doc_tokens))]
    lemmatized_docs.append(lemmatized_tokens)

print("\nLemmatized Documents:")
for lemmatized_tokens in lemmatized_docs:
    print(lemmatized_tokens)


Lemmatized Documents:
['der', 'sein', 'ein', 'Text', '--']
['ich', 'haben', 'ein', 'Text', 'schreiben', '--']
['ich', 'haben', 'mehrere', 'Text', 'schreiben', '--']
['der', 'sein', 'vieler', 'Text', '--', 'insgesamt', 'sein', 'es', '4.']


## **3. Remove Stop Words**

In [5]:
lemmatized_docs_no_stopwords = []

for lemmatized_tokens in lemmatized_docs:
    lemmatized_tokens_no_stopwords = [token for token in lemmatized_tokens if not sp.vocab[token].is_stop]
    lemmatized_docs_no_stopwords.append(lemmatized_tokens_no_stopwords)

print("\nLemmatized Documents without Stop Words:")
for lemmatized_tokens in lemmatized_docs_no_stopwords:
    print(lemmatized_tokens)


Lemmatized Documents without Stop Words:
['Text', '--']
['Text', 'schreiben', '--']
['mehrere', 'Text', 'schreiben', '--']
['vieler', 'Text', '--', 'insgesamt', '4.']


## **4.Eleminate Puctation Marks & Numbers**

In [6]:
lemmatized_docs_no_stopwords_punct_nums = []

for lemmatized_tokens in lemmatized_docs:
    lemmatized_tokens_no_stopwords = [token for token in lemmatized_tokens if not sp.vocab[token].is_stop]
    
    # Remove punctuation tokens
    lemmatized_tokens_no_punct = [token for token in lemmatized_tokens_no_stopwords if not sp.vocab[token].is_punct]
    
    # Remove number tokens
    lemmatized_tokens_no_nums = [token for token in lemmatized_tokens_no_punct if not sp.vocab[token].like_num]
    
    lemmatized_docs_no_stopwords_punct_nums.append(lemmatized_tokens_no_nums)

print("\nLemmatized Documents without Stop Words, Punctuation, and Numbers:")
for tokens in lemmatized_docs_no_stopwords_punct_nums:
    print(tokens)



Lemmatized Documents without Stop Words, Punctuation, and Numbers:
['Text']
['Text', 'schreiben']
['mehrere', 'Text', 'schreiben']
['vieler', 'Text', 'insgesamt']


## **Preprocessing Class**

In [7]:
import spacy

class TextPreprocessor:
    def __init__(self):
        self.nlp = spacy.load("de_core_news_sm")
    
    def tokenize_documents(self, documents):
        tokenized_docs = []
        for doc in documents:
            doc_tokens = self.nlp(doc)
            tokenized_tokens = [token.text for token in doc_tokens]
            tokenized_docs.append(tokenized_tokens)
        return tokenized_docs
    
    def lemmatize_documents(self, tokenized_docs):
        lemmatized_docs = []
        for doc_tokens in tokenized_docs:
            lemmatized_tokens = [token.lemma_ for token in self.nlp(' '.join(doc_tokens))]
            lemmatized_docs.append(lemmatized_tokens)
        return lemmatized_docs
    
    def remove_stopwords_punctuations_numbers(self, lemmatized_docs):
        clean_docs = []
        for lemmatized_tokens in lemmatized_docs:
            lemmatized_tokens_no_stopwords = [token for token in lemmatized_tokens if not self.nlp.vocab[token].is_stop]
            lemmatized_tokens_no_punct = [token for token in lemmatized_tokens_no_stopwords if not self.nlp.vocab[token].is_punct]
            lemmatized_tokens_no_nums = [token for token in lemmatized_tokens_no_punct if not self.nlp.vocab[token].like_num]
            clean_docs.append(lemmatized_tokens_no_nums)
        return clean_docs

documents = [
    "Das ist ein spannender Text.",
    "Ich habe einen Text geschrieben.",
    "Ich habe mehrere Texte geschrieben!",
    "Das sind viele texte. Insgesamt sind es 4."
]

preprocessor = TextPreprocessor()

# Tokenization
tokenized = preprocessor.tokenize_documents(documents)

# Lemmatization
lemmatized = preprocessor.lemmatize_documents(tokenized)

# Removing Stopwords, Punctuation, and Numbers
cleaned = preprocessor.remove_stopwords_punctuations_numbers(lemmatized)

# Print cleaned documents
print("\nCleaned Documents:")
for tokens in cleaned:
    print(tokens)



Cleaned Documents:
['spannend', 'Text']
['Text', 'schreiben']
['mehrere', 'Text', 'schreiben']
['vieler', 'Text', 'insgesamt']


# **Representation Scheme**

## **Bag of Words**

In [8]:
def create_bow_representation(preprocessed_docs):
    dictionary = corpora.Dictionary(preprocessed_docs)
    bow_corpus = [dictionary.doc2bow(doc) for doc in preprocessed_docs]
    return dictionary, bow_corpus

bow_dictionary, bow_corpus = create_bow_representation(cleaned)

# Display bow representation
print("\nBag-of-Words Representation:")
num_terms = len(bow_dictionary)
bow_matrix = [sparse2full(doc, num_terms) for doc in bow_corpus]
bow_df = pd.DataFrame(bow_matrix, columns=[bow_dictionary[i] for i in range(num_terms)])

display(bow_df)

print(bow_dictionary)
print(bow_corpus)


Bag-of-Words Representation:


Unnamed: 0,Text,spannend,schreiben,mehrere,insgesamt,vieler
0,1.0,1.0,0.0,0.0,0.0,0.0
1,1.0,0.0,1.0,0.0,0.0,0.0
2,1.0,0.0,1.0,1.0,0.0,0.0
3,1.0,0.0,0.0,0.0,1.0,1.0


Dictionary<6 unique tokens: ['Text', 'spannend', 'schreiben', 'mehrere', 'insgesamt']...>
[[(0, 1), (1, 1)], [(0, 1), (2, 1)], [(0, 1), (2, 1), (3, 1)], [(0, 1), (4, 1), (5, 1)]]


## **TF-IDF**

In [14]:
def create_tfidf_representation(preprocessed_docs):
    # Joining tokenized documents to form a list of strings
    tokenized_texts = [' '.join(doc) for doc in preprocessed_docs]
    
    # Creating TF-IDF vectorizer and fitting on the tokenized documents
    ifidf_vectorizer = TfidfVectorizer()
    tfidf_representation = ifidf_vectorizer.fit_transform(tokenized_texts)
    
    feature_names = ifidf_vectorizer.get_feature_names_out()
    
    return tfidf_representation, feature_names

tfidf_corpus, tfidf_dictionary = create_tfidf_representation(cleaned)

tfidf_df = pd.DataFrame(tfidf_corpus.toarray(), columns=tfidf_dictionary)

# Display the TF-IDF DataFrame
print("\nTF-IDF Representation:")
display(tfidf_df)


TF-IDF Representation:


Unnamed: 0,insgesamt,mehrere,schreiben,spannend,text,vieler
0,0.0,0.0,0.0,0.886548,0.462637,0.0
1,0.0,0.0,0.833884,0.0,0.551939,0.0
2,0.0,0.726641,0.572892,0.0,0.379192,0.0
3,0.663385,0.0,0.0,0.0,0.346182,0.663385


# **Similarity Measures**

## **Cosine**

In [19]:
def calculate_cosine_similarity(corpus, similarity_type='bow'):
    if similarity_type == 'bow':
        index = similarities.MatrixSimilarity(corpus) #calculates automaticly cosine similarity
    elif similarity_type == 'tfidf':
        index = similarities.MatrixSimilarity(corpus, num_features=len(corpus))
    else:
        raise ValueError("Invalid similarity_type. Choose 'bow' or 'tfidf'.")
    
    return index

bow_similarity_index = calculate_cosine_similarity(bow_corpus, similarity_type='bow')
print("\nCosine Similarity (BoW Representation):")
print(bow_similarity_index)
for i, sims in enumerate(bow_similarity_index):
    print(f"Document {i + 1}: {sims}")


# Calculate cosine similarity for TF-IDF representation
tfidf_similarity_index = similarities.SparseMatrixSimilarity(tfidf_corpus, num_features=len(tfidf_dictionary))

# Display cosine similarity scores for TF-IDF representation
print("\nCosine Similarity (TF-IDF Representation):")
for i, sims in enumerate(tfidf_similarity_index):
    print(f"Document {i + 1}: {sims}")


Cosine Similarity (BoW Representation):
MatrixSimilarity<4 docs, 6 features>
Document 1: [0.99999994 0.49999997 0.40824828 0.40824828]
Document 2: [0.49999997 0.99999994 0.81649655 0.40824828]
Document 3: [0.40824828 0.81649655 0.99999994 0.3333333 ]
Document 4: [0.40824828 0.40824828 0.3333333  0.99999994]

Cosine Similarity (TF-IDF Representation):
Document 1: [1.         0.2553478  0.17542823 0.16015653]
Document 2: [0.2553478  1.         0.68701684 0.19107127]
Document 3: [0.17542823 0.68701684 0.9999999  0.13126917]
Document 4: [0.16015653 0.19107127 0.13126917 1.        ]


## **Eucliadian for bow**

In [32]:
def calculate_euclidean_similarity(corpus):
    num_terms = max(token_id for doc in corpus for token_id, _ in doc) + 1
    matrix = [sparse2full(doc, num_terms) for doc in corpus]
    
    # Calculate Euclidean similarity
    num_docs = len(matrix)
    similarity_matrix = np.zeros((num_docs, num_docs))
    for i, doc1 in enumerate(matrix):
        for j, doc2 in enumerate(matrix):
            similarity_matrix[i, j] = 1 / (1 + euclidean(doc1, doc2))
    
    return similarity_matrix

# Calculate Euclidean similarity bow representation
euclidean_similarity_bow = calculate_euclidean_similarity(bow_corpus)
print("Euclidean Similarity Matrix (bow representation):")
print(euclidean_similarity_bow)

Euclidean Similarity Matrix (bow representation):
[[1.         0.41421357 0.36602541 0.36602541]
 [0.41421357 1.         0.5        0.36602541]
 [0.36602541 0.5        1.         0.33333333]
 [0.36602541 0.36602541 0.33333333 1.        ]]


## **Jaccard for TF-IDF**