## **Imports**

In [96]:
#python -m spacy download de_core_news_sm
import spacy
import pandas as pd

from gensim import corpora, models, similarities
from gensim.matutils import sparse2full
from IPython.display import display

## **Information**

Prefer spacy instead of nltk because it's faster and we want to compute large texts.
Lemmatization and Stemming are redundand and different aproaches. They normaly are not used together. Decided for Lemmatizing because its integrated into spacy module:


Sources:
- Text Similarity Measures in News Articles by Vector Space Model Using NLP (https://link.springer.com/article/10.1007/s40031-020-00501-5)
- Compare documents similarity using Python | NLP (https://dev.to/thedevtimeline/compare-documents-similarity-using-python-nlp-4odp)

In [45]:
documents = [
    "Das ist ein Text.",
    "Ich habe einen Text geschrieben.",
    "Ich habe mehrere Texte geschrieben!",
    "Das sind viele texte. Insgesamt sind es 4."
]

sp = spacy.load("de_core_news_sm")

# **Preprocessing**

## **1. Tokenization**

In [46]:
tokenized_docs = []
for doc in documents:
    doc_tokens = sp(doc)
    tokenized_tokens = [token.text for token in doc_tokens]
    tokenized_docs.append(tokenized_tokens)

print("Tokenized Documents:")
for doc_tokens in tokenized_docs:
    print(doc_tokens)

Tokenized Documents:
['Das', 'ist', 'ein', 'Text', '.']
['Ich', 'habe', 'einen', 'Text', 'geschrieben', '.']
['Ich', 'habe', 'mehrere', 'Texte', 'geschrieben', '!']
['Das', 'sind', 'viele', 'texte', '.', 'Insgesamt', 'sind', 'es', '4.']


## **2. Lammatize**

In [47]:
lemmatized_docs = []
for doc_tokens in tokenized_docs:
    lemmatized_tokens = [token.lemma_ for token in sp(' '.join(doc_tokens))]
    lemmatized_docs.append(lemmatized_tokens)

print("\nLemmatized Documents:")
for lemmatized_tokens in lemmatized_docs:
    print(lemmatized_tokens)


Lemmatized Documents:
['der', 'sein', 'ein', 'Text', '--']
['ich', 'haben', 'ein', 'Text', 'schreiben', '--']
['ich', 'haben', 'mehrere', 'Text', 'schreiben', '--']
['der', 'sein', 'vieler', 'Text', '--', 'insgesamt', 'sein', 'es', '4.']


## **3. Remove Stop Words**

In [48]:
lemmatized_docs_no_stopwords = []

for lemmatized_tokens in lemmatized_docs:
    lemmatized_tokens_no_stopwords = [token for token in lemmatized_tokens if not sp.vocab[token].is_stop]
    lemmatized_docs_no_stopwords.append(lemmatized_tokens_no_stopwords)

print("\nLemmatized Documents without Stop Words:")
for lemmatized_tokens in lemmatized_docs_no_stopwords:
    print(lemmatized_tokens)


Lemmatized Documents without Stop Words:
['Text', '--']
['Text', 'schreiben', '--']
['mehrere', 'Text', 'schreiben', '--']
['vieler', 'Text', '--', 'insgesamt', '4.']


## **4.Eleminate Puctation Marks & Numbers**

In [52]:
lemmatized_docs_no_stopwords_punct_nums = []

for lemmatized_tokens in lemmatized_docs:
    lemmatized_tokens_no_stopwords = [token for token in lemmatized_tokens if not sp.vocab[token].is_stop]
    
    # Remove punctuation tokens
    lemmatized_tokens_no_punct = [token for token in lemmatized_tokens_no_stopwords if not sp.vocab[token].is_punct]
    
    # Remove number tokens
    lemmatized_tokens_no_nums = [token for token in lemmatized_tokens_no_punct if not sp.vocab[token].like_num]
    
    lemmatized_docs_no_stopwords_punct_nums.append(lemmatized_tokens_no_nums)

print("\nLemmatized Documents without Stop Words, Punctuation, and Numbers:")
for tokens in lemmatized_docs_no_stopwords_punct_nums:
    print(tokens)



Lemmatized Documents without Stop Words, Punctuation, and Numbers:
['Text']
['Text', 'schreiben']
['mehrere', 'Text', 'schreiben']
['vieler', 'Text', 'insgesamt']


## **Preprocessing Class**

In [58]:
import spacy

class TextPreprocessor:
    def __init__(self):
        self.nlp = spacy.load("de_core_news_sm")
    
    def tokenize_documents(self, documents):
        tokenized_docs = []
        for doc in documents:
            doc_tokens = self.nlp(doc)
            tokenized_tokens = [token.text for token in doc_tokens]
            tokenized_docs.append(tokenized_tokens)
        return tokenized_docs
    
    def lemmatize_documents(self, tokenized_docs):
        lemmatized_docs = []
        for doc_tokens in tokenized_docs:
            lemmatized_tokens = [token.lemma_ for token in self.nlp(' '.join(doc_tokens))]
            lemmatized_docs.append(lemmatized_tokens)
        return lemmatized_docs
    
    def remove_stopwords_punctuations_numbers(self, lemmatized_docs):
        clean_docs = []
        for lemmatized_tokens in lemmatized_docs:
            lemmatized_tokens_no_stopwords = [token for token in lemmatized_tokens if not self.nlp.vocab[token].is_stop]
            lemmatized_tokens_no_punct = [token for token in lemmatized_tokens_no_stopwords if not self.nlp.vocab[token].is_punct]
            lemmatized_tokens_no_nums = [token for token in lemmatized_tokens_no_punct if not self.nlp.vocab[token].like_num]
            clean_docs.append(lemmatized_tokens_no_nums)
        return clean_docs

documents = [
    "Das ist ein Text.",
    "Ich habe einen Text geschrieben.",
    "Ich habe mehrere Texte geschrieben!",
    "Das sind viele texte. Insgesamt sind es 4."
]

preprocessor = TextPreprocessor()

# Tokenization
tokenized = preprocessor.tokenize_documents(documents)

# Lemmatization
lemmatized = preprocessor.lemmatize_documents(tokenized)

# Removing Stopwords, Punctuation, and Numbers
cleaned = preprocessor.remove_stopwords_punctuations_numbers(lemmatized)

# Print cleaned documents
print("\nCleaned Documents:")
for tokens in cleaned:
    print(tokens)



Cleaned Documents:
['Text']
['Text', 'schreiben']
['mehrere', 'Text', 'schreiben']
['vieler', 'Text', 'insgesamt']


# **Representation Scheme**

## **Bag of Words**

In [98]:
def create_bow_representation(preprocessed_docs):
    dictionary = corpora.Dictionary(preprocessed_docs)
    bow_corpus = [dictionary.doc2bow(doc) for doc in preprocessed_docs]
    return dictionary, bow_corpus

bow_dictionary, bow_corpus = create_bow_representation(cleaned)

# Display bow representation
print("\nBag-of-Words Representation:")
num_terms = len(bow_dictionary)
bow_matrix = [sparse2full(doc, num_terms) for doc in bow_corpus]
bow_df = pd.DataFrame(bow_matrix, columns=[bow_dictionary[i] for i in range(num_terms)])

display(bow_df)

print(bow_dictionary)
print(bow_corpus)


Bag-of-Words Representation:


Unnamed: 0,Text,schreiben,mehrere,insgesamt,vieler
0,1.0,0.0,0.0,0.0,0.0
1,1.0,1.0,0.0,0.0,0.0
2,1.0,1.0,1.0,0.0,0.0
3,1.0,0.0,0.0,1.0,1.0


Dictionary<5 unique tokens: ['Text', 'schreiben', 'mehrere', 'insgesamt', 'vieler']>
[[(0, 1)], [(0, 1), (1, 1)], [(0, 1), (1, 1), (2, 1)], [(0, 1), (3, 1), (4, 1)]]


## **TF-IDF**

In [82]:
def create_tfidf_representation(preprocessed_docs):
    dictionary = corpora.Dictionary(preprocessed_docs)
    bow_corpus = [dictionary.doc2bow(doc) for doc in preprocessed_docs]
    
    # Generate TF-IDF model based on the bag-of-words corpus
    tfidf_model = models.TfidfModel(bow_corpus)
    
    # Transform the bag-of-words corpus to TF-IDF corpus
    tfidf_corpus = tfidf_model[bow_corpus]
    
    return tfidf_model, tfidf_corpus, dictionary

# Create TF-IDF representation
tfidf_model, tfidf_corpus, tfidf_dictionary = create_tfidf_representation(cleaned)

# Display TF-IDF representation
print("\nTF-IDF Representation:")
num_terms = len(tfidf_dictionary)
tfidf_matrix = [sparse2full(doc, num_terms) for doc in tfidf_corpus]
tfidf_df = pd.DataFrame(tfidf_matrix, columns=[tfidf_dictionary[i] for i in range(num_terms)])

display(tfidf_df)

# Display TF-IDF dictionary and corpus
print("\nTF-IDF Dictionary:")
print(tfidf_dictionary)

print("\nTF-IDF Corpus:")
print(tfidf_corpus)
"""

NameError: name 'dictionary' is not defined

# **Similarity Measures**

## **Cosine**

In [91]:
def calculate_cosine_similarity(corpus, similarity_type='bow'):
    if similarity_type == 'bow':
        index = similarities.MatrixSimilarity(corpus)
    elif similarity_type == 'tfidf':
        index = similarities.MatrixSimilarity(corpus, num_features=len(corpus))
    else:
        raise ValueError("Invalid similarity_type. Choose 'bow' or 'tfidf'.")
    
    return index

bow_similarity_index = calculate_cosine_similarity(bow_corpus, similarity_type='bow')
print("\nCosine Similarity (BoW Representation):")
print(bow_similarity_index)
for i, sims in enumerate(bow_similarity_index):
    print(f"Document {i + 1}: {sims}")

"""
# Calculate cosine similarity for TF-IDF representation
tfidf_similarity_index = calculate_cosine_similarity(tfidf_corpus, similarity_type='tfidf')
print("\nCosine Similarity (TF-IDF Representation):")
print(tfidf_similarity_index)
"""


Cosine Similarity (BoW Representation):
MatrixSimilarity<4 docs, 5 features>
Document 1: [1.         0.70710677 0.57735026 0.57735026]
Document 2: [0.70710677 0.99999994 0.81649655 0.40824828]
Document 3: [0.57735026 0.81649655 0.99999994 0.3333333 ]
Document 4: [0.57735026 0.40824828 0.3333333  0.99999994]


'\n# Calculate cosine similarity for TF-IDF representation\ntfidf_similarity_index = calculate_cosine_similarity(tfidf_corpus, similarity_type=\'tfidf\')\nprint("\nCosine Similarity (TF-IDF Representation):")\nprint(tfidf_similarity_index)\n'