## Test Corpus

In [36]:
from nltk.tokenize import word_tokenize

corpus = {
    "doc_1": "Softwar engineering at Damascus university Software",
    "doc_2": "Information retrieval at Damascus university",
    "doc_3": "Indexing Information retrieval"
}

test_tokens = word_tokenize(corpus['doc_1']);

## Remove Punctuation

In [37]:
import string

def remove_punctuation(tokens):
    translator = str.maketrans('', '', string.punctuation)
    tokens_punctuated = [token.translate(translator) for token in tokens]
    return tokens_punctuated

remove_punctuation(test_tokens)

['Softwar', 'engineering', 'at', 'Damascus', 'university', 'Software']

## Remove Stopwords

In [38]:
from nltk.corpus import stopwords

def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return filtered_tokens

remove_stopwords(test_tokens)

['Softwar', 'engineering', 'Damascus', 'university', 'Software']

## Spell Checker

In [39]:

from autocorrect import Speller

spell = Speller(lang='en')

def correct_sentence_spelling(tokens):
    corrected_tokens = []
    for token in tokens:
        corrected_tokens.append(spell(token))
    return corrected_tokens

correct_sentence_spelling(test_tokens)

['Software', 'engineering', 'at', 'Damascus', 'university', 'Software']

## Stemmer

In [40]:
from nltk.stem import PorterStemmer

def stem(tokens):
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    return stemmed_tokens

stem(test_tokens)

['softwar', 'engin', 'at', 'damascu', 'univers', 'softwar']

## Lemmatizer

In [41]:
from nltk.stem import WordNetLemmatizer

def lemmatize(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return lemmatized_tokens

lemmatize(test_tokens)

['Softwar', 'engineering', 'at', 'Damascus', 'university', 'Software']

## Preprocessor

In [42]:
from nltk.tokenize import word_tokenize

def preprocessor(text):
    text = text.lower()
    
    tokens = word_tokenize(text)

    unpunctuated_tokens = remove_punctuation(tokens)
    no_stop_words_tokens = remove_stopwords(unpunctuated_tokens)
    spell_checked_tokens = correct_sentence_spelling(no_stop_words_tokens)
    stemmed_tokens = stem(spell_checked_tokens)
    lemmatized_tokens = lemmatize(stemmed_tokens)
    
    processed_text = ' '.join(lemmatized_tokens)
    
    return processed_text


## TFIDFVectorizer

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

vectorizer = TfidfVectorizer(preprocessor=preprocessor)

documents = list(corpus.values())
tfidf_matrix = vectorizer.fit_transform(documents)

df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out(), index=corpus.keys())

df

Unnamed: 0,damascu,engin,index,inform,retriev,softwar,univers
doc_1,0.306504,0.403016,0.0,0.0,0.0,0.806032,0.306504
doc_2,0.5,0.0,0.0,0.5,0.5,0.0,0.5
doc_3,0.0,0.0,0.680919,0.517856,0.517856,0.0,0.0
