## Tokenizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import stopwords
import pandas as pd

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    filtered_text = [word for word in text.split() if word.lower() not in stop_words]
    return ' '.join(filtered_text)

def stem_and_lemmatize(text):
    tokens = text.split()
    stemmed_and_lemmatized_words = [lemmatizer.lemmatize(stemmer.stem(word)) for word in tokens]
    return ' '.join(stemmed_and_lemmatized_words)

def preprocessor(text):
    text = text.lower()
    unpunctuated = remove_punctuation(text)
    no_stop_words = remove_stopwords(unpunctuated)
    stemmed_and_lemmatized = stem_and_lemmatize(no_stop_words)
    return stemmed_and_lemmatized

corpus = {
    "doc_1": "Software engineering at Damascus university Software",
    "doc_2": "Information retrieval at Damascus university",
    "doc_3": "Indexing Information retrieval"
}

vectorizer = TfidfVectorizer(preprocessor=preprocessor, ngram_range=(1, 2), min_df=2)

documents = list(corpus.values())
tfidf_matrix = vectorizer.fit_transform(documents)

df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out(), index=corpus.keys())

df
