## Test Corpus

In [4]:
from nltk.tokenize import word_tokenize

corpus = {
    "doc_1": "Softwar engineering at Damascus university Software",
    "doc_2": "Information retrieval at Damascus university",
    "doc_3": "Indexing Information retrieval"
}

test_tokens = word_tokenize(corpus['doc_1']);

## Remove Punctuation

In [5]:
import string

def remove_punctuation(tokens):
    translator = str.maketrans('', '', string.punctuation)
    tokens_punctuated = [token.translate(translator) for token in tokens]
    return tokens_punctuated

remove_punctuation(test_tokens)

['Softwar', 'engineering', 'at', 'Damascus', 'university', 'Software']

## Remove Stopwords

In [6]:
from nltk.corpus import stopwords

def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return filtered_tokens

remove_stopwords(test_tokens)

['Softwar', 'engineering', 'Damascus', 'university', 'Software']

## Unify Abbreviation

In [34]:
import spacy

# Load the English language model
nlp = spacy.load("en_core_web_sm")

# Function to generate entity mappings dynamically
def generate_entity_mappings(doc):
    entity_mappings = {}
    for ent in doc.ents:
        if ent.label_ == "ORG" and len(ent.text.split()) > 1:
            # Take the first letter of each word in the organization name
            abbreviation = ''.join(word[0] for word in ent.text.split())
            entity_mappings[ent.text] = abbreviation
    return entity_mappings

# Function to map entity text to abbreviations
def map_entity_to_abbreviation(entity_text, entity_mappings):
    return entity_mappings.get(entity_text, entity_text)

text = "Apple is looking at buying World Health Organisation startup for $1 billion"
doc = nlp(text)

# Generate entity mappings dynamically
entity_mappings = generate_entity_mappings(doc)

# Iterate over the entities in the document
for ent in doc.ents:
    # Get abbreviation if available, otherwise use the original text
    abbreviated_entity = map_entity_to_abbreviation(ent.text, entity_mappings)
    print(abbreviated_entity, ent.label_)


Apple ORG
WHO ORG
$1 billion MONEY


## Spell Checker

In [8]:

from autocorrect import Speller

spell = Speller(lang='en')

def correct_sentence_spelling(tokens):
    corrected_tokens = []
    for token in tokens:
        corrected_tokens.append(spell(token))
    return corrected_tokens

correct_sentence_spelling(test_tokens)

['Software', 'engineering', 'at', 'Damascus', 'university', 'Software']

## Stemmer

In [9]:
from nltk.stem import PorterStemmer

def stem(tokens):
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    return stemmed_tokens

stem(test_tokens)

['softwar', 'engin', 'at', 'damascu', 'univers', 'softwar']

## Lemmatizer

In [10]:
from nltk.stem import WordNetLemmatizer

def lemmatize(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return lemmatized_tokens

lemmatize(test_tokens)

['Softwar', 'engineering', 'at', 'Damascus', 'university', 'Software']

## Preprocessor

In [11]:
from nltk.tokenize import word_tokenize

def preprocessor(text):
    text = text.lower()
    
    tokens = word_tokenize(text)

    unpunctuated_tokens = remove_punctuation(tokens)
    no_stop_words_tokens = remove_stopwords(unpunctuated_tokens)
    spell_checked_tokens = correct_sentence_spelling(no_stop_words_tokens)
    stemmed_tokens = stem(spell_checked_tokens)
    lemmatized_tokens = lemmatize(stemmed_tokens)
    
    processed_text = ' '.join(lemmatized_tokens);
    
    return processed_text


## TFIDFVectorizer

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

vectorizer = TfidfVectorizer(preprocessor=preprocessor)

documents = list(corpus.values())
tfidf_matrix = vectorizer.fit_transform(documents)

df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out(), index=corpus.keys())

df

Unnamed: 0,damascu,engin,index,inform,retriev,softwar,univers
doc_1,0.306504,0.403016,0.0,0.0,0.0,0.806032,0.306504
doc_2,0.5,0.0,0.0,0.5,0.5,0.0,0.5
doc_3,0.0,0.0,0.680919,0.517856,0.517856,0.0,0.0
