In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Download necessary NLTK data files
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')

def text_similarity(text1, text2):
    # Initialize lemmatizer
    lemmatizer = WordNetLemmatizer()

    # Tokenize
    tokens1 = word_tokenize(text1)
    tokens2 = word_tokenize(text2)

    # Lemmatize
    tokens1 = [lemmatizer.lemmatize(token.lower()) for token in tokens1 if token.isalpha()]
    tokens2 = [lemmatizer.lemmatize(token.lower()) for token in tokens2 if token.isalpha()]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens1 = [token for token in tokens1 if token not in stop_words]
    tokens2 = [token for token in tokens2 if token not in stop_words]

    # Join tokens back into strings
    text1_processed = " ".join(tokens1)
    text2_processed = " ".join(tokens2)

    # Create TF-IDF vectors
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([text1_processed, text2_processed])

    # âœ… Correct way to compute cosine similarity
    similarity = cosine_similarity(vectors[0:1], vectors[1:2])[0][0]

    return similarity


# Example usage:
text1 = "Natural Language Processing makes computers understand human language."
text2 = "NLP enables machines to comprehend what people say or write."
print("Similarity Score:", text_similarity(text1, text2))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Similarity Score: 0.0
