In [5]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Load and preprocess the data
metadata = pd.read_csv('metadata_1.csv')  # Assuming metadata is stored in a CSV file


In [6]:
metadata

Unnamed: 0,title,author,description
0,Book Title 1,Author 1,This is a book about programming and algorithms.
1,Book Title 2,Author 2,Explore the world of art with this captivating...
2,Book Title 3,Author 3,A comprehensive guide to mastering the stock m...
3,Book Title 4,Author 4,Discover the wonders of nature through stunnin...
4,Book Title 5,Author 5,A fascinating account of historical events tha...


In [7]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

def preprocess_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Convert to lowercase
    text = text.lower()

    # Tokenize text into words
    words = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    # Join the words back into a single string
    processed_text = ' '.join(words)

    return processed_text

In [8]:
# Step 2: Prepare the data
metadata['preprocessed_text'] = metadata['description'].apply(preprocess_text)


In [9]:
metadata['description']

0     This is a book about programming and algorithms.
1    Explore the world of art with this captivating...
2    A comprehensive guide to mastering the stock m...
3    Discover the wonders of nature through stunnin...
4    A fascinating account of historical events tha...
Name: description, dtype: object

In [10]:
metadata['preprocessed_text']

0                           book programming algorithm
1                   explore world art captivating book
2           comprehensive guide mastering stock market
3           discover wonder nature stunning photograph
4    fascinating account historical event shaped world
Name: preprocessed_text, dtype: object

In [11]:
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Tannu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Tannu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Tannu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [12]:
# Step 3: Define the search query
query = "stock market"  # Replace with your actual search query

# Step 4: Vectorize the metadata
vectorizer = CountVectorizer()
metadata_vectorized = vectorizer.fit_transform(metadata['preprocessed_text'])

# Step 5: Vectorize the search query
query_vectorized = vectorizer.transform([preprocess_text(query)])

In [13]:
query_vectorized

<1x22 sparse matrix of type '<class 'numpy.int64'>'
	with 2 stored elements in Compressed Sparse Row format>

In [14]:
# Step 6: Compute cosine similarity
similarity_scores = cosine_similarity(query_vectorized, metadata_vectorized)
similarity_scores

array([[0.        , 0.        , 0.63245553, 0.        , 0.        ]])

In [15]:
# Step 7: Get the most relevant metadata
most_similar_index = similarity_scores.argmax()
most_similar_metadata = metadata.iloc[most_similar_index]

# Step 8: Print the most relevant metadata
print("Most relevant metadata:")
print("Title:", most_similar_metadata['title'])
print("Author:", most_similar_metadata['author'])
print("Description:", most_similar_metadata['description'])


Most relevant metadata:
Title: Book Title 3
Author: Author 3
Description: A comprehensive guide to mastering the stock market.


In [16]:
query = "historical"
query_vectorized = vectorizer.transform([preprocess_text(query)])
similarity_scores = cosine_similarity(query_vectorized, metadata_vectorized)
most_similar_index = similarity_scores.argmax()
most_similar_metadata = metadata.iloc[most_similar_index]

# Step 8: Print the most relevant metadata
print("Most relevant metadata:")
print("Title:", most_similar_metadata['title'])
print("Author:", most_similar_metadata['author'])
print("Description:", most_similar_metadata['description'])

Most relevant metadata:
Title: Book Title 5
Author: Author 5
Description: A fascinating account of historical events that shaped our world.


In [18]:
from nltk.corpus import wordnet

# Define the query term
query_term = "analyze"


In [19]:
# Find synonyms of the query term using WordNet
query_term = "position"
synonyms = []
for syn in wordnet.synsets(query_term):
    for lemma in syn.lemmas():
        synonyms.append(lemma.name())

# Print the synonyms
print("Synonyms of", query_term, ":")
print(synonyms)

Synonyms of position :
['position', 'place', 'military_position', 'position', 'position', 'view', 'perspective', 'position', 'posture', 'attitude', 'status', 'position', 'position', 'post', 'berth', 'office', 'spot', 'billet', 'place', 'situation', 'position', 'spatial_relation', 'position', 'position', 'placement', 'location', 'locating', 'position', 'positioning', 'emplacement', 'situation', 'position', 'position', 'stance', 'posture', 'side', 'position', 'place', 'position', 'stead', 'position', 'place', 'lieu', 'position', 'position', 'put', 'set', 'place', 'pose', 'position', 'lay']
