**Requirements for Installing Gensim**

In [34]:
!pip uninstall -y gensim scipy numpy
!pip install numpy==1.24.4 scipy==1.10.1 gensim==4.3.1

Found existing installation: gensim 4.3.1
Uninstalling gensim-4.3.1:
  Successfully uninstalled gensim-4.3.1
Found existing installation: scipy 1.10.1
Uninstalling scipy-1.10.1:
  Successfully uninstalled scipy-1.10.1
Found existing installation: numpy 1.24.4
Uninstalling numpy-1.24.4:
  Successfully uninstalled numpy-1.24.4
Collecting numpy==1.24.4
  Using cached numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Collecting scipy==1.10.1
  Using cached scipy-1.10.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (58 kB)
Collecting gensim==4.3.1
  Using cached gensim-4.3.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.4 kB)
Using cached numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
Using cached scipy-1.10.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.1 MB)
Using cached gensim-4.3.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.6 MB)

**Importing Important Libraries like Genism, Pandas...**

In [3]:
import gensim
import gensim.downloader as api
from gensim.models import Word2Vec, KeyedVectors

import pandas as pd

import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

**Importing NLTK and downloading all necessary packages**

In [5]:
import nltk

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download("wordnet")
nltk.download('averaged_perceptron_tagger_eng')

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Dell\A

**Reading the Json File and Converting it to CSV format**

In [6]:
df = pd.read_json("/content/News_Category_Dataset_v3.json", lines=True)
df.to_csv("/content/News_Category_Dataset_v3.csv", index=False)
data = pd.read_csv("/content/News_Category_Dataset_v3.csv")

FileNotFoundError: File /content/News_Category_Dataset_v3.json does not exist

**Extracting all the Headlines, Removing NaNs and Tokenizing**

In [4]:
sentences = data['headline']

clean_sentences = sentences.dropna().reset_index(drop=True)

tokenized_sentences = [word_tokenize(str(sentence)) for sentence in clean_sentences]

**Lemmatization**

In [5]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

lemmatizer = WordNetLemmatizer()

lemmatized_sentences = []
for sentence in tokenized_sentences:
  lowered = [word.lower() for word in sentence]
  tagged = pos_tag(lowered)
  lemmatized = [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in tagged]
  lemmatized_sentences.append(lemmatized)


**Applying StopWords**

In [6]:
stop_words = set(stopwords.words('english'))

filtered_sentences = []
for sentence in lemmatized_sentences:
    filtered = [word for word in sentence if word not in stop_words]
    filtered_sentences.append(filtered)


**Downloading Word2Vec**

In [8]:
wv = api.load('word2vec-google-news-300')

**Keeping only the Sentences which are not Empty and applying Average Word2Vec**

In [9]:
filtered_sentences_cleaned = []
vectorized_sentences = []

for sentence in filtered_sentences:
    sentence_vectors = [wv[word] for word in sentence if word in wv]
    if sentence_vectors:  # for only sentences that dont have NA
        avg_vector = np.mean(sentence_vectors, axis=0)
        vectorized_sentences.append(avg_vector)
        filtered_sentences_cleaned.append(sentence)


**Creating a Function to convert Input Sentence List into a Vector**

In [10]:
def get_sentence_vector(sentence, wv):

    sentence_vectors = [wv[word] for word in sentence if word in wv]
    if sentence_vectors:
        return np.mean(sentence_vectors, axis=0).reshape(1, -1)
    else:
        return None


**Creating a Function to Find the Top 5 most Similar Sentences from the Dataset based on the Input Sentence**

In [11]:
def find_top_k_similar(input_sentence, wv, vectorized_sentences, filtered_sentences, k=5):
    vec = get_sentence_vector(input_sentence, wv)
    if vec is None:
        return

    similarities = cosine_similarity(vec, np.array(vectorized_sentences))[0]
    top_k_idx = similarities.argsort()[::-1][:k]

    return [(filtered_sentences[i], similarities[i]) for i in top_k_idx]


**Example Input Sentence**

In [12]:
input_sentence = ["president", "got", "no", "money"]
top_matches = find_top_k_similar(input_sentence, wv, vectorized_sentences, filtered_sentences)

for i, (sentence, score) in enumerate(top_matches, 1):
    print(f"{i}. {' '.join(sentence)} (Similarity: {score:.4f})")


1. guy 's running president want give 'free ' money (Similarity: 0.7809)
2. copycat chick-fil-a sandwich recipe ( hungry sunday ) (Similarity: 0.6981)
3. clothe organization : family 's closet say ( photo ) (Similarity: 0.6963)
4. 'la la land ' win bafta 's top prize , continue hot streak road oscar (Similarity: 0.6809)
5. guy 's get 2 word president , 's put d.c . (Similarity: 0.6600)
