In [91]:
import pandas as pd 
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
import Levenshtein

In [125]:
!pip install python-Levenshtein

Collecting python-Levenshtein
  Downloading python_Levenshtein-0.21.0-py3-none-any.whl (9.4 kB)
Collecting Levenshtein==0.21.0
  Downloading Levenshtein-0.21.0-cp37-cp37m-win_amd64.whl (101 kB)
     -------------------------------------- 101.5/101.5 kB 1.2 MB/s eta 0:00:00
Collecting rapidfuzz<4.0.0,>=2.3.0
  Downloading rapidfuzz-3.0.0-cp37-cp37m-win_amd64.whl (1.8 MB)
     ---------------------------------------- 1.8/1.8 MB 4.4 MB/s eta 0:00:00
Installing collected packages: rapidfuzz, Levenshtein, python-Levenshtein
Successfully installed Levenshtein-0.21.0 python-Levenshtein-0.21.0 rapidfuzz-3.0.0


# 4 different similarity algorithms used 
1. cosine
2. euclidean
3. levenshtein
4. jaccard

# cosine_similarity based on short_discription

In [2]:
data = pd.read_json("News_Category_Dataset_v3.json", lines=True)
data.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [3]:
new_data = data[['headline','category','short_description','authors']]
new_data.head()

Unnamed: 0,headline,category,short_description,authors
0,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP"
1,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss
2,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel
3,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna
4,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski


In [34]:
# based on short_description
def Cosine_similarity(query:str):
    vectorizer = TfidfVectorizer(stop_words='english')
    vectors = vectorizer.fit_transform(new_data['short_description'])
    query_vector = vectorizer.transform([query])
    similarity = cosine_similarity(X=vectors, Y=query_vector)
    similarity=np.argsort(similarity, axis=0)[::-1][:5]
    for i in similarity:
        print(f" {i}: {new_data['short_description'].tolist()[i[0]]}")
        print('\n')

In [35]:
query = new_data['short_description'][5]
Cosine_similarity(query)

 [5]: The 63-year-old woman was seen working at the South Carolina store on Thursday. She was found dead Monday after her family reported her missing, authorities said.


 [75308]: This time they're in South Carolina.


 [90589]: The 35-year-old was reported missing on Aug. 18.


 [7297]: A 63-year-old woman was charged with second-degree assault over the 2018 incident.


 [55462]: Twenty-seven people have been reported missing.




# euclidean_distances based on short_description

In [32]:
# based on short_description
def euclidean_similarity(query:str):
    vectorizer = TfidfVectorizer(stop_words='english')
    vectors = vectorizer.fit_transform(new_data['short_description'])
    query_vector = vectorizer.transform([query])
    similarity = euclidean_distances(X=vectors, Y=query_vector)
    similarity=np.argsort(similarity, axis=0)[:5]
    for i in similarity:
        print(f" {i}: {new_data['short_description'].tolist()[i[0]]}")
        print('\n')

In [33]:
query = new_data['short_description'][5]
euclidean_similarity(query)

 [5]: The 63-year-old woman was seen working at the South Carolina store on Thursday. She was found dead Monday after her family reported her missing, authorities said.


 [128724]: 


 [109646]: 


 [109647]: 


 [109648]: 




# Levenshtein based on short_description

In [66]:
def levenshtein_similarity(query:str):
    for i, j in enumerate(new_data['short_description']):
        dis = Levenshtein.distance(j, query)
        if dis <= 100:
            print(f"{i} : {j}")

In [67]:
query = new_data['short_description'][5]
levenshtein_similarity(query)

5 : The 63-year-old woman was seen working at the South Carolina store on Thursday. She was found dead Monday after her family reported her missing, authorities said.
5873 : The 16-year-old activist spoke at a youth climate rally in Los Angeles during a week of wildfires ravaging California.
7511 : Jazmine Barnes was sitting in her mother's car when a man drove up and fired into the vehicle before fleeing, authorities said.
13448 : The suspect, wanted for murder in Russia, targeted a woman who looked like her and spoke Russian, authorities said.
36562 : The 40-year-old was preparing to scale the world's highest mountain when he slipped, officials there said.
71893 : Three other guards helped conceal the crime by filing inaccurate reports stating that the inmate attacked first, authorities said.
73432 : The 26-year-old was visiting friends in North Carolina last month when he was taken hostage, according to a police report.
87624 : The 25-year-old Wap was cited for driving without a lic

# jaccard based on short_description

In [88]:
class jaccard:

    def jaccard_score(doc1, doc2):
        
        words_doc1 = set(doc1.lower().split()) 
        words_doc2 = set(doc2.lower().split())
        
        intersection = words_doc1.intersection(words_doc2)
        union = words_doc1.union(words_doc2)
            
        return float(len(intersection)) / len(union)

    def jaccard_similarty(query:str):
        for i, j in enumerate(new_data['short_description']):
            score = jaccard.jaccard_score(query, j)
            if score >= 0.2:
                print(f"{i}: {j}")



In [90]:
query = new_data['short_description'][5]

jaccard.jaccard_similarty(query)

5: The 63-year-old woman was seen working at the South Carolina store on Thursday. She was found dead Monday after her family reported her missing, authorities said.
49510: Lee Manuel Viloria-Paulino, who was last seen Nov. 18, was found dead on Thursday.
51955: Todd Kohlepp, a registered sex offender, reportedly admitted to the killings after the woman was found on his South Carolina property.


# build model using cosine_similarity

In [142]:
# based on short_description
def Cosine_similarity(query:str):
    vectorizer = TfidfVectorizer(stop_words='english')
    vectors = vectorizer.fit_transform(new_data['short_description'])
    query_vector = vectorizer.transform([query])
    similarity = cosine_similarity(X=vectors, Y=query_vector)
    similarity=np.argsort(similarity, axis=0)[::-1][:10]
    for i in similarity:
        return new_data.iloc[i]

In [143]:
query = "Weather"
Cosine_similarity(query)

Unnamed: 0,headline,category,short_description,authors
18715,Chance The Rapper Is Now Chance The Weatherman,ENTERTAINMENT,Weather has never been so cool.,Jenna Amatulli
