In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import string
import spacy
from gensim.models import KeyedVectors
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler
from sklearn.metrics import silhouette_score
import cloudpickle
from sklearn.decomposition import PCA, TruncatedSVD
from numpy.linalg import norm
import warnings
warnings.filterwarnings("ignore")

<h3>SINCE FINDING COSINE SIMILARITY BETWEEN TEXT EMBEDDINGS HAVING 768 DIMENSIONS AND 50,000 SAMPLES IS REALLY RESOURCE INTESIVE. I CAME UP WITH A TECHINIQUE THAT IS LESS RESOURCE INTENSIVE. THIS IS ACHIEVED BY REDUCING THE DIMENSIONS FROM 768 TO 150. THEN THE SAMPLES ARE ASSIGNED TO CLUSTERS. DURING INFERENCE THE NEWS SAMPLES ARE FILTERED BASED ON THE CLUSTER Of INPUT NEWS. THIS HEAVILY REDUCES THE NUMBER OF SAMPLES FOR WHICH COSINE SIMILARITY MUST BE COMPUTED. 

In [2]:
with open(os.path.join("..", "models", "preprocessor.bin"), "rb") as file:
    preprocessor = cloudpickle.load(file)

with open(os.path.join("..", "models", "vectorizer.bin"), "rb") as file:
    vectorizer = cloudpickle.load(file)

with open(os.path.join("..", "models", "pca.bin"), "rb") as file:
    pca = cloudpickle.load(file)

with open(os.path.join("..", "models", "clustering.bin"), "rb") as file:
    clustering = cloudpickle.load(file)
    
pca_text = pd.read_parquet(os.path.join("..", "data", "pca_text.parquet"))
text_data = pd.read_parquet(os.path.join("..", "data", "text_data.parquet"))

In [3]:
pca_text.shape

(50000, 150)

In [4]:
def cosine_similarity(a, b):
    cos_sim = np.dot(a, b)/(norm(a)*norm(b))
    return cos_sim

In [36]:
def inference(text: str):
    text = pd.Series(text)
    text = preprocessor.preprocess(text, dataset='test')
    text = vectorizer.vectorize(text, dataset='test')
    text = pca.reduce_dimensions(pd.DataFrame(text), dataset='test')
    cluster = clustering.predict(pd.DataFrame(text))
    
    match_idx = text_data.loc[text_data['cluster'] == cluster[0]].index
    # match_idx = range(len(pca_text))
    
    match_pca_txt = pca_text.loc[match_idx, :].reset_index(drop=True).copy()
    match_txt = text_data.loc[match_idx, :].reset_index(drop=True).copy()
    similarities = []
    for i in range(len(match_pca_txt)):
        similarities.append(cosine_similarity(match_pca_txt.iloc[i, :].values, text.ravel()))
    
    match_txt['similarity_score'] = similarities
    match_txt.sort_values(by='similarity_score', ascending=False)
    
    return match_txt

In [37]:
def display_similar_news(input_news: str, top_n: int=10):
    similar_news = inference(input_news)
    similar_news = similar_news.iloc[:top_n, :].reset_index(drop=True)
    print(f'Input: {input_news}{"-"*50}\nSIMILAR NEWS\n{"-"*50}\n')
    for i in range(len(similar_news)):
        print(f'Category: {similar_news.loc[i, "category"]}\nHeadline: {similar_news.loc[i, "headline"]}\nDescription: {similar_news.loc[i, "short_description"]}\n')

In [41]:
txt = '''
Killers of the Flower Moon trailer: Martin Scorsese, Leonardo DiCaprio promise a chilling Western
Martin Scorsese directorial Killers of the Flower Moon stars Leonardo DiCaprio, Robert De Niro, Lily Gladstone, Jesse Plemons, Tantoo Cardinal, Cara Jade Myers, JaNae Collins, and Jillian Dion.
'''
display_similar_news(txt, top_n=5)

Input: 
Killers of the Flower Moon trailer: Martin Scorsese, Leonardo DiCaprio promise a chilling Western
Martin Scorsese directorial Killers of the Flower Moon stars Leonardo DiCaprio, Robert De Niro, Lily Gladstone, Jesse Plemons, Tantoo Cardinal, Cara Jade Myers, JaNae Collins, and Jillian Dion.
--------------------------------------------------
SIMILAR NEWS
--------------------------------------------------

Category: ENTERTAINMENT
Headline: James Cameron Says He 'Clashed' With Studio Before 'Avatar' Release
Description: The "Avatar" director said aspects of his 2009 movie are "still competitive with everything that’s out there these days."

Category: ENTERTAINMENT
Headline: Amazon Greenlights 'Blade Runner 2099' Limited Series Produced By Ridley Scott
Description: The director of the original 1982 film joins a writer of the 2017 sequel for the newest installment in the sci-fi franchise.

Category: ENTERTAINMENT
Headline: Jean-Luc Godard, Pioneering French Filmmaker, Dies
Descripti

In [44]:
txt = '''
‘Impossible to play Roland Garros’: Rafael Nadal to miss French Open for the first time since 2004
Nadal has been on the sidelines since picking a left hip flexor injury during a second-round loss at the Australian Open in January.
'''
display_similar_news(txt, top_n=5)

Input: 
‘Impossible to play Roland Garros’: Rafael Nadal to miss French Open for the first time since 2004
Nadal has been on the sidelines since picking a left hip flexor injury during a second-round loss at the Australian Open in January.
--------------------------------------------------
SIMILAR NEWS
--------------------------------------------------

Category: WORLD NEWS
Headline: World Cup Captains Want To Wear Rainbow Armbands In Qatar
Description: FIFA has come under pressure from several European soccer federations who want to support a human rights campaign against discrimination at the World Cup.

Category: SPORTS
Headline: Las Vegas Aces Win First WNBA Title, Chelsea Gray Named MVP
Description: Las Vegas never had a professional sports champion — until Sunday.

Category: SPORTS
Headline: Boston Marathon To Make Race More Inclusive For Nonbinary Runners
Description: The race's organizers say nonbinary athletes won't have to register with the men's or women's divisions and prov

In [50]:
txt = '''
More than 20 rivers have burst their banks in Italy, leaving 13 people dead and forcing thousands from their homes after six months' rainfall fell in a day and a half.
More bodies were found on Thursday after almost every river flooded between Bologna and the north-east coast 115km (70 miles) away.
Some 280 landslides have taken place.
'''
display_similar_news(txt, top_n=5)

Input: 
More than 20 rivers have burst their banks in Italy, leaving 13 people dead and forcing thousands from their homes after six months' rainfall fell in a day and a half.
More bodies were found on Thursday after almost every river flooded between Bologna and the north-east coast 115km (70 miles) away.
Some 280 landslides have taken place.
--------------------------------------------------
SIMILAR NEWS
--------------------------------------------------

Category: WORLD NEWS
Headline: Fiona Threatens To Become Category 4 Storm Headed To Bermuda
Description: Hurricane Fiona lashed the Turks and Caicos Islands and was forecast to squeeze past Bermuda later this week.

Category: WORLD NEWS
Headline: Fiona Barrels Toward Turks And Caicos Islands As Category 3 Hurricane
Description: The Turks and Caicos Islands government imposed a curfew as the intensifying storm kept dropping copious rain over the Dominican Republic and Puerto Rico.

Category: WORLD NEWS
Headline: Hurricane Fiona Bears

In [51]:
txt = '''
Thousands of Israeli nationalists have been marching into the Muslim quarter of Jerusalem's Old City, with violence directed at media covering the event.
The flag parade is part of Israel's Jerusalem Day, marking its capture of the east of the city in the 1967 war.
A group of marchers threw stones, sticks and bottles at Palestinian and foreign journalists at the Damascus Gate entrance.
'''
display_similar_news(txt, top_n=5)

Input: 
Thousands of Israeli nationalists have been marching into the Muslim quarter of Jerusalem's Old City, with violence directed at media covering the event.
The flag parade is part of Israel's Jerusalem Day, marking its capture of the east of the city in the 1967 war.
A group of marchers threw stones, sticks and bottles at Palestinian and foreign journalists at the Damascus Gate entrance.
--------------------------------------------------
SIMILAR NEWS
--------------------------------------------------

Category: U.S. NEWS
Headline: 9/11 Attacks Still Reverberate As U.S. Marks 21st Anniversary
Description: September 11th remains a point for reflection on the 2001 attacks that reconfigured national security policy and spurred a U.S. “war on terror” worldwide.

Category: ENTERTAINMENT
Headline: Muslims Only Make Up 1% Of Characters On TV, Study Finds
Description: In most of the 200 scripted shows that researchers analyzed, Muslim characters were typically violent, dehumanized or disp