In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import string
import spacy
from gensim.models import KeyedVectors
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler
from sklearn.metrics import silhouette_score
import cloudpickle
from sklearn.decomposition import PCA, TruncatedSVD
from numpy.linalg import norm
import warnings
warnings.filterwarnings("ignore")

In [2]:
with open(os.path.join("..", "models", "preprocessor.bin"), "rb") as file:
    preprocessor = cloudpickle.load(file)

with open(os.path.join("..", "models", "vectorizer.bin"), "rb") as file:
    vectorizer = cloudpickle.load(file)

with open(os.path.join("..", "models", "pca.bin"), "rb") as file:
    pca = cloudpickle.load(file)

with open(os.path.join("..", "models", "clustering.bin"), "rb") as file:
    clustering = cloudpickle.load(file)
    
pca_text = pd.read_parquet(os.path.join("..", "data", "pca_text.parquet"))
text_data = pd.read_parquet(os.path.join("..", "data", "text_data.parquet"))

In [3]:
def cosine_similarity(a, b):
    cos_sim = np.dot(a, b)/(norm(a)*norm(b))
    return cos_sim

In [4]:
def inference(text: str):
    text = pd.Series(text)
    text = preprocessor.preprocess(text, dataset='test')
    text = vectorizer.vectorize(text, dataset='test')
    text = pca.reduce_dimensions(pd.DataFrame(text), dataset='test')
    cluster = clustering.predict(pd.DataFrame(text))
    
    match_idx = text_data.loc[text_data['cluster'] == cluster[0]].index
    # match_idx = range(len(pca_text))
    
    match_pca_txt = pca_text.loc[match_idx, :].reset_index(drop=True).copy()
    match_txt = text_data.loc[match_idx, :].reset_index(drop=True).copy()
    similarities = []
    for i in range(len(match_pca_txt)):
        similarities.append(cosine_similarity(match_pca_txt.iloc[i, :].values, text.ravel()))
    
    match_txt['similarity_score'] = similarities
    match_txt.sort_values(by='similarity_score', ascending=False)
    
    return match_txt.iloc[:10, :]

In [5]:
txt = '''
Coronavirus cases in India: India witnessed a single-day rise of 906 new COVID-19 cases as the active cases declined to 10,179, according to Union Health Ministry data updated on Thursday. The toll has risen to 5,31,814 (5.31 lakh) with 20 deaths, including seven reconciled by Kerala, the data updated at 8 am stated. The daily positivity rate has been pegged at 0.70 per cent and the weekly rate at 0.90 per cent.

The total tally of Covid cases was recorded at 4.49 crore (4,49,84,058). The active cases now comprise 0.02 per cent of the total infections while the national recovery rate has been recorded at 98.79 per cent, according to the website. The number of people who have recuperated rose to 4.44 crore (4,44,42,065) while the case fatality rate was recorded at 1.18 per cent.
'''

similar_news = inference(txt)
(similar_news.loc[:, 'headline'] + " " + similar_news.loc[:, 'short_description']).tolist()

['Over 4 Million Americans Roll Up Sleeves For Omicron-Targeted COVID Boosters Health experts said it is too early to predict whether demand would match up with the 171 million doses of the new boosters the U.S. ordered for the fall.',
 'Emmy Awards Viewership Dips To A Record-Low As Its Audience Continues To Drop The television awards ceremony lost roughly 1.5 million viewers compared to its 2021 program.',
 'U.S. Inflation Falls For 2nd Straight Month On Lower Gas Costs On a monthly basis, prices rose 0.1%, after a flat reading in July.',
 "Thousands Of Minnesota Nurses Launch 3-day Strike Over Pay They're pressing for salary increases they say will help improve patient care by resolving understaffing stresses that have worsened in the pandemic.",
 "The Unemployment Insurance System Is Not Ready For The Next Recession States are cutting back, and Congress is unlikely to step in even after the pandemic exposed the unemployment system's shortcomings.",
 'U.S. Hiring Slowed In August As