In [41]:
import pandas as pd
import string
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import os

stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = str(text).lower()
    text = ''.join([c for c in text if c not in string.punctuation])
    return ' '.join([word for word in text.split() if word not in stop_words])

df = pd.read_csv('./data/news_crawler.csv')

df['title'] = df['title'].fillna('')
df['content'] = df['content'].fillna('')
df['text'] = df['content']
df['text'] = df['text'].apply(preprocess)

os.makedirs('output_model', exist_ok=True)

# Save raw text to re-use in recommendations
df[['title', 'text']].to_csv('output_model/news_texts.csv', index=False)

# Vectorize using TF-IDF
vectorizer = TfidfVectorizer(max_features=50000)
tfidf_matrix = vectorizer.fit_transform(df['text'])

# Save vectorizer and matrix
joblib.dump(vectorizer, 'output_model/tfidf_vectorizer.pkl')
joblib.dump(tfidf_matrix, 'output_model/tfidf_matrix.pkl')



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['output_model/tfidf_matrix.pkl']

In [42]:
new_news = pd.read_csv('./output_model/news_texts.csv')
new_news.head(2)

Unnamed: 0,title,text
0,News News Israel-Gaza war,benjamin netanyahu says hamas seeking changes ...
1,News News War in Ukraine,fire 100 drones night theres little talk cease...


In [43]:
from sklearn.metrics.pairwise import cosine_similarity
news_df = pd.read_csv('./output_model/news_texts.csv')


def get_similar_news(input_text, top_n=3):
    cleaned = preprocess(input_text)
    vector = vectorizer.transform([cleaned])
    sim_scores = cosine_similarity(vector, tfidf_matrix).flatten()
   # print(sim_scores)

    top_indices = sim_scores.argsort()[::-1][:top_n]
    print(top_indices)
    similar_news = news_df.iloc[top_indices]
    
    return similar_news[['title']].to_dict(orient='records')

In [45]:
text = """ A second-year student of the Indian Institute of Management-Calcutta has been arrested after a woman alleged she was raped inside the hostel at the premier institute.

The police have charged the accused under various sections of the Bharatiya Nyaya Sanhita, including rape, and seized his clothes and phone for forensic examination. Four others have also been detained and are being questioned.

The woman has alleged that Parmanand Toppaunwar, who hails from Karnataka, called her to the campus in Joka on Friday. She told police that she is a psychologist and the accused her client, and he had asked her to come to the campus for a counselling session. The two had met on social media, she wrote in her complaint
"""

print(get_similar_news(text))

[ 518 2756 1659]
[{'title': 'Man arrested after teenager raped in public toilet'}, {'title': 'Two arrested after school girls in India allegedly made to strip for period check'}, {'title': 'USC campus intersections are dangerous, data shows. Then they became deadly'}]
