# Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import math
import time
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\venil\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\venil\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Loading Dataset

In [2]:
news = pd.read_json("latest_dataset.json", lines = True)

In [3]:
news.shape

(5518, 6)

In [4]:
news.index = range(news.shape[0])

In [5]:
news.index.shape[0]

5518

In [6]:
news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5518 entries, 0 to 5517
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   link               5518 non-null   object        
 1   headline           5518 non-null   object        
 2   category           5518 non-null   object        
 3   short_description  5518 non-null   object        
 4   authors            5518 non-null   object        
 5   date               5518 non-null   datetime64[ns]
dtypes: datetime64[ns](1), object(5)
memory usage: 258.8+ KB


# Data Preprocessing

In [7]:
news_temp = news.copy()

### Removing Stopwords

In [8]:
stop_words = set(stopwords.words('english'))

In [9]:
for i in range(len(news_temp["headline"])):
    string = ""
    for word in news_temp["headline"][i].split():
        word = ("".join(e for e in word if e.isalnum()))
        word = word.lower()
        if not word in stop_words:
          string += word + " "  
    news_temp.at[i,"headline"] = string.strip()

### Lemmatization

In [10]:
lemmatizer = WordNetLemmatizer()

In [11]:
for i in range(len(news_temp["headline"])):
    string = ""
    for w in word_tokenize(news_temp["headline"][i]):
        string += lemmatizer.lemmatize(w,pos = "v") + " "
    news_temp.at[i, "headline"] = string.strip()

## TF-IDF Method for News Recommender System

In [12]:
tfidf_headline_vectorizer = TfidfVectorizer(min_df = 0.0)
tfidf_headline_features = tfidf_headline_vectorizer.fit_transform(news_temp['headline'])

In [13]:
def tfidf_based_model(row_index, num_similar_items):
    couple_dist = pairwise_distances(tfidf_headline_features,tfidf_headline_features[row_index])
    indices = np.argsort(couple_dist.ravel())[0:num_similar_items]
    df = pd.DataFrame({'publish_date': news['date'][indices].values,
               'headline':news['headline'][indices].values, 'link':news['link'][indices].values})
    print('headline : ',news['headline'][indices[0]])
    
    
    return df.iloc[1:,]
tfidf_based_model(5513, 11)

headline :  Trump Deploys More Troops To Middle East After U.S. Embassy Attack


Unnamed: 0,publish_date,headline,link
1,2020-01-01,U.S. Troops Fire Tear Gas At Protesters Outsid...,https://www.huffpost.com/entry/baghdad-embassy...
2,2021-12-14,Meadows’ ‘Protect Pro Trump People’ Email May ...,https://www.huffpost.com/entry/meadows-nationa...
3,2020-01-04,Rockets Fall Near U.S. Embassy In Baghdad,https://www.huffpost.com/entry/rockets-green-z...
4,2020-11-13,Trevor Noah Hits Donald Trump With A Taunting ...,https://www.huffpost.com/entry/trevor-noah-don...
5,2020-01-17,11 American Troops Injured In Iran Attack On I...,https://www.huffpost.com/entry/eleven-american...
6,2022-04-30,Ukraine Fights To Hold Off Russian Advances In...,https://www.huffpost.com/entry/russia-offensiv...
7,2022-02-22,U.S. Relocates Ukraine Embassy Staff To Poland...,https://www.huffpost.com/entry/ukraine-embassy...
8,2020-01-05,Iraqi Parliament Votes To Remove U.S. Troops,https://www.huffpost.com/entry/iraqi-parliamen...
9,2020-01-26,Multiple Rockets Crash Near U.S. Embassy In Ba...,https://www.huffpost.com/entry/rockets-us-emba...
10,2022-02-17,National Guard Deploys For New Emergency: Teac...,https://www.huffpost.com/entry/bc-us-virus-out...
