### 0. Import Library

In [1]:
import nltk, re
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import Counter

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.figure_factory as ff
import plotly.graph_objects as go
import plotly.express as px

### 1. Preprocess Text

In [3]:
stop_words = stopwords.words('english')
normalizer = WordNetLemmatizer()

def get_part_of_speech(word):
  probable_part_of_speech = wordnet.synsets(word)
  pos_counts = Counter()
  pos_counts["n"] = len(  [ item for item in probable_part_of_speech if item.pos()=="n"]  )
  pos_counts["v"] = len(  [ item for item in probable_part_of_speech if item.pos()=="v"]  )
  pos_counts["a"] = len(  [ item for item in probable_part_of_speech if item.pos()=="a"]  )
  pos_counts["r"] = len(  [ item for item in probable_part_of_speech if item.pos()=="r"]  )
  most_likely_part_of_speech = pos_counts.most_common(1)[0][0]
  return most_likely_part_of_speech

def preprocess_text(text):
  cleaned = re.sub(r'\W+', ' ', text).lower()
  tokenized = word_tokenize(cleaned)
  normalized = " ".join([normalizer.lemmatize(token, get_part_of_speech(token)) for token in tokenized])
  return normalized

### 2. Loading the Data

In [4]:
news_df = pd.read_csv('news_set1.csv',encoding='latin-1')
user_df = pd.read_csv('user_read_set1.csv',encoding='latin-1')

In [5]:
news_df['Date'] =  pd.to_datetime(news_df['Date'].str.strip(), format='%m/%d/%Y')
user_df['Date'] =  pd.to_datetime(user_df['Date'].str.strip(), format='%m/%d/%Y')

In [7]:
news_df_temp = news_df.Heading.apply(preprocess_text)
news_df['Article'] = news_df.Article.apply(preprocess_text)

In [8]:
user_df_temp = news_df.Heading.apply(preprocess_text)
user_df['Article'] = news_df.Article.apply(preprocess_text)

### 3. Tf-idf model for news headline and category

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder 
from sklearn.metrics.pairwise import cosine_similarity,cosine_distances
from sklearn.metrics import pairwise_distances

In [15]:
category_onehot_encoded = OneHotEncoder().fit_transform(np.array(news_df["NewsType"]).reshape(-1,1))

In [16]:
tfidf_headline_vectorizer = TfidfVectorizer(min_df = 0)
tfidf_headline_features = tfidf_headline_vectorizer.fit_transform(news_df_temp)

In [21]:
def tfidf_based_model(row_index, num_similar_items):
    couple_dist = cosine_distances(tfidf_headline_features,tfidf_headline_features[row_index])
    category_dist = cosine_similarity(category_onehot_encoded, category_onehot_encoded[row_index])
    indices = np.argsort(couple_dist.ravel())[0:num_similar_items]
    df = pd.DataFrame({
               'headline':news_df['Heading'][indices].values,
                'Cosine Distance with the queried article': couple_dist[indices].ravel(),
                'Category based Cosine Distance': category_dist[indices].ravel(), 
                'Categoty': news_df['NewsType'][indices].values,
                'Date': news_df['Date'][indices].values}).sort_values('Category',ascending=False)

    print("="*30,"Queried article details","="*30)
    print('headline : ',user_df['Heading'][indices[6]])
    print('Category : ', user_df['NewsType'][indices[6]])
    print('Day and month : ', user_df['Date'][indices[6]])
    print("\n","="*25,"Recommended articles : ","="*23)
    
    #return df.iloc[1:,1]
    return df.iloc[1:,].sort_values('Date',ascending=False)
tfidf_based_model(56,11)

headline :  Stokes batters South Africa in blistering double 
Categoty :  sports
Day and month :  2016-01-03 00:00:00



Unnamed: 0,headline,Cosine Distance with the queried article,Category based Cosine Distance,Categoty,Date
5,Kiwis beat Pakistan to bag series 2 0,0.933562,1.0,sports,2016-01-31
9,Watson to lead Australia for final India T20,0.944474,1.0,sports,2016-01-30
10,NZ Cricket apologises to Amir for a tau,0.944571,1.0,sports,2016-01-27
2,Federer through to semis with efficient win ov...,0.826507,1.0,sports,2016-01-26
7,Serena beats Sharapova to reach semi fi,0.941752,1.0,sports,2016-01-26
4,Rabada at the double as South Africa scent vic...,0.887556,1.0,sports,2016-01-25
8,Keys falls to Zhang Shuai at Australian O,0.944163,1.0,sports,2016-01-25
1,Rabada grabs six as England falter,0.732259,1.0,sports,2016-01-24
3,Hales falls cheaply in Englands reply,0.871302,1.0,sports,2016-01-23
6,pol prices expected to rise in november,0.940285,0.0,business,2015-10-19
