### 0. Import Library

In [1]:
import nltk, re
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import Counter

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.figure_factory as ff
import plotly.graph_objects as go
import plotly.express as px

### 1. Preprocess Text

In [3]:
stop_words = stopwords.words('english')
normalizer = WordNetLemmatizer()

def get_part_of_speech(word):
  probable_part_of_speech = wordnet.synsets(word)
  pos_counts = Counter()
  pos_counts["n"] = len(  [ item for item in probable_part_of_speech if item.pos()=="n"]  )
  pos_counts["v"] = len(  [ item for item in probable_part_of_speech if item.pos()=="v"]  )
  pos_counts["a"] = len(  [ item for item in probable_part_of_speech if item.pos()=="a"]  )
  pos_counts["r"] = len(  [ item for item in probable_part_of_speech if item.pos()=="r"]  )
  most_likely_part_of_speech = pos_counts.most_common(1)[0][0]
  return most_likely_part_of_speech

def preprocess_text(text):
  cleaned = re.sub(r'\W+', ' ', text).lower()
  tokenized = word_tokenize(cleaned)
  normalized = " ".join([normalizer.lemmatize(token, get_part_of_speech(token)) for token in tokenized])
  return normalized

### 2. Loading the Data

In [4]:
news_df = pd.read_csv('Articles.csv',encoding='latin-1')
user_df = pd.read_csv('user_read_set1.csv',encoding='latin-1')

In [5]:
news_df['Date'] =  pd.to_datetime(news_df['Date'].str.strip(), format='%m/%d/%Y')
user_df['Date'] =  pd.to_datetime(user_df['Date'].str.strip(), format='%m/%d/%Y')

In [6]:
user_df

Unnamed: 0,Article,Date,Heading,NewsType
0,KARACHI: The Sindh government has decided to b...,2015-01-01,sindh govt decides to cut public transport far...,business
1,HONG KONG: Asian markets started 2015 on an up...,2015-01-02,asia stocks up in new year trad,business
2,HONG KONG: Hong Kong shares opened 0.66 perce...,2015-01-05,hong kong stocks open 0.66 percent lower,business
3,HONG KONG: Asian markets tumbled Tuesday follo...,2015-01-06,asian stocks sink euro near nine year,business
4,NEW YORK: US oil prices Monday slipped below $...,2015-01-06,us oil prices slip below 50 a barr,business
5,"LAHORE: Left arm fast bowler Mohammad Amir, wh...",2016-01-01,Amir returns to Pakistan squad after 5 year b,sports
6,SYDNEY: Australia look set to field two specia...,2016-01-01,Australia set to test twin spin attack against...,sports
7,SYDNEY: After picking up pretty much every ind...,2016-01-02,Australia skipper Smith plots return to test s...,sports
8,WELLINGTON: New Zealand captain Brendon McCull...,2016-01-03,McCullum says Amir should get benefit of doub,sports
9,CAPE TOWN: Ben Stokes and Jonny Bairstow tore ...,2016-01-03,England declare on 629 6 on record breaking day,sports


In [7]:
user_df = user_df[user_df['Date'] >= pd.Timestamp(2016,1,1)] #get user recent news

In [8]:
user_df

Unnamed: 0,Article,Date,Heading,NewsType
5,"LAHORE: Left arm fast bowler Mohammad Amir, wh...",2016-01-01,Amir returns to Pakistan squad after 5 year b,sports
6,SYDNEY: Australia look set to field two specia...,2016-01-01,Australia set to test twin spin attack against...,sports
7,SYDNEY: After picking up pretty much every ind...,2016-01-02,Australia skipper Smith plots return to test s...,sports
8,WELLINGTON: New Zealand captain Brendon McCull...,2016-01-03,McCullum says Amir should get benefit of doub,sports
9,CAPE TOWN: Ben Stokes and Jonny Bairstow tore ...,2016-01-03,England declare on 629 6 on record breaking day,sports
10,CAPE TOWN: Ben Stokes scored the second fastes...,2016-01-03,Stokes batters South Africa in blistering double,sports
11,ISLAMABAD: As Senate Chairman Raza Rabbani rul...,2016-11-23,Most loans written off in Musharrafs er,business
12,SINGAPORE: Oil prices edged up on Wednesday in...,2016-11-23,Oil prices edge up on anticipation of OPEC led...,business
13,TOKYO: Pakistan LNG Ltd has received strong in...,2016-11-24,New LNG buyer Pakistan sees strong interest in...,business
14,ISLAMABAD: Pakistan Electronic Media Regulator...,2016-11-24,Pakistans first DTH licences auctioned for Rs ...,business


In [9]:
news_df.sort_values('Heading',inplace=True, ascending=False)
duplicated_articles_series = news_df.duplicated('Heading', keep = False)
news_df = news_df[~duplicated_articles_series]
print("Total number of articles after removing duplicates:", news_df.shape[0])

Total number of articles after removing duplicates: 2516


In [10]:
news_df = news_df[news_df['Date'] >= pd.Timestamp(2016,1,1)] #get recent news

In [11]:
news_df

Unnamed: 0,Article,Date,Heading,NewsType
731,KARACHI/ISLAMABAD: Pakistani stocks are soarin...,2016-06-20,rising economy Pakistan hampered image prob,business
1200,SHARJAH: England all-rounder Ravi Boparas rem...,2016-02-11,psl Zalmi win nailbiter against Kings by 3 ru,sports
637,DUBLIN: Finance ministers from the world's lar...,2016-05-16,problems G7 finance ministers world,business
691,NEW YORK: A major conference aimed at building...,2016-06-01,moot forge closer Pak US businesses bonds begi...,business
686,NEW DELHI/MUMBAI: Anil Ambani's Reliance Group...,2016-05-30,missiles subs Anil Ambani bets big def,business
...,...,...,...,...
951,ISLAMABAD: The mobile phone companies have add...,2016-08-25,3G 4G users touch 32 mln mark Paki,business
2455,ISLAMABAD: Countrys fourth nuclear power plan...,2016-10-16,340MW nuclear power plant starts operati,business
1354,LONDON: FIFA must make the 2026 World Cup bidd...,2016-03-05,2026 World Cup bid must be bullet proof says I...,sports
1727,PARIS: While hosts France will be bidding to r...,2016-05-11,19 nations to compete in Womens World Team Squ...,sports


In [12]:
news_df_temp = news_df.Heading.apply(preprocess_text)

In [13]:
user_df_temp = news_df.Heading.apply(preprocess_text)

### 3. Tf-idf model for news headline and category

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder 
from sklearn.metrics.pairwise import cosine_similarity,cosine_distances
from sklearn.metrics import pairwise_distances

In [15]:
category_onehot_encoded = OneHotEncoder().fit_transform(np.array(news_df["NewsType"]).reshape(-1,1))
category_user_onehot_encoded = OneHotEncoder().fit_transform(np.array(user_df["NewsType"]).reshape(-1,1))

In [16]:
tfidf_headline_vectorizer = TfidfVectorizer(min_df = 0)
tfidf_headline_features = tfidf_headline_vectorizer.fit_transform(news_df_temp)

In [17]:
tfidf_user_headline_features = tfidf_headline_vectorizer.transform(user_df_temp)

In [18]:
def tfidf_based_model():
    for i in range (user_df.shape[0]):
        couple_dist = cosine_distances(tfidf_headline_features,tfidf_user_headline_features[i])
        category_dist = cosine_similarity(category_onehot_encoded, category_user_onehot_encoded[i])
        indices = np.argsort(couple_dist.ravel())[0:user_df.shape[0]]
        df = pd.DataFrame({
               'headline':news_df['Heading'][indices].values,
                'Cosine Distance with the queried article': couple_dist[indices].ravel(),
                'Category based Cosine Distance': category_dist[indices].ravel(), 
                'Category': news_df['NewsType'][indices].values,
                'Date': news_df['Date'][indices].values
        }).sort_values("Date",ascending=False).dropna()
        
        return df.iloc[1:,].head(10).style.hide_index()

In [19]:
tfidf_based_model()



Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike



headline,Cosine Distance with the queried article,Category based Cosine Distance,Category,Date
Sweden miss rested Ibrahimovic in 0 0 draw with Sloveni,0.833209,1,sports,2016-05-31 00:00:00
NBP ICBC signed MoU bilateral cooperation Pakistan Chi,0.846955,0,business,2016-05-25 00:00:00
Abu Dhabi lays off staff Gulf austerity tig,0.812897,0,business,2016-05-22 00:00:00
Pakistan shares higher rupee stronger,0.828564,0,business,2016-05-17 00:00:00
Apple invests 1 bln in Chinese ride hailing service Didi Chuxing,0.822595,0,business,2016-05-13 00:00:00
K Electric wins Safety Award,0.82256,1,business,2016-05-05 00:00:00
Were not just drummers in the Windies band says Si,0.848381,1,sports,2016-04-01 00:00:00
England win World T20 warm up against Mumbai CA,0.85789,0,sports,2016-03-14 00:00:00
Warner Marsh tons lift Australia to 330 7,0.802382,1,sports,2016-01-23 00:00:00


### KNN model for news headline

In [20]:
from sklearn.neighbors import NearestNeighbors

In [21]:
n_neighbors = 20
KNN = NearestNeighbors(n_neighbors, p=2, metric='cosine')
KNN.fit(tfidf_headline_features)
NNs = KNN.kneighbors(tfidf_user_headline_features,return_distance=True)

In [22]:
def get_recommendation(top,news_df,scores):
    recommendation = pd.DataFrame(columns=['Heading','NewsType','Date','score'])
    count = 0
    for i in top:
        try:
            recommendation.at[count, 'Heading'] = news_df['Heading'][i]
            recommendation.at[count, 'NewsType'] = news_df['NewsType'][i]
            recommendation.at[count, 'Date'] = news_df['Date'][i]
            recommendation.at[count, 'score'] = scores[count]
            count += 1
        except KeyError:
            continue
    return recommendation 

In [23]:
top = NNs[1][0][1:]
index_scores = NNs[0][0][1:]

get_recommendation(top,news_df,index_scores)

Unnamed: 0,Heading,NewsType,Date,score
0,Warner Marsh tons lift Australia to 330 7,sports,2016-01-23 00:00:00,0.787202
1,Abu Dhabi lays off staff Gulf austerity tig,business,2016-05-22 00:00:00,0.802382
2,K Electric wins Safety Award,business,2016-05-05 00:00:00,0.80673
3,Apple invests 1 bln in Chinese ride hailing se...,business,2016-05-13 00:00:00,0.812897
4,Pakistan shares higher rupee stronger,business,2016-05-17 00:00:00,0.82256
5,Sweden miss rested Ibrahimovic in 0 0 draw wit...,sports,2016-05-31 00:00:00,0.822595
6,NBP ICBC signed MoU bilateral cooperation Paki...,business,2016-05-25 00:00:00,0.828564
7,Were not just drummers in the Windies band say...,sports,2016-04-01 00:00:00,0.833209
8,England win World T20 warm up against Mumbai CA,sports,2016-03-14 00:00:00,0.845733
9,Pakistan importing RON 92 standard petrol October,business,2016-07-15 00:00:00,0.846247
