In [1]:
import numpy as np
import pandas as pd

import os
import math
import time
import json
import pickle


import matplotlib.pyplot as plt
import seaborn as sns
import plotly.figure_factory as ff
import plotly.graph_objects as go
import plotly.express as px
import scipy.sparse as sp



# Below libraries are for text processing using NLTK
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Below libraries are for feature representation using sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Below libraries are for similarity matrices using sklearn
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances

#install  the Kaggle library
! pip install Kaggle

#connect between Drive and Kaggle
from google.colab import drive
drive.mount('/content/drive')
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
!cp /content/drive/MyDrive/ColabNotebooks/kaggle.json ~/.kaggle/kaggle.json


#Change the file permissions to read/write to the owner only
! chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d rmisra/news-category-dataset
!kaggle datasets download -d umbertogriffo/googles-trained-word2vec-model-in-python
!unzip  news-category-dataset.zip

Mounted at /content/drive
cp: cannot stat 'kaggle.json': No such file or directory
Downloading news-category-dataset.zip to /content
 38% 10.0M/26.5M [00:00<00:00, 36.2MB/s]
100% 26.5M/26.5M [00:00<00:00, 78.8MB/s]
Downloading googles-trained-word2vec-model-in-python.zip to /content
100% 3.17G/3.17G [00:36<00:00, 87.6MB/s]
100% 3.17G/3.17G [00:36<00:00, 92.5MB/s]
Archive:  news-category-dataset.zip
  inflating: News_Category_Dataset_v3.json  


In [2]:
#Loading Data
news_articles = pd.read_json("/content/News_Category_Dataset_v3.json", lines = True)
news_articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209527 entries, 0 to 209526
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   link               209527 non-null  object        
 1   headline           209527 non-null  object        
 2   category           209527 non-null  object        
 3   short_description  209527 non-null  object        
 4   authors            209527 non-null  object        
 5   date               209527 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(5)
memory usage: 9.6+ MB


In [3]:
news_articles.head()


Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [4]:
#1-Data Preprocessing:
#<Section A> Fetching only the articles from 2018:
news_articles = news_articles[news_articles['date'] >= pd.Timestamp(2018,1,1)]
news_articles.shape

(17257, 6)

In [5]:
# <Section B> Removing all the short headline articles:
news_articles = news_articles[news_articles['headline'].apply(lambda x: len(x.split())>5)]
print("Total number of articles after removal of headlines with short title:", news_articles.shape[0])

Total number of articles after removal of headlines with short title: 17183


In [6]:
#<Section C> Checking and removing all the duplicates:
news_articles.sort_values('headline',inplace=True, ascending=False)
duplicated_articles_series = news_articles.duplicated('headline', keep = False)
news_articles = news_articles[~duplicated_articles_series]
print("Total number of articles after removing duplicates:", news_articles.shape[0])

Total number of articles after removing duplicates: 17113


In [7]:
#<Section D> Checking for missing values:
news_articles.isna().sum()

link                 0
headline             0
category             0
short_description    0
authors              0
date                 0
dtype: int64

In [8]:
#2.Basic Data Exploration
#<Section A> Basic statistics — Number of articles, authors, categories :
print("Total number of articles : ", news_articles.shape[0])
print("Total number of authors : ", news_articles["authors"].nunique())
print("Total number of unqiue categories : ", news_articles["category"].nunique())

Total number of articles :  17113
Total number of authors :  2256
Total number of unqiue categories :  36


In [9]:
#<Section B> Distribution of articles category-wise:
fig = go.Figure([go.Bar(x=news_articles["category"].value_counts().index, y=news_articles["category"].value_counts().values)])
fig['layout'].update(title={"text" : 'Distribution of articles category-wise','y':0.9,'x':0.5,'xanchor': 'center','yanchor': 'top'}, xaxis_title="Category name",yaxis_title="Number of articles")
fig.update_layout(width=800,height=700)
fig

In [10]:
#<Section C> Number of articles per month:
news_articles_per_month = news_articles.resample('m',on = 'date')['headline'].count()
news_articles_per_month

date
2018-01-31    2065
2018-02-28    1694
2018-03-31    1778
2018-04-30    1580
2018-05-31    1391
2018-06-30     136
2018-07-31     155
2018-08-31     123
2018-09-30     156
2018-10-31     182
2018-11-30     175
2018-12-31     180
2019-01-31     181
2019-02-28     168
2019-03-31     179
2019-04-30     147
2019-05-31     152
2019-06-30     153
2019-07-31     157
2019-08-31     161
2019-09-30     163
2019-10-31     183
2019-11-30     174
2019-12-31     181
2020-01-31     154
2020-02-29     132
2020-03-31     161
2020-04-30     170
2020-05-31     179
2020-06-30     176
2020-07-31     185
2020-08-31     180
2020-09-30     172
2020-10-31     175
2020-11-30     178
2020-12-31     181
2021-01-31     183
2021-02-28     140
2021-03-31     164
2021-04-30     176
2021-05-31     184
2021-06-30     176
2021-07-31     180
2021-08-31     175
2021-09-30     168
2021-10-31     174
2021-11-30     170
2021-12-31     170
2022-01-31     149
2022-02-28     152
2022-03-31     174
2022-04-30     162
2022-05

In [11]:
fig = go.Figure([go.Bar(x=news_articles_per_month.index.strftime("%b"), y=news_articles_per_month)])
fig['layout'].update(title={"text" : 'Distribution of articles month-wise','y':0.9,'x':0.5,'xanchor': 'center','yanchor': 'top'}, xaxis_title="Month",yaxis_title="Number of articles")
fig.update_layout(width=500,height=500)
fig

In [12]:
#<Section D> PDF for the length of headlines:
fig = ff.create_distplot([news_articles['headline'].str.len()], ["ht"],show_hist=False,show_rug=False)
fig['layout'].update(title={'text':'PDF','y':0.9,'x':0.5,'xanchor': 'center','yanchor': 'top'}, xaxis_title="Length of a headline",yaxis_title="probability")
fig.update_layout(showlegend = False,width=500,height=500)
fig

In [13]:
news_articles.index = range(news_articles.shape[0])
# Adding a new column containing both day of the week and month, it will be required later while recommending based on day of the week and month
news_articles["day and month"] = news_articles["date"].dt.strftime("%a") + "_" + news_articles["date"].dt.strftime("%b")

In [14]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [15]:

#2. Text Preprocessing:
#<Section A> Stopwords removal:
       # install NLTK library
!pip install nltk
import nltk
nltk.download('brown') # Download brown corpus
nltk.download('punkt') # Download punkt corpus
nltk.download('stopwords')  # Download stopwords corpus
nltk.download('wordnet')
from nltk.corpus import stopwords





[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [16]:
# Create a set of stopwords
stop_words = set(stopwords.words('english'))
# Process the headlines
for i in range(len(news_articles["headline"])):
    string = ""
    for word in news_articles["headline"][i].split():
        word = ("".join(e for e in word if e.isalnum()))
        word = word.lower()
        if not word in stop_words:
          string += word + " "
    if(i%1000==0):
      print(i)           # To track number of records processed
    news_articles.at[i, "headline"] = string.strip()


0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000


In [17]:
#<Section B> Lemmatization:
lemmatizer = WordNetLemmatizer()
for i in range(len(news_articles["headline"])):
    string = ""
    for w in word_tokenize(news_articles["headline"][i]):
        string += lemmatizer.lemmatize(w,pos = "v") + " "
    news_articles.at[i, "headline"] = string.strip()
    if(i%1000==0):
        print(i)
         # To track number of records processed

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000


In [18]:
#6.a Using Bag of Words method
headline_vectorizer = CountVectorizer()
headline_features   = headline_vectorizer.fit_transform(news_articles['headline'])
headline_features.get_shape()# الoutput(عدد العناوين , عدد الكلمات الفريدة فيه)

(17113, 16481)

In [19]:
pd.set_option('display.max_colwidth', -1)  # To display a very long headline completely
def bag_of_words_based_model(row_index, num_similar_items):
    couple_dist = pairwise_distances(headline_features,headline_features[row_index])
    indices = np.argsort(couple_dist.ravel())[0:num_similar_items]
    df = pd.DataFrame({'publish_date': news_articles['date'][indices].values,
               'headline':news_articles['headline'][indices].values,
                'Euclidean similarity with the queried article': couple_dist[indices].ravel()})
    print("="*30,"Queried article details","="*30)
    print('headline : ',news_articles['headline'][indices[0]])
    print("\n","="*25,"Recommended articles : ","="*23)
    return df.iloc[1:,]
bag_of_words_based_model(155,15)


headline :  youtube suspend ads logan paul videos another scandal




Passing a negative integer is deprecated in version 1.0 and will not be supported in future version. Instead, use None to not limit the column width.



Unnamed: 0,publish_date,headline,Euclidean similarity with the queried article
1,2019-06-07,year week youtube,3.0
2,2020-04-13,pandemic war,3.162278
3,2018-03-22,youtube quietly escalate crackdown firearm videos,3.162278
4,2018-02-21,call deportees,3.162278
5,2018-03-13,another gun go school,3.162278
6,2022-07-27,take cat heat,3.316625
7,2022-09-01,freezedry breast milk,3.316625
8,2018-01-13,republicans moral compass,3.316625
9,2022-01-30,7 happiness hack commute,3.316625
10,2018-03-16,fog war america,3.316625


In [20]:
#6.b Using TF-IDF method
tfidf_headline_vectorizer = TfidfVectorizer(min_df = 0)
tfidf_headline_features = tfidf_headline_vectorizer.fit_transform(news_articles['headline'])

In [21]:
def tfidf_based_model(row_index, num_similar_items):
    couple_dist = pairwise_distances(tfidf_headline_features,tfidf_headline_features[row_index])
    indices = np.argsort(couple_dist.ravel())[0:num_similar_items]
    df = pd.DataFrame({'publish_date': news_articles['date'][indices].values,
               'headline':news_articles['headline'][indices].values,
                'Euclidean similarity with the queried article': couple_dist[indices].ravel()})
    print("="*30,"Queried article details","="*30)
    print('headline : ',news_articles['headline'][indices[0]])
    print("\n","="*25,"Recommended articles : ","="*23)


    return df.iloc[1:,]
tfidf_based_model(155, 15)

headline :  youtube suspend ads logan paul videos another scandal



Unnamed: 0,publish_date,headline,Euclidean similarity with the queried article
1,2018-01-05,logan paul dead body video spur thousands petition get youtube,1.121326
2,2018-01-02,youtube star logan paul spark outrage disturb dead body video,1.123215
3,2018-01-11,youtube finally punish logan paul wildly insensitive suicide video,1.142127
4,2018-04-26,woman deliver baby alone hotel use youtube videos,1.19217
5,2018-03-22,youtube quietly escalate crackdown firearm videos,1.199217
6,2018-02-14,russia threaten youtube instagram videos kremlin official oligarch,1.206333
7,2021-06-12,youtube suspend gop sen ron johnson covid19 misinformation,1.211939
8,2018-01-04,logan pauls dead body video explain chain reaction economy youtube,1.216592
9,2018-01-02,6 ways logan paul could actually raise suicide awareness,1.219316
10,2018-01-10,youtubes open letter logan paul isnt open,1.21962


In [22]:
#6.c Using Word2Vec embedding
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle
!pip install gensim
! unzip googles-trained-word2vec-model-in-python


Archive:  googles-trained-word2vec-model-in-python.zip
  inflating: GoogleNews-vectors-negative300.bin  
  inflating: GoogleNews-vectors-negative300.bin.gz  


In [23]:
#loading googlenews pretrained model
model_path = '/content/GoogleNews-vectors-negative300.bin'
loaded_model = KeyedVectors.load_word2vec_format(model_path, binary=True)#تحميل نموذج Word2Vec
model = Word2Vec(vector_size=300, min_count=2)#النموذج الجديد سيحتوي على كلمات مضمّنة بحجم 300 (الأبعاد)وسيتم النظر في الكلمات التي لا يقل تكرارها عن 2 أثناء التدريب

In [24]:
loaded_model['porter']

array([-0.0390625 ,  0.05883789, -0.25195312,  0.25390625,  0.24316406,
        0.15234375,  0.04248047, -0.08349609,  0.02929688,  0.05419922,
       -0.0168457 , -0.25      ,  0.28515625, -0.13085938,  0.046875  ,
        0.00157166, -0.09570312, -0.05541992,  0.3671875 , -0.25390625,
       -0.31054688, -0.0559082 ,  0.20507812,  0.36914062,  0.08984375,
        0.03491211, -0.17382812,  0.13671875,  0.21777344, -0.20410156,
        0.140625  , -0.19335938,  0.08349609, -0.10644531, -0.12988281,
       -0.06225586,  0.08007812,  0.18359375, -0.16699219, -0.03564453,
       -0.23828125, -0.23242188,  0.26367188,  0.07958984,  0.17382812,
       -0.234375  , -0.49023438, -0.07568359, -0.14550781, -0.03881836,
        0.04443359,  0.04199219,  0.10595703, -0.20117188,  0.0145874 ,
       -0.4609375 ,  0.13476562,  0.14550781, -0.25390625, -0.3359375 ,
       -0.078125  ,  0.10791016, -0.25195312, -0.02783203,  0.29492188,
        0.140625  , -0.11572266, -0.38085938,  0.04760742,  0.18

In [37]:
# Get the dictionary that maps words to their indices
#word_to_index = loaded_model.key_to_index

# Get the list of words (keys) in the model
vocabulary =loaded_model.index_to_key
w2v_headline = []
for i in news_articles['headline']:
    w2Vec_word = np.zeros(300, dtype="float32")
    for word in i.split():
        if word in vocabulary:
            w2Vec_word = np.add(w2Vec_word, loaded_model[word])
    w2Vec_word = np.divide(w2Vec_word, len(i.split()))
    w2v_headline.append(w2Vec_word)
w2v_headline = np.array(w2v_headline)



In [38]:
def avg_w2v_based_model(row_index, num_similar_items):
    couple_dist = pairwise_distances(w2v_headline, w2v_headline[row_index].reshape(1,-1))
    indices = np.argsort(couple_dist.ravel())[0:num_similar_items]
    df = pd.DataFrame({'publish_date': news_articles['date'][indices].values,
               'headline':news_articles['headline'][indices].values,
                'Euclidean similarity with the queried article': couple_dist[indices].ravel()})
    print("="*30,"Queried article details","="*30)
    print('headline : ',news_articles['headline'][indices[0]])
    print("\n","="*25,"Recommended articles : ","="*23)
    #return df.iloc[1:,1]
    return df.iloc[1:,]

avg_w2v_based_model(133, 11)

headline :  yovanovitch respond trump twitter attack real time dramatic impeachment testimony



Unnamed: 0,publish_date,headline,Euclidean similarity with the queried article
1,2019-10-09,late night host find funny latest donald trump impeachment inquiry twist,0.922826
2,2018-03-18,trump attack mccabe comey mueller probe sunday tweet barrage,0.933982
3,2018-01-10,seth meyers best response trump reportedly start day 11,0.937962
4,2019-10-24,colbert hit trump selfcongratulatory speech brutal factcheck,0.942128
5,2019-10-30,colbert nail gops big problem try discredit new trump witness,0.948317
6,2018-09-28,trump kavanaugh con job senate must vote credible witness 48 hours,0.950672
7,2018-10-17,ben sasse react trump horseface comment thats way men act,0.951635
8,2021-10-15,kayleigh mcenany attack jen psaki least selfaware tweet ever write,0.954004
9,2020-09-19,expence aide trump derail covid19 meet 45 minutes fox news rant,0.957979
10,2018-05-16,pauley perrette respond cbs statement multiple assault claim,0.958562


In [39]:
#6.d Weighted similarity based on headline and category
from sklearn.preprocessing import OneHotEncoder
category_onehot_encoded = OneHotEncoder().fit_transform(np.array(news_articles["category"]).reshape(-1,1))

In [40]:
def avg_w2v_with_category(row_index, num_similar_items, w1,w2): #headline_preference = True, category_preference = False):
    w2v_dist  = pairwise_distances(w2v_headline, w2v_headline[row_index].reshape(1,-1))
    category_dist = pairwise_distances(category_onehot_encoded, category_onehot_encoded[row_index]) + 1
    weighted_couple_dist   = (w1 * w2v_dist +  w2 * category_dist)/float(w1 + w2)
    indices = np.argsort(weighted_couple_dist.flatten())[0:num_similar_items].tolist()
    df = pd.DataFrame({'publish_date': news_articles['date'][indices].values,
               'headline':news_articles['headline'][indices].values,
                'Weighted Euclidean similarity with the queried article': weighted_couple_dist[indices].ravel(),
                'Word2Vec based Euclidean similarity': w2v_dist[indices].ravel(),
                 'Category based Euclidean similarity': category_dist[indices].ravel(),
                'Categoty': news_articles['category'][indices].values})

    print("="*30,"Queried article details","="*30)
    print('headline : ',news_articles['headline'][indices[0]])
    print('Categoty : ', news_articles['category'][indices[0]])
    print("\n","="*25,"Recommended articles : ","="*23)
    #return df.iloc[1:,[1,5]]
    return df.iloc[1:, ]

avg_w2v_with_category(528,10,0.1,0.8)

headline :  white house staff secretary rob porter resign abuse allegations
Categoty :  POLITICS



Unnamed: 0,publish_date,headline,Weighted Euclidean similarity with the queried article,Word2Vec based Euclidean similarity,Category based Euclidean similarity,Categoty
1,2018-02-09,white house counsel know rob porter accusations year report,0.976359,0.787234,1.0,POLITICS
2,2018-02-10,another white house staffer resign domestic abuse allegations,0.978129,0.803165,1.0,POLITICS
3,2018-02-13,fbi chief dispute white house claim hear rob porter allegations,0.982633,0.8437,1.0,POLITICS
4,2018-04-27,white house know rob porter allegations year ago fbi letter,0.986051,0.874457,1.0,POLITICS
5,2018-02-16,gop lawmaker rob porter abuse allegations still talk,0.993232,0.939084,1.0,POLITICS
6,2018-02-11,white house officials cant say know porter allegations,0.999987,0.999882,1.0,POLITICS
7,2018-03-09,white house refuse house investigation request rob porter,1.000364,1.003274,1.0,POLITICS
8,2018-02-09,rob porter exwife say ask downplay abuse statement,1.000665,1.005986,1.0,POLITICS
9,2022-06-10,jar kushner dismiss white house counsel threats resign whine,1.00379,1.034108,1.0,POLITICS


In [41]:
#6.e Weighted similarity based on headline, category and author:
authors_onehot_encoded = OneHotEncoder().fit_transform(np.array(news_articles["authors"]).reshape(-1,1))

In [42]:
def avg_w2v_with_category_and_authors(row_index, num_similar_items, w1,w2,w3): #headline_preference = True, category_preference = False):
    w2v_dist  = pairwise_distances(w2v_headline, w2v_headline[row_index].reshape(1,-1))
    category_dist = pairwise_distances(category_onehot_encoded, category_onehot_encoded[row_index]) + 1
    authors_dist = pairwise_distances(authors_onehot_encoded, authors_onehot_encoded[row_index]) + 1
    weighted_couple_dist   = (w1 * w2v_dist +  w2 * category_dist + w3 * authors_dist)/float(w1 + w2 + w3)
    indices = np.argsort(weighted_couple_dist.flatten())[0:num_similar_items].tolist()
    df = pd.DataFrame({'publish_date': news_articles['date'][indices].values,
                'headline':news_articles['headline'][indices].values,
                'Weighted Euclidean similarity with the queried article': weighted_couple_dist[indices].ravel(),
                'Word2Vec based Euclidean similarity': w2v_dist[indices].ravel(),
                'Category based Euclidean similarity': category_dist[indices].ravel(),
                'Authors based Euclidean similarity': authors_dist[indices].ravel(),
                'Categoty': news_articles['category'][indices].values,
                'Authors': news_articles['authors'][indices].values})
    print("="*30,"Queried article details","="*30)
    print('headline : ',news_articles['headline'][indices[0]])
    print('Categoty : ', news_articles['category'][indices[0]])
    print('Authors : ', news_articles['authors'][indices[0]])
    print("\n","="*25,"Recommended articles : ","="*23)
    #return df.iloc[1:,[1,6,7]]
    return df.iloc[1:, ]


avg_w2v_with_category_and_authors(528,10,0.1,0.1,1)

headline :  white house staff secretary rob porter resign abuse allegations
Categoty :  POLITICS
Authors :  Sebastian Murdock



Unnamed: 0,publish_date,headline,Weighted Euclidean similarity with the queried article,Word2Vec based Euclidean similarity,Category based Euclidean similarity,Authors based Euclidean similarity,Categoty,Authors
1,2018-02-07,john kelly call aide face abuse allegations man true integrity,1.004848,1.058172,1.0,1.0,POLITICS,Sebastian Murdock
2,2018-02-08,fox news predictably quiet former white house aid abuse allegations,1.008547,1.102569,1.0,1.0,POLITICS,Sebastian Murdock
3,2018-03-17,mike huckabee celebrate fire former fbi deputy director make dead dog joke,1.014724,1.176684,1.0,1.0,POLITICS,Sebastian Murdock
4,2018-02-02,new york state senates new sex harassment policy intimidate victims senators say,1.021709,1.260511,1.0,1.0,POLITICS,Sebastian Murdock
5,2018-01-26,hillary clinton reportedly keep adviser accuse sexual harassment payroll,1.022157,1.265889,1.0,1.0,POLITICS,Sebastian Murdock
6,2018-01-09,ruth bader ginsburg law clerk line 2020,1.022268,1.26722,1.0,1.0,POLITICS,Sebastian Murdock
7,2018-02-10,president love make false accusations suddenly plead due process,1.022361,1.268332,1.0,1.0,POLITICS,Sebastian Murdock
8,2021-01-28,man arrest near capitol gun stop steal paperwork list politicians,1.024225,1.290694,1.0,1.0,POLITICS,Sebastian Murdock
9,2018-09-14,republicans stick brett kavanaugh face attempt sexual assault accusation,1.024253,1.291035,1.0,1.0,POLITICS,Sebastian Murdock


In [43]:
#6.f Weighted similarity based on headline, category, author and publishing day
publishingday_onehot_encoded = OneHotEncoder().fit_transform(np.array(news_articles["day and month"]).reshape(-1,1))


In [44]:
def avg_w2v_with_category_authors_and_publshing_day(row_index, num_similar_items, w1,w2,w3,w4): #headline_preference = True, category_preference = False):
    w2v_dist  = pairwise_distances(w2v_headline, w2v_headline[row_index].reshape(1,-1))
    category_dist = pairwise_distances(category_onehot_encoded, category_onehot_encoded[row_index]) + 1
    authors_dist = pairwise_distances(authors_onehot_encoded, authors_onehot_encoded[row_index]) + 1
    publishingday_dist = pairwise_distances(publishingday_onehot_encoded, publishingday_onehot_encoded[row_index]) + 1
    weighted_couple_dist   = (w1 * w2v_dist +  w2 * category_dist + w3 * authors_dist + w4 * publishingday_dist)/float(w1 + w2 + w3 + w4)
    indices = np.argsort(weighted_couple_dist.flatten())[0:num_similar_items].tolist()
    df = pd.DataFrame({'publish_date': news_articles['date'][indices].values,
                'headline_text':news_articles['headline'][indices].values,
                'Weighted Euclidean similarity with the queried article': weighted_couple_dist[indices].ravel(),
                'Word2Vec based Euclidean similarity': w2v_dist[indices].ravel(),
                'Category based Euclidean similarity': category_dist[indices].ravel(),
                'Authors based Euclidean similarity': authors_dist[indices].ravel(),
                'Publishing day based Euclidean similarity': publishingday_dist[indices].ravel(),
                'Categoty': news_articles['category'][indices].values,
                'Authors': news_articles['authors'][indices].values,
                'Day and month': news_articles['day and month'][indices].values})
    print("="*30,"Queried article details","="*30)
    print('headline : ',news_articles['headline'][indices[0]])
    print('Categoty : ', news_articles['category'][indices[0]])
    print('Authors : ', news_articles['authors'][indices[0]])
    print('Day and month : ', news_articles['day and month'][indices[0]])
    print("\n","="*25,"Recommended articles : ","="*23)
    #return df.iloc[1:,[1,7,8,9]]
    return df.iloc[1:, ]


avg_w2v_with_category_authors_and_publshing_day(528,10,0.1,0.1,0.1,1)

headline :  white house staff secretary rob porter resign abuse allegations
Categoty :  POLITICS
Authors :  Sebastian Murdock
Day and month :  Wed_Feb



Unnamed: 0,publish_date,headline_text,Weighted Euclidean similarity with the queried article,Word2Vec based Euclidean similarity,Category based Euclidean similarity,Authors based Euclidean similarity,Publishing day based Euclidean similarity,Categoty,Authors,Day and month
1,2018-02-07,john kelly call aide face abuse allegations man true integrity,1.004475,1.058172,1.0,1.0,1.0,POLITICS,Sebastian Murdock,Wed_Feb
2,2018-02-14,colorado teacher charge allegedly force student stand pledge,1.025353,1.32959,1.0,1.0,1.0,POLITICS,Sebastian Murdock,Wed_Feb
3,2018-02-14,house oversight panel investigate allege abuser rob porter,1.113059,1.055557,1.0,2.414214,1.0,POLITICS,Sara Boboltz,Wed_Feb
4,2018-02-14,trump finally speak domestic violence wake porter scandal,1.125266,1.214239,1.0,2.414214,1.0,POLITICS,Lydia O'Connor,Wed_Feb
5,2018-02-14,washington insider white house story porter doesnt add,1.126921,1.235754,1.0,2.414214,1.0,POLITICS,"Hunter Walker, Yahoo News",Wed_Feb
6,2022-02-23,kentucky elect first openly lgbtq state house member special election,1.12861,1.257718,1.0,2.414214,1.0,POLITICS,Travis Waldron,Wed_Feb
7,2018-02-28,trump new campaign head link company fraud scandal past report,1.130792,1.286086,1.0,2.414214,1.0,POLITICS,Carla Herreria,Wed_Feb
8,2018-02-07,rnc still say wont return allege sexual predator steve wynns money,1.13135,1.293338,1.0,2.414214,1.0,POLITICS,Marina Fang,Wed_Feb
9,2018-02-21,school shoot survivors cry florida house reject talk assault weapon ban,1.13307,1.315693,1.0,2.414214,1.0,POLITICS,Doha Madani,Wed_Feb
