In [36]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import string
import re

In [37]:
import gensim
from gensim import corpora, models, similarities

import nltk
from nltk.corpus import stopwords

In [38]:
!pip install lightfm
from lightfm import LightFM



In [39]:
import scipy
from scipy.spatial.distance import cdist
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
from scipy.linalg import svd

In [40]:
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from sklearn.model_selection import train_test_split

# Reading data sets

1. rated_articles - Contains user_interests with ratings
2. news_articles - Contains raw articles without user data amalgamation

## news_articles

In [41]:
data = pd.read_csv('https://raw.githubusercontent.com/VividhPandey003/newsRecomm/main/data/0_news_articles.csv?token=GHSAT0AAAAAACF4FNQBY6AYIOICPOBB5MVAZUEHMWQ')
data.head()

Unnamed: 0,Article_id,Title,Description,Date,Category,URL
0,0,Fire at Vaishno Devi shrine complex; cash coun...,"No one was injured in the fire, which broke ou...","June 8, 2021 7:28:32 pm",India,https://indianexpress.com/article/india/vaishn...
1,1,"Had not gone to meet Nawaz Sharif, says Uddhav...",Uddhav Thackeray led a delegation of his cabin...,"June 8, 2021 6:56:40 pm",India,https://indianexpress.com/article/india/had-no...
2,2,Corruption case: Former Haryana I-T deputy com...,It was in 2016 that the CBI had arrested Nitin...,"June 8, 2021 6:25:24 pm",India,https://indianexpress.com/article/india/corrup...
3,3,Kannur MP K Sudhakaran appointed chief of Cong...,Sudhakaran will replace Ramachandran who had a...,"June 8, 2021 5:04:40 pm",India,https://indianexpress.com/article/india/sudhak...
4,4,"Kerala girl of Class 5 writes to CJI, lauds SC...",Chief Justice N V Ramana responded to the Clas...,"June 8, 2021 4:43:10 pm",India,https://indianexpress.com/article/india/kerala...


## Collaborative Filtering

**Need:** Ratings Matrix so I generated user profile with ratings

In [42]:
rating = pd.read_csv('https://raw.githubusercontent.com/VividhPandey003/newsRecomm/main/data/3_user_rated_articles.csv?token=GHSAT0AAAAAACF4FNQAMAGPEBRJTELMNCAEZUEHOPA')
print(rating.shape)
rating.drop(columns= rating.columns[0],
        axis=1,
        inplace=True)
rating.head()

(1187, 8)


Unnamed: 0,Article_id,Title,UserId,SessionId,Article Rank,Time Spent (seconds),Ratings
0,0,Fire at Vaishno Devi shrine complex; cash coun...,1,1,1,81,4
1,2,Corruption case: Former Haryana I-T deputy com...,1,1,3,49,3
2,5,Madhya Pradesh govt gets HC notice on communal...,1,1,6,19,2
3,6,Uddhav Thackeray meets PM Modi; discusses Mara...,1,1,7,33,4
4,7,"New Covid-19 vaccination guidelines out, alloc...",1,1,8,71,1


In [43]:
rating.tail()

Unnamed: 0,Article_id,Title,UserId,SessionId,Article Rank,Time Spent (seconds),Ratings
1182,2245,"Malaika Arora shares yoga asanas for healthy, ...",2235,2230,6,38,2
1183,2246,COVID-19 diet: Khichdi is a ‘safe’ option; oth...,2236,2231,7,74,4
1184,2247,‘Keep listening. The world wants to hear your ...,2237,2232,8,60,4
1185,2248,"Forget cold drinks, switch to refreshing bael ...",2238,2233,9,65,3
1186,2249,‘Love wins’: Rita Wilson and Tom Hanks celebra...,2239,2234,10,45,4


In [44]:
n_users = int(rating.UserId.nunique())
n_article = int(rating.Article_id.nunique())
print("Number of users: " , n_users)
print("Number of articles: ", n_article)

Number of users:  1181
Number of articles:  1187


In [45]:
user_pivot = rating.pivot_table(index = 'UserId', columns = 'Article_id', values = 'Ratings')
user_pivot.head()

Article_id,0,2,5,6,7,8,10,13,14,16,...,2235,2237,2240,2241,2242,2245,2246,2247,2248,2249
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,3.0,2.0,4.0,1.0,4.0,4.0,,,,...,,,,,,,,,,
3,,,,,,,,3.0,,,...,,,,,,,,,,
4,,,,,,,,,4.0,,...,,,,,,,,,,
6,,,,,,,,,,3.0,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,


In [46]:
user_pivot.shape

(1181, 1187)

In [47]:
user_pivot = user_pivot.fillna(0)
user_pivot_matrix = user_pivot.values
user_pivot_matrix

array([[4., 3., 2., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 4., 0., 0.],
       [0., 0., 0., ..., 0., 3., 0.],
       [0., 0., 0., ..., 0., 0., 4.]])

In [48]:
user_item_pivot_sparse = csr_matrix(user_pivot)

In [49]:
n_factors = 150
U, sigma, Vt = svds(user_pivot_matrix, k = n_factors)

sigma = np.diag(sigma)
sigma.shape

(150, 150)

In [50]:
all_user_ratings = np.dot(np.dot(U, sigma), Vt)
all_user_ratings_norm = (all_user_ratings - all_user_ratings.min()) / (all_user_ratings.max() - all_user_ratings.min())
all_user_ratings_norm

array([[0.87153943, 0.68553789, 0.49953636, ..., 0.12753328, 0.12753328,
        0.12753328],
       [0.12753328, 0.12753328, 0.12753328, ..., 0.12753328, 0.12753328,
        0.12753328],
       [0.12753328, 0.12753328, 0.12753328, ..., 0.12753328, 0.12753328,
        0.12753328],
       ...,
       [0.12753328, 0.12753328, 0.12753328, ..., 0.12753328, 0.12753328,
        0.12753328],
       [0.12753328, 0.12753328, 0.12753328, ..., 0.12753328, 0.12753328,
        0.12753328],
       [0.12753328, 0.12753328, 0.12753328, ..., 0.12753328, 0.12753328,
        0.12753328]])

In [51]:
cf_preds_df = pd.DataFrame(all_user_ratings_norm, columns = user_pivot.columns).transpose()

In [52]:
class Collaborative:

    name = "Collaborative Filter"

    def __init__(self, predictions, items = None):
        self.predictions = predictions
        self.items = items

    def get_model_name(self):
        return self.name

    def recommend_items(self, user_id, items_ignore = [], topn = 10, verbose = False):
        sorted_preds = self.predictions[user_id].sort_values(ascending = False).reset_index()

        recommendations = sorted_preds[~sorted_preds['Article_id'].isin(items_ignore)].head(topn)

        if verbose:
            if self.items is None:
                raise Exception('"items" required in verbose mode')

            recommendations = recommendations.merge(self.items, how = 'left', left_on = 'Article_id',
                                                    right_on = 'Article_id')[['Article_id', 'Title']]


        return recommendations

In [53]:
model = Collaborative(cf_preds_df, data)

In [54]:
model.recommend_items(user_id = 224, verbose = True)

Unnamed: 0,Article_id,Title
0,349,"Tanks, Corvette warships, light copters in neg..."
1,1445,New Zealand cricketers land for biosecure tour...
2,1399,BCCI paying for quarantine of Australian IPL p...
3,1742,Simple ways to manage COVID anxiety in the eld...
4,888,Chelsea extend Thomas Tuchel’s contract to 202...
5,742,Post COVID complication among children a new c...
6,1495,England’s IPL players unlikely to find place i...
7,927,I still possess Bajaj Platina bike: Mohd Siraj...
8,2143,Ordering takeout? Try a live concert at home
9,997,Japan rallies to support Naomi Osaka after Fre...


## Hybrid Recommendor System

**Using:** LightRF

In [55]:
user_pivot.head()

Article_id,0,2,5,6,7,8,10,13,14,16,...,2235,2237,2240,2241,2242,2245,2246,2247,2248,2249
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,3.0,2.0,4.0,1.0,4.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [56]:
u_id = list(user_pivot.index)
u_dict = {}
counter = 0

for i in u_id:
    u_dict[i] = counter
    counter += 1

In [57]:
len(u_dict)

1181

In [58]:
# convert to csr matrix
u_interaction_csr = csr_matrix(user_pivot.values)
u_interaction_csr

<1181x1187 sparse matrix of type '<class 'numpy.float64'>'
	with 1187 stored elements in Compressed Sparse Row format>

In [59]:
item_dict ={}
df = rating[['Article_id', 'Title']].sort_values('Article_id').reset_index()

for i in range(df.shape[0]):
    item_dict[(df.loc[i,'Article_id'])] = df.loc[i,'Title']

In [60]:
len(item_dict)

1187

### LightFM not working, so hopes low

In [61]:
model = LightFM(loss = 'warp', random_state = 2016, learning_rate = 0.90, no_components = 150, user_alpha = 0.000005)
model = model.fit(u_interaction_csr, epochs = 100, num_threads = 16, verbose = False)

In [62]:
title = list(data['Title'])
title[:10]

['Fire at Vaishno Devi shrine complex; cash counter damaged',
 'Had not gone to meet Nawaz Sharif, says Uddhav Thackeray as he plays down one-on-one meeting with PM Modi',
 'Corruption case: Former Haryana I-T deputy commissioner gets 4 years in prison',
 'Kannur MP K Sudhakaran appointed chief of Congress in Kerala',
 'Kerala girl of Class 5 writes to CJI, lauds SC for saving lives in fight with Covid',
 'Madhya Pradesh govt gets HC notice on communal clashes during fundraising for Ram temple',
 'Uddhav Thackeray meets PM Modi; discusses Maratha quota issue, GST compensation',
 'New Covid-19 vaccination guidelines out, allocation based on state population',
 'Dantewada: 24-year-old tribal woman killed in ‘maoist encounter’; family claims it was staged, alleges rape',
 'Fire at TMC MLA Madan Mitra’s residence in Kolkata']

In [63]:
total = data.isnull().sum().sort_values(ascending = False)
percent = (data.isnull().sum()/data.isnull().count()).sort_values(ascending = False)
missing_data = pd.concat([total, percent], axis = 1, keys = ['Total', 'Percent'])
missing_data.head()

Unnamed: 0,Total,Percent
Article_id,0,0.0
Title,0,0.0
Description,0,0.0
Date,0,0.0
Category,0,0.0
