In [40]:
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
%matplotlib inline

In [8]:
article_user_data = '../newsrecommender/data/article_user.csv'
article_user = pd.read_csv(article_user_data)
articles_data = '../newsrecommender/data/articles.csv'
articles = pd.read_csv(articles_data)

In [9]:
#articles.head().to_csv('articles_sample.csv')

In [10]:
articles.head(2)

Unnamed: 0,article_id,headline,url,published_date
0,e9d06b2bb42bd9970de43beab2d3893534d7a619,Defense Department slams brakes on border wall...,https://www.cnn.com/2021/01/21/politics/border...,2021-01-21 21:45:03.000 Z
1,e98b0d02b16794bc0884a61c4a45746cb9430630,Biden's DOJ must confront what to do with outs...,https://www.cnn.com/2021/01/21/politics/donald...,2021-01-22 02:11:02.000 Z


In [11]:
len(article_user.user_id.unique())

50000

In [58]:
article_user.user_id.unique()

array(['2bc424123e0a12d29c488bb6e565fe98d0a49b46',
       'd63acb7f8d2f7478fdea0c9c5a630cca455b0cc3',
       'dae20ce165bd1f86bd762c246df93efc27e16774', ...,
       '585429ca58b381fd348e7085acfe07b25400672b',
       '0141d1c0e8eb74293f08fe83d2ab8849dcbc004a',
       'a5960b8352e6605d3864bfbf39f9d0cf5a3e1d26'], dtype=object)

In [12]:
len(articles.article_id.unique())

1455

In [13]:
articles.head().to_csv('articles_sample.csv')

In [14]:
article_user.head(2)

Unnamed: 0,user_id,article_id,timestamp
0,2bc424123e0a12d29c488bb6e565fe98d0a49b46,e9d06b2bb42bd9970de43beab2d3893534d7a619,1611291180
1,2bc424123e0a12d29c488bb6e565fe98d0a49b46,e98b0d02b16794bc0884a61c4a45746cb9430630,1611290464


In [15]:
df = pd.merge(article_user, articles)

In [16]:
#pd.to_datetime(df['timestamp'], infer_datetime_format=True)  

In [17]:
from datetime import datetime, timedelta
def convert_time(time_str):
    ts = int(time_str)
    return datetime.utcfromtimestamp(ts)

In [18]:
df['timestamp'] = df.apply(lambda r: convert_time(r['timestamp']), axis=1)

In [19]:
datetime.strptime('2021-04-19', '%Y-%m-%d') - timedelta(days=7)

datetime.datetime(2021, 4, 12, 0, 0)

In [20]:
f1 = df['timestamp'] >=  datetime.now() - timedelta(days=120)
f2 = df['timestamp'] <  datetime.now() - timedelta(days=100)
df[f2]

Unnamed: 0,user_id,article_id,timestamp,headline,url,published_date


In [21]:
df['timestamp'].max()

Timestamp('2021-01-25 00:09:41')

In [22]:
df['timestamp'].min()

Timestamp('2021-01-20 00:00:31')

In [23]:
datetime.now() - timedelta(days=100)

datetime.datetime(2021, 1, 13, 14, 59, 9, 38280)

In [24]:
datetime.now() - timedelta(days=120)

datetime.datetime(2020, 12, 24, 14, 59, 9, 42562)

In [25]:
# Most popular score

most popular articles are something. What are they?

In [26]:
#df['article_id'].value_counts().to_csv('article_counts.csv')

In [27]:
def get_topic(url):
    pattern = 'http(s)?://(www|money).cnn.com/\d{4}/\d{2}/\d{2}/(?P<topic>[a-z0-9-]+)/'
    result = re.match(pattern, url)
    topic = None
    if result:
        topic = result.group('topic')
    if not result:
        pattern2 = 'https://www.cnn.com/(?P<topic>[a-z]*)/'
        result = re.match(pattern2, url)
        if result:
            topic = result.group('topic')
    if topic:
        return topic
    else:
        return ''

In [30]:
df['category'] = df.apply(lambda r: get_topic(r['url']), axis=1)

In [32]:
df.head(2)

Unnamed: 0,user_id,article_id,timestamp,headline,url,published_date,category
0,2bc424123e0a12d29c488bb6e565fe98d0a49b46,e9d06b2bb42bd9970de43beab2d3893534d7a619,2021-01-22 04:53:00,Defense Department slams brakes on border wall...,https://www.cnn.com/2021/01/21/politics/border...,2021-01-21 21:45:03.000 Z,politics
1,b5c93b1ce2245fe8a4d59aedbd6c33a8b315bf60,e9d06b2bb42bd9970de43beab2d3893534d7a619,2021-01-21 19:05:43,Defense Department slams brakes on border wall...,https://www.cnn.com/2021/01/21/politics/border...,2021-01-21 21:45:03.000 Z,politics


In [56]:
# # calculate the frequency of each article
groups = df.groupby(['category', 'article_id', 'headline']).size().reset_index().rename({0: 'frequency'}, axis=1)
# n = 5
# # get the top n articles
groups = groups.sort_values('frequency', ascending=False).groupby('category')\
            .head(1).sort_values(['category', 'frequency'], ascending=False)\
            .sort_values('frequency', ascending=False).head(5)

In [None]:
pd.merge(groups, articles_

In [57]:
groups.to_csv('top_freq.csv')

In [38]:
msk1 = groups['category'] == 'world'
groups[msk1].head(2)

Unnamed: 0,category,article_id,frequency
1431,world,889f731b80b1becc703d0f550da3fe85be9251eb,1518
1444,world,d3810241ac4c40506c1392a189fd67abf0d9baf8,1407


In [39]:
categories = groups['category'].unique()
def category_exists(category: str) -> bool:
    return category in categories

def get_top_n_in_category(n, category) -> pd.DataFrame:
    n = max(5, n)
    msk1 = groups['category'] == category
    return groups[msk1].head(n)

In [42]:
dfs = []
for category in categories:
    category_df = get_top_n_in_category(n, category)
    dfs.append(category_df)

NameError: name 'n' is not defined

In [353]:
# groups.to_csv('nlargest.csv')

In [409]:
all_articles = pd.merge(article_user, articles)

# Item-Item Collaborative Filtering

In [415]:
%time
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(articles['headline'].unique())

CPU times: user 5 µs, sys: 1 µs, total: 6 µs
Wall time: 13.8 µs


In [357]:
tfidf_matrix.shape

(1455, 13246)

In [412]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [413]:
all_articles['headline'][0]

'Defense Department slams brakes on border wall as it reviews Biden order'

In [414]:
uid = '2bc424123e0a12d29c488bb6e565fe98d0a49b46'
all_articles[all_articles['user_id'] == '2bc424123e0a12d29c488bb6e565fe98d0a49b46'].head(2)

Unnamed: 0,user_id,article_id,timestamp,headline,url,published_date
0,2bc424123e0a12d29c488bb6e565fe98d0a49b46,e9d06b2bb42bd9970de43beab2d3893534d7a619,1611291180,Defense Department slams brakes on border wall...,https://www.cnn.com/2021/01/21/politics/border...,2021-01-21 21:45:03.000 Z
714,2bc424123e0a12d29c488bb6e565fe98d0a49b46,e98b0d02b16794bc0884a61c4a45746cb9430630,1611290464,Biden's DOJ must confront what to do with outs...,https://www.cnn.com/2021/01/21/politics/donald...,2021-01-22 02:11:02.000 Z


In [416]:
smd = all_articles.reset_index()
headlines = articles['headline']
indices = pd.Series(articles.index, index=articles['headline'])
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return headlines.iloc[movie_indices]

In [417]:
t = 'Defense Department slams brakes on border wall as it reviews Biden order'
get_recommendations(t).head(5)#.to_csv('similar_recommendations.csv')

1223    Biden's desire to stop the border wall could b...
908     Steve Bannon, three others charged with fraud ...
483     Trump visits his border wall on the heels of d...
1239    Trump administration locks down border wall co...
169     Biden starts fast on immigration by halting bo...
Name: headline, dtype: object

In [418]:
all_articles['rating'] = 1

In [421]:
msk3 = all_articles['user_id'] == '2bc424123e0a12d29c488bb6e565fe98d0a49b46'
all_articles[msk3].head(2)

Unnamed: 0,user_id,article_id,timestamp,headline,url,published_date,rating
0,2bc424123e0a12d29c488bb6e565fe98d0a49b46,e9d06b2bb42bd9970de43beab2d3893534d7a619,1611291180,Defense Department slams brakes on border wall...,https://www.cnn.com/2021/01/21/politics/border...,2021-01-21 21:45:03.000 Z,1
714,2bc424123e0a12d29c488bb6e565fe98d0a49b46,e98b0d02b16794bc0884a61c4a45746cb9430630,1611290464,Biden's DOJ must confront what to do with outs...,https://www.cnn.com/2021/01/21/politics/donald...,2021-01-22 02:11:02.000 Z,1


In [422]:
g = df.groupby(['user_id', 'article_id'])['article_id'].count()#.to_frame()
#ratings = g.rename('rating').to_frame().reset_index()
article_counts = g.rename('rating').to_frame().reset_index()

In [423]:
ratings[ratings['rating'] > 1].head(2)

Unnamed: 0,user_id,article_id,rating
47,0006dbdc6f22fc3be0204942d416309efe82eb69,20699bd1f6dbe29c142e4fafba8e37c3145da027,4
86,000beb56a84772c98ed86f43ef605c3533224e04,9b951443de50bcf084324600050028118817ddc4,2


In [424]:
article_counts.head(10)

Unnamed: 0,user_id,article_id,rating
0,0002604d5804123dedb91819acbc93f888c54b8e,534c4076ad58bee6998339f70a018a0330cdcbf5,1
1,0002604d5804123dedb91819acbc93f888c54b8e,6ab428f68d0fa9a6405a508751acc8ef0b9ee4ff,1
2,0002604d5804123dedb91819acbc93f888c54b8e,9260080e038b6274f850d9e82712dc395e26e091,1
3,0002604d5804123dedb91819acbc93f888c54b8e,b38cb1d061da04da1b3bc14082317a2f9e9fd300,1
4,0002604d5804123dedb91819acbc93f888c54b8e,f09985cd3473960ea8c464e24ade4de4a8c1f8f0,1
5,0002604d5804123dedb91819acbc93f888c54b8e,f4bfbf1c7b245390cf57aab2445102d6d7573825,1
6,0003728d0245028f76f7a655241a07181d1be815,0dc502c9dfe4dbd472cfc5a745f8b1d3361b05ed,1
7,0003728d0245028f76f7a655241a07181d1be815,0e10125561d99d714a577dcc4bd6972e0cb70a1e,1
8,0003728d0245028f76f7a655241a07181d1be815,2f790fadb623c58c932e50774eccdd1db56205ef,1
9,0003728d0245028f76f7a655241a07181d1be815,4bfdf06da4852a06da783456cc61e9504fb8e434,1


In [425]:
number_of_visited_articles = df.groupby(['user_id'])['article_id'].count().to_frame().reset_index().rename({'article_id': 'number_of_visited_articles'}, axis=1)
number_of_visited_articles

Unnamed: 0,user_id,number_of_visited_articles
0,0002604d5804123dedb91819acbc93f888c54b8e,6
1,0003728d0245028f76f7a655241a07181d1be815,20
2,000430b60d8f1dfbc40f387c57a83989f9fa93a9,14
3,0004c5445d969cabdb55c8fa69f3fd5212723032,3
4,0005c0d4d9f01b62578a4987a7ec4fcd95fada4a,4
...,...,...
49995,fffb5887fbd6f820ca50ce3787fdd8bd2c87a3c4,9
49996,fffbacf805a9436e177d6b243883fc2c7832ab5e,9
49997,fffca37521dbe342b103e4c05a70522291240f1e,20
49998,ffff1ae26738234ebd6ccfdb2fa197a409f0e65b,7


In [426]:
article_counts_merged = pd.merge(number_of_visited_articles, article_counts)

In [427]:
article_counts_merged['normalized_rating'] = article_counts_merged['rating'] / article_counts_merged['number_of_visited_articles']

In [428]:
# add normalized ratings
df = pd.merge(df, article_counts_merged.drop(['rating'], axis=1))

In [429]:
p = ratings.groupby(['user_id', 'article_id']).sum()['rating'].to_frame()
p[p['rating'] > 1]

Unnamed: 0_level_0,Unnamed: 1_level_0,rating
user_id,article_id,Unnamed: 2_level_1
0006dbdc6f22fc3be0204942d416309efe82eb69,20699bd1f6dbe29c142e4fafba8e37c3145da027,4
000beb56a84772c98ed86f43ef605c3533224e04,9b951443de50bcf084324600050028118817ddc4,2
00193047cbef4957c66c16e3c212817a78c542fd,7c3c7d84275bf16786d1cdc92e5ec73ef5db6277,2
001b4f187a9c5930b190b6673bd5504a558ba52a,709a1c6ea25c6783f095508e3a654c2c509ef08c,2
001b4f187a9c5930b190b6673bd5504a558ba52a,8bff315e3ecdc22d5bbdae3a1cc37f48179fdfd3,2
...,...,...
fff56268e977ba77a9321de6759887db9e11f902,964b98d1ed8f3a97d7338fdfbb9fa675bad3559a,2
fffad616517c702809ebfac8e3d1bcaf762b471a,5040fd6e29c834e8bb916cb5b2cf88dc837577a1,3
fffb5887fbd6f820ca50ce3787fdd8bd2c87a3c4,db28270c836816d3674dde7c5139de7c170bc877,2
fffca37521dbe342b103e4c05a70522291240f1e,77e2fe5aa1b34a93491af7516ae6e12fc1d37b28,2


In [430]:
import surprise
from surprise import Reader, Dataset, SVD, model_selection#, evaluate
svd = SVD()
reader = Reader()
data = Dataset.load_from_df(df[['user_id', 'article_id', 'normalized_rating']], reader)
model_selection.cross_validate(svd, data, measures=['RMSE', 'MAE'])

{'test_rmse': array([0.88737601, 0.88714016, 0.88722031, 0.88736068, 0.88746047]),
 'test_mae': array([0.88217455, 0.88187476, 0.88196833, 0.88211519, 0.88221476]),
 'fit_time': (19.885215997695923,
  20.038259983062744,
  19.758590936660767,
  19.764885902404785,
  19.875874996185303),
 'test_time': (0.824937105178833,
  0.768557071685791,
  0.767301082611084,
  0.756016731262207,
  0.7778670787811279)}

In [431]:
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f89d1a61f10>

In [375]:
svd.predict('0002604d5804123dedb91819acbc93f888c54b8e', 'fffb5887fbd6f820ca50ce3787fdd8bd2c87a3c4', 3)

Prediction(uid='0002604d5804123dedb91819acbc93f888c54b8e', iid='fffb5887fbd6f820ca50ce3787fdd8bd2c87a3c4', r_ui=3, est=1, details={'was_impossible': False})

In [376]:
def get_recommendations_from_user_id_headline(user_id, headline):
    idx = indices[headline]
    sim_scores = list(enumerate(cosine_sim[int(idx)]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    article_indices = [i[0] for i in sim_scores]
    
    articles_tmp = articles.iloc[article_indices][['headline', 'article_id']]
    articles_tmp['est'] = articles_tmp.apply(lambda x: svd.predict(user_id, x['article_id']).est, axis=1)
    articles_tmp = articles_tmp.sort_values('est', ascending=False)
    return articles.head(5)

In [377]:
articles.head(2)

Unnamed: 0,article_id,headline,url,published_date
0,e9d06b2bb42bd9970de43beab2d3893534d7a619,Defense Department slams brakes on border wall...,https://www.cnn.com/2021/01/21/politics/border...,2021-01-21 21:45:03.000 Z
1,e98b0d02b16794bc0884a61c4a45746cb9430630,Biden's DOJ must confront what to do with outs...,https://www.cnn.com/2021/01/21/politics/donald...,2021-01-22 02:11:02.000 Z


In [378]:
smd.head(2)

Unnamed: 0,index,user_id,article_id,timestamp,headline,url,published_date
0,0,2bc424123e0a12d29c488bb6e565fe98d0a49b46,e9d06b2bb42bd9970de43beab2d3893534d7a619,1611291180,Defense Department slams brakes on border wall...,https://www.cnn.com/2021/01/21/politics/border...,2021-01-21 21:45:03.000 Z
1,1,b5c93b1ce2245fe8a4d59aedbd6c33a8b315bf60,e9d06b2bb42bd9970de43beab2d3893534d7a619,1611255943,Defense Department slams brakes on border wall...,https://www.cnn.com/2021/01/21/politics/border...,2021-01-21 21:45:03.000 Z


In [1]:
user_id = '2bc424123e0a12d29c488bb6e565fe98d0a49b46'
headline = 'Defense Department slams brakes on border wall as it reviews Biden order'
get_recommendations_from_user_id_headline(user_id, headline)

# can't find this: 'article_id'

NameError: name 'get_recommendations_from_user_id_headline' is not defined

In [381]:
def get_user_history_headlines(user_id: str) -> set:
    msk = all_articles['user_id'] == user_id
    headlines = all_articles[msk]['headline'].unique()
    return headlines

In [382]:
get_user_history_headlines(user_id)

array(['Defense Department slams brakes on border wall as it reviews Biden order',
       "Biden's DOJ must confront what to do with outstanding cases in which it is defending Trump. The first test could come Friday.",
       'White House wants Democrats to be patient on stimulus talks as Biden pushes for bipartisan path, officials say',
       "The Supreme Court's run out of excuses to avoid controversial issues",
       'Evidence shows Capitol rioters brutally attacked police with flagpoles, fire extinguishers and fists',
       'Four presidents take a stand',
       'I went to Washington for joy, and Amanda Gorman delivered it',
       "Syroco vs SP80: The race to create the world's fastest sail boat",
       'Here are the 30 executive orders and actions Biden signed in his first three days',
       "Joe Biden grieves Covid victims on eve of his inauguration: 'To heal, we must remember'",
       'Putin presents a Russia-sized foreign-policy headache for Biden',
       'Why Republica

In [383]:
user_id

'2bc424123e0a12d29c488bb6e565fe98d0a49b46'

In [384]:
dfs = []
for i, headline in enumerate(get_user_history_headlines(user_id)):
    user_history_headlines = get_recommendations_from_user_id_headline(user_id, headline)
    for headline in user_history_headlines['headline'].unique():
        try:
            recs = get_recommendations_from_user_id_headline(user_id, headline)
            dfs.append(recs)
        except:
            print('user_id', user_id)
            print('headline', headline)

In [385]:
pd.concat(dfs)

Unnamed: 0,article_id,headline,url,published_date
0,e9d06b2bb42bd9970de43beab2d3893534d7a619,Defense Department slams brakes on border wall...,https://www.cnn.com/2021/01/21/politics/border...,2021-01-21 21:45:03.000 Z
1,e98b0d02b16794bc0884a61c4a45746cb9430630,Biden's DOJ must confront what to do with outs...,https://www.cnn.com/2021/01/21/politics/donald...,2021-01-22 02:11:02.000 Z
2,05f51c586ada891448ab56d9a5c8adb2a688e2fb,White House wants Democrats to be patient on s...,https://www.cnn.com/2021/01/22/politics/covid-...,2021-01-22 21:53:55.000 Z
3,91333a3c6d28c9241cbb01678a97699232b79e70,The Supreme Court's run out of excuses to avoi...,https://www.cnn.com/2021/01/24/politics/suprem...,2021-01-24 18:42:26.000 Z
4,30170d08779c54b84170d4cb3b5cbe0462df704a,Evidence shows Capitol rioters brutally attack...,https://www.cnn.com/2021/01/21/politics/capito...,2021-01-21 12:45:30.000 Z
...,...,...,...,...
0,e9d06b2bb42bd9970de43beab2d3893534d7a619,Defense Department slams brakes on border wall...,https://www.cnn.com/2021/01/21/politics/border...,2021-01-21 21:45:03.000 Z
1,e98b0d02b16794bc0884a61c4a45746cb9430630,Biden's DOJ must confront what to do with outs...,https://www.cnn.com/2021/01/21/politics/donald...,2021-01-22 02:11:02.000 Z
2,05f51c586ada891448ab56d9a5c8adb2a688e2fb,White House wants Democrats to be patient on s...,https://www.cnn.com/2021/01/22/politics/covid-...,2021-01-22 21:53:55.000 Z
3,91333a3c6d28c9241cbb01678a97699232b79e70,The Supreme Court's run out of excuses to avoi...,https://www.cnn.com/2021/01/24/politics/suprem...,2021-01-24 18:42:26.000 Z
