In [38]:
import pandas as pd


df_users_interactions = pd.read_csv('users_interactions.csv')
df_shared_articles = pd.read_csv('shared_articles.csv')


print(df_users_interactions.shape)
print(df_users_interactions.isna().sum())

print(df_shared_articles.shape)
print(df_shared_articles.isna().sum())



(72312, 8)
timestamp          0
eventType          0
contentId          0
personId           0
sessionId          0
userAgent      15394
userRegion     15405
userCountry    15394
dtype: int64
(3122, 13)
timestamp          0
eventType          0
contentId          0
authorPersonId     0
authorSessionId    0
                  ..
contentType        0
url                0
title              0
text               0
lang               0
Length: 13, dtype: int64


In [39]:
print(df_shared_articles.columns)


Index(['timestamp', 'eventType', 'contentId', 'authorPersonId',
       'authorSessionId', 'authorUserAgent', 'authorRegion', 'authorCountry',
       'contentType', 'url', 'title', 'text', 'lang'],
      dtype='object')


In [40]:
# Fill missing values in specific columns
df_shared_articles['authorPersonId'].fillna('unknown', inplace=True)
df_shared_articles['authorSessionId'].fillna('unknown', inplace=True)
df_shared_articles['authorUserAgent'] = df_shared_articles['authorUserAgent'].fillna('unknown')
df_shared_articles['authorRegion'] = df_shared_articles['authorRegion'].fillna('unknown')
df_shared_articles['authorCountry'] = df_shared_articles['authorCountry'].fillna('unknown')
df_shared_articles['contentType'].fillna('unknown', inplace=True)
df_shared_articles['url'].fillna('unknown', inplace=True)
df_shared_articles['title'].fillna('unknown', inplace=True)
df_shared_articles['text'].fillna('unknown', inplace=True)
df_shared_articles['lang'].fillna('unknown', inplace=True)

# If there are other columns like 'timestamp' or 'eventType' that may have missing values,
# you can handle them as well. For example, if we choose to fill missing timestamps with the mode:
df_shared_articles['timestamp'].fillna(df_shared_articles['timestamp'].mode()[0], inplace=True)

# If we need to handle 'eventType' missing values similarly:
df_shared_articles['eventType'].fillna(df_shared_articles['eventType'].mode()[0], inplace=True)

# Drop rows where critical data might be missing, such as contentId (since it's used for recommendations)
df_shared_articles.dropna(subset=['contentId'], inplace=True)

# Reset index after filling and dropping rows
df_shared_articles.reset_index(drop=True, inplace=True)

# Print out the missing values in each column after the replacement
print(df_shared_articles.isna().sum(), '\n')

# Display the shape of the dataframe to verify the changes
print(df_shared_articles.shape)


timestamp          0
eventType          0
contentId          0
authorPersonId     0
authorSessionId    0
                  ..
contentType        0
url                0
title              0
text               0
lang               0
Length: 13, dtype: int64 

(3122, 13)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_shared_articles['authorPersonId'].fillna('unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_shared_articles['authorSessionId'].fillna('unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the inter

In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 10)

# Create a TfidfVectorizer and Remove stopwords
tfidf = TfidfVectorizer(stop_words='english')

# Fit and transform the data to a tfidf matrix
tfidf_matrix = tfidf.fit_transform(df_shared_articles['title'])

# Print the shape of the tfidf_matrix
print(tfidf_matrix.shape)

# Preview the matrix by placing it into a DataFrame (which we won't need later)
df_tfidf = pd.DataFrame(tfidf_matrix.T.todense(), index=tfidf.get_feature_names_out(), columns=df_shared_articles['title'])
df_tfidf.iloc[2221:2226]

(3122, 6973)


title,"Ethereum, a Virtual Currency, Enables Transactions That Rival Bitcoin's","Ethereum, a Virtual Currency, Enables Transactions That Rival Bitcoin's.1",Bitcoin Future: When GBPcoin of Branson Wins Over USDcoin of Trump,Google Data Center 360° Tour,"IBM Wants to ""Evolve the Internet"" With Blockchain Technology",...,"Conheça a Liga IoT, plataforma de inovação aberta que irá acelerar projetos ligados a Internet das Coisas",Amazon takes on Skype and GoToMeeting with its Chime video conferencing app,Code.org 2016 Annual Report,"JPMorgan Software Does in Seconds What Took Lawyers 360,000 Hours",The 2017 Acquia Partners of the Year
elevam,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0
elimina,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0
eliminate,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0
elon,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0
elopar,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0


In [42]:
from sklearn.metrics.pairwise import linear_kernel
import pandas as pd

# Assuming `tfidf_matrix` is your TF-IDF matrix and `df_shared_articles` is your DataFrame containing the article data (with 'contentId' and 'userId' columns)

# Compute the cosine similarity between each article description
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# For easier viewing, put the cosine similarity into a DataFrame with 'contentId' as both columns and index
df_results = pd.DataFrame(cosine_sim, columns=df_shared_articles['contentId'], index=df_shared_articles['contentId'])

# Now, use 'userId' as the index instead of 'contentId' for the rows
# Assuming `df_shared_articles` has 'userId' for each article, map it accordingly

# Save the DataFrame to a CSV file
df_results.to_csv('content_filtering.csv')

# Optional: Print the DataFrame to inspect it
print(df_results.head())


contentId             -6451309518266745024  -4110354420726924665  \
contentId                                                          
-6451309518266745024              1.000000              1.000000   
-4110354420726924665              1.000000              1.000000   
-7292285110016212249              0.079058              0.079058   
-6151852268067518688              0.000000              0.000000   
 2448026894306402386              0.000000              0.000000   

contentId             -7292285110016212249  -6151852268067518688  \
contentId                                                          
-6451309518266745024              0.079058                   0.0   
-4110354420726924665              0.079058                   0.0   
-7292285110016212249              1.000000                   0.0   
-6151852268067518688              0.000000                   1.0   
 2448026894306402386              0.000000                   0.0   

contentId              2448026894306402386  ..

In [43]:
df_sorted = pd.DataFrame(cosine_sim).sort_values(by=[0], ascending=False)

for id in df_sorted.index[0:5]:
  print(id, '\t', df_shared_articles.loc[id, 'title'])

df_sorted.head(10)

0 	 Ethereum, a Virtual Currency, Enables Transactions That Rival Bitcoin's
1 	 Ethereum, a Virtual Currency, Enables Transactions That Rival Bitcoin's
187 	 Microsoft Continues to Embrace Ethereum & Bitcoin - Bitcoin News
2455 	 Bitcoin Accepted! German Energy Giant Enables Payments - CCN: Financial Bitcoin & Cryptocurrency News
184 	 Ethereum and Bitcoin Are Market Leaders But Not Competitors


Unnamed: 0,0,1,2,3,4,...,3117,3118,3119,3120,3121
0,1.0,1.0,0.079058,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,0.079058,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0
187,0.275459,0.275459,0.155049,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0
2455,0.24131,0.24131,0.109661,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0
184,0.236723,0.236723,0.095456,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0
987,0.220653,0.220653,0.066307,0.0,0.073255,...,0.0,0.0,0.0,0.0,0.0
2176,0.213223,0.213223,0.198667,0.0,0.219486,...,0.0,0.0,0.0,0.0,0.0
1059,0.18645,0.18645,0.064837,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0
18,0.182754,0.182754,0.073693,0.0,0.081416,...,0.0,0.0,0.077294,0.0,0.0
992,0.175375,0.175375,0.0,0.060295,0.0,...,0.0,0.0,0.0,0.0,0.0


In [44]:
def get_recommendations(item_id, sim_matrix, n=10, messages=True):
  if item_id > sim_matrix.shape[0]:  # Add some error checking for robustness
    print(f"Item {item_id} is not in the similarity matrix you provided")
    return

  # Get the pairwise similarity scores of all movies with that movie
  sim_scores = list(enumerate(sim_matrix[item_id]))

  # Sort the items based on the similarity scores
  sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

  # Get the scores of the n most similar items; start at 1 so that it skips itself
  top_similar = sim_scores[1:n+1]

  # Put the recommended item indices and similarity scores together in a dictionary using comprehension
  rec_dict = {i[0]:i[1] for i in top_similar}

  if messages:
    print(f"The top recommended item IDs are: {list(rec_dict.keys())}")
    print(f"Their similarity scores are:\t  {list(rec_dict.values())}")

  # Return the top n most similar items
  return rec_dict

In [45]:
# Change this value to any title you'd like to get recommendations
title = "Google Data Center 360° Tour"

print(df_shared_articles.columns)

# Check if the title is valid; if not, suggest alternatives and use the last one for recommendations
if title in df_shared_articles['title'].to_list():
  id = df_shared_articles.index[df_shared_articles['title']==title][0] # Convert the title to an index (i.e. item ID)
else:
  print(f"\"{title}\" is not in the data set. Try one of these:\n")
  for row in df_shared_articles.sample(n=10).itertuples():  # Get a random 10 titles
    id = row[0]
    title = row.title
    print(f'\t{title}')

print(f"\nIf you like \"{title},\" then you may also like:\n")

# Call the function and return the dictionary; print out the dictionary if you want to see what it is
recommend_dict = get_recommendations(id, cosine_sim, n=10, messages=False)

# Add the dictionary to a new DataFrame; this isn't necessary, but it helps to see what articles are recommended
df_similarity = pd.DataFrame(data=recommend_dict.values(), columns=['similarity'], index=recommend_dict.keys())

# Create a subset of the original df DataFrame with only the recommended articles
df_recommendations = df_shared_articles.loc[df_shared_articles.index.isin(recommend_dict.keys()), ['title', 'contentId']]

# Join the original df results with the recommended article similarity scores so that we can sort the list and view it
df_recommendations.join(df_similarity).sort_values(by=['similarity'], ascending=False)

Index(['timestamp', 'eventType', 'contentId', 'authorPersonId',
       'authorSessionId', 'authorUserAgent', 'authorRegion', 'authorCountry',
       'contentType', 'url', 'title', 'text', 'lang'],
      dtype='object')

If you like "Google Data Center 360° Tour," then you may also like:



Unnamed: 0,title,contentId,similarity
142,Google shares data center security and design ...,8298709454703868984,0.337687
1142,Diane Greene wants to put the enterprise front...,1929674614667189969,0.22311
1236,360 million reasons to destroy all passwords -...,2765063319512128208,0.196657
2563,Visa inaugura Co-Creation Center em São Paulo,-4487024160266973763,0.186554
1749,Código Google: Introdução da próxima geração d...,4375556914674736641,0.180178
3120,JPMorgan Software Does in Seconds What Took La...,6607431762270322325,0.174994
1862,Portal Abrasce - Conhecendo o frequentador de ...,8541347773340490171,0.174492
2881,A Data Engineer's Guide To Non-Traditional Dat...,4702453495211185350,0.170292
2113,Opening Up Data Science with data.world - Jono...,4479576054067772847,0.16923
77,Coin Center Opens Nominations for the 2016 Blo...,-7101541512657907485,0.168267
