In [41]:
import pandas as pd
import numpy as np

# Load the datasets
posts_df = pd.read_csv('datasets/post_data.csv')
views_df = pd.read_csv('datasets/view_data.csv')
users_df = pd.read_csv('datasets/user_data.csv')

views_df_with_scores = pd.DataFrame(views_df)

# Generate random probabilities for 1, 2, and 3
probs = np.random.dirichlet(np.ones(3))

# Assign probabilities for the scores
scores = np.random.choice(
    [1, 2, 3],
    size=len(views_df),
    p=probs
)

# Add the score column to the dataframe
views_df_with_scores['score'] = scores

views_df_with_scores.head()

Unnamed: 0,user_id,post_id,time_stamp,score
0,5eece14ffc13ae660900008b,136781766,01/01/2019 01:30 PM,1
1,5eece14efc13ae660900003c,43094523,01/01/2019 01:33 PM,3
2,5eece14efc13ae6609000025,42428071,01/01/2019 01:43 PM,2
3,5eece14ffc13ae66090001d4,76472880,01/01/2019 01:54 PM,2
4,5eece14ffc13ae66090000ac,202721843,01/01/2019 02:00 PM,3


In [42]:
# Merge the views_df_with_scores with posts_df
merged_data = pd.merge(views_df_with_scores, posts_df ,on='post_id')
merged_data.head()

Unnamed: 0,user_id,post_id,time_stamp,score,title,category
0,5eece14ffc13ae660900008b,136781766,01/01/2019 01:30 PM,1,Sexy BANKING,banking
1,5eece14efc13ae660900003c,43094523,01/01/2019 01:33 PM,3,10 Ways To Immediately Start Selling PROGRAMMING,programming
2,5eece14efc13ae6609000025,42428071,01/01/2019 01:43 PM,2,DRAWING Adventures,drawing
3,5eece14ffc13ae66090001d4,76472880,01/01/2019 01:54 PM,2,The Ultimate Guide To POLITICS,politics
4,5eece14ffc13ae66090000ac,202721843,01/01/2019 02:00 PM,3,ZOOLOGY And Love Have 4 Things In Common,zoology


In [43]:
# Drop the columns that are not needed and remove the rows with missing title values
cleaned_data = merged_data.drop(['time_stamp', 'category'], axis=1).dropna(axis = 0, subset = ['title'])
cleaned_data.head()

Unnamed: 0,user_id,post_id,score,title
0,5eece14ffc13ae660900008b,136781766,1,Sexy BANKING
1,5eece14efc13ae660900003c,43094523,3,10 Ways To Immediately Start Selling PROGRAMMING
2,5eece14efc13ae6609000025,42428071,2,DRAWING Adventures
3,5eece14ffc13ae66090001d4,76472880,2,The Ultimate Guide To POLITICS
4,5eece14ffc13ae66090000ac,202721843,3,ZOOLOGY And Love Have 4 Things In Common


In [44]:
# Create a new dataframe with the total score for each post
post_total_scores = (cleaned_data.
     groupby(by = ['title'])['score'].
     count().
     reset_index().
     rename(columns = {'score': 'total_score'})
     [['title', 'total_score']]
    )
post_total_scores.head()

Unnamed: 0,title,total_score
0,10 Funny ART Quotes,15
1,10 Funny BANKING Quotes,10
2,10 Funny BUSINESS Quotes,13
3,10 Funny CRAFT Quotes,9
4,10 Funny DANCE Quotes,19


In [45]:
# Merge the two dataframes
posts_with_score_details = post_total_scores.merge(cleaned_data, left_on = 'title', right_on = 'title', how = 'left')
posts_with_score_details.head()

Unnamed: 0,title,total_score,user_id,post_id,score
0,10 Funny ART Quotes,15,5eece14ffc13ae66090001f0,436754978,2
1,10 Funny ART Quotes,15,5eece14ffc13ae660900018d,436754978,3
2,10 Funny ART Quotes,15,5eece14ffc13ae6609000162,436754978,2
3,10 Funny ART Quotes,15,5eece14ffc13ae6609000149,436754978,3
4,10 Funny ART Quotes,15,5eece14ffc13ae660900012b,436754978,1


In [46]:
from scipy.sparse import csr_matrix
unique_user_post_scores = posts_with_score_details.drop_duplicates(['user_id', 'title'])
user_post_score_matrix =  unique_user_post_scores.pivot(index = 'title', columns = 'user_id', values = 'score').fillna(0)
sparse_user_post_matrix = csr_matrix(user_post_score_matrix.values)

# Calculate the total number of elements
total_elements = sparse_user_post_matrix.shape[0] * sparse_user_post_matrix.shape[1]

# Get the number of zero elements
non_zero_elements = sparse_user_post_matrix.nnz
print(f"Number of non-zero elements: {non_zero_elements}")

# Calculate sparsity
sparsity = (total_elements - non_zero_elements) / total_elements

print(f"Sparsity: {sparsity:.2%}")

Number of non-zero elements: 70614
Sparsity: 97.65%


In [60]:

from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(sparse_user_post_matrix)

In [None]:
query_index = np.random.choice(user_post_score_matrix.shape[0])

# Get top 10 recommendations
distances, indices = model_knn.kneighbors(user_post_score_matrix.iloc[query_index, :].values.reshape(1, -1), n_neighbors = 11)
for i in range(0, len(distances.flatten())):
    if i == 0:
        print(f"Recommendations for {user_post_score_matrix.index[query_index]}:")
    else:
        print(f"{i}: {user_post_score_matrix.index[indices.flatten()[i]]}, with distance of {distances.flatten()[i]}")
        
            


Recommendations for  How To Become Better With DANCE In 10 Minutes:
1:  What Can Instagramm Teach You About ART, with distance of 0.4865639691897298
2:  The Untold Secret To OPERATING SYSTEM In Less Than Ten Minutes, with distance of 0.5271945711553498
3:  OMG! The Best MUSIC Ever!, with distance of 0.5377498364789758
4:  How To Win Clients And Influence Markets with ZOOLOGY, with distance of 0.5532811099562212
5:  The Ultimate Secret Of GST, with distance of 0.6176404435490638
6: POLITICS Iphone Apps, with distance of 0.6225743219518014
7:  Want A Thriving Business? Focus On DANCE!, with distance of 0.6241769859985855
8:  Get Rid of PAINTING For Good, with distance of 0.6257724004081255
9: The Ultimate Guide To FASHION DESIGN, with distance of 0.6419425629802835
10:  How You Can (Do) DRAWING In 24 Hours Or Less For Free, with distance of 0.6464466094067263
