In [34]:
import pandas as pd
import numpy as np

# Load the datasets
posts_df = pd.read_csv('datasets/post_data.csv')
views_df = pd.read_csv('datasets/view_data.csv')
users_df = pd.read_csv('datasets/user_data.csv')

views_df_with_scores = pd.DataFrame(views_df)

# Generate random probabilities for 1, 2, and 3
probs = np.random.dirichlet(np.ones(3))

# Assign probabilities for the ratings
scores = np.random.choice(
    [1, 2, 3],
    size=len(views_df),
    p=probs
)

# Add the score column to the dataframe
views_df_with_scores['score'] = scores

views_df_with_scores.head()

Unnamed: 0,user_id,post_id,time_stamp,score
0,5eece14ffc13ae660900008b,136781766,01/01/2019 01:30 PM,1
1,5eece14efc13ae660900003c,43094523,01/01/2019 01:33 PM,1
2,5eece14efc13ae6609000025,42428071,01/01/2019 01:43 PM,3
3,5eece14ffc13ae66090001d4,76472880,01/01/2019 01:54 PM,3
4,5eece14ffc13ae66090000ac,202721843,01/01/2019 02:00 PM,1


In [35]:
# Merge the views_df_with_scores with the posts_df
merged_data = pd.merge(views_df_with_scores, posts_df ,on='post_id')
merged_data.head()

Unnamed: 0,user_id,post_id,time_stamp,score,title,category
0,5eece14ffc13ae660900008b,136781766,01/01/2019 01:30 PM,1,Sexy BANKING,banking
1,5eece14efc13ae660900003c,43094523,01/01/2019 01:33 PM,1,10 Ways To Immediately Start Selling PROGRAMMING,programming
2,5eece14efc13ae6609000025,42428071,01/01/2019 01:43 PM,3,DRAWING Adventures,drawing
3,5eece14ffc13ae66090001d4,76472880,01/01/2019 01:54 PM,3,The Ultimate Guide To POLITICS,politics
4,5eece14ffc13ae66090000ac,202721843,01/01/2019 02:00 PM,1,ZOOLOGY And Love Have 4 Things In Common,zoology


In [36]:
# Drop the columns that are not needed and remove the rows with missing title values
cleaned_data = merged_data.drop(['time_stamp', 'category'], axis=1).dropna(axis = 0, subset = ['title'])
cleaned_data.head()

Unnamed: 0,user_id,post_id,score,title
0,5eece14ffc13ae660900008b,136781766,1,Sexy BANKING
1,5eece14efc13ae660900003c,43094523,1,10 Ways To Immediately Start Selling PROGRAMMING
2,5eece14efc13ae6609000025,42428071,3,DRAWING Adventures
3,5eece14ffc13ae66090001d4,76472880,3,The Ultimate Guide To POLITICS
4,5eece14ffc13ae66090000ac,202721843,1,ZOOLOGY And Love Have 4 Things In Common


In [37]:
# Create a new dataframe with the total score for each post
post_total_scores = (cleaned_data.
     groupby(by = ['title'])['score'].
     count().
     reset_index().
     rename(columns = {'score': 'total_score'})
     [['title', 'total_score']]
    )
post_total_scores.head()

Unnamed: 0,title,total_score
0,10 Funny ART Quotes,15
1,10 Funny BANKING Quotes,10
2,10 Funny BUSINESS Quotes,13
3,10 Funny CRAFT Quotes,9
4,10 Funny DANCE Quotes,19


In [38]:
# Merge the two dataframes
posts_with_score_details = post_total_scores.merge(cleaned_data, left_on = 'title', right_on = 'title', how = 'left')
posts_with_score_details.head()

Unnamed: 0,title,total_score,user_id,post_id,score
0,10 Funny ART Quotes,15,5eece14ffc13ae66090001f0,436754978,1
1,10 Funny ART Quotes,15,5eece14ffc13ae660900018d,436754978,1
2,10 Funny ART Quotes,15,5eece14ffc13ae6609000162,436754978,1
3,10 Funny ART Quotes,15,5eece14ffc13ae6609000149,436754978,2
4,10 Funny ART Quotes,15,5eece14ffc13ae660900012b,436754978,3


In [39]:
from scipy.sparse import csr_matrix
unique_user_post_scores = posts_with_score_details.drop_duplicates(['user_id', 'title'])
user_post_score_matrix =  unique_user_post_scores.pivot(index = 'title', columns = 'user_id', values = 'score').fillna(0)
sparse_user_post_matrix = csr_matrix(user_post_score_matrix.values)

# Calculate the total number of elements
total_elements = sparse_user_post_matrix.shape[0] * sparse_user_post_matrix.shape[1]

# Get the number of zero elements
non_zero_elements = sparse_user_post_matrix.nnz
print(f"Number of non-zero elements: {non_zero_elements}")

# Calculate sparsity
sparsity = (total_elements - non_zero_elements) / total_elements

print(f"Sparsity: {sparsity:.2%}")

user_post_score_matrix

Number of non-zero elements: 70614
Sparsity: 97.65%


user_id,5eece14efc13ae6609000000,5eece14efc13ae6609000001,5eece14efc13ae6609000002,5eece14efc13ae6609000003,5eece14efc13ae6609000004,5eece14efc13ae6609000005,5eece14efc13ae6609000006,5eece14efc13ae6609000007,5eece14efc13ae6609000008,5eece14efc13ae6609000009,...,5eece14ffc13ae66090001ea,5eece14ffc13ae66090001eb,5eece14ffc13ae66090001ec,5eece14ffc13ae66090001ed,5eece14ffc13ae66090001ee,5eece14ffc13ae66090001ef,5eece14ffc13ae66090001f0,5eece14ffc13ae66090001f1,5eece14ffc13ae66090001f2,5eece14ffc13ae66090001f3
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Funny ART Quotes,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
10 Funny BANKING Quotes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 Funny BUSINESS Quotes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 Funny CRAFT Quotes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 Funny DANCE Quotes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZOOLOGY: Do You Really Need It? This Will Help You Decide!,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ZOOLOGY: The Samurai Way,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ZOOLOGY: This Is What Professionals Do,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ZOOLOGY: What A Mistake!,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
