## Creating the score column

In [31]:
import pandas as pd
import numpy as np

# Load the datasets
post_data = pd.read_csv('post_data.csv')
view_data = pd.read_csv('view_data.csv')

dataframe = pd.DataFrame(view_data)

# Generate random probabilities for 1, 2, and 3
probs = np.random.dirichlet(np.ones(3))

# Assign probabilities for the ratings
scores = np.random.choice(
    [1, 2, 3],
    size=len(view_data),
    p=probs
)

# Add the score column to the dataframe
dataframe['score'] = scores

dataframe.head()

Unnamed: 0,user_id,post_id,time_stamp,score
0,5eece14ffc13ae660900008b,136781766,01/01/2019 01:30 PM,3
1,5eece14efc13ae660900003c,43094523,01/01/2019 01:33 PM,1
2,5eece14efc13ae6609000025,42428071,01/01/2019 01:43 PM,2
3,5eece14ffc13ae66090001d4,76472880,01/01/2019 01:54 PM,3
4,5eece14ffc13ae66090000ac,202721843,01/01/2019 02:00 PM,1


In [32]:
# Merge the two dataframes
df = pd.merge(dataframe, post_data ,on='post_id')
df.tail()

Unnamed: 0,user_id,post_id,time_stamp,score,title,category
71794,5eece14ffc13ae660900018c,615389604,12/31/2019 12:37 AM,1,5 Brilliant Ways To Teach Your Audience About ...,operating system
71795,5eece14ffc13ae660900010c,348689108,12/31/2019 12:50 PM,1,The Secrets To Finding World Class Tools For ...,GST
71796,5eece14ffc13ae6609000190,619052165,12/31/2019 12:51 AM,1,Double Your Profit With These 5 Tips on CRAFT,Craft
71797,5eece14efc13ae6609000067,426384418,12/31/2019 12:51 PM,1,It's All About (The) DANCE,dance
71798,5eece14ffc13ae6609000110,165390871,12/31/2019 12:54 AM,1,3 Ways You Can Reinvent ZOOLOGY Without Looki...,zoology


In [33]:
# Drop the time_stamp and category columns because they are not needed for now
data = df.drop(['time_stamp', 'category'], axis=1)
data.tail()

Unnamed: 0,user_id,post_id,score,title
71794,5eece14ffc13ae660900018c,615389604,1,5 Brilliant Ways To Teach Your Audience About ...
71795,5eece14ffc13ae660900010c,348689108,1,The Secrets To Finding World Class Tools For ...
71796,5eece14ffc13ae6609000190,619052165,1,Double Your Profit With These 5 Tips on CRAFT
71797,5eece14efc13ae6609000067,426384418,1,It's All About (The) DANCE
71798,5eece14ffc13ae6609000110,165390871,1,3 Ways You Can Reinvent ZOOLOGY Without Looki...


In [34]:
# Create a new dataframe with the total score for each post
posts_with_score = data.dropna(axis = 0, subset = ['title'])

post_totalScore = (posts_with_score.
     groupby(by = ['title'])['score'].
     count().
     reset_index().
     rename(columns = {'score': 'totalScore'})
     [['title', 'totalScore']]
    )
post_totalScore.head()

Unnamed: 0,title,totalScore
0,10 Funny ART Quotes,15
1,10 Funny BANKING Quotes,10
2,10 Funny BUSINESS Quotes,13
3,10 Funny CRAFT Quotes,9
4,10 Funny DANCE Quotes,19


In [35]:
# Merge the two dataframes
score_with_totalValuableCount = posts_with_score.merge(post_totalScore, left_on = 'title', right_on = 'title', how = 'left')
score_with_totalValuableCount.tail()

Unnamed: 0,user_id,post_id,score,title,totalScore
71794,5eece14ffc13ae660900018c,615389604,1,5 Brilliant Ways To Teach Your Audience About ...,14
71795,5eece14ffc13ae660900010c,348689108,1,The Secrets To Finding World Class Tools For ...,7
71796,5eece14ffc13ae6609000190,619052165,1,Double Your Profit With These 5 Tips on CRAFT,8
71797,5eece14efc13ae6609000067,426384418,1,It's All About (The) DANCE,14
71798,5eece14ffc13ae6609000110,165390871,1,3 Ways You Can Reinvent ZOOLOGY Without Looki...,12


In [36]:
from scipy.sparse import csr_matrix
rating_popular_post = score_with_totalValuableCount.drop_duplicates(['user_id', 'title'])
rating_popular_post_pivot = rating_popular_post.pivot(index = 'title', columns = 'user_id', values = 'score').fillna(0)
rating_popular_post_matrix = csr_matrix(rating_popular_post_pivot.values)

# Calculate the total number of elements
total_elements = rating_popular_post_matrix.shape[0] * rating_popular_post_matrix.shape[1]
print(f"Shape[0]: {rating_popular_post_matrix.shape[0]}")
print(f"Shape[1]: {rating_popular_post_matrix.shape[1]}")
print(f"Total number of elements: {total_elements}")

# Get the number of non-zero elements
non_zero_elements = rating_popular_post_matrix.nnz
print(f"Number of non-zero elements: {non_zero_elements}")

# Calculate sparsity
sparsity = (total_elements - non_zero_elements) / total_elements

print(f"Sparsity: {sparsity:.2%}")

rating_popular_post_pivot

Shape[0]: 6000
Shape[1]: 500
Total number of elements: 3000000
Number of non-zero elements: 70614
Sparsity: 97.65%


user_id,5eece14efc13ae6609000000,5eece14efc13ae6609000001,5eece14efc13ae6609000002,5eece14efc13ae6609000003,5eece14efc13ae6609000004,5eece14efc13ae6609000005,5eece14efc13ae6609000006,5eece14efc13ae6609000007,5eece14efc13ae6609000008,5eece14efc13ae6609000009,...,5eece14ffc13ae66090001ea,5eece14ffc13ae66090001eb,5eece14ffc13ae66090001ec,5eece14ffc13ae66090001ed,5eece14ffc13ae66090001ee,5eece14ffc13ae66090001ef,5eece14ffc13ae66090001f0,5eece14ffc13ae66090001f1,5eece14ffc13ae66090001f2,5eece14ffc13ae66090001f3
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Funny ART Quotes,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
10 Funny BANKING Quotes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 Funny BUSINESS Quotes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 Funny CRAFT Quotes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 Funny DANCE Quotes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZOOLOGY: Do You Really Need It? This Will Help You Decide!,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ZOOLOGY: The Samurai Way,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ZOOLOGY: This Is What Professionals Do,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ZOOLOGY: What A Mistake!,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
