# Combining collaborative filtering and content-based filtering 

https://chatgpt.com/share/5244e191-ffac-463f-96f5-de04a7c8dc7f

## Content-Based Filtering:

- Uses TfidfVectorizer to convert product descriptions into TF-IDF vectors and computes cosine similarity between products.
Recommends products similar to a given product based on these similarity scores.
Collaborative Filtering:

- Uses SVD from the surprise library to perform matrix factorization on user-item interaction data.
Recommends products based on predicted ratings for items not yet interacted with by the user.
Sequence-Based Model (RNN):

- Prepares sequential data of user interactions and pads sequences to a fixed length.
Defines and trains an RNN model to predict the next product a user might interact with.
Recommends products based on the predictions of the RNN model.
Hybrid Approach:

- Combines recommendations from content-based filtering, collaborative filtering, and the RNN model.
Uses a weighted approach (controlled by alpha, beta, and gamma) to merge the scores from all methods.
Provides final recommendations by ranking the combined scores.


In [1]:
import pandas as pd
from surprise import Dataset, Reader
from surprise import SVD, KNNBasic
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Load user-item interactions
interaction_data = pd.read_csv('./user_product_interactions.csv')

# Load product features
product_data = pd.read_csv('./product_features.csv')

# Prepare Surprise dataset
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(interaction_data[['user_id', 'product_id', 'rating']], reader)
trainset = data.build_full_trainset()


In [2]:
# Print out some information about the trainset
print("Number of users:", trainset.n_users)
print("Number of items:", trainset.n_items)
print("Number of ratings:", trainset.n_ratings)

Number of users: 1000
Number of items: 500
Number of ratings: 10000


In [3]:
# To inspect the ratings, we can iterate through the trainset
print("Ratings:")
for uid, iid, rating in trainset.all_ratings():
    print(f"user: {uid}, item: {iid}, rating: {rating}")


Ratings:
user: 0, item: 0, rating: 4.0
user: 0, item: 225, rating: 3.0
user: 0, item: 198, rating: 3.0
user: 0, item: 235, rating: 1.0
user: 0, item: 189, rating: 3.0
user: 0, item: 55, rating: 5.0
user: 0, item: 169, rating: 1.0
user: 0, item: 485, rating: 5.0
user: 1, item: 1, rating: 5.0
user: 1, item: 469, rating: 2.0
user: 1, item: 427, rating: 2.0
user: 1, item: 188, rating: 2.0
user: 1, item: 330, rating: 4.0
user: 1, item: 436, rating: 3.0
user: 1, item: 52, rating: 4.0
user: 1, item: 56, rating: 4.0
user: 1, item: 379, rating: 2.0
user: 1, item: 113, rating: 2.0
user: 1, item: 282, rating: 2.0
user: 1, item: 497, rating: 1.0
user: 2, item: 2, rating: 5.0
user: 2, item: 254, rating: 2.0
user: 2, item: 361, rating: 2.0
user: 2, item: 24, rating: 5.0
user: 2, item: 128, rating: 3.0
user: 2, item: 231, rating: 4.0
user: 2, item: 381, rating: 5.0
user: 2, item: 29, rating: 1.0
user: 2, item: 12, rating: 4.0
user: 3, item: 3, rating: 3.0
user: 3, item: 9, rating: 2.0
user: 3, item: 

In [4]:
# Assume product_data has columns: 'product_id', 'description'
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(product_data['description'])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)


In [5]:
print(type(cosine_sim), cosine_sim.shape, cosine_sim[:5])

<class 'numpy.ndarray'> (500, 500) [[1.         0.02393411 0.02599778 ... 1.         0.02491918 0.18021567]
 [0.02393411 1.         0.0210908  ... 0.02393411 0.02021578 0.02031635]
 [0.02599778 0.0210908  1.         ... 0.02599778 0.02195885 0.02206809]
 [0.02569617 0.02084612 0.02264354 ... 0.02569617 0.0217041  0.02181207]
 [1.         0.02393411 0.02599778 ... 1.         0.02491918 0.18021567]]


In [6]:
print('product_data.index', product_data.index)

product_data.index RangeIndex(start=0, stop=500, step=1)


In [7]:
# Function to get product recommendations based on content
def get_content_recommendations(product_id, cosine_sim=cosine_sim):
    idx = product_data.index[product_data['product_id'] == product_id].tolist()[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    print('sim_scores', sim_scores[:2])
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    print('sim_scores:sorted', sim_scores)
    product_indices = [i[0] for i in sim_scores]
    print('product_indices', product_indices)
    return product_data['product_id'].iloc[product_indices]

# Example usage
content_recommendations = get_content_recommendations(1)
print("Content-Based Recommendations:", content_recommendations)

sim_scores [(0, 0.9999999999999998), (1, 0.023934105217457074)]
sim_scores:sorted [(4, 0.9999999999999998), (7, 0.9999999999999998), (28, 0.9999999999999998), (34, 0.9999999999999998), (37, 0.9999999999999998), (48, 0.9999999999999998), (68, 0.9999999999999998), (80, 0.9999999999999998), (94, 0.9999999999999998), (97, 0.9999999999999998)]
product_indices [4, 7, 28, 34, 37, 48, 68, 80, 94, 97]
Content-Based Recommendations: 4      5
7      8
28    29
34    35
37    38
48    49
68    69
80    81
94    95
97    98
Name: product_id, dtype: int64


In [8]:
# Use SVD for collaborative filtering
algo_cf = SVD()
algo_cf.fit(trainset)

# Function to get collaborative filtering recommendations
def get_collaborative_recommendations(user_id, num_recommendations=10):
    all_product_ids = interaction_data['product_id'].unique()
    predictions = [algo_cf.predict(user_id, pid) for pid in all_product_ids]
    predictions = sorted(predictions, key=lambda x: x.est, reverse=True)
    top_predictions = predictions[:num_recommendations]
    top_product_ids = [pred.iid for pred in top_predictions]
    return top_product_ids

# Example usage
collaborative_recommendations = get_collaborative_recommendations(1)
print("Collaborative Filtering Recommendations:", collaborative_recommendations)


Collaborative Filtering Recommendations: [156, 63, 346, 457, 99, 434, 12, 404, 469, 3]


In [9]:
def get_hybrid_recommendations(user_id, product_id, alpha=0.5, num_recommendations=10):
    content_recs = get_content_recommendations(product_id)
    collaborative_recs = get_collaborative_recommendations(user_id)
    print('content_recs', content_recs)
    print('collaborative_recs', collaborative_recs)

    hybrid_scores = {}
    
    for pid in content_recs:
        hybrid_scores[pid] = hybrid_scores.get(pid, 0) + alpha

    for pid in collaborative_recs:
        hybrid_scores[pid] = hybrid_scores.get(pid, 0) + (1 - alpha)
    print('hybrid_scores', hybrid_scores)
    sorted_hybrid_scores = sorted(hybrid_scores.items(), key=lambda x: x[1], reverse=True)
    hybrid_recommendations = [item[0] for item in sorted_hybrid_scores[:num_recommendations]]
    return hybrid_recommendations

# Example usage
hybrid_recommendations = get_hybrid_recommendations(1, 1)
print("Hybrid Recommendations:", hybrid_recommendations)


sim_scores [(0, 0.9999999999999998), (1, 0.023934105217457074)]
sim_scores:sorted [(4, 0.9999999999999998), (7, 0.9999999999999998), (28, 0.9999999999999998), (34, 0.9999999999999998), (37, 0.9999999999999998), (48, 0.9999999999999998), (68, 0.9999999999999998), (80, 0.9999999999999998), (94, 0.9999999999999998), (97, 0.9999999999999998)]
product_indices [4, 7, 28, 34, 37, 48, 68, 80, 94, 97]
content_recs 4      5
7      8
28    29
34    35
37    38
48    49
68    69
80    81
94    95
97    98
Name: product_id, dtype: int64
collaborative_recs [156, 63, 346, 457, 99, 434, 12, 404, 469, 3]
hybrid_scores {5: 0.5, 8: 0.5, 29: 0.5, 35: 0.5, 38: 0.5, 49: 0.5, 69: 0.5, 81: 0.5, 95: 0.5, 98: 0.5, 156: 0.5, 63: 0.5, 346: 0.5, 457: 0.5, 99: 0.5, 434: 0.5, 12: 0.5, 404: 0.5, 469: 0.5, 3: 0.5}
Hybrid Recommendations: [5, 8, 29, 35, 38, 49, 69, 81, 95, 98]


In [12]:
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN, Dense

In [None]:
# Prepare sequential data for RNN
user_sequences = interaction_data.groupby('user_id')['product_id'].apply(list)

# Pad sequences to the same length
max_seq_length = 10
padded_sequences = pad_sequences(user_sequences, maxlen=max_seq_length, padding='pre')

# Prepare input and output sequences for RNN training
X = padded_sequences[:, :-1]
y = padded_sequences[:, -1]

# Define RNN model
model = Sequential()
model.add(Embedding(input_dim=num_products + 1, output_dim=50, input_length=max_seq_length - 1))
model.add(SimpleRNN(50))
model.add(Dense(num_products + 1, activation='softmax'))

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X, y, epochs=10, batch_size=32)

# Function to get RNN-based recommendations
def get_rnn_recommendations(user_id, num_recommendations=10):
    user_seq = user_sequences[user_id][-max_seq_length + 1:]
    padded_seq = pad_sequences([user_seq], maxlen=max_seq_length - 1, padding='pre')
    pred = model.predict(padded_seq)
    top_indices = np.argsort(pred[0])[-num_recommendations:][::-1]
    return top_indices
