Colloborative filtering model

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

data = pd.read_csv('/content/drive/MyDrive/recommendation/multi-category-dataset-recommendation.csv')
df = data.dropna()

In [3]:
columns_to_keep = ['category_code', 'event_type', 'event_time', 'brand', 'price', 'age', 'gender', 'location']
df = data[columns_to_keep].copy()

split_category = df['category_code'].str.split('.', 1, expand=True)
df['category'] = split_category[0]
df['product_name'] = split_category[1]

df['age'].fillna(df['age'].mean(), inplace=True)

df['event_time'] = pd.to_datetime(df['event_time'], format='%Y-%m-%d %H:%M:%S %Z', utc=True)

print(df.head())

  split_category = df['category_code'].str.split('.', 1, expand=True)


               category_code event_type                event_time   brand  \
0     electronics.smartphone       view 2019-11-01 00:00:00+00:00  xiaomi   
1  appliances.sewing_machine       view 2019-11-01 00:00:00+00:00  janome   
2                        NaN       view 2019-11-01 00:00:01+00:00   creed   
3  appliances.kitchen.washer       view 2019-11-01 00:00:01+00:00      lg   
4     electronics.smartphone       view 2019-11-01 00:00:01+00:00  xiaomi   

    price  age gender       location     category    product_name  
0  489.07   20      M       Northern  electronics      smartphone  
1  293.65   25      M        Central   appliances  sewing_machine  
2   28.31   20      F  North Central          NaN             NaN  
3  712.87   18      M   Sabaragamuwa   appliances  kitchen.washer  
4  183.27   53      M        Eastern  electronics      smartphone  


In [4]:
import pandas as pd

# Assuming you don't have a user_id, you can create user profiles based on age, gender, and location.
user_profiles = df.groupby(['age', 'gender', 'location']).agg({
    'event_type': 'count'
}).reset_index()

user_profiles.rename(columns={'event_type': 'total_interactions'}, inplace=True)

print(user_profiles.head())


   age gender       location  total_interactions
0   18      F        Central                  96
1   18      F        Eastern                  81
2   18      F  North Central                 101
3   18      F  North Western                 110
4   18      F       Northern                 122


In [5]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3162726 sha256=9aeee6b252c70777edd9a2cfe15f4e0c8ecbc60bdd6769bf556e521839bbe4ae
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.3


In [6]:
from surprise import Reader, Dataset, SVD, accuracy
from surprise.model_selection import train_test_split

df['event_type_binary'] = df['event_type'].map({'view': 1, 'cart': 2, 'purchase': 3})
df['pseudo_user_id'] = pd.factorize(df['product_name'])[0]

reader = Reader(rating_scale=(1, 3))  # The rating scale now accurately reflects our event type mapping
data = Dataset.load_from_df(df[['pseudo_user_id', 'product_name', 'event_type_binary']], reader)

trainset, testset = train_test_split(data, test_size=0.2, random_state=42)


In [7]:
model = SVD()
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f962db46140>

In [8]:
predictions = model.test(testset)

In [9]:
# Recommendation function adapted for the scenario without using user IDs
def collaborative_filtering_recommendation(num_recommendations=5):
    # Calculate the average rating for each product
    average_ratings = df.groupby('product_name')['event_type_binary'].mean().reset_index()

    # Sort the products by their average rating in descending order
    top_rated_products = average_ratings.sort_values(by='event_type_binary', ascending=False).head(num_recommendations)

    # Return the top N recommended products
    return top_rated_products

# Get top recommended products
top_recommendations = collaborative_filtering_recommendation()

# Display the top recommended products
print("Top Recommended Products:")
for index, row in top_recommendations.iterrows():
    print(f"Product Name: {row['product_name']}, Average Rating: {row['event_type_binary']}")


Top Recommended Products:
Product Name: shoes.sandals, Average Rating: 1.105263157894737
Product Name: accessories.radar, Average Rating: 1.1014492753623188
Product Name: components.power_supply, Average Rating: 1.0869565217391304
Product Name: environment.water_heater, Average Rating: 1.0833333333333333
Product Name: audio.headphone, Average Rating: 1.081798866855524


In [10]:
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)

RMSE: 0.2644
MAE:  0.0712


In [11]:
# Print evaluation results
print(f'RMSE: {rmse}')  # Root Mean Square Error
print(f'MAE: {mae}')  # Mean Absolute Error


RMSE: 0.26441333710480214
MAE: 0.07120548019774448


In [12]:
rmse_accuracy = 100 - (rmse * 100)
mae_accuracy = 100 - (mae * 100)

# Print the accuracy results as percentages
print(f'RMSE Accuracy: {rmse_accuracy:.2f}%')
print(f'MAE Accuracy: {mae_accuracy:.2f}%')

RMSE Accuracy: 73.56%
MAE Accuracy: 92.88%


In [13]:
import pickle

# Save the model
with open('collaborative_model.pkl', 'wb') as file:
    pickle.dump(model, file)

print("Model saved successfully.")

Model saved successfully.


Content Based Model

In [14]:
df['product_profile'] = df.apply(lambda x: f"{x['brand']}_{x['category']}_{x['product_name']}", axis=1)


In [15]:
df['user_profile'] = df.apply(lambda x: f"{x['age']}_{x['gender']}_{x['location']}", axis=1)


In [16]:
event_type_mapping = {
    'view': 1,
    'cart': 2,
    'purchase': 3
}
df['event_type_rating'] = df['event_type'].map(event_type_mapping)


In [17]:
reader = Reader(rating_scale=(1, 3))
data = Dataset.load_from_df(df[['user_profile', 'product_profile', 'event_type_rating']], reader)


In [18]:
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)


In [19]:
from surprise import Reader, Dataset, NMF, accuracy


In [20]:
content_model = NMF()
content_model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.NMF at 0x7f962d9c8820>

In [21]:
# Making predictions
predictions = content_model.test(testset)

# Evaluation
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)

RMSE: 0.2670
MAE:  0.0529


In [22]:
print(f'RMSE: {rmse:.4f}, MAE: {mae:.4f}')

# Recommendation function adapted for user profiles and product profiles
def recommend_for_profile(user_profile, num_recommendations=5):
    unique_product_profiles = df['product_profile'].unique()
    predictions = [model.predict(user_profile, product_profile).est for product_profile in unique_product_profiles]
    recommendations = sorted(zip(unique_product_profiles, predictions), key=lambda x: x[1], reverse=True)[:num_recommendations]
    return recommendations

# Example usage
recommendations = recommend_for_profile("25_M_NewYork", 5)
print("Top 5 Recommendations:")
for product_profile, score in recommendations:
    print(f"{product_profile}: {score:.4f}")

RMSE: 0.2670, MAE: 0.0529
Top 5 Recommendations:
xiaomi_electronics_smartphone: 1.0388
janome_appliances_sewing_machine: 1.0388
creed_nan_nan: 1.0388
lg_appliances_kitchen.washer: 1.0388
hp_computers_notebook: 1.0388


In [23]:
import pickle

# Save the model
with open('content_model.pkl', 'wb') as file:
    pickle.dump(content_model, file)

print("Model saved successfully.")

Model saved successfully.
