In [27]:
import os 

os.chdir("/home/hamza-arain/Documents/code/recmmendation")

In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')


from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [29]:
df = pd.read_csv("data/online_retail_final.csv")

df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df['Date'] = df['InvoiceDate'].dt.date
df = df[df['Country'] == "France"]


print(df.shape)
df.head(2)

(8468, 10)


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,TotalPrice,Date
26,536370,22728,ALARM CLOCK BAKELIKE PINK,24,2010-12-01 08:45:00,3.75,12583.0,France,90.0,2010-12-01
27,536370,22727,ALARM CLOCK BAKELIKE RED,24,2010-12-01 08:45:00,3.75,12583.0,France,90.0,2010-12-01


In [30]:
## Association Rule

import joblib
from mlxtend.frequent_patterns import apriori, association_rules

class AssociationRule:
    def __init__(self, dataframe):
        self.dataframe = dataframe.copy()
        self.rules_path = "output/rules.joblib"
        
        # Check if the rules file exists
        if os.path.exists(self.rules_path):
            self.rules = joblib.load(self.rules_path)
        else:
            self.rules = self.train_rules(min_support=0.07, min_threshold=0.01)

    def outlier_thresholds(self, dataframe, variable):
        quartile1 = dataframe[variable].quantile(0.01)
        quartile3 = dataframe[variable].quantile(0.99)
        interquantile_range = quartile3 - quartile1
        up_limit = quartile3 + 1.5 * interquantile_range
        low_limit = quartile3 - 1.5 * interquantile_range
        return low_limit, up_limit

    def replace_with_thresholds(self, dataframe, variable):
        low_limit, up_limit = self.outlier_thresholds(dataframe, variable)
        dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
        dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit

    def create_invoice_product_df(self, dataframe, id=False):
        if id:
            return dataframe.groupby(['InvoiceNo', "StockCode"])['Quantity'].sum().unstack().fillna(0). \
                applymap(lambda x: 1 if x > 0 else 0)
        else:
            return dataframe.groupby(['InvoiceNo', 'Description'])['Quantity'].sum().unstack().fillna(0). \
                applymap(lambda x: 1 if x > 0 else 0)


    def preprocessing(self, dataframe):
        dataframe.dropna(inplace=True)
        dataframe = dataframe[~dataframe['InvoiceNo'].str.contains("C", na=False)]
        dataframe = dataframe[~dataframe['StockCode'].str.contains("POST", na=False)]
        dataframe = dataframe[~dataframe['StockCode'].str.contains("C2", na=False)]
        dataframe = dataframe[~dataframe['StockCode'].str.contains("M", na=False)]

        dataframe = dataframe[dataframe['Quantity'] > 0]
        dataframe = dataframe[dataframe['UnitPrice'] > 0]
        self.replace_with_thresholds(dataframe, "Quantity")
        self.replace_with_thresholds(dataframe, "UnitPrice")
        return dataframe

    def encode_units(self, x):
        if x <= 0:
            return 0
        if x >= 1:
            return 1

    def train_rules(self, min_support=0.07, min_threshold=0.01):
        frequency_df = self.dataframe.copy()
        frequency_df = self.preprocessing(frequency_df)

        basket_df = self.create_invoice_product_df(frequency_df, id=True)
        basket_df = basket_df.applymap(self.encode_units)

        frequent_itemsets = apriori(basket_df, min_support=min_support, use_colnames=True)
        rules = association_rules(frequent_itemsets,
                                metric="lift",
                                min_threshold=min_threshold,
                                num_itemsets=len(frequent_itemsets))
        rules = rules.sort_values("lift", ascending=False)

        joblib.dump(rules , self.rules_path)
        return rules

    def recommend(self, product_id, support=0.05, confidence=0.1, lift=5):
        sorted_rules = self.rules[(self.rules["support"]>=support) & (self.rules["confidence"]>=confidence) & (self.rules["lift"]>lift)].sort_values("confidence", ascending=False)

        recommendation_list = []
        for i, product in enumerate(sorted_rules["antecedents"]):
            for j in list(product):
                if j == product_id:
                    recommendation_list.append(list(sorted_rules.iloc[i]["consequents"])[0])
        return list(set(recommendation_list))

In [31]:
class CollaborativeEngine:
    def __init__(self, dataframe,
                    user_matrix_path="output/user_collaborative_matrix.joblib",
                    co_occurrence_matrix_path="output/co_occurrence_matrix.joblib"):
        self.dataframe = dataframe.copy()

        self.user_matrix_path = user_matrix_path
        self.co_occurrence_matrix_path = co_occurrence_matrix_path

        # Check if the rules file exists
        if os.path.exists(self.user_matrix_path) and (os.path.exists(self.co_occurrence_matrix_path)):
            self.user_matrix, self.co_occurrence_matrix = joblib.load(self.user_matrix_path), joblib.load(self.co_occurrence_matrix_path)
        else:
            self.user_matrix, self.co_occurrence_matrix = self.train_user_matrix()

    def train_user_matrix(self):
        # User Matrix
        user_item_matrix = self.dataframe.pivot_table(index='CustomerID', columns='StockCode', values='Quantity', fill_value=0)

        # Item based
        # Create a time-aware co-occurrence matrix by considering only recent data
        recent_data = self.dataframe[self.dataframe['InvoiceDate'] >= (self.dataframe['InvoiceDate'].max() - pd.Timedelta(days=30))]
        co_occurrence_matrix = self.dataframe.pivot_table(index='StockCode', columns='CustomerID', values='Quantity', fill_value=0).T.dot(self.dataframe.pivot_table(index='StockCode', columns='CustomerID', values='Quantity', fill_value=0))
        
        joblib.dump(user_item_matrix, self.user_matrix_path) 
        joblib.dump(co_occurrence_matrix, self.co_occurrence_matrix_path)   
  
        return user_item_matrix, co_occurrence_matrix

    def apply_time_decay(self, timestamp, current_time, decay_factor=0.8):
        """
        Apply exponential time decay. Recent purchases are given higher weight.
        """
        delta_days = (current_time - timestamp).days
        decay_weight = np.exp(-decay_factor * delta_days)
        return decay_weight

    def user_based_recommendation(self, user, current_time, top_n=5, time_decay=0):
        """
        User-based collaborative filtering recommendations with time decay.
        """
        if user not in self.user_matrix.index:
            return []  # Cold start user
        
        user_vector = self.user_matrix.loc[user].values.reshape(1, -1)
        similarities = cosine_similarity(user_vector, self.user_matrix)[0]
        
        # Apply time decay to user interactions (recent purchases have higher weight)
        time_weights = np.array([self.apply_time_decay(self.dataframe[self.dataframe['CustomerID'] == user]['InvoiceDate'].max(), current_time) 
                                for _ in range(len(similarities))])
        similarities = similarities * time_weights  # Adjust similarity by time decay
        
        similar_users = self.user_matrix.index[np.argsort(-similarities)[1:top_n+1]]
        recommended_items = self.user_matrix.loc[similar_users].sum().sort_values(ascending=False).index
        return list(recommended_items[:top_n])

    def item_based_recommendation(self, item, top_n=5):
        """
        Item-based collaborative filtering using time-sensitive co-occurrence matrix.
        """
        # Create a time-aware co-occurrence matrix by considering only recent data
        recent_data = self.dataframe[self.dataframe['InvoiceDate'] >= (self.dataframe['InvoiceDate'].max() - pd.Timedelta(days=30))]
        co_occurrence_matrix = self.dataframe.pivot_table(index='StockCode', columns='CustomerID', values='Quantity', fill_value=0).T.dot(self.dataframe.pivot_table(index='StockCode', columns='CustomerID', values='Quantity', fill_value=0))
        
        item_idx = co_occurrence_matrix.index.get_loc(item) if item in co_occurrence_matrix.index else None
        
        if item_idx is None:
            return []  # If item is not in the matrix
        
        similar_items = co_occurrence_matrix.iloc[item_idx].sort_values(ascending=False).index[1:top_n+1]
        return list(similar_items)


In [32]:
import joblib
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import numpy as np

class ContentBasedRecommedationSystem:
    def __init__(self, dataframe, model_name='all-MiniLM-L6-v2', embeds_path="output/content_embeddings.joblib", description_path="output/content_descriptions.joblib"):
        self.model = self.load_model(model_name)
        self.dataframe = dataframe.copy()

        self.embeds_path = embeds_path    
        self.description_path = description_path
        if os.path.exists(self.embeds_path) and os.path.exists(self.description_path):
            self.descriptions, self.embeds = joblib.load(self.description_path), joblib.load(self.embeds_path) 
        else:
            self.descriptions, self.embeds = self.train_embeddings()


    def load_model(self, model_name):
        embedding_model = SentenceTransformer(model_name)  
        return embedding_model

    def train_embeddings(self):
        descriptions = self.dataframe.drop_duplicates('StockCode').set_index('StockCode')['Description'].fillna("")
        description_texts = descriptions.tolist()
        embeddings = self.model .encode(description_texts, convert_to_tensor=True)
        
        joblib.dump(embeddings, self.embeds_path)
        joblib.dump(descriptions, self.description_path)
        return descriptions, embeddings

    def recommend(self, item, top_n):
        item_index = self.descriptions.index.get_loc(item) if item in self.descriptions.index else None
        if item_index is None:
            return []  # If item description is missing

        # Get the embedding for the target item
        item_embedding = self.embeds[item_index]
        

        # Compute cosine similarities
        similarities = cosine_similarity([item_embedding], self.embeds)[0]

        # Get the indices of the top N most similar items
        top_indices = np.argsort(-similarities)[1:top_n + 1]
        similar_items = self.descriptions.index[top_indices]
        return list(similar_items)




In [33]:
class HybridRecommendationSystem:
    def __init__(self, dataframe):
        self.dataframe = dataframe.copy()
        
        ## Association Rule 
        self.association_rule = AssociationRule(dataframe=dataframe)

        ## CollaborativeEngine for (user/item)
        self.collaborative_engine = CollaborativeEngine(dataframe=dataframe)

        ## Content based Recommendation
        self.content_based_recommendation = ContentBasedRecommedationSystem(dataframe=dataframe)

    def check_id(self, stock_code):
        product_name = self.dataframe[self.dataframe["StockCode"] == stock_code][["Description"]].values[0].tolist()
        return product_name[0]

    def update_weights(self):
        ## Association Rule 
        self.association_rule.train_rules(min_support=0.07, min_threshold=0.01)
        
        ## CollaborativeEngine for (user/item)
        self.collaborative_engine.train_user_matrix()

        ## Content based Recommendation
        self.content_based_recommendation.train_embeddings()

    def calculate_item_boost(self, item, recent_days=30):
        """
        Boost score for items with low purchase frequency in recent time.
        """
        recent_data = self.dataframe[self.dataframe['InvoiceDate'] >= (self.dataframe['InvoiceDate'].max() - pd.Timedelta(days=recent_days))]
        item_purchases = recent_data['StockCode'].value_counts()
        total_purchases = len(recent_data)
        boost = (total_purchases - item_purchases.get(item, 0)) / total_purchases
        return boost

    def recommend(self, user, item, current_time, top_n=5):
        # Define base weights for each methodrecommendations
        base_weights = {
            "apriori": 0.4,
            "user": 0.3,
            "content": 0.2,
            "item": 0.1
        }

        # Check if the user is new
        is_new_user = user not in self.collaborative_engine.user_matrix.index

        # Adjust weights dynamically
        if is_new_user:
            base_weights['content'] += 0.2  # Favor content for new users
            base_weights['item'] += 0.1  # Favor content for new users
            base_weights['user'] = 0.0     # Ignore user-based recommendations for new users

        # Get recommendations from each method
        apriori_based = self.association_rule.recommend(item)
        user_based = [] if is_new_user else  self.collaborative_engine.user_based_recommendation(user, current_time, top_n=top_n*2,time_decay=0)
        item_based = self.collaborative_engine.item_based_recommendation(item, top_n * 2)
        content_based = self.content_based_recommendation.recommend(item, top_n)

        # Combine recommendations with weighted scores
        scores = {}
        for rank, recommendation in enumerate(apriori_based, start=1):
            scores[recommendation] = scores.get(recommendation, 0) + base_weights['apriori'] / rank
        for rank, recommendation in enumerate(user_based, start=1):
            scores[recommendation] = scores.get(recommendation, 0) + base_weights['user'] / rank
        for rank, recommendation in enumerate(content_based, start=1):
            scores[recommendation] = scores.get(recommendation, 0) + base_weights['content'] / rank
        for rank, recommendation in enumerate(item_based, start=1):
            scores[recommendation] = scores.get(recommendation, 0) + base_weights['item'] / rank


        # Apply boost for less frequent items in recent data
        for recommendation in scores.keys():
            scores[recommendation] += self.calculate_item_boost(recommendation)

        # Sort items by their final weighted scores
        sorted_recommendations = sorted(scores.items(), key=lambda x: x[1], reverse=True)

        # Return the top_n recommendations
        return [item[0] for item in sorted_recommendations[:top_n]]


In [34]:
hbs = HybridRecommendationSystem(dataframe=df)
# hbs.update_weights()

# Example Usage
user_id = 12345  # Replace with actual user
item_code = '85123A'  # Replace with actual item
current_time = pd.to_datetime("2024-12-01")  # Current date (for time-decay)


recommendations = hbs.recommend(user_id, item_code, current_time, top_n=5)

print(f"Selected Item: {hbs.check_id(item_code)}")

print()

for recomended in recommendations:
    print(f"Recommended Item: {hbs.check_id(recomended)}")


Selected Item: WHITE HANGING HEART T-LIGHT HOLDER

Recommended Item: RED HANGING HEART T-LIGHT HOLDER
Recommended Item: HANGING HEART ZINC T-LIGHT HOLDER
Recommended Item: HANGING HEART JAR T-LIGHT HOLDER
Recommended Item: T-LIGHT HOLDER HANGING LOVE BIRD
Recommended Item: HANGING  BUTTERFLY T-LIGHT HOLDER
