In [38]:
# pip install torch transformers faiss-cpu pandas numpy matplotlib seaborn scikit-learn sentence-transformers


In [39]:
from sentence_transformers import SentenceTransformer
import torch
import faiss
import pandas as pd
import numpy as np
import json
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import LabelEncoder
# import sys

### Helper Classes

In [40]:
class VisitorExhibitor:
    def __init__(self):
        pass
    
    def __get_data_frame_from_csv(self,file_path):
        """
        This function is used to create and return the data frame from csv file
        parameters: file_path
        return: Panda data frame
        """
        return pd.read_csv(file_path)
    def get_embedding(self,text_data,model):
        """
        This model used for creating embeddings of text data

        Args:
            text_data (List): List of text
            model (Sentence Transforme): Sentence Transformer model

        Returns:
            Sentence transformer: Instance of Sentence transformer
        """
        return model.encode(text_data, convert_to_numpy=True)
        
    def get_final_visitor_df(self,clean_visitor_data_path,group_by_col_name,concate_col_name):
        df = self.__get_data_frame_from_csv(clean_visitor_data_path)
        df = df.groupby(group_by_col_name)[concate_col_name].agg(lambda x: " ".join(x)).reset_index()
        df["answer"] = df["answer"].apply(lambda x : x.lower().replace("unknown"," "))
        return df
    def get_final_exhibitor_data_frame(self,clean_exhibitor_data_path,model):
        df = self.__get_data_frame_from_csv(clean_exhibitor_data_path)
        df["categoryName"] = df["categoryName"].apply(lambda x: x.lower())
        df["Name"] = df["Name"].apply(lambda x: x.lower())
        df_cat_name = df["categoryName"] + " " + df["Name"]
        df["name_cat_embedding"] = list(self.get_embedding(df_cat_name.tolist(),model))
        return df
        
        

In [41]:
class SemanticModel:
    def __init__(self):
        pass
    def get_llm_details(self):
        """_summary_

        Returns:
            _type_: _description_
        """
        model = SentenceTransformer("all-MiniLM-L6-v2")
        return model

In [42]:
class VectorDB:
    def __init__(self):
        pass
    def getIndexFlatL2(self,embedding_dim):
        return faiss.IndexFlatL2(embedding_dim)
    def add_embedding_data(self,data,db_index):
        db_index.add(data)
        

In [43]:
class Recommendation:
    def __init__(self):
        pass
    def recommend_exhibitors(self,visitor_email,visitor_df,exhibitor_df,db_index,visitorExhibitor,semanticModel,top_k_exhibitor_name = 5):
        """
        This function used to get top k recommendation on the basis of visitor email
        Args:
            visitor_email (String): Visitor email
            visitor_df (Pandas data frame): Visitor data frame
            exhibitor_df (Pandas Data Frame): Exhinitor Data frame
            top_k_exhibitor_name (int, optional): Number of top exhibitor. Defaults to 5.

        Returns:
            _type_: _description_
        """
        visitor_answers = visitor_df[visitor_df["email"] == visitor_email]["answer"].tolist()
        visitor_embedding = visitorExhibitor.get_embedding([" ".join(visitor_answers)],semanticModel)[0]
        distances, indices = db_index.search(np.array([visitor_embedding]), k=top_k_exhibitor_name)
        recommendations = exhibitor_df.iloc[indices[0]][["exhibitorid", "Name", "categoryName"]]
        return recommendations["Name"].to_list()
    def hybrid_recommend_exhibitors(self,email,visitor_df_final,exhibitor_df_final,tfidf,exhibitor_knn_model,db_index,alpha=0.5):
        """
        This function return the combined recommendation of exhibitor. It uses TF-IDF vectorizer and sentence transformer 
        Args:
            email (string): Email of user
            visitor_df_final (Pandas Data frame): Visitor data frame
            exhibitor_df_final (Pandas data frame): Exhibitor data frame
            alpha (float, optional): Control the TF-IDF recommendation. Defaults to 0.5.

        Returns:
            List: List of exhibitor Name
        """
        # Visitor data matching email
        visitor_row = visitor_df_final[visitor_df_final['email'] == email]
        if visitor_row.empty:
            return "No available answers for this visitor."
        # Get the visitor vector
        visitor_vector = tfidf.transform(visitor_row['answer'].fillna(''))
        
        # KNN based Recommendations
        distance, ndices = exhibitor_knn_model.kneighbors(visitor_vector)
        scores = 1 - distance  # Convert distances to similarity scores
        tfidf_recommendations = exhibitor_df_final.iloc[ndices[0]][['exhibitorid', 'Name', 'categoryName']]
        tfidf_recommendations['score'] = alpha * scores.flatten()
        
        # Semantic search based Recommendations
        vdb_distances, vdb_indices = db_index.search(visitor_vector.toarray(), 5)
        vdb_scores = 1 / (1 + vdb_distances)  # Convert L2 distances to similarity scores
        recommendations_semantic_search_model = exhibitor_df_final.iloc[vdb_indices[0]][['exhibitorid', 'Name', 'categoryName']]
        recommendations_semantic_search_model['score'] = (1 - alpha) * vdb_scores.flatten()
        
        # Combine Nearest Neighbour model and semantic search model
        final_recommendations = pd.concat([tfidf_recommendations, recommendations_semantic_search_model])
        final_recommendations = final_recommendations.groupby(['exhibitorid', 'Name']).sum().reset_index()
        final_recommendations = final_recommendations.sort_values(by='score', ascending=False)
        
        return list(set(final_recommendations.head(5)["Name"].to_list()))  # Return top 5 recommendations
    

In [44]:
class Vectorizer:
    def __init__(self):
        pass
    def get_tf_idf_vectorizer(self,):
        return TfidfVectorizer(stop_words='english')
    def fit_transform(self,data,vectorizer):
        return vectorizer.fit_transform(data)
    def transform(self,data,vectorizer):
        return vectorizer.transform(data)
    
class ClassicaMlModel:
    def __init__(self):
        pass
    def get_nearest_neighbour_model(self):
        return NearestNeighbors(n_neighbors=5, metric='cosine')

### Exihibitor Recommendation using Vistors Email

#### Logic to implement the Exhibitor Recommendation using visitor email
- Filter the data points where visitor email matched
- Get the answer given
- Created embeddings for exhibitor using Name and category.
- Create embedding for answer
- Semantic search and return Top k exhibitor 

In [45]:
# visitorExhibitor object
visitorExhibitor = VisitorExhibitor()
# SemanticModel object
model = SemanticModel().get_llm_details()
# VectorDB object
vector_db = VectorDB()
# Recommendation object
recommendation = Recommendation()
# Vectorizer Object
vectorizer = Vectorizer()
# ClassicaMlModel object
classicaMlModel = ClassicaMlModel()

In [46]:
visitor_df_final = visitorExhibitor.get_final_visitor_df("../../source/clean_visitor_data.csv","email","answer")
exhibitor_df_final = visitorExhibitor.get_final_exhibitor_data_frame("../../source/clean_exhibitor.csv",model)

In [47]:
# Create FAISS Index
embedding_dim = exhibitor_df_final["name_cat_embedding"].iloc[0].shape[0]
db_index = vector_db.getIndexFlatL2(embedding_dim)
vector_db.add_embedding_data(np.array(exhibitor_df_final["name_cat_embedding"].tolist()),db_index)

In [48]:
# recommend_exhibitors("3990147_SeNs@gmail.com",visitor_df_final,exhibitor_df_final)
recommendation.recommend_exhibitors("3990147_SeNs@gmail.com",visitor_df_final,exhibitor_df_final,db_index,visitorExhibitor,model)

['indian travel company',
 'exotic europe travels',
 'exotic tours tours',
 'global tours voyages',
 'exotic tours journeys']

### Hybrid Model for recommendations
- Use TF-IDF vectorizer to encode the text 
- Use Sentence tranformer embedding and Vector db for semantic search

In [49]:

tfidf = vectorizer.get_tf_idf_vectorizer()

In [50]:
visitor_tfidf = vectorizer.fit_transform(visitor_df_final["answer"],tfidf)
exhibitor_cat_name = exhibitor_df_final["categoryName"] + " " + exhibitor_df_final["Name"]
exhibitor_tfidf = vectorizer.transform(exhibitor_cat_name,tfidf)

In [51]:
exhibitor_knn_model = classicaMlModel.get_nearest_neighbour_model()
exhibitor_knn_model.fit(exhibitor_tfidf)

visitor_knn_model = classicaMlModel.get_nearest_neighbour_model()
visitor_knn_model.fit(visitor_tfidf)

In [52]:
db_index = vector_db.getIndexFlatL2(visitor_tfidf.shape[1])
vector_db.add_embedding_data(visitor_tfidf.toarray(),db_index)


In [53]:
recommendation.hybrid_recommend_exhibitors("3990147_SeNs@gmail.com",visitor_df_final,exhibitor_df_final,tfidf,exhibitor_knn_model,db_index)

['global holidays expeditions',
 'dreamtravel company ',
 'exotic tours holidays',
 'exotic europe travels',
 'turkey travels']