In [21]:
pip install torch transformers faiss-cpu pandas numpy matplotlib seaborn scikit-learn sentence-transformers


Collecting sentence-transformers
  Downloading sentence_transformers-4.0.1-py3-none-any.whl.metadata (13 kB)
Downloading sentence_transformers-4.0.1-py3-none-any.whl (340 kB)
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-4.0.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [4]:
from sentence_transformers import SentenceTransformer
import torch
import faiss
import pandas as pd
import numpy as np
import json
import seaborn as sns
import matplotlib.pyplot as plt


### Helper functions

In [40]:
def get_data_frame_from_csv(file_path):
    """
    This function is used to create and return the data frame from csv file
    parameters: file_path
    return: Panda data frame
    """
    return pd.read_csv(file_path)
def get_final_answer_data_frame(df, group_by_col_name,concate_col_name):
    """_summary_

    Args:
        df (_type_): _description_
        group_by_col_name (_type_): _description_
        concate_col_name (_type_): _description_

    Returns:
        _type_: _description_
    """
    return df.groupby(group_by_col_name)[concate_col_name].agg(lambda x: " ".join(x)).reset_index()
def get_llm_details():
    """_summary_

    Returns:
        _type_: _description_
    """
    model = SentenceTransformer("all-MiniLM-L6-v2")
    return model
def get_embedding(text_data,model):
    """
    This model used for creating embeddings of text data

    Args:
        text_data (List): List of text
        model (Sentence Transforme): Sentence Transformer model

    Returns:
        Sentence transformer: Instance of Sentence transformer
    """
    return model.encode(text_data, convert_to_numpy=True)

def hybrid_recommend_exhibitors(visitor_email,visitor_df,exhibitor_df,top_k_exhibitor_name = 5):
    """
    This function used to get top k recommendation on the basis of visitor email
    Args:
        visitor_email (String): Visitor email
        visitor_df (Pandas data frame): Visitor data frame
        exhibitor_df (Pandas Data Frame): Exhinitor Data frame
        top_k_exhibitor_name (int, optional): Number of top exhibitor. Defaults to 5.

    Returns:
        _type_: _description_
    """
    visitor_answers = visitor_df[visitor_df["email"] == visitor_email]["answer"].tolist()
    visitor_embedding = get_embedding([" ".join(visitor_answers)],model)[0]
    distances, indices = faiss_index.search(np.array([visitor_embedding]), k=top_k_exhibitor_name)
    recommendations = exhibitor_df.iloc[indices[0]][["exhibitorid", "Name", "categoryName"]]
    return recommendations["Name"].to_list()



### Exihibitor Recommendation using Vistors Email

#### Logic to implement the Exhibitor Recommendation using visitor email
- Filter the data points where visitor email matched
- Get the answer given
- Created embeddings for exhibitor using Name and category.
- Create embedding for answer
- Semantic search and return Top k exhibitor 

In [6]:
visitor_df = get_data_frame_from_csv("../../source/clean_visitor_data.csv")
visitor_df.head()

Unnamed: 0,email,gender,id,stepId,questionId,answerValue,answerId,answerTypeId,questionTypeId,question,answer,stepId_int
0,emilija+100_L8gA@bss.mk,F,67b70a9f2d21f543a1096602,5c8a78336d41a10da4f730fd,5c8a78336d41a10da4f730fe,,5c8a78336d41a10da4f73100,Answer,5bf7c399b82beb7a182cc3de,Reason for Attending the Event,To obtain general information,1
1,emilija+100_L8gA@bss.mk,F,67b70a9f2d21f543a1096602,5c8a78336d41a10da4f73225,5c8a78336d41a10da4f73227,,5c8a78336d41a10da4f73244,Answer,5bf7c399b82beb7a182cc3de,Which of the following best describes your job...,Media,2
2,emilija+100_L8gA@bss.mk,F,67b70a9f2d21f543a1096602,5c8a78336d41a10da4f73252,5c8a78336d41a10da4f73253,,5c8a78336d41a10da4f73291,Answer,5bf7c399b82beb7a182cc3de,Please indicate your company's main area of bu...,Travel Agent,3
3,emilija+100_L8gA@bss.mk,F,67b70a9f2d21f543a1096602,5c8a78336d41a10da4f7336c,5c8a78336d41a10da4f7336d,,5c8a78336d41a10da4f73371,Answer,5bf7c399b82beb7a182cc3de,What role do you play in the purchasing decisi...,No influence,4
4,aleksandar.dimkov+mitt1_n5eA@bss.com.mk,M,67ada1ee197e604dd2722d1b,5c8a78336d41a10da4f730fd,5c8a78336d41a10da4f730fe,,5c8a78336d41a10da4f730ff,Answer,5bf7c399b82beb7a182cc3de,Reason for Attending the Event,To source products and services,1


In [7]:
visitor_df_final = get_final_answer_data_frame(visitor_df, "email","answer").head()
visitor_df_final["answer"] = visitor_df_final["answer"].apply(lambda x : x.lower().replace("unknown"," "))
visitor_df_final.head()

Unnamed: 0,email,answer
0,3990147_SeNs@gmail.com,to source products and services tour operato...
1,3990147_SeNs_09Hr@gmail.com,to source products and services tour operato...
2,3990147_SeNs_mVZi@gmail.com,to source products and services tour operato...
3,aleksandar.dimkov+mb1_Xc8j@bss.com.mk,to obtain general information sales event mana...
4,aleksandar.dimkov+mb1_Xc8j_kuh8@bss.com.mk,to obtain general information sales event mana...


In [8]:
exhibitor_df = get_data_frame_from_csv("../../source/clean_exhibitor.csv")
exhibitor_df.head()

Unnamed: 0.1,Unnamed: 0,exhibitorid,Name,categoryId,categoryName
0,0,90556,Turkey Travels,52276,1.5 Resort hotel
1,1,90556,Turkey Travels,52280,2.1 Inbound tour operator
2,2,90556,Turkey Travels,52281,2.2 Outbound tour operator
3,3,92462,Russian Travel Company,52273,1.2 Apartments Residential hotel
4,4,92462,Russian Travel Company,52283,2.4 Mass market tour operators


In [9]:

model = get_llm_details()

In [10]:
exhibitor_df.head()

Unnamed: 0.1,Unnamed: 0,exhibitorid,Name,categoryId,categoryName
0,0,90556,Turkey Travels,52276,1.5 Resort hotel
1,1,90556,Turkey Travels,52280,2.1 Inbound tour operator
2,2,90556,Turkey Travels,52281,2.2 Outbound tour operator
3,3,92462,Russian Travel Company,52273,1.2 Apartments Residential hotel
4,4,92462,Russian Travel Company,52283,2.4 Mass market tour operators


In [11]:
# Exhibitor category and name embedding
exhibitor_df["categoryName"] = exhibitor_df["categoryName"].apply(lambda x: x.lower())
exhibitor_df["Name"] = exhibitor_df["Name"].apply(lambda x: x.lower())
exhibitor_cat_name = exhibitor_df["categoryName"] + " " + exhibitor_df["Name"]
exhibitor_df_final = exhibitor_df.copy()
exhibitor_df_final["name_cat_embedding"] = list(get_embedding(exhibitor_cat_name.tolist(),model))

# Create FAISS Index
embedding_dim = exhibitor_df_final["name_cat_embedding"].iloc[0].shape[0]
faiss_index = faiss.IndexFlatL2(embedding_dim)
faiss_index.add(np.array(exhibitor_df_final["name_cat_embedding"].tolist()))

In [59]:
# hybrid_recommend_exhibitors_by_visitor_email("3990147_SeNs@gmail.com",visitor_df_final,exhibitor_df_final)

### Hybrid Mode
- Use TF-IDF vectorizer to encode the text 
- Use Sentence tranformer embedding and Vector db for semantic search

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import LabelEncoder

In [13]:
tfidf = TfidfVectorizer(stop_words='english')

In [35]:
visitor_tfidf = tfidf.fit_transform(visitor_df_final["answer"])
exhibitor_tfidf = tfidf.transform(exhibitor_cat_name)

In [36]:
exhibitor_knn_model = NearestNeighbors(n_neighbors=5, metric='cosine')
exhibitor_knn_model.fit(exhibitor_tfidf)

visitor_knn_model = NearestNeighbors(n_neighbors=5, metric='cosine')
visitor_knn_model.fit(visitor_tfidf)

In [37]:
faiss_index = faiss.IndexFlatL2(visitor_tfidf.shape[1])
faiss_index.add(visitor_tfidf.toarray())


In [42]:
def hybrid_recommend_exhibitors(email,visitor_df_final,exhibitor_df_final,alpha=0.5):
    visitor_row = visitor_df_final[visitor_df_final['email'] == email]
    if visitor_row.empty:
        return "No available answers for this visitor."
    visitor_vector = tfidf.transform(visitor_row['answer'].fillna(''))
    
    # Nearesrt Neighbour based Recommendations
    knn_distances, knn_indices = exhibitor_knn_model.kneighbors(visitor_vector)
    knn_scores = 1 - knn_distances  # Convert distances to similarity scores
    knn_recommendations = exhibitor_df_final.iloc[knn_indices[0]][['exhibitorid', 'Name', 'categoryName']]
    knn_recommendations['score'] = alpha * knn_scores.flatten()
    
    # Semantic search based Recommendations
    faiss_distances, faiss_indices = faiss_index.search(visitor_vector.toarray(), 5)
    faiss_scores = 1 / (1 + faiss_distances)  # Convert L2 distances to similarity scores
    recommendations_semantic_search_model = exhibitor_df_final.iloc[faiss_indices[0]][['exhibitorid', 'Name', 'categoryName']]
    recommendations_semantic_search_model['score'] = (1 - alpha) * faiss_scores.flatten()
    
    # Combine Nearest Neighbour model and semantic search model
    final_recommendations = pd.concat([knn_recommendations, recommendations_semantic_search_model])
    final_recommendations = final_recommendations.groupby(['exhibitorid', 'Name', 'categoryName']).sum().reset_index()
    final_recommendations = final_recommendations.sort_values(by='score', ascending=False)
    
    return final_recommendations.head(5)  # Return top 5 recommendations

In [44]:
hybrid_recommend_exhibitors("3990147_SeNs@gmail.com",visitor_df_final,exhibitor_df_final)

Unnamed: 0,exhibitorid,Name,categoryName,score
3,90556,turkey travels,2.1 inbound tour operator,0.788675
2,90556,turkey travels,1.5 resort hotel,0.5
4,90556,turkey travels,2.2 outbound tour operator,0.5
0,17729,sunny travel journeys,2.6 specialized tour operator,0.288675
1,30134,global holidays expeditions,2.6 specialized tour operator,0.288675
