### Importing Packages

In [21]:
from sentence_transformers import SentenceTransformer
import torch
import faiss
import pandas as pd
import numpy as np
import json
import seaborn as sns
import matplotlib.pyplot as plt


### Helper functions

In [22]:
def get_data_frame_from_csv(file_path):
    """
    This function is used to create and return the data frame from csv file
    parameters: file_path
    return: Panda data frame
    """
    return pd.read_csv(file_path)
def get_final_answer_data_frame(df, group_by_col_name,concate_col_name):
    """_summary_

    Args:
        df (_type_): _description_
        group_by_col_name (_type_): _description_
        concate_col_name (_type_): _description_

    Returns:
        _type_: _description_
    """
    return df.groupby(group_by_col_name)[concate_col_name].agg(lambda x: " ".join(x)).reset_index()
def get_llm_details():
    """_summary_

    Returns:
        _type_: _description_
    """
    model = SentenceTransformer("all-MiniLM-L6-v2")
    return model
def get_embedding(text_data,model):
    """
    This model used for creating embeddings of text data

    Args:
        text_data (List): List of text
        model (Sentence Transforme): Sentence Transformer model

    Returns:
        Sentence transformer: Instance of Sentence transformer
    """
    return model.encode(text_data, convert_to_numpy=True)

def get_embedding_dimention(df,embedding_column):
    """
    This function get the dimention of embedding column
    Args:
        df (Pands Data frame): Data frame
        embedding_column (String): Embedding column name

    Returns:
        Int: Number of data points
    """
    return df[embedding_column].iloc[0].shape[0]

def visitors_recommend(df,exhibitor_id,visitor_df):
    """
    This function return the list of visitors using semantic search
    Args:
        df (Pandas Data frame): Data frame
        exhibitor_id (Int): Exhibitor Id 
        visitor_df (Pandas Data frame): Visitor Data frame

    Returns:
        List: List of vistors
    """
    row_data = df[df["exhibitorid"] == exhibitor_id]
    if row_data.empty:
        return "Exhibitor not found."
    # Exhibitor embedding
    exhibitor_embedding = row_data["name_cat_embedding"].values[0]
    # Distance and indices
    distances, indices = faiss_index_visitors.search(np.array([exhibitor_embedding]), k=7)
    # Reccommeded visitors
    recommendations = visitor_df.iloc[indices[0]][["email", "answer"]]
    return recommendations["email"].to_list()

def get_file_path(base_folder_path,file_name):
    """
    This function takes the parameter base_folder_path and file_name
    Return final path of file
    """
    return base_folder_path + file_name

def get_vistors_details_using_hybrid_recommendation(exhibitor_id,exhibitor_df_final, visitor_df_final,alpha=0.5):
    """
    This function gives the list of visitors using TF-IDF KNN and semantic search based recommedition
    Args:
        exhibitor_id (Int): Unique Id of Exhibitor
        exhibitor_df_final (Pandas Dataframe): Exhibitor Data frame
        visitor_df_finalalpha (float, optional): Control the contribution of TF-IDF recommended visitors. Defaults to 0.5.

    Returns:
        List: List of visitors email
    """
    # get the exhibitor using exhibitor Id
    exhibitor_row = exhibitor_df_final[exhibitor_df_final['exhibitorid'] == exhibitor_id]
    if exhibitor_row.empty:
        return "No available data for this exhibitor."
    # Exhibitor vector of categoryName and Name
    exhibitor_vector = tfidf.transform(exhibitor_row['categoryName'] + ' ' + exhibitor_row['Name'])
    
    # Classical KNN-based Recommendations
    distances, indices = visitor_knn_model.kneighbors(exhibitor_vector)

    scores = (1 - distances).flatten()  # Convert distances to similarity scores
    
    knn_tfidf_recommendations = visitor_df_final.iloc[indices.flatten()][['email', 'answer']]
    knn_tfidf_recommendations['score'] = alpha * scores

    # FAISS-based Recommendations
    vdb_distances, vdb_indices = faiss_index.search(exhibitor_vector.toarray(), 7)
    vdb_scores = (1 / (1 + vdb_distances)).flatten()  # Convert L2 distances to similarity scores
    
    semantic_recommendations = visitor_df_final.iloc[vdb_indices.flatten()][['email', 'answer']]
    semantic_recommendations['score'] = (1 - alpha) * vdb_scores 
    
    # Combine and Rank Recommendations
    final_recommendations = pd.concat([knn_tfidf_recommendations, semantic_recommendations])
    final_recommendations = final_recommendations.groupby(['email', 'answer']).sum().reset_index()
    final_recommendations = final_recommendations.sort_values(by='score', ascending=False)
    
    return final_recommendations.head(7)["email"].to_list()  # Return top 7 recommendations



In [23]:
visitor_df = get_data_frame_from_csv("../../source/clean_visitor_data.csv")
visitor_df.head()

Unnamed: 0,email,gender,id,stepId,questionId,answerValue,answerId,answerTypeId,questionTypeId,question,answer,stepId_int
0,emilija+100_L8gA@bss.mk,F,67b70a9f2d21f543a1096602,5c8a78336d41a10da4f730fd,5c8a78336d41a10da4f730fe,,5c8a78336d41a10da4f73100,Answer,5bf7c399b82beb7a182cc3de,Reason for Attending the Event,To obtain general information,1
1,emilija+100_L8gA@bss.mk,F,67b70a9f2d21f543a1096602,5c8a78336d41a10da4f73225,5c8a78336d41a10da4f73227,,5c8a78336d41a10da4f73244,Answer,5bf7c399b82beb7a182cc3de,Which of the following best describes your job...,Media,2
2,emilija+100_L8gA@bss.mk,F,67b70a9f2d21f543a1096602,5c8a78336d41a10da4f73252,5c8a78336d41a10da4f73253,,5c8a78336d41a10da4f73291,Answer,5bf7c399b82beb7a182cc3de,Please indicate your company's main area of bu...,Travel Agent,3
3,emilija+100_L8gA@bss.mk,F,67b70a9f2d21f543a1096602,5c8a78336d41a10da4f7336c,5c8a78336d41a10da4f7336d,,5c8a78336d41a10da4f73371,Answer,5bf7c399b82beb7a182cc3de,What role do you play in the purchasing decisi...,No influence,4
4,aleksandar.dimkov+mitt1_n5eA@bss.com.mk,M,67ada1ee197e604dd2722d1b,5c8a78336d41a10da4f730fd,5c8a78336d41a10da4f730fe,,5c8a78336d41a10da4f730ff,Answer,5bf7c399b82beb7a182cc3de,Reason for Attending the Event,To source products and services,1


In [24]:
visitor_df_final = get_final_answer_data_frame(visitor_df, ["email","id"],"answer").head()
visitor_df_final["answer"] = visitor_df_final["answer"].apply(lambda x : x.lower().replace("unknown"," "))
visitor_df_final.head()

Unnamed: 0,email,id,answer
0,3990147_SeNs@gmail.com,67b5e0f7774d9e718c7541db,to source products and services tour operato...
1,3990147_SeNs_09Hr@gmail.com,sl0pqtnqavydiqidf8nxzrea,to source products and services tour operato...
2,3990147_SeNs_mVZi@gmail.com,wgf8glx8axdaq94290uynav9,to source products and services tour operato...
3,aleksandar.dimkov+mb1_Xc8j@bss.com.mk,67b484a9197e604dd2722d72,to obtain general information sales event mana...
4,aleksandar.dimkov+mb1_Xc8j_kuh8@bss.com.mk,xodwfemcnbuqzq632xnlrxt6,to obtain general information sales event mana...


In [25]:
exhibitor_df = get_data_frame_from_csv("../../source/clean_exhibitor.csv")
exhibitor_df.head()

Unnamed: 0.1,Unnamed: 0,exhibitorid,Name,categoryId,categoryName
0,0,90556,Turkey Travels,52276,1.5 Resort hotel
1,1,90556,Turkey Travels,52280,2.1 Inbound tour operator
2,2,90556,Turkey Travels,52281,2.2 Outbound tour operator
3,3,92462,Russian Travel Company,52273,1.2 Apartments Residential hotel
4,4,92462,Russian Travel Company,52283,2.4 Mass market tour operators


In [26]:
exhibitor_df = exhibitor_df.groupby(["exhibitorid","Name"],as_index=False).agg(
    {
        "categoryName": lambda x: " ".join(set(x))
    }
)

In [27]:
model = get_llm_details()

In [28]:
# Exhibitor category and name embedding
exhibitor_df["categoryName"] = exhibitor_df["categoryName"].apply(lambda x: x.lower())
exhibitor_df["Name"] = exhibitor_df["Name"].apply(lambda x: x.lower())
exhibitor_cat_name = exhibitor_df["categoryName"] + " " + exhibitor_df["Name"]
exhibitor_df_final = exhibitor_df.copy()
exhibitor_df_final["name_cat_embedding"] = list(get_embedding(exhibitor_cat_name.tolist(),model))

In [29]:
visitor_danswer = visitor_df_final.groupby("email")["answer"].apply(lambda x: " ".join(x)).reset_index()
visitor_danswer["answer_embedding"] = list(get_embedding(visitor_danswer["answer"].tolist(), model))


In [30]:
# Create FAISS Index for Exhibitors
faiss_index_exhibitors = faiss.IndexFlatL2(get_embedding_dimention(exhibitor_df_final,"name_cat_embedding"))
faiss_index_exhibitors.add(np.array(exhibitor_df_final["name_cat_embedding"].tolist()))

# Create FAISS Index for Visitors
faiss_index_visitors = faiss.IndexFlatL2(get_embedding_dimention(visitor_danswer,"answer_embedding"))
faiss_index_visitors.add(np.array(visitor_danswer["answer_embedding"].tolist()))

In [31]:
visitors_recommend(exhibitor_df_final,90556,visitor_danswer)

['3990147_SeNs@gmail.com',
 '3990147_SeNs_09Hr@gmail.com',
 '3990147_SeNs_mVZi@gmail.com',
 'aleksandar.dimkov+mb1_Xc8j@bss.com.mk',
 'aleksandar.dimkov+mb1_Xc8j_kuh8@bss.com.mk',
 'aleksandar.dimkov+mb1_Xc8j_kuh8@bss.com.mk',
 'aleksandar.dimkov+mb1_Xc8j_kuh8@bss.com.mk']

In [32]:
exhibitor_df_final[exhibitor_df_final["exhibitorid"] == 90556]

Unnamed: 0,exhibitorid,Name,categoryName,name_cat_embedding
23,90556,turkey travels,2.2 outbound tour operator 1.5 resort hotel 2....,"[0.08213275, -0.0033600826, 0.023591066, 0.058..."


### Hybrid Model for recommendations
- Use TF-IDF vectorizer to encode the text 
- Use Sentence tranformer embedding and Vector db for semantic search

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import LabelEncoder

In [34]:
tfidf = TfidfVectorizer(stop_words='english')

In [35]:
visitor_tfidf = tfidf.fit_transform(visitor_df_final["answer"])
exhibitor_tfidf = tfidf.transform(exhibitor_cat_name)

exhibitor_tfidf = tfidf.fit_transform(exhibitor_cat_name)
visitor_tfidf = tfidf.transform(visitor_df_final["answer"])

In [36]:
exhibitor_knn_model = NearestNeighbors(n_neighbors=5, metric='cosine')
exhibitor_knn_model.fit(exhibitor_tfidf)

visitor_knn_model = NearestNeighbors(n_neighbors=5, metric='cosine')
visitor_knn_model.fit(visitor_tfidf)

In [37]:
faiss_index = faiss.IndexFlatL2(visitor_tfidf.shape[1])
faiss_index.add(visitor_tfidf.toarray())

In [38]:
visitor_df_final.head()

Unnamed: 0,email,id,answer
0,3990147_SeNs@gmail.com,67b5e0f7774d9e718c7541db,to source products and services tour operato...
1,3990147_SeNs_09Hr@gmail.com,sl0pqtnqavydiqidf8nxzrea,to source products and services tour operato...
2,3990147_SeNs_mVZi@gmail.com,wgf8glx8axdaq94290uynav9,to source products and services tour operato...
3,aleksandar.dimkov+mb1_Xc8j@bss.com.mk,67b484a9197e604dd2722d72,to obtain general information sales event mana...
4,aleksandar.dimkov+mb1_Xc8j_kuh8@bss.com.mk,xodwfemcnbuqzq632xnlrxt6,to obtain general information sales event mana...


In [39]:
# visitors_recommend(90556,)
visitors_recommend(exhibitor_df_final,92462,visitor_df_final)

['3990147_SeNs@gmail.com',
 '3990147_SeNs_09Hr@gmail.com',
 '3990147_SeNs_mVZi@gmail.com',
 'aleksandar.dimkov+mb1_Xc8j@bss.com.mk',
 'aleksandar.dimkov+mb1_Xc8j_kuh8@bss.com.mk',
 'aleksandar.dimkov+mb1_Xc8j_kuh8@bss.com.mk',
 'aleksandar.dimkov+mb1_Xc8j_kuh8@bss.com.mk']

In [40]:
get_vistors_details_using_hybrid_recommendation(92462,exhibitor_df_final, visitor_df_final,.2)

['3990147_SeNs@gmail.com',
 '3990147_SeNs_09Hr@gmail.com',
 '3990147_SeNs_mVZi@gmail.com',
 'aleksandar.dimkov+mb1_Xc8j@bss.com.mk',
 'aleksandar.dimkov+mb1_Xc8j_kuh8@bss.com.mk']