In [1]:
from sentence_transformers import SentenceTransformer
import torch
import faiss
import pandas as pd
import numpy as np
import json
import seaborn as sns
import matplotlib.pyplot as plt


### Helper functions

In [26]:
def get_data_frame_from_csv(file_path):
    """
    This function is used to create and return the data frame from csv file
    parameters: file_path
    return: Panda data frame
    """
    return pd.read_csv(file_path)
def get_final_answer_data_frame(df, group_by_col_name,concate_col_name):
    """_summary_

    Args:
        df (_type_): _description_
        group_by_col_name (_type_): _description_
        concate_col_name (_type_): _description_

    Returns:
        _type_: _description_
    """
    return df.groupby(group_by_col_name)[concate_col_name].agg(lambda x: " ".join(x)).reset_index()
def get_llm_details():
    """_summary_

    Returns:
        _type_: _description_
    """
    model = SentenceTransformer("all-MiniLM-L6-v2")
    return model
def get_embedding(text_data,model):
    """
    This model used for creating embeddings of text data

    Args:
        text_data (List): List of text
        model (Sentence Transforme): Sentence Transformer model

    Returns:
        Sentence transformer: Instance of Sentence transformer
    """
    return model.encode(text_data, convert_to_numpy=True)

def get_embedding_dimention(df,embedding_column):
    return df[embedding_column].iloc[0].shape[0]

def visitors_recommend(df,exhibitor_id,visitor_df):
    row_data = df[df["exhibitorid"] == exhibitor_id]
    if row_data.empty:
        return "Exhibitor not found."
    exhibitor_embedding = row_data["name_cat_embedding"].values[0]
    distances, indices = faiss_index_visitors.search(np.array([exhibitor_embedding]), k=5)
    recommendations = visitor_df.iloc[indices[0]][["email", "answer"]]
    return recommendations["email"].to_list()



In [4]:
visitor_df = get_data_frame_from_csv("../../source/clean_visitor_data.csv")
visitor_df.head()

Unnamed: 0,email,gender,id,stepId,questionId,answerValue,answerId,answerTypeId,questionTypeId,question,answer,stepId_int
0,emilija+100_L8gA@bss.mk,F,67b70a9f2d21f543a1096602,5c8a78336d41a10da4f730fd,5c8a78336d41a10da4f730fe,,5c8a78336d41a10da4f73100,Answer,5bf7c399b82beb7a182cc3de,Reason for Attending the Event,To obtain general information,1
1,emilija+100_L8gA@bss.mk,F,67b70a9f2d21f543a1096602,5c8a78336d41a10da4f73225,5c8a78336d41a10da4f73227,,5c8a78336d41a10da4f73244,Answer,5bf7c399b82beb7a182cc3de,Which of the following best describes your job...,Media,2
2,emilija+100_L8gA@bss.mk,F,67b70a9f2d21f543a1096602,5c8a78336d41a10da4f73252,5c8a78336d41a10da4f73253,,5c8a78336d41a10da4f73291,Answer,5bf7c399b82beb7a182cc3de,Please indicate your company's main area of bu...,Travel Agent,3
3,emilija+100_L8gA@bss.mk,F,67b70a9f2d21f543a1096602,5c8a78336d41a10da4f7336c,5c8a78336d41a10da4f7336d,,5c8a78336d41a10da4f73371,Answer,5bf7c399b82beb7a182cc3de,What role do you play in the purchasing decisi...,No influence,4
4,aleksandar.dimkov+mitt1_n5eA@bss.com.mk,M,67ada1ee197e604dd2722d1b,5c8a78336d41a10da4f730fd,5c8a78336d41a10da4f730fe,,5c8a78336d41a10da4f730ff,Answer,5bf7c399b82beb7a182cc3de,Reason for Attending the Event,To source products and services,1


In [5]:
visitor_df_final = get_final_answer_data_frame(visitor_df, "email","answer").head()
visitor_df_final["answer"] = visitor_df_final["answer"].apply(lambda x : x.lower().replace("unknown"," "))
visitor_df_final.head()

Unnamed: 0,email,answer
0,3990147_SeNs@gmail.com,to source products and services tour operato...
1,3990147_SeNs_09Hr@gmail.com,to source products and services tour operato...
2,3990147_SeNs_mVZi@gmail.com,to source products and services tour operato...
3,aleksandar.dimkov+mb1_Xc8j@bss.com.mk,to obtain general information sales event mana...
4,aleksandar.dimkov+mb1_Xc8j_kuh8@bss.com.mk,to obtain general information sales event mana...


In [6]:
exhibitor_df = get_data_frame_from_csv("../../source/clean_exhibitor.csv")
exhibitor_df.head()

Unnamed: 0.1,Unnamed: 0,exhibitorid,Name,categoryId,categoryName
0,0,90556,Turkey Travels,52276,1.5 Resort hotel
1,1,90556,Turkey Travels,52280,2.1 Inbound tour operator
2,2,90556,Turkey Travels,52281,2.2 Outbound tour operator
3,3,92462,Russian Travel Company,52273,1.2 Apartments Residential hotel
4,4,92462,Russian Travel Company,52283,2.4 Mass market tour operators


In [7]:
model = get_llm_details()

In [8]:
# Exhibitor category and name embedding
exhibitor_df["categoryName"] = exhibitor_df["categoryName"].apply(lambda x: x.lower())
exhibitor_df["Name"] = exhibitor_df["Name"].apply(lambda x: x.lower())
exhibitor_cat_name = exhibitor_df["categoryName"] + " " + exhibitor_df["Name"]
exhibitor_df_final = exhibitor_df.copy()
exhibitor_df_final["name_cat_embedding"] = list(get_embedding(exhibitor_cat_name.tolist(),model))

In [11]:
visitor_danswer = visitor_df_final.groupby("email")["answer"].apply(lambda x: " ".join(x)).reset_index()
visitor_danswer["answer_embedding"] = list(get_embedding(visitor_danswer["answer"].tolist(), model))


In [27]:
# Create FAISS Index for Exhibitors
faiss_index_exhibitors = faiss.IndexFlatL2(get_embedding_dimention(exhibitor_df_final,"name_cat_embedding"))
faiss_index_exhibitors.add(np.array(exhibitor_df_final["name_cat_embedding"].tolist()))

# Create FAISS Index for Visitors
faiss_index_visitors = faiss.IndexFlatL2(get_embedding_dimention(visitor_danswer,"answer_embedding"))
faiss_index_visitors.add(np.array(visitor_danswer["answer_embedding"].tolist()))

In [28]:
# visitors_recommend(exhibitor_df_final,90556,visitor_danswer)

['3990147_SeNs@gmail.com',
 '3990147_SeNs_09Hr@gmail.com',
 '3990147_SeNs_mVZi@gmail.com',
 'aleksandar.dimkov+mb1_Xc8j@bss.com.mk',
 'aleksandar.dimkov+mb1_Xc8j_kuh8@bss.com.mk']