In [56]:
from sentence_transformers import SentenceTransformer
import torch
import faiss
import pandas as pd
import numpy as np
import json
import seaborn as sns
import matplotlib.pyplot as plt


ModuleNotFoundError: No module named 'vdb'

### Helper functions

In [None]:
def get_data_frame_from_csv(file_path):
    """
    This function is used to create and return the data frame from csv file
    parameters: file_path
    return: Panda data frame
    """
    return pd.read_csv(file_path)
def get_final_answer_data_frame(df, group_by_col_name,concate_col_name):
    """_summary_

    Args:
        df (_type_): _description_
        group_by_col_name (_type_): _description_
        concate_col_name (_type_): _description_

    Returns:
        _type_: _description_
    """
    return df.groupby(group_by_col_name)[concate_col_name].agg(lambda x: " ".join(x)).reset_index()
def get_llm_details():
    """_summary_

    Returns:
        _type_: _description_
    """
    model = SentenceTransformer("all-MiniLM-L6-v2")
    return model
def get_embedding(text_data,model):
    """
    This model used for creating embeddings of text data

    Args:
        text_data (List): List of text
        model (Sentence Transforme): Sentence Transformer model

    Returns:
        Sentence transformer: Instance of Sentence transformer
    """
    return model.encode(text_data, convert_to_numpy=True)

def get_embedding_dimention(df,embedding_column):
    """
    Get embedding dimention

    Args:
        df (Pandas Data frame): Pandas Data frame
        embedding_column (String): Embedding Column name

    Returns:
        int: Return number of row
    """
    return df[embedding_column].iloc[0].shape[0]

def recommend_exhibitors(answer_text,df):
    """
    This function return the recommend exhibitor name
    Args:
        answer_text (string): Answer text
        df (Pandas data frame): Pandas data frame

    Returns:
        List: Return the list of exhibitor name
    """
    # Create embedding  for answer text
    answer_embedding = get_embedding([answer_text], model)[0]
    # get the distance and index from vector db
    distances, indices = vdb_index_exhibitors.search(np.array([answer_embedding]), k=5)
    # get recommendation data frame using matching indices
    recommendations = df.iloc[indices[0]][["exhibitorid", "Name", "categoryName"]]
    return recommendations["Name"].to_list()

def hybrid_recommendation(answer_text, alpha=0.5):
    """
    This function return the recommendation from tfidf vectorization and semantic search recommendation

    Args:
        answer_text (String): Answer text
        alpha (float, optional): This alpha value used to control the recommendation contributation from Nearest neghbour model. Defaults to 0.5.

    Returns:
        List: List of exhibitor on the basis of answer
    """
    # Answer tf-idf vector using the answer text
    answer_tf_vector = tfidf.transform([answer_text])
    
    # Distance and index on the basis exhibitor knn model
    distances, indices = exhibitor_knn_model.kneighbors(answer_tf_vector)
    scores = 1 - distances  # Convert distances to similarity scores
    # Recommenndation of exibitor data frame using indices
    recommendations_tfidf = exhibitor_df_final.iloc[indices[0]][['exhibitorid', 'Name', 'categoryName']]
    # Add score featire using alpha value
    recommendations_tfidf['score'] = alpha * scores.flatten()
    
    # vdb-based Recommendations
    vdb_distances, vdb_indices = vdb_index.search(answer_tf_vector.toarray(), 5)
    vdb_scores = 1 / (1 + vdb_distances)  # Convert L2 distances to similarity scores
    vdb_recommendations = exhibitor_df_final.iloc[vdb_indices[0]][['exhibitorid', 'Name', 'categoryName']]
    vdb_recommendations['score'] = (1 - alpha) * vdb_scores.flatten()
    
    # Combine and Rank Recommendations
    final_recommendations = pd.concat([recommendations_tfidf, vdb_recommendations])
    final_recommendations = final_recommendations.groupby(['exhibitorid', 'Name', 'categoryName']).sum().reset_index()
    final_recommendations = final_recommendations.sort_values(by='score', ascending=False)
    
    return list(set(final_recommendations.head(10)["Name"].to_list()))  # Return top 5 recommendations



In [None]:
visitor_df = get_data_frame_from_csv("../../source/clean_visitor_data.csv")
visitor_df.head()

Unnamed: 0,email,gender,id,stepId,questionId,answerValue,answerId,answerTypeId,questionTypeId,question,answer,stepId_int
0,emilija+100_L8gA@bss.mk,F,67b70a9f2d21f543a1096602,5c8a78336d41a10da4f730fd,5c8a78336d41a10da4f730fe,,5c8a78336d41a10da4f73100,Answer,5bf7c399b82beb7a182cc3de,Reason for Attending the Event,To obtain general information,1
1,emilija+100_L8gA@bss.mk,F,67b70a9f2d21f543a1096602,5c8a78336d41a10da4f73225,5c8a78336d41a10da4f73227,,5c8a78336d41a10da4f73244,Answer,5bf7c399b82beb7a182cc3de,Which of the following best describes your job...,Media,2
2,emilija+100_L8gA@bss.mk,F,67b70a9f2d21f543a1096602,5c8a78336d41a10da4f73252,5c8a78336d41a10da4f73253,,5c8a78336d41a10da4f73291,Answer,5bf7c399b82beb7a182cc3de,Please indicate your company's main area of bu...,Travel Agent,3
3,emilija+100_L8gA@bss.mk,F,67b70a9f2d21f543a1096602,5c8a78336d41a10da4f7336c,5c8a78336d41a10da4f7336d,,5c8a78336d41a10da4f73371,Answer,5bf7c399b82beb7a182cc3de,What role do you play in the purchasing decisi...,No influence,4
4,aleksandar.dimkov+mitt1_n5eA@bss.com.mk,M,67ada1ee197e604dd2722d1b,5c8a78336d41a10da4f730fd,5c8a78336d41a10da4f730fe,,5c8a78336d41a10da4f730ff,Answer,5bf7c399b82beb7a182cc3de,Reason for Attending the Event,To source products and services,1


In [None]:
visitor_df_final = get_final_answer_data_frame(visitor_df, "email","answer").head()
visitor_df_final["answer"] = visitor_df_final["answer"].apply(lambda x : x.lower().replace("unknown"," "))
visitor_df_final.head()

Unnamed: 0,email,answer
0,3990147_SeNs@gmail.com,to source products and services tour operato...
1,3990147_SeNs_09Hr@gmail.com,to source products and services tour operato...
2,3990147_SeNs_mVZi@gmail.com,to source products and services tour operato...
3,aleksandar.dimkov+mb1_Xc8j@bss.com.mk,to obtain general information sales event mana...
4,aleksandar.dimkov+mb1_Xc8j_kuh8@bss.com.mk,to obtain general information sales event mana...


In [None]:
exhibitor_df = get_data_frame_from_csv("../../source/clean_exhibitor.csv")
exhibitor_df.head()

Unnamed: 0.1,Unnamed: 0,exhibitorid,Name,categoryId,categoryName
0,0,90556,Turkey Travels,52276,1.5 Resort hotel
1,1,90556,Turkey Travels,52280,2.1 Inbound tour operator
2,2,90556,Turkey Travels,52281,2.2 Outbound tour operator
3,3,92462,Russian Travel Company,52273,1.2 Apartments Residential hotel
4,4,92462,Russian Travel Company,52283,2.4 Mass market tour operators


In [None]:
model = get_llm_details()

In [None]:
# Exhibitor category and name embedding
exhibitor_df["categoryName"] = exhibitor_df["categoryName"].apply(lambda x: x.lower())
exhibitor_df["Name"] = exhibitor_df["Name"].apply(lambda x: x.lower())
exhibitor_cat_name = exhibitor_df["categoryName"] + " " + exhibitor_df["Name"]
exhibitor_df_final = exhibitor_df.copy()
exhibitor_df_final["name_cat_embedding"] = list(get_embedding(exhibitor_cat_name.tolist(),model))

In [None]:
visitor_danswer = visitor_df_final.groupby("email")["answer"].apply(lambda x: " ".join(x)).reset_index()
visitor_danswer["answer_embedding"] = list(get_embedding(visitor_danswer["answer"].tolist(), model))


In [None]:
# Create vdb Index for Exhibitors
vdb_index_exhibitors = vdb.IndexFlatL2(get_embedding_dimention(exhibitor_df_final,"name_cat_embedding"))
vdb_index_exhibitors.add(np.array(exhibitor_df_final["name_cat_embedding"].tolist()))

# Create vdb Index for Visitors
vdb_index_visitors = vdb.IndexFlatL2(get_embedding_dimention(visitor_danswer,"answer_embedding"))
vdb_index_visitors.add(np.array(visitor_danswer["answer_embedding"].tolist()))

In [None]:
recommend_exhibitors("To source products and services",exhibitor_df_final)

['elite travel services',
 'dream getaways expeditions',
 'dreamtravel company ',
 'global tours voyages',
 'exotic tours journeys']

### Hybrid Model for recommendations
- Use TF-IDF vectorizer to encode the text 
- Use Sentence tranformer embedding and Vector db for semantic search

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import LabelEncoder

In [None]:
tfidf = TfidfVectorizer(stop_words='english')

In [None]:
visitor_tfidf = tfidf.fit_transform(visitor_df_final["answer"])
exhibitor_tfidf = tfidf.transform(exhibitor_cat_name)

In [None]:
exhibitor_knn_model = NearestNeighbors(n_neighbors=5, metric='cosine')
exhibitor_knn_model.fit(exhibitor_tfidf)

visitor_knn_model = NearestNeighbors(n_neighbors=5, metric='cosine')
visitor_knn_model.fit(visitor_tfidf)

In [None]:
vdb_index = faiss.IndexFlatL2(visitor_tfidf.shape[1])
vdb_index.add(visitor_tfidf.toarray())

In [None]:
hybrid_recommendation("To source products and services")

['global tours voyages',
 'dreamtravel company ',
 'turkey travels',
 'elite travel services',
 'russian travel company']