In [28]:
import os
import time
import json
from sklearn.metrics.pairwise import cosine_similarity
import polars as pl
from tqdm import tqdm
from dotenv import load_dotenv
from openai import OpenAI

# Load the .env file
load_dotenv("../.env")
client = OpenAI()

json_file_path = "../data/inputs/embeddings.json"
embeddings = pl.read_json(json_file_path)

def get_embedding(text):
	embedding = (
		client.embeddings.create(input=text, model="text-embedding-3-small")
		.data[0]
		.embedding
	)
	return embedding

def get_similarity(search_embedding, embeddings):
	embeddings = embeddings.with_columns(
		embeddings["embedding"]
		.apply(lambda e: cosine_similarity([e], [search_embedding])[0][0])
		.alias("cosine_similarity")
	)
	embeddings = embeddings.sort("cosine_similarity", descending=True)
	return embeddings

# Function to measure execution time
def measure_execution_time(func, *args, iterations=1000):
    start_time = time.time()
    sorted_embeddings = None
    for _ in range(iterations):
        sorted_embeddings = func(*args)
    end_time = time.time()
    mean_time = (end_time - start_time) / iterations
    print(f"Execution time: {mean_time:.5f} seconds")
    return sorted_embeddings

In [31]:
search_embedding = get_embedding("外国人におすすめ")
sorted_embeddings = measure_execution_time(get_similarity, search_embedding, embeddings, iterations=10)
sorted_embeddings.head(5)

  embeddings["embedding"]
  embeddings["embedding"]
  embeddings["embedding"]
  embeddings["embedding"]
  embeddings["embedding"]
  embeddings["embedding"]
  embeddings["embedding"]
  embeddings["embedding"]
  embeddings["embedding"]
  embeddings["embedding"]
  embeddings["embedding"]
  embeddings["embedding"]
  embeddings["embedding"]
  embeddings["embedding"]
  embeddings["embedding"]
  embeddings["embedding"]
  embeddings["embedding"]
  embeddings["embedding"]
  embeddings["embedding"]
  embeddings["embedding"]


Execution time: 12.87 seconds


place_id,review_id,feature,key,embedding,cosine_similarity
str,str,str,i64,list[f64],f64
"""ChIJK9584NuMGGARVKfZjpYoQDs""","""ChdDSUhNMG9nS0VJQ0FnSUR6c3N2aD…","""外国人にも人気の観光スポット""",693,"[0.03074, 0.017347, … 0.000512]",0.680851
"""ChIJCewJkL2LGGAR3Qmk0vCTGkg""","""ChdDSUhNMG9nS0VJQ0FnSURUd1ppQ2…","""外国人にも人気の観光スポット""",236,"[0.030738, 0.017356, … 0.000534]",0.680817
"""ChIJM9xJEHeLGGARAshMWUgC_ls""","""ChdDSUhNMG9nS0VJQ0FnSURqNjdqLX…","""外国人にも優しい環境""",518,"[0.019292, -0.009432, … 0.002982]",0.673975
"""ChIJM9xJEHeLGGARAshMWUgC_ls""","""ChdDSUhNMG9nS0VJQ0FnSURqNjdqLX…","""外国人観光客にも人気""",509,"[0.050723, 0.005674, … -0.007751]",0.669858
"""ChIJP9eKBdeMGGAR0zzBXJNVj5A""","""ChdDSUhNMG9nS0VJQ0FnSUM5dU9TQ2…","""外国人との交流が楽しめる""",104,"[-0.002273, -0.005705, … -0.004685]",0.645741


In [33]:
search_embedding2 = get_embedding("子供が楽しめる場所。できれば歴史に触れられると嬉しい。")
sorted_embeddings = get_similarity(search_embedding2, embeddings)
sorted_embeddings.head(5)

  embeddings["embedding"]
  embeddings["embedding"]


place_id,review_id,feature,key,embedding,cosine_similarity
str,str,str,i64,list[f64],f64
"""ChIJAQCTx8uMGGARpNjDI6maHdc""","""ChZDSUhNMG9nS0VJQ0FnSUNOeHI2M2…","""子供たちの楽しさが感じられる場所""",2590,"[0.06447, 0.009691, … 0.002378]",0.736199
"""ChIJHeBCIOyMGGAR7BktETlUP_w""","""ChdDSUhNMG9nS0VJQ0FnSUNON3ZDYi…","""子供と一緒に楽しむことができる場所""",3887,"[0.051534, -0.014964, … 0.013816]",0.688365
"""ChIJf4O1UMCMGGARZ6nQzxTLK1E""","""ChZDSUhNMG9nS0VJQ0FnSUN6bVlPdV…","""子供たちにとって刺激的な場所である。""",3646,"[0.05418, -0.00057, … 0.001293]",0.661715
"""ChIJAQCTx8uMGGARpNjDI6maHdc""","""ChdDSUhNMG9nS0VJQ0FnSUN6OHNUcn…","""子供と一緒に楽しめる""",2615,"[0.057801, -0.011662, … 0.018633]",0.653211
"""ChIJHeBCIOyMGGAR7BktETlUP_w""","""ChdDSUhNMG9nS0VJQ0FnSUNENFByU3…","""子供たちが存分に楽しめる施設""",3918,"[0.052711, 0.015642, … 0.014437]",0.648514


In [17]:
search_embedding3 = get_embedding("クリエイティブな体験ができる場所")
sorted_embeddings = get_similarity(search_embedding3, embeddings)
sorted_embeddings.head(5)

  embeddings["embedding"]
  embeddings["embedding"]


place_id,review_id,feature,key,embedding,cosine_similarity
str,str,str,i64,list[f64],f64
"""ChIJzRzI3HSNGGARRwZW6AtJfi0""","""ChZDSUhNMG9nS0VJQ0FnSUN6cVppSl…","""場所柄、エキサイティングな体験ができる""",451,"[0.023017, 0.038835, … 0.009937]",0.74084
"""ChIJjy2U41iLGGAREMoF6FatXt4""","""ChZDSUhNMG9nS0VJQ0FnSURVb2ZETF…","""クリエイティブな空間""",2315,"[0.002878, 0.032346, … -0.003183]",0.690786
"""ChIJM9xJEHeLGGARAshMWUgC_ls""","""ChZDSUhNMG9nS0VJQ0FnSURkLWFyRE…","""都会の中でラグジュアリーかつ多様な体験ができる場所""",558,"[0.054874, 0.027104, … -0.008433]",0.661875
"""ChIJJRmFiCOLGGARyLhUN8vGRXs""","""ChZDSUhNMG9nS0VJQ0FnSUNMLXN6UE…","""インタラクティブな展示が多く、楽しい体験ができる""",2202,"[0.00723, 0.029636, … -0.037694]",0.65893
"""ChIJD1M2GbCMGGARmUFziof73JM""","""ChdDSUhNMG9nS0VJQ0FnSURoLVozVG…","""非日常的な体験ができる場所である""",2102,"[0.00231, 0.043781, … 0.009976]",0.650934


In [34]:
import os
import json
import numpy as np
import polars as pl
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
from dotenv import load_dotenv
from openai import OpenAI

# Load the .env file
load_dotenv("../.env")
client = OpenAI()

json_file_path = "../data/inputs/embeddings.json"
embeddings = pl.read_json(json_file_path)

# Extract embeddings to a numpy array for PCA
embedding_matrix = np.array(embeddings["embedding"].to_list())

# Perform PCA to reduce dimensionality
pca = PCA(n_components=200)  # Adjust the number of components as needed
reduced_embeddings = pca.fit_transform(embedding_matrix)

# Replace the original embeddings in the DataFrame with the reduced embeddings
embeddings = embeddings.with_columns(pl.Series("reduced_embedding", reduced_embeddings.tolist()))

def get_comp_similarity(search_embedding, embeddings, pca):
    # Reduce the dimensionality of the sample embedding
    reduced_sample_embedding = pca.transform([search_embedding])[0]

    # Compute cosine similarity for each reduced embedding
    embeddings = embeddings.with_columns(
        embeddings["reduced_embedding"]
        .apply(lambda e: cosine_similarity([e], [reduced_sample_embedding])[0][0])
        .alias("cosine_similarity")
    )

    # Sort the DataFrame by the cosine similarity in descending order
    sorted_embeddings = embeddings.sort("cosine_similarity", descending=True)
    return sorted_embeddings

In [35]:
embedding_matrix.shape

(4093, 1536)

In [36]:
reduced_embeddings.shape

(4093, 200)

In [37]:
# Example usage
sorted_embeddings = measure_execution_time(get_comp_similarity, search_embedding, embeddings, pca, iterations=10)
sorted_embeddings.head(5)


  embeddings["reduced_embedding"]
  embeddings["reduced_embedding"]
  embeddings["reduced_embedding"]
  embeddings["reduced_embedding"]
  embeddings["reduced_embedding"]
  embeddings["reduced_embedding"]
  embeddings["reduced_embedding"]
  embeddings["reduced_embedding"]
  embeddings["reduced_embedding"]
  embeddings["reduced_embedding"]
  embeddings["reduced_embedding"]
  embeddings["reduced_embedding"]
  embeddings["reduced_embedding"]
  embeddings["reduced_embedding"]
  embeddings["reduced_embedding"]
  embeddings["reduced_embedding"]
  embeddings["reduced_embedding"]
  embeddings["reduced_embedding"]
  embeddings["reduced_embedding"]
  embeddings["reduced_embedding"]


Execution time: 12.65 seconds


place_id,review_id,feature,key,embedding,reduced_embedding,cosine_similarity
str,str,str,i64,list[f64],list[f64],f64
"""ChIJM9xJEHeLGGARAshMWUgC_ls""","""ChdDSUhNMG9nS0VJQ0FnSURqNjdqLX…","""外国人観光客にも人気""",509,"[0.050723, 0.005674, … -0.007751]","[-0.169436, 0.392744, … 0.019206]",0.625686
"""ChIJK9584NuMGGARVKfZjpYoQDs""","""ChdDSUhNMG9nS0VJQ0FnSUR6c3N2aD…","""外国人にも人気の観光スポット""",693,"[0.03074, 0.017347, … 0.000512]","[-0.179647, 0.439145, … 0.030647]",0.622217
"""ChIJCewJkL2LGGAR3Qmk0vCTGkg""","""ChdDSUhNMG9nS0VJQ0FnSURUd1ppQ2…","""外国人にも人気の観光スポット""",236,"[0.030738, 0.017356, … 0.000534]","[-0.179668, 0.43921, … 0.030658]",0.622142
"""ChIJM9xJEHeLGGARAshMWUgC_ls""","""ChdDSUhNMG9nS0VJQ0FnSURqNjdqLX…","""外国人にも優しい環境""",518,"[0.019292, -0.009432, … 0.002982]","[0.082765, 0.11864, … 0.025674]",0.615436
"""ChIJ4Rr2JWiLGGARcyRSHuZ-9G8""","""ChZDSUhNMG9nS0VJQ0FnSUN6M3BLek…","""海外からの観光客に喜んでもらえる""",1774,"[0.02009, -0.005817, … -0.002197]","[-0.078679, 0.29397, … -0.01713]",0.57767


In [38]:
# Example usage
sorted_embeddings = get_comp_similarity(search_embedding2, embeddings, pca)
sorted_embeddings.head(5)


  embeddings["reduced_embedding"]
  embeddings["reduced_embedding"]


place_id,review_id,feature,key,embedding,reduced_embedding,cosine_similarity
str,str,str,i64,list[f64],list[f64],f64
"""ChIJAQCTx8uMGGARpNjDI6maHdc""","""ChZDSUhNMG9nS0VJQ0FnSUNOeHI2M2…","""子供たちの楽しさが感じられる場所""",2590,"[0.06447, 0.009691, … 0.002378]","[0.247693, 0.112016, … 0.011193]",0.638628
"""ChIJHeBCIOyMGGAR7BktETlUP_w""","""ChdDSUhNMG9nS0VJQ0FnSUNON3ZDYi…","""子供と一緒に楽しむことができる場所""",3887,"[0.051534, -0.014964, … 0.013816]","[0.133147, 0.111245, … -0.013014]",0.562721
"""ChIJf4O1UMCMGGARZ6nQzxTLK1E""","""ChZDSUhNMG9nS0VJQ0FnSUN6bVlPdV…","""子供たちにとって刺激的な場所である。""",3646,"[0.05418, -0.00057, … 0.001293]","[0.06807, 0.023603, … 0.018761]",0.55961
"""ChIJf4O1UMCMGGARZ6nQzxTLK1E""","""ChZDSUhNMG9nS0VJQ0FnSUN6bVlPdV…","""子供たちにとって学びがある場所である。""",3635,"[0.014487, -0.017748, … 0.004655]","[0.009517, -0.004672, … 0.002989]",0.548029
"""ChIJzRzI3HSNGGARRwZW6AtJfi0""","""ChdDSUhNMG9nS0VJQ0FnSUR6cFpubj…","""子供たちにとって刺激的な場所""",432,"[0.064909, 0.004485, … 0.005009]","[0.076191, 0.017012, … 0.007103]",0.54049


In [39]:
import os
import json
import numpy as np
import polars as pl
from sklearn.decomposition import FastICA
from sklearn.metrics.pairwise import cosine_similarity
from dotenv import load_dotenv
from openai import OpenAI

# Load the .env file
load_dotenv("../.env")
client = OpenAI()

json_file_path = "../data/inputs/embeddings.json"
embeddings = pl.read_json(json_file_path)

# Extract embeddings to a numpy array for ICA
embedding_matrix = np.array(embeddings["embedding"].to_list())

# Perform ICA to reduce dimensionality
ica = FastICA(n_components=200)  # Adjust the number of components as needed
reduced_embeddings = ica.fit_transform(embedding_matrix)

# Replace the original embeddings in the DataFrame with the reduced embeddings
embeddings = embeddings.with_columns(pl.Series("reduced_embedding", reduced_embeddings.tolist()))


In [None]:
# Example usage
sorted_embeddings = measure_execution_time(get_comp_similarity, search_embedding, embeddings, ica, iterations=10)
sorted_embeddings.head(5)

  embeddings["reduced_embedding"]
  embeddings["reduced_embedding"]


place_id,review_id,feature,key,embedding,reduced_embedding,cosine_similarity
str,str,str,i64,list[f64],list[f64],f64
"""ChIJP9eKBdeMGGAR0zzBXJNVj5A""","""ChdDSUhNMG9nS0VJQ0FnSUM5dU9TQ2…","""外国人との交流が楽しめる""",104,"[-0.002273, -0.005705, … -0.004685]","[-0.582492, -0.182544, … 0.11698]",0.548342
"""ChIJM9xJEHeLGGARAshMWUgC_ls""","""ChdDSUhNMG9nS0VJQ0FnSURqNjdqLX…","""外国人にも優しい環境""",518,"[0.019292, -0.009432, … 0.002982]","[-0.428731, 0.524611, … -0.076985]",0.537634
"""ChIJCewJkL2LGGAR3Qmk0vCTGkg""","""ChdDSUhNMG9nS0VJQ0FnSURUd1ppQ2…","""外国人にも人気の観光スポット""",236,"[0.030738, 0.017356, … 0.000534]","[0.060362, -0.532813, … 0.56915]",0.499647
"""ChIJK9584NuMGGARVKfZjpYoQDs""","""ChdDSUhNMG9nS0VJQ0FnSUR6c3N2aD…","""外国人にも人気の観光スポット""",693,"[0.03074, 0.017347, … 0.000512]","[0.060375, -0.534344, … 0.569095]",0.49962
"""ChIJP9eKBdeMGGAR0zzBXJNVj5A""","""ChdDSUhNMG9nS0VJQ0FnSUM5dU9TQ2…","""外国の味が楽しめる""",113,"[-0.030375, -0.002719, … -0.028685]","[-0.067865, -0.211163, … -0.248195]",0.499428


In [None]:
# Example usage
sorted_embeddings = get_comp_similarity(search_embedding2, embeddings, ica)
sorted_embeddings.head(5)


  embeddings["reduced_embedding"]
  embeddings["reduced_embedding"]


place_id,review_id,feature,key,embedding,reduced_embedding,cosine_similarity
str,str,str,i64,list[f64],list[f64],f64
"""ChIJMZVITVyNGGARSNITQ5VS1yU""","""ChZDSUhNMG9nS0VJQ0FnSUNqb3Fuek…","""暑い日には噴水周辺が涼しくて過ごしやすい""",1333,"[0.02521, 0.008119, … 0.027153]","[-0.428004, 1.209492, … 0.909708]",0.368601
"""ChIJK9584NuMGGARVKfZjpYoQDs""","""ChdDSUhNMG9nS0VJQ0FnSURqb28yam…","""運動不足や暑さに慣れていない人には辛いかもしれない""",645,"[0.032247, -0.009529, … -0.000242]","[1.415757, -1.530714, … -0.376637]",0.360406
"""ChIJbWVFlTCKGGARr5l6ox-7_CA""","""ChdDSUhNMG9nS0VJQ0FnSURqLUlleX…","""暑い日でも肌寒いため、薄手の長袖を着用することがおすすめ""",3058,"[0.052504, 0.019623, … -0.012547]","[0.082357, 0.322924, … -0.132739]",0.346356
"""ChIJMwpiebSMGGARPr_454zHvDQ""","""ChZDSUhNMG9nS0VJQ0FnSUNUMUszRU…","""公園内は30度の予報にもかかわらず涼しく、快適に過ごせる環境""",155,"[0.030293, -0.007434, … 0.000526]","[0.489064, 0.343984, … -0.714974]",0.329417
"""ChIJK9584NuMGGARVKfZjpYoQDs""","""ChdDSUhNMG9nS0VJQ0FnSURqb28yam…","""夏には暑くなることがある""",643,"[-0.010203, 0.0531, … 0.004278]","[-0.090706, 0.326859, … 0.080392]",0.317768


In [None]:
# Example usage
sorted_embeddings = get_comp_similarity(search_embedding3, embeddings, ica)
sorted_embeddings.head(5)


  embeddings["reduced_embedding"]
  embeddings["reduced_embedding"]


place_id,review_id,feature,key,embedding,reduced_embedding,cosine_similarity
str,str,str,i64,list[f64],list[f64],f64
"""ChIJ6xA0pKSMGGAREB8mFuuaPYo""","""ChdDSUhNMG9nS0VJQ0FnSUNMd3FYcn…","""地元の伝統工芸品が販売されている""",4044,"[0.036521, 0.051619, … -0.02104]","[-1.039584, -0.101258, … -0.35438]",0.318612
"""ChIJPd37MMGOGGARvJ2hfxoiNVE""","""ChZDSUhNMG9nS0VJQ0FnSUNEbEtxZm…","""伝統工芸品などの販売も行われている""",1099,"[0.050246, 0.047429, … -0.03082]","[-0.960863, 0.127202, … -0.590703]",0.310075
"""ChIJK9584NuMGGARVKfZjpYoQDs""","""ChdDSUhNMG9nS0VJQ0FnSUR6c3N2aD…","""地元のお姉さまたちが伝統的な盆踊りを品よく踊る""",670,"[0.041023, 0.035715, … -0.027211]","[-0.166318, -0.99667, … -0.527394]",0.250188
"""ChIJtW7K92qNGGARBBLnvRhDQPM""","""ChdDSUhNMG9nS0VJQ0FnSURUMV95aj…","""伝統と現代が共存している雰囲気がする""",2548,"[0.029849, 0.011282, … -0.006944]","[0.123089, 0.146152, … -0.571432]",0.240777
"""ChIJxU48-2OLGGARHMtB5RAhytA""","""ChdDSUhNMG9nS0VJQ0FnSUNMeE9XST…","""伝統美と現代美の融合が特徴的""",3485,"[0.010537, 0.015855, … -0.010211]","[2.066813, -0.197476, … -0.518551]",0.234959
