In [20]:
import os
import time
import json
from sklearn.metrics.pairwise import cosine_similarity
import polars as pl
from tqdm import tqdm
from dotenv import load_dotenv
from openai import OpenAI

# Load the .env file
load_dotenv("../.env")
client = OpenAI()

json_file_path = "../data/inputs/embeddings.json"
embeddings = pl.read_json(json_file_path)

def get_similarity(text, embeddings):
	embedding = (
		client.embeddings.create(input=text, model="text-embedding-3-small")
		.data[0]
		.embedding
	)
	embeddings = embeddings.with_columns(
		embeddings["embedding"]
		.apply(lambda e: cosine_similarity([e], [embedding])[0][0])
		.alias("cosine_similarity")
	)
	embeddings = embeddings.sort("cosine_similarity", descending=True)
	return embeddings

shape: (5, 5)
┌────────────────────┬────────────────────┬────────────────────┬──────────────┬────────────────────┐
│ place_id           ┆ review_id          ┆ feature            ┆ embedding    ┆ cosine_similarity  │
│ ---                ┆ ---                ┆ ---                ┆ ---          ┆ ---                │
│ str                ┆ str                ┆ str                ┆ list[f64]    ┆ f64                │
╞════════════════════╪════════════════════╪════════════════════╪══════════════╪════════════════════╡
│ ChIJoTcat9SMGGAR6G ┆ ChdDSUhNMG9nS0VJQ0 ┆ Step 3: 外国からの ┆ [0.001227,   ┆ 0.512461           │
│ GG8zdcZvE          ┆ FnSUNEb3FlMj…      ┆ 観光客も多く訪れる ┆ 0.012029, …  ┆                    │
│                    ┆                    ┆                    ┆ 0.01594…     ┆                    │
│ ChIJoTcat9SMGGAR6G ┆ ChdDSUhNMG9nS0VJQ0 ┆ 都民や観光客に人気 ┆ [0.06745,    ┆ 0.454456           │
│ GG8zdcZvE          ┆ FnSURqby0tTm…      ┆ がある             ┆ -0.008885, … ┆                    │
│ 

  embeddings["embedding"]
  embeddings["embedding"]


In [23]:
sorted_embeddings = get_similarity("外国人におすすめ", embeddings)
sorted_embeddings.head(5)

  embeddings["embedding"]
  embeddings["embedding"]


place_id,review_id,feature,embedding,cosine_similarity
str,str,str,list[f64],f64
"""ChIJoTcat9SMGGAR6GGG8zdcZvE""","""ChdDSUhNMG9nS0VJQ0FnSUNEb3FlMj…","""Step 3: 外国からの観光客も多く訪れる""","[0.001227, 0.012029, … 0.015946]",0.512493
"""ChIJoTcat9SMGGAR6GGG8zdcZvE""","""ChdDSUhNMG9nS0VJQ0FnSURqby0tTm…","""都民や観光客に人気がある""","[0.06745, -0.008885, … 0.01348]",0.454484
"""ChIJoTcat9SMGGAR6GGG8zdcZvE""","""ChZDSUhNMG9nS0VJQ0FnSUM5Ni1xUE…","""インバウンドの影響で外国人観光客も多く、国際色豊かな雰囲気""","[0.011898, -0.014897, … 0.0003]",0.453831
"""ChIJoTcat9SMGGAR6GGG8zdcZvE""","""ChZDSUhNMG9nS0VJQ0FnSUM5Ni1xUE…","""観光客としてだけでなく、地元の方々の生活スポットとしても人気…","[0.047373, 0.005483, … 0.004243]",0.418255
"""ChIJoTcat9SMGGAR6GGG8zdcZvE""","""ChZDSUhNMG9nS0VJQ0FnSUM5Ni1xUE…","""無料で観光できるため、多くの人が訪れる人気観光地""","[0.037475, -0.017462, … 0.03031]",0.392539


In [22]:
sorted_embeddings = get_similarity("子供が楽しめる場所。できれば歴史に触れられると嬉しい。", embeddings)
sorted_embeddings.head(5)

  embeddings["embedding"]
  embeddings["embedding"]


place_id,review_id,feature,embedding,cosine_similarity
str,str,str,list[f64],f64
"""ChIJoTcat9SMGGAR6GGG8zdcZvE""","""ChdDSUhNMG9nS0VJQ0FnSUNEb3FlMj…","""Step 2: 子供から大人まで楽しめるイベントが多い""","[0.050672, 0.017772, … 0.017475]",0.554717
"""ChIJoTcat9SMGGAR6GGG8zdcZvE""","""ChZDSUhNMG9nS0VJQ0FnSUM5Ni1xUE…","""静かな空間で、リラックスしながら景色を楽しめる場所""","[0.027346, 0.00253, … 0.00994]",0.43679
"""ChIJoTcat9SMGGAR6GGG8zdcZvE""","""ChZDSUhNMG9nS0VJQ0FnSUM5Ni1xUE…","""地元の方々と触れ合う機会も多く、地元の文化や生活に触れること…","[0.00899, 0.020318, … -0.0034]",0.415958
"""ChIJoTcat9SMGGAR6GGG8zdcZvE""","""ChZDSUhNMG9nS0VJQ0FnSUM5Ni1xUE…","""季節ごとに異なる景色を楽しめるため、何度訪れても飽きない""","[0.027305, -0.00142, … 0.002894]",0.399547
"""ChIJoTcat9SMGGAR6GGG8zdcZvE""","""ChZDSUhNMG9nS0VJQ0FnSUM5Ni1xUE…","""夜景も美しく、ロマンチックな雰囲気を楽しめる""","[0.0059, -0.013158, … 0.005091]",0.382516


In [8]:
import os
import json
import numpy as np
import polars as pl
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
from dotenv import load_dotenv
from openai import OpenAI

# Load the .env file
load_dotenv("../.env")
client = OpenAI()

json_file_path = "../data/inputs/embeddings.json"
embeddings = pl.read_json(json_file_path)

# Extract embeddings to a numpy array for PCA
embedding_matrix = np.array(embeddings["embedding"].to_list())

# Perform PCA to reduce dimensionality
pca = PCA(n_components=50)  # Adjust the number of components as needed
reduced_embeddings = pca.fit_transform(embedding_matrix)

# Replace the original embeddings in the DataFrame with the reduced embeddings
embeddings = embeddings.with_columns(pl.Series("reduced_embedding", reduced_embeddings.tolist()))

def get_similarity(text, embeddings, pca):
    # Get the sample embedding
    sample_embedding = (
        client.embeddings.create(input=text, model="text-embedding-ada-002")
        .data[0]
        .embedding
    )
    
    # Reduce the dimensionality of the sample embedding
    reduced_sample_embedding = pca.transform([sample_embedding])[0]

    # Compute cosine similarity for each reduced embedding
    embeddings = embeddings.with_columns(
        embeddings["reduced_embedding"]
        .apply(lambda e: cosine_similarity([e], [reduced_sample_embedding])[0][0])
        .alias("cosine_similarity")
    )

    # Sort the DataFrame by the cosine similarity in descending order
    sorted_embeddings = embeddings.sort("cosine_similarity", descending=True)
    return sorted_embeddings

In [10]:
# Example usage
text = "外国人におすすめ"
sorted_embeddings = get_similarity(text, embeddings, pca)
sorted_embeddings.head(5)


  embeddings["reduced_embedding"]
  embeddings["reduced_embedding"]


place_id,review_id,feature,embedding,reduced_embedding,cosine_similarity
str,str,str,list[f64],list[f64],f64
"""ChIJoTcat9SMGGAR6GGG8zdcZvE""","""ChdDSUhNMG9nS0VJQ0FnSURqby0tTm…","""現在は「ゴジラ」バージョンのプロジェクションマッピング""","[-0.011986, 0.032607, … -0.007424]","[0.528753, 0.027889, … -0.014807]",0.348922
"""ChIJoTcat9SMGGAR6GGG8zdcZvE""","""ChdDSUhNMG9nS0VJQ0FnSURqby0tTm…","""上映時間は10分間で、30分毎に通常版も上映される""","[-0.001462, 0.077193, … -0.016847]","[0.315831, -0.218939, … -0.102579]",0.343878
"""ChIJoTcat9SMGGAR6GGG8zdcZvE""","""ChZDSUhNMG9nS0VJQ0FnSUNUN3ZPOU…","""6. 都の職員が東京都島の焼酎やクラフトビールを販売している""","[0.001604, 0.030078, … 0.027133]","[-0.052379, -0.290941, … 0.014298]",0.341943
"""ChIJoTcat9SMGGAR6GGG8zdcZvE""","""ChdDSUhNMG9nS0VJQ0FnSUNEb3FlMj…","""Step 1: ピアノの飛び入り演奏が行われる""","[0.012912, -0.003128, … -0.019431]","[0.355002, -0.05761, … -0.028673]",0.31963
"""ChIJoTcat9SMGGAR6GGG8zdcZvE""","""ChZDSUhNMG9nS0VJQ0FnSUNUN3ZPOU…","""11. ゴジラの登場にワクワクする""","[0.040052, 0.065419, … 0.007181]","[0.345285, 0.010216, … -0.05019]",0.247015


In [11]:
# Example usage
text = "子供が楽しめる場所。できれば歴史に触れられると嬉しい。"
sorted_embeddings = get_similarity(text, embeddings, pca)
sorted_embeddings.head(5)


  embeddings["reduced_embedding"]
  embeddings["reduced_embedding"]


place_id,review_id,feature,embedding,reduced_embedding,cosine_similarity
str,str,str,list[f64],list[f64],f64
"""ChIJoTcat9SMGGAR6GGG8zdcZvE""","""ChdDSUhNMG9nS0VJQ0FnSURqby0tTm…","""現在は「ゴジラ」バージョンのプロジェクションマッピング""","[-0.011986, 0.032607, … -0.007424]","[0.528753, 0.027889, … -0.014807]",0.376838
"""ChIJoTcat9SMGGAR6GGG8zdcZvE""","""ChdDSUhNMG9nS0VJQ0FnSURqby0tTm…","""上映時間は10分間で、30分毎に通常版も上映される""","[-0.001462, 0.077193, … -0.016847]","[0.315831, -0.218939, … -0.102579]",0.372013
"""ChIJoTcat9SMGGAR6GGG8zdcZvE""","""ChdDSUhNMG9nS0VJQ0FnSUNEb3FlMj…","""Step 1: ピアノの飛び入り演奏が行われる""","[0.012912, -0.003128, … -0.019431]","[0.355002, -0.05761, … -0.028673]",0.336002
"""ChIJoTcat9SMGGAR6GGG8zdcZvE""","""ChZDSUhNMG9nS0VJQ0FnSUNUN3ZPOU…","""11. ゴジラの登場にワクワクする""","[0.040052, 0.065419, … 0.007181]","[0.345285, 0.010216, … -0.05019]",0.299695
"""ChIJoTcat9SMGGAR6GGG8zdcZvE""","""ChdDSUhNMG9nS0VJQ0FnSURqby0tTm…","""アニメや映画のキャラクターなども上映される""","[0.049511, 0.057963, … 0.011408]","[0.148426, -0.208527, … 0.044004]",0.259371


In [14]:
import os
import json
import numpy as np
import polars as pl
from sklearn.decomposition import FastICA
from sklearn.metrics.pairwise import cosine_similarity
from dotenv import load_dotenv
from openai import OpenAI

# Load the .env file
load_dotenv("../.env")
client = OpenAI()

json_file_path = "../data/inputs/embeddings.json"
embeddings = pl.read_json(json_file_path)

# Extract embeddings to a numpy array for ICA
embedding_matrix = np.array(embeddings["embedding"].to_list())

# Perform ICA to reduce dimensionality
ica = FastICA(n_components=50)  # Adjust the number of components as needed
reduced_embeddings = ica.fit_transform(embedding_matrix)

# Replace the original embeddings in the DataFrame with the reduced embeddings
embeddings = embeddings.with_columns(pl.Series("reduced_embedding", reduced_embeddings.tolist()))


In [16]:
# Example usage
text = "外国人におすすめ"
sorted_embeddings = get_similarity(text, embeddings, ica)
sorted_embeddings.head(5)

  embeddings["reduced_embedding"]
  embeddings["reduced_embedding"]


place_id,review_id,feature,embedding,reduced_embedding,cosine_similarity
str,str,str,list[f64],list[f64],f64
"""ChIJoTcat9SMGGAR6GGG8zdcZvE""","""ChZDSUhNMG9nS0VJQ0FnSUM5Ni1xUE…","""観光案内所が整備されており、次のプランを立てやすい""","[-0.010807, 0.004569, … -0.005498]","[0.33665, -0.053008, … 0.019674]",0.297206
"""ChIJoTcat9SMGGAR6GGG8zdcZvE""","""ChZDSUhNMG9nS0VJQ0FnSUNUN3ZPOU…","""6. 都の職員が東京都島の焼酎やクラフトビールを販売している""","[0.001604, 0.030078, … 0.027133]","[0.079421, -0.362306, … -0.183997]",0.285655
"""ChIJoTcat9SMGGAR6GGG8zdcZvE""","""ChdDSUhNMG9nS0VJQ0FnSURqby0tTm…","""上映時間は10分間で、30分毎に通常版も上映される""","[-0.001462, 0.077193, … -0.016847]","[7.866113, 0.353614, … -0.188921]",0.281957
"""ChIJoTcat9SMGGAR6GGG8zdcZvE""","""ChdDSUhNMG9nS0VJQ0FnSURqby0tTm…","""都心の喧騒を忘れさせてくれる""","[0.061906, 0.012897, … -0.001959]","[0.121442, -0.200072, … -0.191471]",0.268356
"""ChIJoTcat9SMGGAR6GGG8zdcZvE""","""ChdDSUhNMG9nS0VJQ0FnSURqby0tTm…","""音楽と映像のシンクロが見どころ""","[0.000199, 0.026349, … -0.015605]","[0.152197, -1.924673, … -0.096128]",0.206174


In [17]:
# Example usage
text = "子供が楽しめる場所。できれば歴史に触れられると嬉しい。"
sorted_embeddings = get_similarity(text, embeddings, pca)
sorted_embeddings.head(5)


  embeddings["reduced_embedding"]
  embeddings["reduced_embedding"]


place_id,review_id,feature,embedding,reduced_embedding,cosine_similarity
str,str,str,list[f64],list[f64],f64
"""ChIJoTcat9SMGGAR6GGG8zdcZvE""","""ChdDSUhNMG9nS0VJQ0FnSURqby0tTm…","""都庁周辺の公園も散策に最適""","[0.034885, -0.004204, … 0.003314]","[1.126503, -1.038356, … 0.13657]",0.336068
"""ChIJoTcat9SMGGAR6GGG8zdcZvE""","""ChdDSUhNMG9nS0VJQ0FnSURqby0tTm…","""上映時間は10分間で、30分毎に通常版も上映される""","[-0.001462, 0.077193, … -0.016847]","[7.866113, 0.353614, … -0.188921]",0.334017
"""ChIJoTcat9SMGGAR6GGG8zdcZvE""","""ChdDSUhNMG9nS0VJQ0FnSURqby0tTm…","""見どころがいっぱい詰まった観光スポット""","[0.035952, -0.001157, … 0.012021]","[-0.293518, -7.251588, … -0.102457]",0.278763
"""ChIJoTcat9SMGGAR6GGG8zdcZvE""","""ChdDSUhNMG9nS0VJQ0FnSUNEb3FlMj…","""Step 5: 雨の日でも楽しめる施設がある""","[-0.021625, 0.038027, … 0.014852]","[0.173428, 0.252447, … -0.138414]",0.262288
"""ChIJoTcat9SMGGAR6GGG8zdcZvE""","""ChdDSUhNMG9nS0VJQ0FnSURqby0tTm…","""土日祝日の夕方にはプロジェクションマッピングが開催される""","[0.024845, 0.017236, … -0.013928]","[-0.406029, -0.014703, … -0.118068]",0.249338
