## 文字轉換成數據

In [9]:
import os
import json
import torch
from transformers import BertTokenizer, BertModel
from ckiptagger import WS


ws = WS("C:\\Users\\user\\OneDrive\\桌面\\data")
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
bert_model = BertModel.from_pretrained('bert-base-chinese')

def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    return outputs.last_hidden_state[:, 0, :].cpu().tolist()  


def extract_text_features(item):
    address_text = ' '.join([str(a).strip() for a in item['positionround'].get('address', [])])
    address_tokens = ws([address_text])  
    VW_address = get_bert_embedding(' '.join(address_tokens[0]))  

    pattern_text = item['houseinfo']['pattern']
    pattern_tokens = ws([pattern_text])  
    VW_pattern = get_bert_embedding(' '.join(pattern_tokens[0]))  

    VW_size = get_bert_embedding(item['houseinfo']['size'])  
    VW_layer = get_bert_embedding(item['houseinfo']['layer']) 

    VW_servicelist_items = []
    for service_item in item['servicelist']:
        if isinstance(service_item, dict):
            VW_servicelist_items.append(service_item.get('service', ''))
        else:
            VW_servicelist_items.append(str(service_item))
    
    VW_servicelist = get_bert_embedding(' '.join(VW_servicelist_items))

    return {
        "hid": item.get('hid'),
        "VW_address": VW_address,
        "VW_pattern": VW_pattern,
        "VW_size": VW_size,
        "VW_layer": VW_layer,
        "VW_servicelist": VW_servicelist
    }

def extract_and_save_text_features(json_data, output_text_json):
    text_features = []
    
    for item in json_data:
        hid = item.get('hid')
        if not hid:
            print(f"跳過缺少 HID 的項目: {item}")
            continue

        text_feature = extract_text_features(item)
        text_features.append(text_feature)

        print(f"已經寫完 HID: {hid}")
    

    if text_features:
        print(f"提取數量: {len(text_features)}")
        print(f"正在寫入: {output_text_json}")  


        os.makedirs(os.path.dirname(output_text_json), exist_ok=True)

        with open(output_text_json, 'w', encoding='utf-8') as f:
            json.dump(text_features, f, ensure_ascii=False, indent=4)
        print(f"文字特徵已成功存入 {output_text_json}")
    else:
        print("無法寫入JSON 文件。")


def main():
    json_file = "C:\\Users\\user\\OneDrive\\桌面\\detail.json"
    output_text_json = "C:\\Users\\user\\OneDrive\\桌面\\text_features.json"
    
    if os.path.exists(json_file):
        print(f"正在讀取: {json_file}")
        with open(json_file, 'r', encoding='utf-8') as f:
            json_data = json.load(f) 
    else:
        print(f"文件 {json_file} 不存在，請檢查。")
        return
    
    if isinstance(json_data, list):
        extract_and_save_text_features(json_data, output_text_json)
    else:
        print("json_data 無效。")

if __name__ == "__main__":
    main()


正在读取文件: C:\Users\user\OneDrive\桌面\detail.json
已經寫完 HID: 16356457
已經寫完 HID: 16368467
已經寫完 HID: 16328872
已經寫完 HID: 16209324
已經寫完 HID: 16368496
已經寫完 HID: 16332031
已經寫完 HID: 16292443
已經寫完 HID: 16230221
已經寫完 HID: 16347023
已經寫完 HID: 16362448
已經寫完 HID: 16367525
已經寫完 HID: 16359486
已經寫完 HID: 16323247
已經寫完 HID: 16364931
已經寫完 HID: 16342509
已經寫完 HID: 16357221
已經寫完 HID: 16354184
已經寫完 HID: 16357531
已經寫完 HID: 16349728
已經寫完 HID: 16353229
已經寫完 HID: 16346123
已經寫完 HID: 16320210
已經寫完 HID: 16332647
已經寫完 HID: 16360405
已經寫完 HID: 16356245
已經寫完 HID: 16255419
已經寫完 HID: 16339599
已經寫完 HID: 16334970
已經寫完 HID: 16367559
已經寫完 HID: 16360598
已經寫完 HID: 16352315
已經寫完 HID: 16352070
已經寫完 HID: 16301216
已經寫完 HID: 16327833
已經寫完 HID: 16327850
已經寫完 HID: 16330263
已經寫完 HID: 16310650
已經寫完 HID: 16366763
已經寫完 HID: 16368439
已經寫完 HID: 16355795
已經寫完 HID: 16369078
已經寫完 HID: 16367837
已經寫完 HID: 16262527
已經寫完 HID: 16301885
已經寫完 HID: 16280586
已經寫完 HID: 16253662
已經寫完 HID: 16333540
已經寫完 HID: 16321795
已經寫完 HID: 16358674
已經寫完 HID: 16206303
已經寫完

## 圖片轉換成數據

In [1]:
import os
import json
import numpy as np
from ultralytics import YOLO
from PIL import Image
from collections import Counter
from transformers import BertTokenizer, BertModel
import torch

# 初始化 YOLO 模型
yolo_model = YOLO("yolov8n.pt")

# 初始化 BERT 模型和 Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# 使用 YOLO 偵測物件
def detect_objects(image_path):
    results = yolo_model(image_path)
    image = Image.open(image_path)
    
    # 使用 names 屬性來獲取物品名稱，而非索引
    labels = [yolo_model.names[int(cls)] for cls in results[0].boxes.cls.tolist()]  # YOLO 偵測到的物件名稱
    return labels, image

# 提取圖片的主色
def get_dominant_color(image):
    try:
        image = image.resize((50, 50))
        pixels = np.array(image)
        
        if pixels.ndim == 3 and pixels.shape[2] == 3:
            pixels = pixels.reshape(-1, 3)  # 將圖像像素展開
        else:
            print("不是 RGB 格式，跳過此圖片")
            return None
        
        counter = Counter(map(tuple, pixels))
        dominant_color = counter.most_common(1)[0][0]
        return tuple(map(int, dominant_color))  # 確保轉換為整數類型
    except Exception as e:
        print(f"提取主要顏色時出錯: {e}，跳過此圖片")
        return None

# 使用 BERT 將物件標籤轉換為特徵數據
def bert_process_label(label):
    inputs = tokenizer(label, return_tensors="pt")
    with torch.no_grad():
        outputs = bert_model(**inputs)
    
    # 使用 BERT 最後一層的 [CLS] 標籤嵌入
    last_hidden_state = outputs.last_hidden_state
    label_embedding = last_hidden_state[:, 0, :]  # [CLS] token 的嵌入
    return label_embedding.squeeze().tolist()

# 提取圖片描述（紀錄物品、顏色和 BERT 嵌入）
def generate_descriptions(image, labels):
    descriptions = []
    dominant_color = get_dominant_color(image)
    
    if dominant_color is not None:
        color_text = f"RGB({dominant_color[0]}, {dominant_color[1]}, {dominant_color[2]})"
        
        for label in labels:
            bert_features = bert_process_label(label)
            descriptions.append({
                "object_label": label,  # YOLO 偵測到的物件名稱
                "color_text": color_text,  # 物品顏色
                "bert_features": bert_features  # BERT 提取的嵌入數據
            })
    else:
        print("無法提取主要顏色，跳過此圖片。")
    
    return descriptions

# 提取圖片特徵並儲存至JSON
def extract_image_features(hid, image_folder):
    hid_folder = os.path.join(image_folder, str(hid))
    if not os.path.exists(hid_folder):
        print(f"跳過沒有圖片的 HID: {hid}")
        return None

    images = os.listdir(hid_folder)
    if not images:
        print(f"跳過沒有圖片的 HID: {hid}")
        return None
    
    image_features = []
    for img in images:
        image_path = os.path.join(hid_folder, img)
        labels, image = detect_objects(image_path)
        VP_image = generate_descriptions(image, labels)

        image_features.append({
            "image_name": img,
            "objects": VP_image
        })
    
    return {
        "hid": hid,
        "VP_images": image_features
    }

# 提取前五筆圖片特徵並儲存至 JSON
def extract_and_save_image_features(image_folder, output_image_json):
    image_features = []
    hids = os.listdir(image_folder)

    for hid in hids:
        hid_folder_path = os.path.join(image_folder, hid)
        if os.path.isdir(hid_folder_path):
            image_feature = extract_image_features(hid, image_folder)
            if image_feature:
                image_features.append(image_feature)
            print(f"已經處理完 HID: {hid}")
    
    if image_features:
        print(f"提取到的圖片數量: {len(image_features)}")
        print(f"正在寫入文件: {output_image_json}")

        os.makedirs(os.path.dirname(output_image_json), exist_ok=True)
        with open(output_image_json, 'w', encoding='utf-8') as f:
            json.dump(image_features, f, ensure_ascii=False, indent=4)
        print(f"圖片特徵已存入 {output_image_json}")
    else:
        print("沒有提取任何圖片，無法寫入 JSON 文件。")

def main():
    image_folder = "C:\\Users\\user\\OneDrive\\桌面\\gold_house"
    output_image_json = "C:\\Users\\user\\OneDrive\\桌面\\image_features.json"
    
    extract_and_save_image_features(image_folder, output_image_json)

if __name__ == "__main__":
    main()


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]


image 1/1 C:\Users\user\OneDrive\\gold_house\10305143\image1.jpg: 480x640 1 chair, 2 dining tables, 1 tv, 1 remote, 1 cell phone, 266.5ms
Speed: 14.7ms preprocess, 266.5ms inference, 25.9ms postprocess per image at shape (1, 3, 480, 640)

image 1/1 C:\Users\user\OneDrive\\gold_house\10305143\image2.jpg: 640x608 1 refrigerator, 221.4ms
Speed: 5.0ms preprocess, 221.4ms inference, 12.3ms postprocess per image at shape (1, 3, 640, 608)

image 1/1 C:\Users\user\OneDrive\\gold_house\10305143\image3.jpg: 480x640 3 refrigerators, 175.3ms
Speed: 5.4ms preprocess, 175.3ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

image 1/1 C:\Users\user\OneDrive\\gold_house\10305143\image4.jpg: 640x480 1 toilet, 1 sink, 191.8ms
Speed: 5.0ms preprocess, 191.8ms inference, 3.0ms postprocess per image at shape (1, 3, 640, 480)

image 1/1 C:\Users\user\OneDrive\\gold_house\10305143\image5.jpg: 640x480 1 toilet, 1 remote, 2 sinks, 176.4ms
Speed: 4.0ms preprocess, 176.4ms inference, 2.7ms post

## yolo and clip

In [None]:
import os
import json
import numpy as np
from ultralytics import YOLO
from PIL import Image
from collections import Counter
from transformers import BertTokenizer, BertModel
import torch

# 初始化 YOLO 模型
yolo_model = YOLO("yolov8n.pt")

# 初始化 BERT 模型和 Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# 使用 YOLO 偵測物件
def detect_objects(image_path):
    results = yolo_model(image_path)
    image = Image.open(image_path)
    
    # 使用 names 屬性來獲取物品名稱，而非索引
    labels = [yolo_model.names[int(cls)] for cls in results[0].boxes.cls.tolist()]  # YOLO 偵測到的物件名稱
    return labels, image

# 提取圖片的主色
def get_dominant_color(image):
    try:
        image = image.resize((50, 50))
        pixels = np.array(image)
        
        if pixels.ndim == 3 and pixels.shape[2] == 3:
            pixels = pixels.reshape(-1, 3)  # 將圖像像素展開
        else:
            print("不是 RGB 格式，跳過此圖片")
            return None
        
        counter = Counter(map(tuple, pixels))
        dominant_color = counter.most_common(1)[0][0]
        return tuple(map(int, dominant_color))  # 確保轉換為整數類型
    except Exception as e:
        print(f"提取主要顏色時出錯: {e}，跳過此圖片")
        return None

# 使用 BERT 將物件標籤轉換為特徵數據
def bert_process_label(label):
    inputs = tokenizer(label, return_tensors="pt")
    with torch.no_grad():
        outputs = bert_model(**inputs)
    
    # 使用 BERT 最後一層的 [CLS] 標籤嵌入
    last_hidden_state = outputs.last_hidden_state
    label_embedding = last_hidden_state[:, 0, :]  # [CLS] token 的嵌入
    return label_embedding.squeeze().tolist()

# 提取圖片描述（紀錄物品、顏色和 BERT 嵌入）
def generate_descriptions(image, labels):
    descriptions = []
    dominant_color = get_dominant_color(image)
    
    if dominant_color is not None:
        color_text = f"RGB({dominant_color[0]}, {dominant_color[1]}, {dominant_color[2]})"
        
        for label in labels:
            bert_features = bert_process_label(label)
            descriptions.append({
                "object_label": label,  # YOLO 偵測到的物件名稱
                "color_text": color_text,  # 物品顏色
                "bert_features": bert_features  # BERT 提取的嵌入數據
            })
    else:
        print("無法提取主要顏色，跳過此圖片。")
    
    return descriptions

# 提取圖片特徵並儲存至JSON
def extract_image_features(hid, image_folder):
    hid_folder = os.path.join(image_folder, str(hid))
    if not os.path.exists(hid_folder):
        print(f"跳過沒有圖片的 HID: {hid}")
        return None

    images = os.listdir(hid_folder)
    if not images:
        print(f"跳過沒有圖片的 HID: {hid}")
        return None
    
    image_features = []
    for img in images:
        image_path = os.path.join(hid_folder, img)
        labels, image = detect_objects(image_path)
        VP_image = generate_descriptions(image, labels)

        image_features.append({
            "image_name": img,
            "objects": VP_image
        })
    
    return {
        "hid": hid,
        "VP_images": image_features
    }

# 提取前五筆圖片特徵並儲存至 JSON
def extract_and_save_image_features(image_folder, output_image_json):
    image_features = []
    hids = os.listdir(image_folder)

    for hid in hids:
        hid_folder_path = os.path.join(image_folder, hid)
        if os.path.isdir(hid_folder_path):
            image_feature = extract_image_features(hid, image_folder)
            if image_feature:
                image_features.append(image_feature)
            print(f"已經處理完 HID: {hid}")
    
    if image_features:
        print(f"提取到的圖片數量: {len(image_features)}")
        print(f"正在寫入文件: {output_image_json}")

        os.makedirs(os.path.dirname(output_image_json), exist_ok=True)
        with open(output_image_json, 'w', encoding='utf-8') as f:
            json.dump(image_features, f, ensure_ascii=False, indent=4)
        print(f"圖片特徵已存入 {output_image_json}")
    else:
        print("沒有提取任何圖片，無法寫入 JSON 文件。")

def main():
    image_folder = "C:\\Users\\user\\OneDrive\\桌面\\gold_house"
    output_image_json = "C:\\Users\\user\\OneDrive\\桌面\\image_features.json"
    
    extract_and_save_image_features(image_folder, output_image_json)

if __name__ == "__main__":
    main()


## 數據合併

In [2]:
import os
import json

def merge_text_and_image_features(text_json_path, image_json_path, output_json_path):
    # 讀取文字特徵的 JSON 檔案
    with open(text_json_path, 'r', encoding='utf-8') as f:
        text_data = json.load(f)

    # 讀取圖片特徵的 JSON 檔案
    with open(image_json_path, 'r', encoding='utf-8') as f:
        image_data = json.load(f)

    # 根據 text_data 中的 hid，篩選出有相同 hid 的圖片特徵
    image_features_by_hid = {item['hid']: item['VP_images'] for item in image_data if item['hid'] in {text_item['hid'] for text_item in text_data}}

    # 將圖片特徵合併到相應的文字特徵中
    for text_item in text_data:
        hid = text_item.get('hid')
        if hid in image_features_by_hid:
            text_item['VP_images'] = image_features_by_hid[hid]
        else:
            text_item['VP_images'] = []  # 若無對應圖片，則設置為空列表

    # 將合併後的資料寫入到新的 JSON 檔案
    with open(output_json_path, 'w', encoding='utf-8') as f:
        json.dump(text_data, f, ensure_ascii=False, indent=4)

    print(f"已經將結果存到 {output_json_path}")

def main():
    # 定義 JSON 檔案的路徑
    text_json_path = "C:\\Users\\user\\OneDrive\\桌面\\text_features_1.json"
    image_json_path = "C:\\Users\\user\\OneDrive\\桌面\\image_features.json"
    output_json_path = "C:\\Users\\user\\OneDrive\\桌面\\merged_features.json"

    # 合併 JSON 檔案
    merge_text_and_image_features(text_json_path, image_json_path, output_json_path)

if __name__ == "__main__":
    main()


已經將結果存到 C:\Users\user\OneDrive\桌面\merged_features.json


## 使用上面數據進行比對

In [12]:
import json
import uuid
import torch
import torch.nn.functional as F
from sklearn.metrics.pairwise import cosine_similarity

# 加載提取的特徵 JSON 文件
def load_features(json_file):
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

# 計算兩個向量的餘弦相似度
def cosine_similarity(v1, v2):
    v1 = torch.tensor(v1, dtype=torch.float32)
    v2 = torch.tensor(v2, dtype=torch.float32)

    # 檢查兩個向量的形狀是否相同
    if v1.shape != v2.shape:
        raise ValueError(f"向量形狀不匹配: v1 {v1.shape}, v2 {v2.shape}")

    # 將向量變成 2D 張量，以符合 cosine_similarity 的要求
    v1 = v1.unsqueeze(0)  # 添加一個維度，使其變為 (1, num_features)
    v2 = v2.unsqueeze(0)  # 添加一個維度，使其變為 (1, num_features)

    # 計算批量餘弦相似度並返回平均值
    return F.cosine_similarity(v1, v2, dim=1).mean().item()

def compare_text_features(item1, item2, text_threshold=1):
    try:
        # 比較 VW_address
        address_sim = cosine_similarity(item1['VW_address'], item2['VW_address'])
        if address_sim < text_threshold:
            return False

        # 比較其他文字特徵
        pattern_sim = cosine_similarity(item1['VW_pattern'], item2['VW_pattern']) > 0.9
        size_sim = cosine_similarity(item1['VW_size'], item2['VW_size']) >= 1 
        layer_sim = cosine_similarity(item1['VW_layer'], item2['VW_layer']) > 0.95

        return pattern_sim and size_sim and layer_sim
    except ValueError as e:
        print(f"錯誤: {e}")
        return False

# 計算兩個顏色的相似度（允許一定範圍內的誤差）
def compare_colors(color1, color2, tolerance=50):
    r1, g1, b1 = map(int, color1[4:-1].split(','))
    r2, g2, b2 = map(int, color2[4:-1].split(','))
    return abs(r1 - r2) <= tolerance and abs(g1 - g2) <= tolerance and abs(b1 - b2) <= tolerance

# 比較兩張圖片的物品和顏色
def compare_images(img1_objects, img2_objects, image_threshold=0.5):
    matched_items = 0
    max_items = max(len(img1_objects), len(img2_objects))
    
    # 如果 max_items 為 0，直接返回 False，避免除以 0
    if max_items == 0:
        return False
    
    # 比對物品名稱和顏色，如果相似數達到 image_threshold 則返回 True
    for obj1 in img1_objects:
        for obj2 in img2_objects:
            if obj1['object_label'] == obj2['object_label'] and compare_colors(obj1['color_text'], obj2['color_text']):
                matched_items += 1
                break  # 如果找到相似物品，則不再比對該物品與其他物品
    
    # 確保相似物件數量超過總物品數量的60%
    return matched_items / max_items >= image_threshold

# 比較兩張圖片的物品和顏色
def compare_images(img1_objects, img2_objects, bert_threshold=0.6, color_tolerance=50):
    matched_items = 0
    max_items = max(len(img1_objects), len(img2_objects))

    # 如果 max_items 為 0，直接返回 False，避免除以 0
    if max_items == 0:
        return False

    # 比對物品的 BERT 向量相似性和顏色相似性
    for obj1 in img1_objects:
        for obj2 in img2_objects:
            # 計算 BERT 向量的餘弦相似度
            bert_similarity = cosine_similarity(obj1['bert_features'], obj2['bert_features'])

            # 比較 RGB 顏色的相似性
            color_similarity = compare_colors(obj1['color_text'], obj2['color_text'], tolerance=color_tolerance)

            # 如果 BERT 向量相似度和顏色相似性都達標，則認為物件相似
            if bert_similarity >= bert_threshold and color_similarity:
                matched_items += 1
                break  # 找到相似物品後不再進行後續比對

    # 確保相似物件數量超過總物品數量的60%
    return matched_items / max_items >= 0.6

def find_similar_items(data, text_threshold=0.8, image_threshold=0.5):
    same_map = {}  # 用來存儲每個房子的 same 編號
    similar_pairs = []  # 存儲相同的房屋 (hid1, hid2)
    same_id_counter = 1  # 自增數字編號起始值

    # 遍歷所有房屋，進行兩兩比對
    for i in range(len(data)):  
        item1 = data[i]
        
        # 如果該房屋已經有相同編號，則跳過比對
        if item1['hid'] in same_map and same_map[item1['hid']] != "none":
            continue
        
        print(f"正在比對房屋 {item1['hid']}")  # 檢查點：列印當前比對的房屋 ID
        found_similar = False

        for j in range(i + 1, len(data)):
            item2 = data[j]

            # 如果該房屋已經有相同編號，則跳過比對
            if item2['hid'] in same_map and same_map[item2['hid']] != "none":
                continue

            # 先進行文字比對
            if compare_text_features(item1, item2, text_threshold):
                
                # 如果文字比對成功，檢查圖片部分
                if 'VP_images' not in item1 or 'VP_images' not in item2:
                    # 如果其中一個房屋沒有圖片，則直接判定為不同
                    continue
                
                # 檢查圖片是否有物件，如果圖片中沒有物件則跳過該圖片繼續比對其他圖片
                for img1 in item1['VP_images']:
                    for img2 in item2['VP_images']:
                        if 'objects' not in img1 or not img1['objects'] or 'objects' not in img2 or not img2['objects']:
                            # 如果某張圖片沒有物件，則跳過該圖片繼續比對其他圖片
                            continue

                        # 如果圖片有物件，進行物件比對
                        if compare_images(img1['objects'], img2['objects'], image_threshold):
                            found_similar = True
                            similar_pairs.append((item1['hid'], item2['hid']))  # 記錄相同的房屋
                            
                            # 檢查是否已有 same_id，沒有則生成
                            if item1['hid'] not in same_map and item2['hid'] not in same_map:
                                same_map[item1['hid']] = same_id_counter
                                same_map[item2['hid']] = same_id_counter
                                print(f"為房屋 {item1['hid']} 和 {item2['hid']} 分配 same_id: {same_id_counter}")  # 列印分配的 same_id
                                same_id_counter += 1  # 編號自增
                            elif item1['hid'] in same_map:
                                same_map[item2['hid']] = same_map[item1['hid']]
                                print(f"為房屋 {item2['hid']} 分配 same_id: {same_map[item1['hid']]}")  # 列印分配的 same_id
                            else:
                                same_map[item1['hid']] = same_map[item2['hid']]
                                print(f"為房屋 {item1['hid']} 分配 same_id: {same_map[item2['hid']]}")  # 列印分配的 same_id
            
        # 如果沒有找到相似房屋，設置 same 為 "none"
        if not found_similar:
            same_map[item1['hid']] = "none"

    # 確保每個 hid 都有 same 編號，若沒有相似房屋則設置為 "none"
    for item in data:
        if item['hid'] not in same_map:
            same_map[item['hid']] = "none"

    return same_map, similar_pairs

# 將結果存入新的 JSON 文件
def save_similarities_to_json(same_map, output_json):
    # 構建只包含 hid 和 same 的結果
    result = [{"hid": hid, "same": same} for hid, same in same_map.items()]

    # 將結果寫入新的 JSON 文件
    with open(output_json, 'w', encoding='utf-8') as f:
        json.dump(result, f, ensure_ascii=False, indent=4)

    print(f"結果已成功寫入 {output_json}")

# 打印相同房屋對
def print_similar_pairs(similar_pairs):
    print("相同的房屋對：")
    for pair in similar_pairs:
        print(f"房屋 {pair[0]} 和 房屋 {pair[1]} 是相似的")

def main():
    # 加載提取的特徵
    json_file = "C:\\Users\\user\\OneDrive\\桌面\\merged_features.json"
    output_json = "C:\\Users\\user\\OneDrive\\桌面\\similar_houses.json"
    
    data = load_features(json_file)
    
    # 找出相似房屋
    same_map, similar_pairs = find_similar_items(data)

    # 將結果保存到新的 JSON 文件
    save_similarities_to_json(same_map, output_json)

    # 打印相同的房屋對
    print_similar_pairs(similar_pairs)

if __name__ == "__main__":
    main()


正在比對房屋 16356457
為房屋 16356457 和 16325563 分配 same_id: 1
為房屋 16325563 分配 same_id: 1
為房屋 16325563 分配 same_id: 1
為房屋 16325563 分配 same_id: 1
為房屋 16325563 分配 same_id: 1
正在比對房屋 16368467
為房屋 16368467 和 16368496 分配 same_id: 2
為房屋 16368496 分配 same_id: 2
為房屋 16368496 分配 same_id: 2
為房屋 16368496 分配 same_id: 2
為房屋 16368496 分配 same_id: 2
為房屋 16368496 分配 same_id: 2
為房屋 16368496 分配 same_id: 2
為房屋 16368496 分配 same_id: 2
為房屋 16368496 分配 same_id: 2
為房屋 16368496 分配 same_id: 2
為房屋 16368496 分配 same_id: 2
為房屋 16368496 分配 same_id: 2
為房屋 16368496 分配 same_id: 2
為房屋 16368496 分配 same_id: 2
為房屋 16368496 分配 same_id: 2
為房屋 16368496 分配 same_id: 2
為房屋 16368496 分配 same_id: 2
為房屋 16368496 分配 same_id: 2
為房屋 16368496 分配 same_id: 2
為房屋 16368496 分配 same_id: 2
為房屋 16368496 分配 same_id: 2
為房屋 16368496 分配 same_id: 2
為房屋 16368496 分配 same_id: 2
為房屋 16368496 分配 same_id: 2
為房屋 16368496 分配 same_id: 2
為房屋 16368496 分配 same_id: 2
為房屋 16368496 分配 same_id: 2
為房屋 16368496 分配 same_id: 2
為房屋 16368496 分配 same_id: 2
為房屋 16368496 分配 same_id: 2
為

## json匯入新資料庫

In [None]:
import mysql.connector
import json

# 創建資料庫和表
def create_database_and_table():

    # 連接到 MySQL 伺服器
    connection = mysql.connector.connect(
        host="localhost",  
        user="root",  
        password="ntubGH113402"  
    )
    cursor = connection.cursor()

    # 創建新的資料庫
    cursor.execute("CREATE DATABASE IF NOT EXISTS vmvp")

    # 使用新創建的資料庫
    cursor.execute("USE vmvp")

    # 創建存儲 BERT 和 YOLO 結果的資料表
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS house_data (
            id INT AUTO_INCREMENT PRIMARY KEY,  -- 自動遞增的主鍵
            hid VARCHAR(255) NOT NULL,  -- 房屋的唯一識別碼
            VW_address JSON,  -- 地址的 BERT 向量，使用 JSON 類型存儲
            VW_pattern JSON,  -- 房屋類型的 BERT 向量，使用 JSON 類型存儲
            VW_size JSON,  -- 房屋大小的 BERT 向量
            VW_layer JSON,  -- 樓層的 BERT 向量
            VW_servicelist JSON,  -- 服務列表的 BERT 向量
            VP_images JSON  -- YOLO 檢測結果，包括物品名稱和顏色等
        )
    """)
    print("已創建")

    # 提交更改
    connection.commit()
    return connection, cursor

# 將 JSON 數據插入到 MySQL 資料庫
def insert_data_into_mysql(json_file, connection):
    cursor = connection.cursor()

    # 定義插入數據的 SQL 語句
    sql = """
        INSERT INTO house_data (hid, VW_address, VW_pattern, VW_size, VW_layer, VW_servicelist, VP_images)
        VALUES (%s, %s, %s, %s, %s, %s, %s)
    """

    # 打開並讀取 JSON 文件
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # 遍歷 JSON 數據並插入到資料庫
    for item in data:
        hid = item.get('hid')
        VW_address = json.dumps(item.get('VW_address'))  # 將 BERT 向量轉為 JSON 字串
        VW_pattern = json.dumps(item.get('VW_pattern'))
        VW_size = json.dumps(item.get('VW_size'))
        VW_layer = json.dumps(item.get('VW_layer'))
        VW_servicelist = json.dumps(item.get('VW_servicelist'))
        VP_images = json.dumps(item.get('VP_images'))  # YOLO 結果轉為 JSON 字串

        # 執行插入操作
        cursor.execute(sql, (hid, VW_address, VW_pattern, VW_size, VW_layer, VW_servicelist, VP_images))

    # 提交更改
    connection.commit()
    print("數據已成功插入到資料庫中")

# 主函數
def main():
    # 創建新資料庫和表
    connection, cursor = create_database_and_table()

    # JSON 文件路徑
    json_file = "C:\\Users\\user\\OneDrive\\桌面\\merged_features.json"

    # 插入數據
    insert_data_into_mysql(json_file, connection)

    # 關閉連接
    cursor.close()
    connection.close()

if __name__ == "__main__":
    main()


## json匯入舊資料庫 same

In [None]:
import mysql.connector
import json

# 連接到現有的資料庫
def connect_to_mysql():
    connection = mysql.connector.connect(
        host='localhost',
        user='root',
        password='ntubGH113402',
        database='ghdetail'  # 現有的資料庫名稱
    )
    return connection

# 將 JSON 中的 same 編號更新到資料庫中的相應欄位
def update_same_in_mysql(json_file, connection):
    cursor = connection.cursor()

    # 定義 SQL 更新語句
    sql = """
        UPDATE house_data
        SET same = %s
        WHERE hid = %s
    """

    # 讀取 JSON 文件
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # 遍歷 JSON 數據，根據 hid 更新資料庫中的 same 欄位
    for item in data:
        hid = item.get('hid')  # 從 JSON 文件中取得 hid
        same = item.get('same')  # 從 JSON 文件中取得 same 編號

        if hid and same:
            # 執行 SQL 更新操作
            cursor.execute(sql, (same, hid))

    # 提交更改
    connection.commit()
    print("數據已成功更新到資料庫中")

# 主函數
def main():
    # JSON 文件路徑，這裡保存了比對後的結果
    json_file = "C:\\Users\\user\\OneDrive\\桌面\\similar_houses.json"

    # 連接到 MySQL 資料庫
    connection = connect_to_mysql()

    # 更新資料庫中的 same 欄位
    update_same_in_mysql(json_file, connection)

    # 關閉連接
    connection.close()

if __name__ == "__main__":
    main()
