In [1]:
import faiss
import numpy as np
import json
import time
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from tqdm import tqdm

In [2]:
# prepare images dataset
with open('../sources/metadata/image_database.json', 'r') as f:
    image_database = json.load(f)
data = pd.read_csv('../data mining/image_database.csv')

In [3]:
vectorizer = TfidfVectorizer()

In [None]:
clean_data = data.dropna(subset='background_class')
start_time = time.time()


tfidf_matrix = vectorizer.fit_transform(clean_data['background_class'].apply(lambda x: x))
tfidf_dense = tfidf_matrix.toarray().astype('float32')    
print(f"feature extraction duration: {time.time() - start_time}")

bg_index = faiss.IndexFlatIP(tfidf_dense.shape[1])
bg_index.add(tfidf_dense)
faiss.write_index(bg_index, r'D:\image_search_engine_ai-end\sources\metadata\faiss_background_features_index.idx')

In [None]:
query = image_database[0]['background_class']
query_tfidf = vectorizer.transform([query]).toarray().astype('float32')

k = 10
_, I = bg_index.search(query_tfidf, k)

print("Query:", query)
for idx in I[0]:
    print(image_database[idx]['path'])

In [None]:
clean_data = data.dropna(subset=['caption'])

In [None]:
start_time = time.time()
tfidf_matrix = vectorizer.fit_transform(clean_data['caption'].apply(lambda x: x[19:]))
tfidf_dense = tfidf_matrix.toarray().astype('float32')    
print(f"feature extraction duration: {time.time() - start_time}")

caption_index = faiss.IndexFlatL2(tfidf_dense.shape[1])
caption_index.add(tfidf_dense)
faiss.write_index(caption_index, r'D:\image_search_engine_ai-end\sources\metadata\faiss_context_features_index.idx')

In [None]:
query = image_database[0]['caption']
query_tfidf = vectorizer.transform([query]).toarray().astype('float32')

k = 30
_, I = caption_index.search(query_tfidf, k)

print("Query:", query)
for idx in I[0]:
    print(image_database[idx]['path'])


In [None]:
clean_data = data.dropna(subset=['objects_label'])
start_time = time.time()
tfidf_matrix = vectorizer.fit_transform(clean_data['objects_label']
                                        .apply(lambda x: ' '.join(x.split(","))))
tfidf_dense = tfidf_matrix.toarray().astype('float32')
print(f"feature extraction duration: {time.time() - start_time}")

objects_index = faiss.IndexFlatL2(tfidf_dense.shape[1])
objects_index.add(tfidf_dense)
faiss.write_index(objects_index, r'D:\image_search_engine_ai-end\sources\metadata\faiss_objects_features_index.idx')

query =  ' '.join(image_database[0]['objects_label'])
query_tfidf = vectorizer.transform([query]).toarray().astype('float32')

k = 20
D, I = objects_index.search(query_tfidf, 50, )
# print(D[0])
print("Query:", query)
for idx, score in zip(I[0], D[0]):
    print(score)
    if score < 0.5:
        print(image_database[idx]['path'])

In [None]:
clean_data = data.dropna(subset=['text'])
clean_data[clean_data['text'].apply(lambda x: x.lower() == 'ken')].shape

In [None]:
clean_data[['path', 'text']].reset_index().drop(columns=['index'])

In [None]:
clean_data = data.dropna(subset=['text'])[['path', 'text']].reset_index().drop(columns=['index'])
start_time = time.time()
tfidf_matrix = vectorizer.fit_transform(clean_data['text'].apply(lambda x: x.lower()))
tfidf_dense = tfidf_matrix.toarray().astype('float32')
print(f"feature extraction duration: {time.time() - start_time}")

text_index = faiss.IndexFlatL2(tfidf_dense.shape[1])
text_index.add(tfidf_dense)
faiss.write_index(text_index, r'D:\image_search_engine_ai-end\sources\metadata\faiss_text_features_index.idx')
docs_n_boks = [{'path': path, 'text': text} for path, text in zip(clean_data['path'].values, clean_data['text'].values)]
query = 'healthy tip'
query_tfidf = vectorizer.transform([query]).toarray().astype('float32')

k = 20
_, I = text_index.search(query_tfidf, k)
print("Query:", query)
for idx in I[0]:
    print(idx)
    print(docs_n_boks[idx]['path'])
    print(clean_data.iloc[idx]['path'])
    # print(image_database[idx]['path'])
    # print(image_database[idx]['text'])
    
with open(r'D:\image_search_engine_ai-end\sources\metadata\docs_n_boks.json', 'w') as f:
        json.dump(docs_n_boks, f, indent=4, default=vars)

In [6]:
clean_data = data[data['folder'] == 'camera'][['path', 'faces_label']].dropna(subset=['faces_label']).reset_index().drop(columns=['index'])
display(clean_data)
start_time = time.time()
tfidf_matrix = vectorizer.fit_transform(clean_data['faces_label'].apply(lambda x: x.lower()))
tfidf_dense = tfidf_matrix.toarray().astype('float32')
print(f"feature extraction duration: {time.time() - start_time}")

text_index = faiss.IndexFlatL2(tfidf_dense.shape[1])
text_index.add(tfidf_dense)
faiss.write_index(text_index, r'D:\image_search_engine_ai-end\sources\metadata\faiss_faces_features_index.idx')
query = 'lily'
query_tfidf = vectorizer.transform([query]).toarray().astype('float32')

k = 20
_, I = text_index.search(query_tfidf, k)
print("Query:", query)
for idx in I[0]:
    print(clean_data.iloc[idx]['faces_label'])
    print(clean_data.iloc[idx]['path'])


Unnamed: 0,path,faces_label
0,D:\image_search_engine_ai-end\sources\photos\c...,robin
1,D:\image_search_engine_ai-end\sources\photos\c...,robin
2,D:\image_search_engine_ai-end\sources\photos\c...,robin
3,D:\image_search_engine_ai-end\sources\photos\c...,robin
4,D:\image_search_engine_ai-end\sources\photos\c...,robin
...,...,...
590,D:\image_search_engine_ai-end\sources\photos\c...,"robin, barney"
591,D:\image_search_engine_ai-end\sources\photos\c...,lily
592,D:\image_search_engine_ai-end\sources\photos\c...,"barney, robin, marshel, lily"
593,D:\image_search_engine_ai-end\sources\photos\c...,robin


feature extraction duration: 0.16283392906188965
Query: lily
lily
D:\image_search_engine_ai-end\sources\photos\camera\000100.jpg
lily
D:\image_search_engine_ai-end\sources\photos\camera\000704.jpg
lily
D:\image_search_engine_ai-end\sources\photos\camera\000768.jpg
lily
D:\image_search_engine_ai-end\sources\photos\camera\000814.jpg
lily
D:\image_search_engine_ai-end\sources\photos\camera\000993.jpg
lily
D:\image_search_engine_ai-end\sources\photos\camera\001025.jpg
robin, lily
D:\image_search_engine_ai-end\sources\photos\camera\000032.jpg
robin, lily
D:\image_search_engine_ai-end\sources\photos\camera\000038.jpg
robin, lily
D:\image_search_engine_ai-end\sources\photos\camera\000650.jpg
robin, lily
D:\image_search_engine_ai-end\sources\photos\camera\000718.jpg
lily, robin
D:\image_search_engine_ai-end\sources\photos\camera\000795.jpg
lily, robin
D:\image_search_engine_ai-end\sources\photos\camera\000825.jpg
lily, robin
D:\image_search_engine_ai-end\sources\photos\camera\000843.jpg
robin,