In [6]:
comments = [
    (1, "This one looked too perfect to be real!"),
    (2, "I thought it was real until I saw the stem."),
    (3, "It tasted too good to be fake!"),
    (4, "The texture felt off, so I'm guessing fake."),
    (5, "I've never seen this vegetable before, so I'm guessing fake."),
    (6, "It smelled strange, so I'm guessing fake."),
    (7, "The color seemed artificial, so I'm guessing fake."),
    (8, "It felt too firm to be real."),
    (9, "The taste was a bit bland, so I'm guessing fake."),
    (10, "It looked too shiny to be real."),
    (11, "The price was too low for it to be real."),
    (12, "It looked too wrinkled to be real."),
    (13, "It looked too perfect to be fake!"),
    (14, "It tasted exactly like the real thing!"),
    (15, "It had a natural smell, so I'm guessing real."),
    (16, "The texture felt just right, so I'm guessing real."),
    (17, "I recognized the vegetable immediately, so I'm guessing real."),
    (18, "It had imperfections, so I'm guessing real."),
    (19, "The color looked natural, so I'm guessing real."),
    (20, "It felt soft to the touch, so I'm guessing real."),
    (21, "It tasted fresh, so I'm guessing real."),
    (22, "It had soil residue, so I'm guessing real."),
    (23, "The price was high, so I'm guessing real."),
    (24, "It looked like it was just picked, so I'm guessing real."),
    (25, "It had irregularities, so I'm guessing real."),
    (26, "The texture felt natural, so I'm guessing real."),
    (27, "It tasted organic, so I'm guessing real."),
    (28, "It looked like it was grown locally, so I'm guessing real."),
    (29, "It had a stem attached, so I'm guessing real."),
    (30, "It looked like it was recently harvested, so I'm guessing real."),
    (31, "It tasted earthy, so I'm guessing real."),
    (32, "It felt like it was from a farmer's market, so I'm guessing real."),
    (33, "It had a leafy smell, so I'm guessing real."),
    (34, "It had a rich flavor, so I'm guessing real."),
    (35, "It had natural ridges, so I'm guessing real."),
    (36, "The seeds looked genuine, so I'm guessing real."),
    (37, "It looked like it was grown in a garden, so I'm guessing real."),
    (38, "It tasted juicy, so I'm guessing real."),
    (39, "It looked like it was grown with care, so I'm guessing real.")
]


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
def search_comments(query):
    results = []
    
    vectorizer = TfidfVectorizer()

    query_tfidf = vectorizer.fit_transform([query])
    tfidf_matrix_batch = vectorizer.transform([comment[1] for comment in comments])

    similarity = cosine_similarity(tfidf_matrix_batch, query_tfidf)

    batch_results = [(comments[i][0], similarity[i][0]) for i in range(len(comments))]

    results = sorted(batch_results, key=lambda x: x[1], reverse=True)
    return results

results = search_comments("looked too perfect")
print(results)


[(1, 1.0000000000000002), (13, 1.0000000000000002), (10, 0.8164965809277261), (12, 0.8164965809277261), (3, 0.5773502691896258), (8, 0.5773502691896258), (11, 0.5773502691896258), (19, 0.5773502691896258), (24, 0.5773502691896258), (28, 0.5773502691896258), (30, 0.5773502691896258), (36, 0.5773502691896258), (37, 0.5773502691896258), (39, 0.5773502691896258), (2, 0.0), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (9, 0.0), (14, 0.0), (15, 0.0), (16, 0.0), (17, 0.0), (18, 0.0), (20, 0.0), (21, 0.0), (22, 0.0), (23, 0.0), (25, 0.0), (26, 0.0), (27, 0.0), (29, 0.0), (31, 0.0), (32, 0.0), (33, 0.0), (34, 0.0), (35, 0.0), (38, 0.0)]


In [9]:
def search_comments_batch(query):
    batch_size = 10 
    results = []
    
    vectorizer = TfidfVectorizer()

    query_tfidf = vectorizer.fit_transform([query])

    for i in range(0, len(comments), batch_size):
        batch_comments = [comment[1] for comment in comments[i:i+batch_size]]
        tfidf_matrix_batch = vectorizer.transform(batch_comments)

        similarity = cosine_similarity(tfidf_matrix_batch, query_tfidf)

        batch_results = [(comments[i+j][0], similarity[j][0]) for j in range(len(batch_comments))]

        results.extend(batch_results)

    results = sorted(results, key=lambda x: x[1], reverse=True)
    return results

results = search_comments_batch("looked too perfect")
print(results)


[(1, 1.0000000000000002), (13, 1.0000000000000002), (10, 0.8164965809277261), (12, 0.8164965809277261), (3, 0.5773502691896258), (8, 0.5773502691896258), (11, 0.5773502691896258), (19, 0.5773502691896258), (24, 0.5773502691896258), (28, 0.5773502691896258), (30, 0.5773502691896258), (36, 0.5773502691896258), (37, 0.5773502691896258), (39, 0.5773502691896258), (2, 0.0), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (9, 0.0), (14, 0.0), (15, 0.0), (16, 0.0), (17, 0.0), (18, 0.0), (20, 0.0), (21, 0.0), (22, 0.0), (23, 0.0), (25, 0.0), (26, 0.0), (27, 0.0), (29, 0.0), (31, 0.0), (32, 0.0), (33, 0.0), (34, 0.0), (35, 0.0), (38, 0.0)]


In [13]:
import time

start_time = time.time()
results = search_comments("looked too perfect")
end_time = time.time()
print("Time taken for search_comments:", end_time - start_time, "seconds")

start_time = time.time()
results_batch = search_comments_batch("looked too perfect")
end_time = time.time()
print("Time taken for search_comments_batch:", end_time - start_time, "seconds")


Time taken for search_comments: 0.03183102607727051 seconds
Time taken for search_comments_batch: 0.019866466522216797 seconds
