In [1]:
import pandas as pd

# Load the CSV file
csv_file_path = '/kaggle/input/ir-assignment-2/A2_Data.csv'
df = pd.read_csv(csv_file_path)

# Convert string representation of list to actual list
df['Image'] = df['Image'].apply(lambda x: x.strip("[]").replace("'", "").split(", "))

# Create a new DataFrame to store the split rows
new_rows = []

# Iterate over each row in the original DataFrame
for index, row in df.iterrows():
    # Extract image URLs from the row
    image_urls = row['Image']
    
    # Iterate over each image URL
    for image_url in image_urls:
        # Create a new row with the same review text and serial number
        new_row = {
            'Image': image_url,
            'Review Text': row['Review Text']
        }
        # Append the new row to the list
        new_rows.append(new_row)

# Create a new DataFrame from the list of new rows
new_df = pd.DataFrame(new_rows)

# Add a new column for serial number
new_df['S.No'] = range(1, len(new_df) + 1)

# Reorder columns to match the original DataFrame
new_df = new_df[['S.No', 'Image', 'Review Text']]

# Define the path for the new CSV file
new_csv_file_path = '/kaggle/working/new_data.csv'

# Save the new DataFrame as a CSV file
new_df.to_csv(new_csv_file_path, index=False)

print(f"New CSV file saved at: {new_csv_file_path}")


New CSV file saved at: /kaggle/working/new_data.csv


In [64]:
!pip install beautifulsoup4


  pid, fd = os.forkpty()




In [67]:
import pandas as pd
import numpy as np  # Import numpy for handling NaN values
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
import string
import spacy

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load spaCy English model
nlp = spacy.load('en_core_web_sm')

import os

def preprocess_text(text):
    if pd.isnull(text):  # Check for NaN values
        return ''

    # Check if the input text is a filename
    if os.path.isfile(text):
        # Read the file contents
        with open(text, 'r') as file:
            text = file.read()

    # Remove HTML tags using BeautifulSoup
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Lowercasing
    text = text.lower()

    # Tokenization
    tokens = word_tokenize(text)

    # Removing punctuations
    tokens = [token for token in tokens if token not in string.punctuation]

    # Stop word removal
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Lemmatization using spaCy
    doc = nlp(" ".join(tokens))
    tokens = [token.lemma_ for token in doc]

    # Joining tokens back to text
    processed_text = ' '.join(tokens)

    return processed_text


# Load CSV file
df = pd.read_csv('/kaggle/working/new_data.csv')

# Apply preprocessing to the 'Review Text' column
df['review_text_processed'] = df['Review Text'].apply(preprocess_text)
# Drop the 'Review Text' column
df.drop(columns=['Review Text'], inplace=True)

# Save the preprocessed data to a new CSV file
df.to_csv('preprocessed_data.csv', index=False)


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


  text = BeautifulSoup(text, 'html.parser').get_text()


In [2]:
import pandas as pd
import numpy as np
import requests
from PIL import Image, ImageEnhance
from io import BytesIO
from keras.applications.resnet50 import ResNet50, preprocess_input
from keras.models import Model
import pickle

# Load the CSV file
csv_file_path = '/kaggle/working/new_data.csv'
df = pd.read_csv(csv_file_path)

# Convert string representation of list to actual list
df['Image'] = df['Image'].apply(lambda x: x.strip("[]").replace("'", "").split(", "))

# Function to download and preprocess image with random augmentation
def preprocess_image(url):
    response = requests.get(url)
    img = Image.open(BytesIO(response.content))

    # Resize
    img = img.resize((224, 224))

    # Random flips
    if np.random.rand() < 0.5:
        img = img.transpose(Image.FLIP_LEFT_RIGHT)
    if np.random.rand() < 0.5:
        img = img.transpose(Image.FLIP_TOP_BOTTOM)

    # Brightness adjustment
    enhancer = ImageEnhance.Brightness(img)
    img = enhancer.enhance(np.random.uniform(0.5, 1.5))

    # Exposure adjustment
    enhancer = ImageEnhance.Contrast(img)
    img = enhancer.enhance(np.random.uniform(0.5, 1.5))

    # Convert to numpy array
    img = np.array(img)

    # Preprocess for ResNet50 model
    img = preprocess_input(img)

    return img

# Load pre-trained ResNet50 model
base_model = ResNet50(weights='imagenet')
model = Model(inputs=base_model.input, outputs=base_model.get_layer('avg_pool').output)

# Function to extract features from image
def extract_features(urls):
    features_list = []
    for url in urls:
        try:
            img = preprocess_image(url)
            img = np.expand_dims(img, axis=0)
            features = model.predict(img)
            features_list.append(features.flatten())
        except Exception as e:
            print(f"Error processing {url}: {str(e)}")
            features_list.append(None)
    return np.array(features_list)  # Convert to NumPy array

# Extract features for each list of URLs in the 'URL' column and store in a list
df['Features'] = df['Image'].apply(extract_features)

# Save the NumPy array
with open("features_resnet50.pkl", "wb") as f:
    pickle.dump(df['Features'], f)


2024-03-14 05:08:00.032641: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-14 05:08:00.032771: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-14 05:08:00.212579: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels.h5
[1m102967424/102967424[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 0us/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 143ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 129ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 133ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 149ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 125ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 126ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 130ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0

# Image Based Retrieval

In [125]:
import numpy as np
import requests
from PIL import Image, ImageEnhance
from io import BytesIO
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tensorflow.keras.models import Model
import math
import pickle
import os

# Function to preprocess image with random augmentation
def preprocess_image(url):
    response = requests.get(url)
    img = Image.open(BytesIO(response.content))

    # Resize
    img = img.resize((224, 224))

    # Random flips
    if np.random.rand() < 0.5:
        img = img.transpose(Image.FLIP_LEFT_RIGHT)
    if np.random.rand() < 0.5:
        img = img.transpose(Image.FLIP_TOP_BOTTOM)

    # Brightness adjustment
    enhancer = ImageEnhance.Brightness(img)
    img = enhancer.enhance(np.random.uniform(0.5, 1.5))

    # Exposure adjustment
    enhancer = ImageEnhance.Contrast(img)
    img = enhancer.enhance(np.random.uniform(0.5, 1.5))

    # Convert to numpy array
    img = np.array(img)

    # Preprocess for ResNet50 model
    img = preprocess_input(img)

    return img

# Load pre-trained ResNet50 model
base_model = ResNet50(weights='imagenet')
model = Model(inputs=base_model.input, outputs=base_model.get_layer('avg_pool').output)

# Function to extract features from image
def extract_features(urls):
    features_list = []
    for url in urls:
        try:
            img = preprocess_image(url)
            img = np.expand_dims(img, axis=0)
            features = model.predict(img)
            features_list.append(features.flatten())
        except Exception as e:
            print(f"Error processing {url}: {str(e)}")
            features_list.append(None)
    return features_list


# Ask the user to provide a URL
url = input("Please enter the URL of the image: ")

# Extract features for the provided URL
image_features = extract_features([url])

# Check if features were successfully extracted
if image_features[0] is not None and len(image_features[0]) > 0:
    
    # Convert the list of arrays into a single NumPy array
    image_features_array = np.concatenate(image_features)

    # Save image features array to a file
    np.save("extracted_image_features.npy", image_features_array)
    print("\nImage features array saved successfully.")

Please enter the URL of the image:  https://images-na.ssl-images-amazon.com/images/I/71bztfqdg+L._SY88.jpg


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step

Image features array saved successfully.


In [133]:
def preprocess_text(text):
    if pd.isnull(text):  # Check for NaN values
        return ''

    # Check if the input text is a filename
    if os.path.isfile(text):
        # Read the file contents
        with open(text, 'r') as file:
            text = file.read()

    # Remove HTML tags using BeautifulSoup
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Lowercasing
    text = text.lower()

    # Tokenization
    tokens = word_tokenize(text)

    # Removing punctuations
    tokens = [token for token in tokens if token not in string.punctuation]

    # Stop word removal
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Lemmatization using spaCy
    doc = nlp(" ".join(tokens))
    tokens = [token.lemma_ for token in doc]

    # Joining tokens back to text
    processed_text = ' '.join(tokens)

    return processed_text

text = input("Please enter the text: ")
    
    # Preprocess text
processed_text = preprocess_text(text)
print("Preprocessed Text:")
print(processed_text)
    


# Function to load TF-IDF matrix from pickle
def load_tfidf_matrix(file_path):
    with open(file_path, 'rb') as f:
        tfidf_matrix = pickle.load(f)
    return tfidf_matrix

# Function to calculate TF-IDF for a given text
def calculate_tfidf(text, tfidf_matrix):
    tfidf = {}
    words = text.split()
    for word in words:
        tfidf[word] = 0  # Initialize TF-IDF value for the word
        for doc in tfidf_matrix:
            if word in doc:
                tfidf[word] = doc[word]
                break  # Exit the loop once TF-IDF value is found for the word
    return tfidf

# Load TF-IDF matrix from pickle file
file_path = "tfidf_matrix.pkl"  # Replace with the actual file path
tfidf_matrix = load_tfidf_matrix(file_path)

tfidf = calculate_tfidf(processed_text, tfidf_matrix)
print(tfidf)


# Function to save TF-IDF to a pickle file
def save_tfidf_to_pickle(tfidf_values, file_path):
    with open(file_path, 'wb') as f:
        pickle.dump(tfidf_values, f)

# Calculate TF-IDF for the user's text
tfidf = calculate_tfidf(processed_text, tfidf_matrix)

# Save TF-IDF values to a pickle file
output_file_path = "user_text_tfidf.pkl"  # Choose the desired output file path
save_tfidf_to_pickle(tfidf, output_file_path)

print("\nTF-IDF values saved to:", output_file_path)


Please enter the text:  I have been using Fender locking tuners for about five years on various strats and teles. Definitely helps with tuning stability and way faster to restring if there is a break.


Preprocessed Text:
use fender lock tuner five year various strat tele definitely help tune stability way fast restring break
{'use': 0.049737861063236466, 'fender': 0.03748448064597976, 'lock': 0.037177528986696746, 'tuner': 0.12808072443146698, 'five': 0.03773459475270186, 'year': 0.03795969546950136, 'various': 0.04608509155389261, 'strat': 0.1999994321942258, 'tele': 0.2163617823249111, 'definitely': 0.04882289859499056, 'help': 0.1646600498299718, 'tune': 0.1068204912713258, 'stability': 0.35515127923318696, 'way': 0.17083490958126993, 'fast': 0.03929961906824721, 'restring': 0.12809396529910105, 'break': 0.22507151633351044}

TF-IDF values saved to: user_text_tfidf.pkl


In [141]:
import numpy as np
import csv
import pickle

# Define a function to compute cosine similarity from scratch
def cosine_similarity_scratch(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    similarity = dot_product / (norm_a * norm_b)
    return similarity

# Load the pickle file
with open('features_resnet50.pkl', 'rb') as f:
    data = pickle.load(f)

# Load the numpy array
your_numpy_array = np.load('/kaggle/working/extracted_image_features.npy')

# Ensure that your_numpy_array is 1-dimensional
your_numpy_array = your_numpy_array.ravel()

# Compute cosine similarity for each key
similarities = []

for key in data.keys():
    feature_vector = data[key]

    # Skip if feature vector is None
    if feature_vector is None:
        print(f"Skipping key '{key}' as feature vector is None")
        continue

    # Check if feature vector contains numeric values
    if not np.issubdtype(feature_vector.dtype, np.number):
        continue

    # Ensure that feature_vector is 1-dimensional
    feature_vector = feature_vector.ravel()

    # Compute cosine similarity if both arrays are not empty
    if len(your_numpy_array) > 0 and len(feature_vector) > 0:
        similarity = cosine_similarity_scratch(feature_vector, your_numpy_array)
        similarities.append((key, similarity))

# Sort similarities in descending order
similarities.sort(key=lambda x: x[1], reverse=True)

# Store the top 3 highest similarities
top_similarities = similarities[:3]

# Load CSV and match keys with serial numbers
with open('/kaggle/working/preprocessed_data.csv', newline='') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)  # Skip the first row
    for row in reader:
        for key, similarity in top_similarities:
            if int(row[0]) == int(key):  # Assuming the serial numbers are in the first column
                print("Image Number:", row[0])
                print("Image:", row[1])
                print("Text:", row[2])
                print("Cosine Similarity of Image:", similarity)
                print()  # Add an empty line for readability



Image Number: 887
Image: https://images-na.ssl-images-amazon.com/images/I/5134EWdp6lL._SY88.jpg
Text: love guitar honestly never hold squire brand strat compare everything ' ve hold play hear past guitar really give good bang buck especially be like pretty much poor dirt think guitar cheap would probably want get small upgrade ' m pretty content get first 10 minute play recommend guitar pretty much anyone want strat incredible quality price
Cosine Similarity of Image: 0.7026417

Image Number: 1099
Image: https://images-na.ssl-images-amazon.com/images/I/61Wqw4GwL8L._SY88.jpg
Text: ' ve bough work good sure else say cheap - look hold violin bow wall way
Cosine Similarity of Image: 0.83370674

Image Number: 1244
Image: https://images-na.ssl-images-amazon.com/images/I/71N0t6HU37L._SY88.jpg
Text: man word say catalina maple drum set place head first hit sound like cannon perfect sound make want play day make want start dance eye set product church product know would next purchase market new

In [16]:
import math
import pickle
import csv

def cosine_similarity(vector1, vector2):
    dot_product = sum(vector1[key] * vector2.get(key, 0) for key in vector1)
    
    magnitude_vector1 = math.sqrt(sum(value ** 2 for value in vector1.values()))
    magnitude_vector2 = math.sqrt(sum(value ** 2 for value in vector2.values()))
    
    if magnitude_vector1 == 0 or magnitude_vector2 == 0:
        return 0
    
    cosine_similarity = dot_product / (magnitude_vector1 * magnitude_vector2)
    return cosine_similarity

# Load dictionaries from the first pickle file
with open('/kaggle/working/tfidf_matrix.pkl', 'rb') as f:
    dictionaries = pickle.load(f)

# Load user dictionary from the second pickle file
with open('/kaggle/working/user_text_tfidf.pkl', 'rb') as f:
    user_dictionary = pickle.load(f)

# Function to get the full row from a CSV file based on the serial number
def get_row(serial_number):
    with open('/kaggle/working/preprocessed_data.csv', newline='') as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            if row[0] == serial_number:
                return row
    return None

# Compute cosine similarity for each dictionary in the first pickle file and the user dictionary
similarities = []
for i, vector in enumerate(dictionaries):
    similarity = cosine_similarity(user_dictionary, vector)
    similarities.append((i, similarity))

# Sort similarities by cosine similarity in descending order
similarities.sort(key=lambda x: x[1], reverse=True)

# Get three indices from the user
user_entered_indices = [int(index) for index in input("Enter three indices separated by spaces: ").split()[:3]]

# Print similarities for the entered indices
for idx, (index, similarity) in enumerate(similarities):
    serial_number = index + 1  # Assuming serial numbers start from 1
    if serial_number in user_entered_indices:
        full_row = get_row(str(serial_number))
        if full_row:
            print(f"Top {idx+1}: Cosine Similarity between user dictionary and vector {index+1}: {similarity}")
        else:
            print(f"Top {idx+1}: Cosine Similarity between user dictionary and vector {index+1}: {similarity}")
            print("No corresponding row found in the CSV for serial number", serial_number)


Enter three indices separated by spaces:  887 1099 1244


Top 86: Cosine Similarity between user dictionary and vector 887: 0.07959818016311464
Top 148: Cosine Similarity between user dictionary and vector 1099: 0.0548740674436874
Top 1487: Cosine Similarity between user dictionary and vector 1244: 0.0


In [17]:
# Define weights for cosine similarity components
image_similarity_weight = 0.7
review_similarity_weight = 0.3

# Define cosine similarity values for each pair of (image, review)
cosine_similarity_values = {
    "image1_review1": (0.7026417, 0.07959818016311464),
    "image2_review2": (0.83370674, 0.0548740674436874),
    "image3_review3": (0.8736638, 0)
}

# Calculate composite score for each pair
composite_scores = {}
for pair, similarities in cosine_similarity_values.items():
    image_similarity, review_similarity = similarities
    composite_score = (image_similarity_weight * image_similarity) + (review_similarity_weight * review_similarity)
    composite_scores[pair] = composite_score

# Print composite scores
for pair, composite_score in composite_scores.items():
    print(f"Composite score for {pair}: {composite_score}")


Composite score for image1_review1: 0.5157286440489344
Composite score for image2_review2: 0.6000569382331061
Composite score for image3_review3: 0.61156466


In [18]:
# Define weights for cosine similarity components
image_similarity_weight = 0.7
review_similarity_weight = 0.3

# Get input from the user for three pairs of (image, review)
user_input_pairs = []
for i in range(3):
    image_similarity, review_similarity = input(f"Enter the cosine similarities for pair {i+1} (image, review) separated by space: ").split()
    user_input_pairs.append((float(image_similarity), float(review_similarity)))

# Calculate composite score for each user input pair
composite_scores = {}
for i, similarities in enumerate(user_input_pairs, start=1):
    image_similarity, review_similarity = similarities
    composite_score = (image_similarity_weight * image_similarity) + (review_similarity_weight * review_similarity)
    composite_scores[f"image{i}_review{i}"] = composite_score

# Sort pairs based on composite scores in descending order
ranked_pairs = sorted(composite_scores.items(), key=lambda x: x[1], reverse=True)

# Print the ranked pairs
for rank, (pair, composite_score) in enumerate(ranked_pairs, start=1):
    print(f"Rank {rank}: {pair} - Composite Score: {composite_score}")


Enter the cosine similarities for pair 1 (image, review) separated by space:  0.7026417 0.07959
Enter the cosine similarities for pair 2 (image, review) separated by space:  0.83370 0.05487
Enter the cosine similarities for pair 3 (image, review) separated by space:  0.87366 0


Rank 1: image3_review3 - Composite Score: 0.6115619999999999
Rank 2: image2_review2 - Composite Score: 0.6000509999999999
Rank 3: image1_review1 - Composite Score: 0.51572619


# TEXT BASED RETRIEVAL

In [111]:
import pandas as pd
import numpy as np
import math

# Read the CSV file
df = pd.read_csv('/kaggle/working/preprocessed_data.csv')

# Assuming 'review_text_processed' is the name of the column containing text data
text_column = 'review_text_processed'  # Change to the actual name of your text column

# Drop rows with NaN values in the text column
df = df.dropna(subset=[text_column])

# Get text data
text_data = df[text_column].tolist()

# Function to calculate term frequency (TF)
def calculate_tf(text):
    words = text.split()
    word_count = len(words)
    term_frequency = {}
    for word in words:
        term_frequency[word] = term_frequency.get(word, 0) + 1
    for word, freq in term_frequency.items():
        term_frequency[word] = freq / word_count
    return term_frequency

# Function to calculate inverse document frequency (IDF)
def calculate_idf(corpus):
    num_documents = len(corpus)
    word_document_count = {}
    for document in corpus:
        unique_words = set(document.split())
        for word in unique_words:
            word_document_count[word] = word_document_count.get(word, 0) + 1
    inverse_document_frequency = {}
    for word, doc_count in word_document_count.items():
        inverse_document_frequency[word] = math.log(num_documents / (1 + doc_count))
    return inverse_document_frequency

# Function to calculate TF-IDF
def calculate_tfidf(text, idf):
    tfidf = {}
    tf = calculate_tf(text)
    for word, tf_value in tf.items():
        if word in idf:
            tfidf[word] = tf_value * idf[word]
    return tfidf

# Calculate IDF
idf = calculate_idf(text_data)

# Calculate TF-IDF for each document
tfidf_matrix = []
for document in text_data:
    tfidf_matrix.append(calculate_tfidf(document, idf))

# Convert TF-IDF matrix to a numpy array
tfidf_matrix = np.array(tfidf_matrix)

# Print the TF-IDF matrix
print("TF-IDF Matrix:")
print(tfidf_matrix)

# Save the TF-IDF matrix as a pickle file
with open('tfidf_matrix.pkl', 'wb') as f:
    pickle.dump(tfidf_matrix, f)


TF-IDF Matrix:
[{'love': 0.13095286797705646, 'vintage': 0.5173800274081972, 'spring': 0.6945981537121895, 'strat': 0.1999994321942258, 'good': 0.08983916897305623, 'tension': 0.3227840915144069, 'great': 0.07533781605573306, 'stability': 0.35515127923318696, 'float': 0.3339210304919513, 'bridge': 0.2230845146420814, 'want': 0.13858897075920396, 'way': 0.17083490958126993, 'go': 0.12055258474528437}
 {'work': 0.06045305669636706, 'great': 0.047086135034833165, 'guitar': 0.05017396720790491, 'bench': 0.2415530340726475, 'mat': 0.2415530340726475, 'rugge': 0.23395630253956604, 'enough': 0.10331071020918482, 'abuse': 0.22196954952074183, 'take': 0.18588008313365093, 'care': 0.339542185000203, 'make': 0.058553993842353755, 'organization': 0.2508506820440729, 'workspace': 0.2508506820440729, 'much': 0.08130790061749744, 'easy': 0.07905509973123596, 'screw': 0.1077678818571918, 'will': 0.10182775416323887, 'not': 0.04084259469666456, 'roll': 0.19577752204480958, 'around': 0.10612076357566533

In [161]:
import math
import pickle
import csv

def cosine_similarity(vector1, vector2):
    dot_product = sum(vector1[key] * vector2.get(key, 0) for key in vector1)
    
    magnitude_vector1 = math.sqrt(sum(value ** 2 for value in vector1.values()))
    magnitude_vector2 = math.sqrt(sum(value ** 2 for value in vector2.values()))
    
    if magnitude_vector1 == 0 or magnitude_vector2 == 0:
        return 0
    
    cosine_similarity = dot_product / (magnitude_vector1 * magnitude_vector2)
    return cosine_similarity

# Load dictionaries from the first pickle file
with open('/kaggle/working/tfidf_matrix.pkl', 'rb') as f:
    dictionaries = pickle.load(f)

# Load user dictionary from the second pickle file
with open('/kaggle/working/user_text_tfidf.pkl', 'rb') as f:
    user_dictionary = pickle.load(f)

# Function to get the full row from a CSV file based on the serial number
def get_row(serial_number):
    with open('/kaggle/working/preprocessed_data.csv', newline='') as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            if row[0] == serial_number:
                return row
    return None

# Compute cosine similarity for each dictionary in the first pickle file and the user dictionary
similarities = []
for i, vector in enumerate(dictionaries):
    similarity = cosine_similarity(user_dictionary, vector)
    similarities.append((i, similarity))

# Sort similarities by cosine similarity in descending order
similarities.sort(key=lambda x: x[1], reverse=True)

# Print the top 3 highest cosine similarity values and their corresponding indices
for idx, (index, similarity) in enumerate(similarities[:3]):
    serial_number = index + 1  # Assuming serial numbers start from 1
    full_row = get_row(str(serial_number))
    if full_row:
        print(f"Top {idx+1}: Cosine Similarity between user dictionary and vector {index+1}: {similarity}")
        print("Full row from CSV:", full_row)
    else:
        print(f"Top {idx+1}: Cosine Similarity between user dictionary and vector {index+1}: {similarity}")
        print("No corresponding row found in the CSV for serial number", serial_number)


Top 1: Cosine Similarity between user dictionary and vector 1245: 0.8306238893131533
Full row from CSV: ['1245', 'https://images-na.ssl-images-amazon.com/images/I/71bztfqdg+L._SY88.jpg', 'use fender lock tuner five year various strat tele definitely help tune stability way fast restring break']
Top 2: Cosine Similarity between user dictionary and vector 1: 0.2811710011960442
Full row from CSV: ['1', 'https://images-na.ssl-images-amazon.com/images/I/81q5+IxFVUL._SY88.jpg', 'love vintage spring vintage strat good tension great stability float bridge want spring way go']
Top 3: Cosine Similarity between user dictionary and vector 621: 0.2418052370248125
Full row from CSV: ['621', 'https://images-na.ssl-images-amazon.com/images/I/61Np-qH9ZVL._SY88.jpg', 'fit tele build perfectly look great']


In [5]:
import numpy as np
import csv
import pickle

# Define a function to compute cosine similarity from scratch
def cosine_similarity_scratch(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    similarity = dot_product / (norm_a * norm_b)
    return similarity

# Load the pickle file
with open('features_resnet50.pkl', 'rb') as f:
    data = pickle.load(f)

# Print the number of keys present in the dataset
num_keys = len(data)
print(f"Number of keys present in the dataset: {num_keys}")

# Load the numpy array
your_numpy_array = np.load('/kaggle/working/extracted_image_features.npy')

# Ensure that your_numpy_array is 1-dimensional
your_numpy_array = your_numpy_array.ravel()

user_indices = []

# Get indices from the user one by one
for i in range(3):
    while True:
        try:
            index = int(input(f"Enter index {i+1} (between 0 and {num_keys - 1}): "))
            if 0 <= index < num_keys:
                user_indices.append(index)
                break
            else:
                print(f"Index must be between 0 and {num_keys - 1}. Try again.")
        except ValueError:
            print("Invalid input. Please enter a valid integer index.")

# Compute cosine similarity for user-provided indices
for index in user_indices:
    feature_vector = data[index]

    # Skip if feature vector is None
    if feature_vector is None:
        print(f"Skipping index '{index}' as feature vector is None")
        continue

    # Check if feature vector contains numeric values
    if not np.issubdtype(feature_vector.dtype, np.number):
        print(f"Feature vector for index '{index}' does not contain numeric values")
        continue

    # Ensure that feature_vector is 1-dimensional
    feature_vector = feature_vector.ravel()

    # Compute cosine similarity if both arrays are not empty
    if len(your_numpy_array) > 0 and len(feature_vector) > 0:
        similarity = cosine_similarity_scratch(feature_vector, your_numpy_array)
        print(f"Index: {index}, Cosine Similarity of image: {similarity}")
    else:
        print(f"At least one of the arrays for index '{index}' is empty")


Number of keys present in the dataset: 1648


Enter index 1 (between 0 and 1647):  1245
Enter index 2 (between 0 and 1647):  1
Enter index 3 (between 0 and 1647):  621


Index: 1245, Cosine Similarity of image: 0.4557837247848511
Index: 1, Cosine Similarity of image: 0.505005419254303
Index: 621, Cosine Similarity of image: 0.39874082803726196


In [19]:
# Define weights for cosine similarity components
image_similarity_weight = 0.4
review_similarity_weight = 0.4

# Get input from the user for three pairs of (image, review)
user_input_pairs = []
for i in range(3):
    image_similarity, review_similarity = input(f"Enter the cosine similarities for pair {i+1} (image, review) separated by space: ").split()
    user_input_pairs.append((float(image_similarity), float(review_similarity)))

# Calculate composite score for each user input pair
composite_scores = {}
for i, similarities in enumerate(user_input_pairs, start=1):
    image_similarity, review_similarity = similarities
    composite_score = (image_similarity_weight * image_similarity) + (review_similarity_weight * review_similarity)
    composite_scores[f"image{i}_review{i}"] = composite_score

# Sort pairs based on composite scores in descending order
ranked_pairs = sorted(composite_scores.items(), key=lambda x: x[1], reverse=True)

# Print the ranked pairs
for rank, (pair, composite_score) in enumerate(ranked_pairs, start=1):
    print(f"Rank {rank}: {pair} - Composite Score: {composite_score}")


Enter the cosine similarities for pair 1 (image, review) separated by space:  0.45578 0.83062
Enter the cosine similarities for pair 2 (image, review) separated by space:  0.50500 0.28117
Enter the cosine similarities for pair 3 (image, review) separated by space:  0.39874 0.24180


Rank 1: image1_review1 - Composite Score: 0.5145600000000001
Rank 2: image2_review2 - Composite Score: 0.314468
Rank 3: image3_review3 - Composite Score: 0.256216
