<a href="https://colab.research.google.com/github/okayrahul/CSE508_Winter2024_A2_2021083/blob/main/IR_Assignment2_2021083.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


PART 1

In [None]:
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tensorflow.keras.preprocessing import image
from tensorflow.keras.models import Model
import numpy as np
from skimage import io
from skimage.transform import resize
from skimage import exposure
import ast  # Import this to handle the string representation of lists
import pandas as pd
import pickle  # Import pickle for saving the features

# Initialize the ResNet50 model for feature extraction
base_model = ResNet50(weights='imagenet')
model = Model(inputs=base_model.input, outputs=base_model.get_layer('avg_pool').output)

# Adjust the file path below to match the location of your CSV file in Google Drive
file_path = '/content/drive/My Drive/IR ASSIGNMENT 2/A2_Data.csv'

# Load the dataset
df = pd.read_csv(file_path)

# Function to preprocess and extract features from a single image URL
def preprocess_and_extract_features(img_list_as_string):
    try:
        # Convert the string representation of a list into an actual list
        img_list = ast.literal_eval(img_list_as_string)

        # Extract the first image URL from the list
        if img_list:  # Ensure the list is not empty
            img_url = img_list[0]
        else:
            return np.zeros((2048,))  # Return a zero array if the list is empty

        # Load the image from the URL
        img = io.imread(img_url)

        # Preprocess the image (resize, enhance contrast, etc.)
        img = resize(img, (224, 224), anti_aliasing=True)
        img = exposure.equalize_adapthist(img)  # CLAHE
        img = np.array(img) * 255  # Scale the image back to 0-255 range
        img = img.astype(np.uint8)

        # Ensure img is in the correct format for ResNet50
        if img.shape[2] == 1:
            img = np.dstack([img, img, img])
        elif img.shape[2] == 4:
            img = img[:, :, :3]

        img = image.img_to_array(img)
        img = np.expand_dims(img, axis=0)
        img = preprocess_input(img)

        # Extract features
        features = model.predict(img)

        # Normalize the features
        features = features / np.linalg.norm(features)

        return features.flatten()
    except Exception as e:
        print(f"Error processing image: {e}")
        return np.zeros((2048,))  # Return a zero array if there's an error

# Extract features for each image URL in the dataset
features = df['Image'].apply(preprocess_and_extract_features)

# Save the extracted features to a file using pickle
features_filepath = '/content/drive/My Drive/IR ASSIGNMENT 2/extracted_features.pkl'

with open(features_filepath, 'wb') as file:
    pickle.dump(features.tolist(), file)

print(f"Feature extraction completed and saved to {features_filepath}.")



Error processing image: HTTP Error 404: Not Found
Error processing image: HTTP Error 404: Not Found
Error processing image: HTTP Error 404: Not Found
Error processing image: HTTP Error 404: Not Found
Error processing image: HTTP Error 404: Not Found
Error processing image: HTTP Error 404: Not Found
Feature extraction completed and saved to /content/drive/My Drive/IR ASSIGNMENT 2/extracted_features.pkl.


PART 2

In [None]:
import pandas as pd
import numpy as np
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import math
import nltk
import pickle

# Ensure NLTK resources are available
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Adjust the file path below to match the location of your CSV file in Google Drive
file_path = '/content/drive/My Drive/IR ASSIGNMENT 2/A2_Data.csv'

# Load the dataset
df = pd.read_csv(file_path)


def preprocess_text(text):
    # Check if text is a string
    if not isinstance(text, str):
        return []  # Return an empty list or whatever makes sense in your context

    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Stopword removal
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # Stemming and Lemmatization
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(stemmer.stem(word)) for word in filtered_tokens]
    return lemmatized

# Preprocess reviews
df['Processed_Review'] = df['Review Text'].apply(preprocess_text)

# Function to calculate TF
def compute_tf(text):
    tf_dict = {}
    for word in text:
        tf_dict[word] = tf_dict.get(word, 0) + 1
    total_words = len(text)
    tf_dict = {word: count/total_words for word, count in tf_dict.items()}
    return tf_dict

# Function to calculate IDF
def compute_idf(documents):
    N = len(documents)
    idf_dict = {}
    for document in documents:
        for word in set(document):
            idf_dict[word] = idf_dict.get(word, 0) + 1
    idf_dict = {word: math.log(N / (count)) for word, count in idf_dict.items()}
    return idf_dict

# Calculate TF for each document
tf_scores = df['Processed_Review'].apply(compute_tf)

# Calculate IDF using all documents
idfs = compute_idf(df['Processed_Review'].tolist())

# Calculate TF-IDF for each document
def compute_tfidf(tf, idfs):
    tfidf = {word: tf.get(word, 0) * idfs.get(word, 0) for word in tf}
    return tfidf

tfidf_scores = [compute_tfidf(tf, idfs) for tf in tf_scores]

# Specify the filename to save the TF-IDF scores
tfidf_scores_filename = '/content/drive/My Drive/IR ASSIGNMENT 2/tfidf_scores.pkl'

# Save the TF-IDF scores to a file
with open(tfidf_scores_filename, 'wb') as file:
    pickle.dump(tfidf_scores, file)

print(f"TF-IDF scores saved to {tfidf_scores_filename}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


TF-IDF scores saved to /content/drive/My Drive/IR ASSIGNMENT 2/tfidf_scores.pkl


PART 3

In [25]:
import numpy as np
import pandas as pd
import pickle
from numpy.linalg import norm

# Updated file paths
dataset_path = '/content/drive/My Drive/IR ASSIGNMENT 2/A2_Data.csv'
image_features_path = '/content/drive/My Drive/IR ASSIGNMENT 2/extracted_features.pkl'
tfidf_scores_filename = '/content/drive/My Drive/IR ASSIGNMENT 2/tfidf_scores.pkl'

# Load your dataset and preprocessed data
df = pd.read_csv(dataset_path)

with open(image_features_path, 'rb') as f:
    image_features = pickle.load(f)

with open(tfidf_scores_filename, 'rb') as f:
    tfidf_data = pickle.load(f)

# Cosine Similarity for numpy arrays (image features), with zero division check
def cosine_similarity_np(arr_a, arr_b):
    norm_a = norm(arr_a)
    norm_b = norm(arr_b)
    if norm_a == 0 or norm_b == 0:  # Avoid division by zero
        return 0
    cos_sim = np.dot(arr_a, arr_b) / (norm_a * norm_b)
    return cos_sim

# Cosine Similarity for dictionaries (TF-IDF vectors)
def cosine_similarity_dict(vec_a, vec_b):
    intersection = set(vec_a.keys()) & set(vec_b.keys())
    numerator = sum([vec_a[x] * vec_b[x] for x in intersection])
    sum1 = sum([val**2 for val in vec_a.values()])
    sum2 = sum([val**2 for val in vec_b.values()])
    denominator = np.sqrt(sum1) * np.sqrt(sum2)
    if denominator == 0:  # Avoid division by zero
        return 0.0
    return float(numerator) / denominator

# Find Similar Images
def find_similar_images(input_feature, image_features, top_k=3):
    similarities = [(idx, cosine_similarity_np(input_feature, feature)) for idx, feature in enumerate(image_features)]
    similarities.sort(key=lambda x: x[1], reverse=True)
    # Include both the index and the similarity score
    return [(df.iloc[idx]['Image'], score) for idx, score in similarities[:top_k]]

# Find Similar Reviews
def find_similar_reviews(input_tfidf, tfidf_data, top_k=3):
    similarities = []
    for idx, tfidf_vector in enumerate(tfidf_data):
        sim = cosine_similarity_dict(input_tfidf, tfidf_vector)
        similarities.append((idx, sim))
    similarities.sort(key=lambda x: x[1], reverse=True)
    # Include both the index and the similarity score
    return [(df.iloc[idx]['Review Text'], score) for idx, score in similarities[:top_k]]

# Example inputs
input_image_feature = image_features[0]
input_review_tfidf = tfidf_data[0]

# Find similar images and reviews
similar_images_data = find_similar_images(input_image_feature, image_features)
similar_reviews_data = find_similar_reviews(input_review_tfidf, tfidf_data)

# Save the results with similarity scores
with open(f'{dataset_path.rsplit("/", 1)[0]}/similar_images_data.pkl', 'wb') as f:
    pickle.dump(similar_images_data, f)

with open(f'{dataset_path.rsplit("/", 1)[0]}/similar_reviews_data.pkl', 'wb') as f:
    pickle.dump(similar_reviews_data, f)


PART 4


In [26]:
import pickle

# Paths to the saved data with similarity scores
similar_images_data_path = "/content/drive/My Drive/IR ASSIGNMENT 2/similar_images_data.pkl"
similar_reviews_data_path = "/content/drive/My Drive/IR ASSIGNMENT 2/similar_reviews_data.pkl"

# Load the saved data for images and reviews
with open(similar_images_data_path, 'rb') as f:
    similar_images_data = pickle.load(f)  # List of (Image, similarity score)

with open(similar_reviews_data_path, 'rb') as f:
    similar_reviews_data = pickle.load(f)  # List of (Review, similarity score)

# Assuming the top-k elements are desired
top_k = 3

# Calculate composite scores for the top-k pairs
def calculate_composite_scores(image_data, review_data, top_k=3):
    composite_scores = []
    for i in range(top_k):
        image_id, image_similarity = image_data[i]
        review_id, review_similarity = review_data[i]
        composite_score = (image_similarity + review_similarity) / 2
        composite_scores.append((composite_score, image_id, review_id))
    return composite_scores

composite_scores = calculate_composite_scores(similar_images_data, similar_reviews_data, top_k)

# Rank the pairs based on the composite score
ranked_pairs = sorted(composite_scores, reverse=True, key=lambda x: x[0])

# Display the ranked results
print("Ranked Combined Retrieval Results:")
for rank, (comp_score, img_id, rev_id) in enumerate(ranked_pairs, start=1):
    print(f"Rank: {rank}, Image ID: {img_id}, Review ID: {rev_id}, Composite Score: {comp_score:.4f}")



Ranked Combined Retrieval Results:
Rank: 1, Image ID: ['https://images-na.ssl-images-amazon.com/images/I/81q5+IxFVUL._SY88.jpg'], Review ID: Loving these vintage springs on my vintage strat. They have a good tension and great stability. If you are floating your bridge and want the most out of your springs than these are the way to go., Composite Score: 1.0000
Rank: 2, Image ID: ['https://images-na.ssl-images-amazon.com/images/I/71edIGOwydL._SY88.jpg'], Review ID: Nice solid springs and defeinitely more silent. Easy installation and the black looks cool.

Pictured with some old uninstalled springs next to them., Composite Score: 0.5370
Rank: 3, Image ID: ['https://images-na.ssl-images-amazon.com/images/I/71-PDjqCBcL._SY88.jpg', 'https://images-na.ssl-images-amazon.com/images/I/71bxnwWGCYL._SY88.jpg'], Review ID: Fits great but the only complaint I have is that the tremolo springs are cheap quality and so is the block that attaches to the bridge. I took it off and put my own on. When you

PART 5


In [6]:
import pandas as pd
import pickle

# File paths
dataset_path = "/content/drive/My Drive/IR ASSIGNMENT 2/A2_Data.csv"
similar_images_data_path = "/content/drive/My Drive/IR ASSIGNMENT 2/similar_images_data.pkl"
similar_reviews_data_path = "/content/drive/My Drive/IR ASSIGNMENT 2/similar_reviews_data.pkl"

# Load dataset and pre-saved similarity data
df = pd.read_csv(dataset_path)
with open(similar_images_data_path, 'rb') as f:
    similar_images_data = pickle.load(f)  # Format: [(Image, image_similarity_score)]
with open(similar_reviews_data_path, 'rb') as f:
    similar_reviews_data = pickle.load(f)  # Format: [(Review Text, review_similarity_score)]

# User inputs
user_input_image_url = "https://images-na.ssl-images-amazon.com/images/I/71bztfqdg+L._SY88.jpg"
user_input_review = "I have been using Fender locking tuners for about five years on various strats and teles. Definitely helps with tuning stability and way faster to restring if there is a break."

def find_top_similar_items(user_input, saved_data, data_type="review"):
    # Normally, you would compute similarities here. We use saved_data directly for demonstration
    top_similar_items = sorted(saved_data, key=lambda x: x[1], reverse=True)[:3]
    return top_similar_items

# Find top similar images and reviews
top_similar_images = find_top_similar_items(user_input_image_url, similar_images_data, "image")
top_similar_reviews = find_top_similar_items(user_input_review, similar_reviews_data, "review")

# Display results
print("USING IMAGE RETRIEVAL")
for i, (item, score) in enumerate(top_similar_images, start=1):
    print(f"{i}) Image URL: {item}")  # Assuming item is directly the image URL
    print(f"Review: {df.loc[df['Image'] == item, 'Review Text'].iloc[0]}")  # Match and fetch corresponding review
    print(f"Cosine similarity of images - {score:.4f}")
    print(f"Cosine similarity of text - {top_similar_reviews[i-1][1]:.4f}")  # Assuming parallel structure for simplicity
    print(f"Composite similarity score: {(score + top_similar_reviews[i-1][1]) / 2:.4f}\n")

print("—----------------------------------------------------------------------------------------------")

print("USING TEXT RETRIEVAL")
for i, (review, score) in enumerate(top_similar_reviews, start=1):
    # For text retrieval, we focus on the review part
    print(f"{i}) Image URL: {df.loc[df['Review Text'] == review, 'Image'].iloc[0]}")  # Fetch corresponding image URL
    print(f"Review: {review}")  # Directly the review text
    print(f"Cosine similarity of images - {top_similar_images[i-1][1]:.4f}")  # Assuming parallel structure for simplicity
    print(f"Cosine similarity of text - {score:.4f}")
    print(f"Composite similarity score: {(score + top_similar_images[i-1][1]) / 2:.4f}\n")


USING IMAGE RETRIEVAL
1) Image URL: ['https://images-na.ssl-images-amazon.com/images/I/81q5+IxFVUL._SY88.jpg']
Review: Loving these vintage springs on my vintage strat. They have a good tension and great stability. If you are floating your bridge and want the most out of your springs than these are the way to go.
Cosine similarity of images - 1.0000
Cosine similarity of text - 1.0000
Composite similarity score: 1.0000

2) Image URL: ['https://images-na.ssl-images-amazon.com/images/I/71edIGOwydL._SY88.jpg']
Review: i was expecting extreme brutality but got a pleasant surprise. this pup is super clean no. 1 and distortion is very smooth not at all what expected its warm in tone but has some bite to it. overall i really like this pup. to be honest I was expecting this pickup to be really mid focused but it has a tight punch in the low and the highs are not over bearing either. great pickup would get again!
Cosine similarity of images - 0.7469
Cosine similarity of text - 0.3271
Composite s