<a href="https://colab.research.google.com/github/ommeh404/CSE508_Winter2024_A2_2021404/blob/main/IR_A2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

***ANSWER 1***

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tensorflow.keras.preprocessing import image
from PIL import Image
import requests
from io import BytesIO
from sklearn.preprocessing import normalize
import ast
import pickle

# Load dataset
data_path = '/content/drive/MyDrive/IR_2/A2_Data.csv'
df = pd.read_csv(data_path)

# Clean the 'Image' column to ensure it contains usable lists of URLs
df['Image'] = df['Image'].apply(lambda x: ast.literal_eval(x))

# Initialize ResNet50 model
model = ResNet50(weights='imagenet', include_top=False, pooling='avg')

# Function to extract features from an image URL
def extract_features(img_url):
    try:
        response = requests.get(img_url, timeout=10)
        img = Image.open(BytesIO(response.content))
        img = img.resize((224, 224))
        img_array = image.img_to_array(img)
        img_array_expanded = np.expand_dims(img_array, axis=0)
        img_preprocessed = preprocess_input(img_array_expanded)
        features = model.predict(img_preprocessed)
        return features.flatten()
    except Exception as e:
        print(f"Error processing image {img_url}: {e}")
        return np.zeros((model.output_shape[1],))  # Return zero vector if error

# Function to extract and normalize features for a list of image URLs
def extract_and_normalize_features(urls):
    all_features = [extract_features(url) for url in urls]
    if len(all_features) > 0:
        all_features = np.array(all_features)
        all_features = normalize(all_features, axis=1, norm='l2')
        return np.mean(all_features, axis=0)
    else:
        return np.zeros((model.output_shape[1],))

# Preprocess and vectorize the text data
df['Review Text'] = df['Review Text'].fillna('').apply(lambda x: x.lower())  # Lower-casing
vectorizer = TfidfVectorizer(stop_words='english')  # Considering stop words removal
tfidf_matrix = vectorizer.fit_transform(df['Review Text'])

# Serialize the TF-IDF vectorizer and the matrix for later use
with open('/content/drive/MyDrive/IR_2/tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)
with open('/content/drive/MyDrive/IR_2/tfidf_matrix.pkl', 'wb') as f:
    pickle.dump(tfidf_matrix, f)

# Example of processing the first row's images
urls = df['Image'].iloc[0]
features = extract_and_normalize_features(urls)
print(features)

# Serialize extracted features for the first row as an example
with open('/content/drive/MyDrive/IR_2/image_features_example.pkl', 'wb') as f:
    pickle.dump(features, f)


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
[0.         0.09461115 0.         ... 0.00275598 0.0181075  0.09889498]


***ANSWER 2***

In [2]:
import ast
import numpy as np
import pandas as pd
import pickle
from PIL import Image
from io import BytesIO
import requests
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tensorflow.keras.preprocessing import image
import string

# Initialize NLTK lemmatizer and download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

# Function for preprocessing text
def preprocess_text(text):
    text = text.lower()  # Lowercasing
    tokens = word_tokenize(text)  # Tokenization
    tokens = [word for word in tokens if word not in string.punctuation]  # Removing punctuations
    stop_words = set(stopwords.words('english'))  # Stop word removal
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatization
    return ' '.join(tokens)

# Load dataset
data_path = '/content/drive/MyDrive/IR_2/A2_Data.csv'
df = pd.read_csv(data_path)

# Preprocess text data
df['Processed Review'] = df['Review Text'].fillna('').apply(preprocess_text)

# Calculate TF-IDF scores
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['Processed Review'])

# Initialize ResNet50 model for image feature extraction
model = ResNet50(weights='imagenet', include_top=False, pooling='avg')

# Function to extract features from an image URL
def extract_features(img_url):
    try:
        response = requests.get(img_url, timeout=10)
        img = Image.open(BytesIO(response.content))
        img = img.resize((224, 224))
        img_array = image.img_to_array(img)
        img_array_expanded = np.expand_dims(img_array, axis=0)
        img_preprocessed = preprocess_input(img_array_expanded)
        features = model.predict(img_preprocessed)
        return features.flatten()
    except Exception as e:
        print(f"Error processing image {img_url}: {e}")
        return np.zeros((2048,))

# Adjusted function for handling multiple URLs in 'Image' column
def extract_and_aggregate_features(urls):
    # Ensure URLs are in list format
    urls = ast.literal_eval(urls) if isinstance(urls, str) else urls
    features_list = [extract_features(url) for url in urls if url]
    if features_list:
        features_array = np.array(features_list)
        aggregated_features = np.mean(features_array, axis=0)
        return aggregated_features
    else:
        return np.zeros((2048,))

# Apply the adjusted function to each row in the dataframe
df['Image Features'] = df['Image'].apply(lambda x: extract_and_aggregate_features(x) if pd.notna(x) else np.zeros((2048,)))

# Save processed data and features using pickle
with open('/content/drive/MyDrive/IR_2/processed_reviews.pkl', 'wb') as f:
    pickle.dump(df['Processed Review'].tolist(), f)

with open('/content/drive/MyDrive/IR_2/tfidf_matrix.pkl', 'wb') as f:
    pickle.dump(tfidf_matrix, f)

with open('/content/drive/MyDrive/IR_2/image_features.pkl', 'wb') as f:
    pickle.dump(df['Image Features'].tolist(), f)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Error processing image https://images-na.ssl-images-amazon.com/images/I/71F3npeHUDL._SY88.jpg: cannot identify image file <_io.BytesIO object at 0x78439cd21b20>
Error processing image https://images-na.ssl-images-amazon.com/images/I/71wHUWncMGL._SY88.jpg: cannot identify image file <_io.BytesIO object at 0x78439cd79c10>
Error processing image https://images-na.ssl-images-amazon.com/images/I/71B8OOE5N8L._SY88.jpg: cannot identify image file <_io.BytesIO object at 0x78439cd4bc40>
Error processing image https://images-na.ssl-images-amazon.com/images/I/81SX3oAWbNL._SY88.jpg: cannot identify image file <_io.BytesIO object at 0x7843a421bf60>
Error processing image https://images-na.ssl-images-amazon.com/images/I/718niQ1GEwL._SY88.jpg: cannot identify image file <_io.BytesIO object at 0x7843a665a6b0>
Error processing image https://images-na.ssl-images-amazon.com/images/I/61OboZT-kcL._SY88.jpg: cannot identify image file <_io.BytesIO object at 0x7843a6519440>
Error processing image https://ima

***ANSWER 3***

In [3]:
import pickle
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Load dataset for result presentation
data_path = '/content/drive/MyDrive/IR_2/A2_Data.csv'
df = pd.read_csv(data_path)

# Load the precomputed image features and TF-IDF vectors
with open('/content/drive/MyDrive/IR_2/image_features.pkl', 'rb') as f:
    image_features = pickle.load(f)
with open('/content/drive/MyDrive/IR_2/tfidf_matrix.pkl', 'rb') as f:
    tfidf_matrix = pickle.load(f)

# Assuming the first entry is the query
input_image_features = np.array(image_features[0]).reshape(1, -1)  # Ensure compatibility with cosine_similarity
input_text_features = tfidf_matrix[0:1]  # Reshape for compatibility

# Calculate cosine similarity for the input image and text
image_similarities = cosine_similarity(input_image_features, np.array(image_features))
text_similarities = cosine_similarity(input_text_features, tfidf_matrix)

# Get the indices of the top 3 similar images, excluding the first one (itself)
top_image_indices = np.argsort(image_similarities[0])[::-1][1:4]

# Get the indices of the top 3 similar texts, excluding the first one (itself)
top_text_indices = np.argsort(text_similarities[0])[::-1][1:4]

# Presenting the results with similarity scores
print("Top 3 similar images:")
for index in top_image_indices:
    similarity_score = image_similarities[0][index]
    print(f"Index: {index}, Similarity Score: {similarity_score:.4f}")

print("\nTop 3 similar reviews:")
for index in top_text_indices:
    similarity_score = text_similarities[0][index]
    print(f"Index: {index}, Similarity Score: {similarity_score:.4f}")

# Optionally, save these results using pickle for future analysis
with open('/content/drive/MyDrive/IR_2/similar_images_indices.pkl', 'wb') as f:
    pickle.dump(top_image_indices, f)
with open('/content/drive/MyDrive/IR_2/similar_reviews_indices.pkl', 'wb') as f:
    pickle.dump(top_text_indices, f)

# For detailed presentation: displaying the image URLs and the corresponding reviews
for index in top_image_indices:
    print(f"\nImage Index: {index}")
    print(f"Image URL(s): {df.loc[index, 'Image']}")
    print(f"Review: {df.loc[index, 'Review Text']}")


Top 3 similar images:
Index: 853, Similarity Score: 0.7401
Index: 244, Similarity Score: 0.7392
Index: 62, Similarity Score: 0.7380

Top 3 similar reviews:
Index: 271, Similarity Score: 0.2898
Index: 390, Similarity Score: 0.2477
Index: 805, Similarity Score: 0.2165

Image Index: 853
Image URL(s): ['https://images-na.ssl-images-amazon.com/images/I/71mRyDr3LuL._SY88.jpg']
Review: This is a great Les Paul like guitar for the novice player. Very minimum setup was needed...6th string buzzed a little. Everyone loves the look. Fun to play at a very affordable price.

Image Index: 244
Image URL(s): ['https://images-na.ssl-images-amazon.com/images/I/41wkxBIHFHL._SY88.jpg', 'https://images-na.ssl-images-amazon.com/images/I/41GMv9v5xBL._SY88.jpg', 'https://images-na.ssl-images-amazon.com/images/I/41ho924kuQL._SY88.jpg', 'https://images-na.ssl-images-amazon.com/images/I/41JzN+hpYlL._SY88.jpg', 'https://images-na.ssl-images-amazon.com/images/I/41pni7hkHEL._SY88.jpg', 'https://images-na.ssl-images-

***ANSWER 4***

In [4]:
import pickle
import numpy as np
import pandas as pd

# Assuming the previous steps have been executed

# Calculate composite similarity scores
composite_similarities = (image_similarities[0] + text_similarities[0]) / 2

# Rank the pairs based on the composite similarity score, excluding the first one (itself)
composite_indices = np.argsort(composite_similarities)[::-1][1:]  # Start from 1 to exclude the input itself

# Print top 3 overall similar items based on composite score
print("Top 3 overall similar items indices:", composite_indices[:3])

# Enhanced presentation
for index in composite_indices[:3]:
    similarity_score = composite_similarities[index]
    print(f"Index: {index}, Composite Similarity Score: {similarity_score:.4f}")

# Optionally, load dataset for detailed presentation
data_path = '/content/drive/MyDrive/IR_2/A2_Data.csv'
df = pd.read_csv(data_path)

# Display detailed information for the top similar items
for index in composite_indices[:3]:
    print(f"\nIndex: {index}")
    print(f"Image URL(s): {df.loc[index, 'Image']}")
    print(f"Review: {df.loc[index, 'Review Text']}")

# Optionally, save these ranked indices using pickle for later use
with open('/content/drive/MyDrive/IR_2/composite_similar_items_indices.pkl', 'wb') as f:
    pickle.dump(composite_indices[:3], f)


Top 3 overall similar items indices: [390 750 244]
Index: 390, Composite Similarity Score: 0.4489
Index: 750, Composite Similarity Score: 0.4113
Index: 244, Composite Similarity Score: 0.4081

Index: 390
Image URL(s): ['https://images-na.ssl-images-amazon.com/images/I/71YEX7X28kL._SY88.jpg', 'https://images-na.ssl-images-amazon.com/images/I/71SUXEwHegL._SY88.jpg', 'https://images-na.ssl-images-amazon.com/images/I/71UMds34gxL._SY88.jpg', 'https://images-na.ssl-images-amazon.com/images/I/71t4gm+RcYL._SY88.jpg']
Review: All I can say is I'm loving it.

Index: 750
Image URL(s): ['https://images-na.ssl-images-amazon.com/images/I/81U3GJsTjNL._SY88.jpg', 'https://images-na.ssl-images-amazon.com/images/I/71TDWb-prbL._SY88.jpg']
Review: Great Quality, adjustable tension. Well made.

Index: 244
Image URL(s): ['https://images-na.ssl-images-amazon.com/images/I/41wkxBIHFHL._SY88.jpg', 'https://images-na.ssl-images-amazon.com/images/I/41GMv9v5xBL._SY88.jpg', 'https://images-na.ssl-images-amazon.com/

***ANSWER 5***

In [5]:
# Assuming `composite_indices` contains indices of the items sorted by their composite similarity score
# And `df` is the DataFrame containing the dataset

# Display top 3 overall similar items with improved formatting and potential handling for missing data
for rank, index in enumerate(composite_indices[:3], start=1):
    image_url = df.loc[index, 'Image'] if pd.notna(df.loc[index, 'Image']) else "No image available"
    review_text = df.loc[index, 'Review Text'] if pd.notna(df.loc[index, 'Review Text']) else "No review text available"
    print(f"Rank: {rank}")
    print(f"Image URL: {image_url}")
    print(f"Review Text: {review_text}")
    print(f"Image Similarity Score: {image_similarities[0][index]:.4f}")
    print(f"Text Similarity Score: {text_similarities[0][index]:.4f}")
    print(f"Composite Similarity Score: {composite_similarities[index]:.4f}")
    print("----------------------------------------------------")


Rank: 1
Image URL: ['https://images-na.ssl-images-amazon.com/images/I/71YEX7X28kL._SY88.jpg', 'https://images-na.ssl-images-amazon.com/images/I/71SUXEwHegL._SY88.jpg', 'https://images-na.ssl-images-amazon.com/images/I/71UMds34gxL._SY88.jpg', 'https://images-na.ssl-images-amazon.com/images/I/71t4gm+RcYL._SY88.jpg']
Review Text: All I can say is I'm loving it.
Image Similarity Score: 0.6501
Text Similarity Score: 0.2477
Composite Similarity Score: 0.4489
----------------------------------------------------
Rank: 2
Image URL: ['https://images-na.ssl-images-amazon.com/images/I/81U3GJsTjNL._SY88.jpg', 'https://images-na.ssl-images-amazon.com/images/I/71TDWb-prbL._SY88.jpg']
Review Text: Great Quality, adjustable tension. Well made.
Image Similarity Score: 0.6459
Text Similarity Score: 0.1767
Composite Similarity Score: 0.4113
----------------------------------------------------
Rank: 3
Image URL: ['https://images-na.ssl-images-amazon.com/images/I/41wkxBIHFHL._SY88.jpg', 'https://images-na.s

***ANSWER 6***

In [7]:
# Prompt user for input image URL and review text
input_image_url = input("Enter the image URL: ")
input_review_text = input("Enter the review text: ")

# Preprocess input review text and extract features from input image URL
input_image_features = extract_features(input_image_url).reshape(1, -1)  # Ensure the function is defined correctly
input_text_features = vectorizer.transform([preprocess_text(input_review_text)])  # Ensure the function is defined correctly

# Calculate cosine similarity for the input image and text
image_similarities = cosine_similarity(input_image_features, np.array(image_features))
text_similarities = cosine_similarity(input_text_features, tfidf_matrix)

# Calculate composite similarity scores for each item
composite_similarities = (image_similarities[0] + text_similarities[0]) / 2

# Function to format and print the results for both image and text retrieval
def format_and_print_results(df, image_similarities, text_similarities, composite_similarities, type="IMAGE"):
    print(f"\nUSING {type} RETRIEVAL")
    top_indices = np.argsort(composite_similarities)[::-1][1:4]  # Top 3 indices excluding the query itself
    for index in top_indices:
        image_url = df.iloc[index]['Image']
        review = df.iloc[index]['Review Text']
        image_sim = image_similarities[0][index]
        text_sim = text_similarities[0][index]
        composite_sim = composite_similarities[index]
        print(f"\nImage URL: {image_url}")
        print(f"Review: {review}")
        print(f"Cosine similarity of images - {image_sim:.4f}")
        print(f"Cosine similarity of text - {text_sim:.4f}")
        print(f"Composite similarity score: {composite_sim:.4f}")
    print("—--------------------------------------------------------------------------------------------")

# Display top similar items based on composite similarity scores for Image Retrieval
format_and_print_results(df, image_similarities, text_similarities, composite_similarities, type="IMAGE")

# Display top similar items based on composite similarity scores for Text Retrieval
format_and_print_results(df, image_similarities, text_similarities, composite_similarities, type="TEXT")


Enter the image URL: https://images-na.ssl-images-amazon.com/images/I/71bztfqdg+L._SY88.jpg
Enter the review text: I have been using Fender locking tuners for about five years on various strats and teles. Definitely helps with tuning stability and way faster to restring if there is a break.

USING IMAGE RETRIEVAL

Image URL: ['https://images-na.ssl-images-amazon.com/images/I/719-SDMiOoL._SY88.jpg']
Review: These locking tuners look great and keep tune.  Good quality materials and construction.  Excellent upgrade to any guitar.  I had to drill additions holes for installation.  If your neck already comes with pre-drilled holes, then they should drop right in, otherwise you will need to buy a guitar tuner pin drill jig, also available from Amazon.
Cosine similarity of images - 0.8370
Cosine similarity of text - 0.1092
Composite similarity score: 0.4731

Image URL: ['https://images-na.ssl-images-amazon.com/images/I/61clqkZnKxL._SY88.jpg', 'https://images-na.ssl-images-amazon.com/images/I/