In [115]:
import pickle
import numpy as np
import pandas as pd
import requests
from PIL import Image, ImageEnhance
import nltk
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import tensorflow as tf
from sklearn.preprocessing import normalize
from tensorflow.keras.applications import VGG16
from tensorflow.keras.applications.vgg16 import preprocess_input
import re
import string
from nltk.stem import PorterStemmer, WordNetLemmatizer
import math
from io import BytesIO
import string
import random
import os

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to C:\Users\Neev
[nltk_data]     vero\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [116]:
df = pd.read_csv('A2_data.csv')
images = df["Image"]
text = df["Review Text"]
index = df["Unnamed: 0"]
vgg_model = VGG16(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

In [120]:
# Define the preprocess_image function
def preprocess_image(image_url):
    response = requests.get(image_url)
    if response.status_code == 200:
        img = Image.open(BytesIO(response.content))
        img = img.resize((224, 224))  # Resize image to match VGG16 input size
        img = img.convert('RGB')  # Convert image to RGB mode (in case of transparency)
        img = np.array(img)  # Convert image to numpy array
        img = preprocess_input(img)  # Preprocess the image for VGG16
        return img
    else:
        print(f"Failed to fetch image from URL: {image_url}")
        return None

# Function to preprocess and extract features from images
def extract_image_features(image_url):
    response = requests.get(image_url)
    if response.status_code != 200:
        return None
    image = Image.open(BytesIO(response.content))
    image = preprocess_image(image)
    image = image.resize((224, 224))  # Resize image to VGG16 input size
    image_array = np.array(image)
    image_array = np.expand_dims(image_array, axis=0)
    image_array = preprocess_input(image_array)
    features = vgg_model.predict(image_array)
    features = features.flatten()  # Flatten features
    return features

# Apply preprocess_image function to each row in the DataFrame
preprocessed_images = []
for index, row in df.iterrows():
    image_urls = eval(row['Image'])  # Convert string representation of list to actual list
    for image_url in image_urls:
        preprocessed_image = preprocess_image(image_url)
        if preprocessed_image is not None:
            preprocessed_images.append(preprocessed_image)

# Save preprocessed images using pickle
with open('preprocessed_images.pickle', 'wb') as f:
    pickle.dump(preprocessed_images, f)

# Load preprocessed images
with open('preprocessed_images.pickle', 'rb') as f:
    preprocessed_images_loaded = pickle.load(f)

print("Preprocessed images saved successfully.")

Failed to fetch image from URL: https://images-na.ssl-images-amazon.com/images/I/71F3npeHUDL._SY88.jpg
Failed to fetch image from URL: https://images-na.ssl-images-amazon.com/images/I/71wHUWncMGL._SY88.jpg
Failed to fetch image from URL: https://images-na.ssl-images-amazon.com/images/I/71B8OOE5N8L._SY88.jpg
Failed to fetch image from URL: https://images-na.ssl-images-amazon.com/images/I/81SX3oAWbNL._SY88.jpg
Failed to fetch image from URL: https://images-na.ssl-images-amazon.com/images/I/718niQ1GEwL._SY88.jpg
Failed to fetch image from URL: https://images-na.ssl-images-amazon.com/images/I/61OboZT-kcL._SY88.jpg
Failed to fetch image from URL: https://images-na.ssl-images-amazon.com/images/I/710a2Pyh5lL._SY88.jpg
Failed to fetch image from URL: https://images-na.ssl-images-amazon.com/images/I/816NMd0LexL._SY88.jpg
Preprocessed images saved successfully.


In [121]:
# Function to preprocess text
def preprocess_text(text):
    # Check if text is NaN, and replace with empty string if NaN
    if pd.isna(text):
        return ''
    text = text.lower()
    tokens = re.findall(r'\b\w+\b', text)
    tokens = [token for token in tokens if token not in stopwords]
    tokens = [token for token in tokens if token not in string.punctuation]
    tokens = [token for token in tokens if token.strip() != '']
    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens

with open("stopwords.txt", 'r') as stopword_file:
    stopwords = stopword_file.read().splitlines() 
    
# Apply preprocessing to each review in the dataframe
df['Preprocessed_Text'] = df['Review Text'].apply(preprocess_text)

# Function to calculate TF-IDF scores and TF-IDF matrix
def calculate_tf_idf(corpus):
    # Calculate TF for each document
    tf = {}
    for i, doc in enumerate(corpus):
        tf_doc = {}
        for word in doc:
            tf_doc[word] = tf_doc.get(word, 0) + 1
        for word in tf_doc:
            tf_doc[word] = tf_doc[word] / len(doc)
        tf[i] = tf_doc
    
    # Calculate IDF
    idf = {}
    for doc in corpus:
        for word in set(doc):
            idf[word] = idf.get(word, 0) + 1
    
    for word in idf:
        idf[word] = math.log(len(corpus) / idf[word])
    
    # Calculate TF-IDF
    tfidf = {}
    for i, doc in enumerate(corpus):
        tfidf_doc = {}
        for word in doc:
            tfidf_doc[word] = tf[i][word] * idf[word]
        tfidf[i] = tfidf_doc
    
    # Create TF-IDF matrix
    terms = sorted(idf.keys())
    tfidf_matrix = np.zeros((len(corpus), len(terms)))
    for i, doc in enumerate(corpus):
        for j, term in enumerate(terms):
            tfidf_matrix[i, j] = tfidf[i].get(term, 0)
    
    return tfidf, tfidf_matrix

# Calculate TF-IDF scores and TF-IDF matrix for the preprocessed text
tfidf_scores, tfidf_matrix = calculate_tf_idf(df['Preprocessed_Text'])

with open('tfidf_matrix.pickle', 'rb') as f_matrix:
    tfidf_matrix = pickle.load(f_matrix)

# Load TF-IDF scores from pickle file
with open('tfidf_scores.pickle', 'rb') as f_scores:
    tfidf_scores = pickle.load(f_scores)

# Print TF-IDF matrix
print("\nTF-IDF Matrix:")
for doc_num, word_scores in tfidf_scores.items():
    print(f"Item {doc_num + 1}/{len(tfidf_scores)}:")
    print("{")
    for word, score in word_scores.items():
        print(f"    '{word}': {score},")
    print("}")


TF-IDF Matrix:
Item 1/1000:
{
    'love': 0.20714733720306594,
    'vintag': 0.8034767042171945,
    'spring': 0.9923690259853647,
    'strat': 0.3123565645063876,
    'tension': 0.48283137373023016,
    'stabil': 0.48283137373023016,
    'float': 0.5809142990314028,
    'bridg': 0.3381394754365976,
}
Item 2/1000:
{
    'guitar': 0.08391873605472873,
    'bench': 0.4143072065614794,
    'mat': 0.4143072065614794,
    'rug': 0.31403538010972787,
    'abus': 0.3872761993542685,
    'care': 0.4631690765995988,
    'make': 0.2097703442192383,
    'organ': 0.35322115776986907,
    'workspac': 0.4605170185988091,
    'easier': 0.26080153369520975,
    'screw': 0.1763383601293881,
    't': 0.07152963612796878,
    'roll': 0.3218875824868201,
    'color': 0.19216023921646586,
}
Item 3/1000:
{
    'acoust': 0.07383554260028136,
    'bass': 0.06297798576049575,
    'ukulel': 0.0868966366895394,
    'model': 0.08599805745185257,
    'avail': 0.10340014098858571,
    'uke': 0.10163566546111616,
 

In [122]:
tfidf_df = pd.DataFrame.from_dict(tfidf_scores, orient='index')

# Fill NaN values with 0
tfidf_df.fillna(0, inplace=True)

# Convert the DataFrame to a numpy array
tfidf_matrix = tfidf_df.to_numpy()

print(tfidf_matrix)

[[0.20714734 0.8034767  0.99236903 ... 0.         0.         0.        ]
 [0.24370275 0.         0.         ... 0.         0.         0.        ]
 [0.03186882 0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [123]:
def cosine_similarity_manual(vector1, vector2):
    dot_product = np.dot(vector1, vector2)
    norm_vector1 = np.linalg.norm(vector1)
    norm_vector2 = np.linalg.norm(vector2)
    similarity = dot_product / (norm_vector1 * norm_vector2)
    return similarity

def normalize_array(arr):
    min_val = min(arr)
    max_val = max(arr)
    normalized_arr = [(x - min_val) / (max_val - min_val) for x in arr]
    return normalized_arr

In [131]:
# Input image and review
url = input("Enter the image URL: ")
review = input("Enter the review")

# Function to extract features from an image given its URL
def extract_image_features(url):
    # Download the image from the URL
    response = requests.get(url)
    img = Image.open(BytesIO(response.content))

    # Perform feature extraction (example: converting image to grayscale and resizing)
    img_gray = img.convert('L')  # Convert to grayscale
    img_resized = img_gray.resize((64, 64))  # Resize the image
    img_array = np.array(img_resized)  # Convert image to numpy array

    # Flatten the 2D array into a 1D array (vector)
    img_vector = img_array.flatten()

    return img_vector

review = review.replace(' ','')
input_tf_idf = []
for i in range(len(text)):
    temp = str(text[i])
    temp = temp.replace(' ','')
    temp = temp.replace('\n','')

    if review == temp:
        input_tf_idf = tfidf_matrix[i]
        break       

# Validate the URL before processing
if url.startswith(('http://', 'https://')):
    # Process the image features if the URL is valid
    image_features = extract_image_features(url)
    url_extracted_features = normalize_array(image_features)
    print(url_extracted_features)
else:
    print("Invalid URL. Please enter a valid URL starting with 'http://' or 'https://'.")

[0.20588235294117646, 0.20168067226890757, 0.19327731092436976, 0.18907563025210083, 0.18907563025210083, 0.18487394957983194, 0.18487394957983194, 0.18067226890756302, 0.17647058823529413, 0.17647058823529413, 0.18067226890756302, 0.1722689075630252, 0.1638655462184874, 0.07983193277310924, 0.06722689075630252, 0.07983193277310924, 0.07983193277310924, 0.07563025210084033, 0.1092436974789916, 0.06302521008403361, 0.06302521008403361, 0.058823529411764705, 0.058823529411764705, 0.29831932773109243, 0.7521008403361344, 0.8067226890756303, 0.47058823529411764, 0.15126050420168066, 0.23529411764705882, 0.20168067226890757, 0.20168067226890757, 0.20168067226890757, 0.18907563025210083, 0.20168067226890757, 0.21428571428571427, 0.21428571428571427, 0.2689075630252101, 0.5672268907563025, 0.680672268907563, 0.6764705882352942, 0.6764705882352942, 0.6680672268907563, 0.6638655462184874, 0.6512605042016807, 0.6512605042016807, 0.6428571428571429, 0.634453781512605, 0.6302521008403361, 0.638655

In [132]:
import ast

def image_extractor(image):
    # Convert the string to a list using ast.literal_eval
    url_list = ast.literal_eval(image)
    url = url_list[0]
    return url

In [145]:
# Function to extract features from an image given its URL
def extract_image_features(url):
    try:
        response = requests.get(url)
        img = Image.open(BytesIO(response.content))
        img_gray = img.convert('L')  # Convert to grayscale
        img_resized = img_gray.resize((64, 64))  # Resize the image
        img_array = np.array(img_resized)  # Convert image to numpy array
        img_vector = img_array.flatten()
        return img_vector
    except Exception as e:
        print(f"Error processing image: {e}")
        return None

# Function to extract features from a list of image URLs for a single product
def extract_product_image_features(image_urls):
    product_image_features = []
    for url in image_urls.split(', '):
        url.strip("'")
        print(url)
        image_features = extract_image_features(url)
        if image_features is not None:
            product_image_features.append(image_features)
    product_image_features = np.array(product_image_features)
    if len(product_image_features) != 0:    
        product_image_features_normalized = normalize(product_image_features)  # Normalize features
    else:
        return None
    return product_image_features_normalized

extracted_image_features = []
index = {}

if os.path.exists("extracted_features_images.pickle") and os.path.exists("index.pickle"):
    with open('extracted_features_images.pickle', 'rb') as f:
        extracted_image_features = pickle.load(f)
    with open('index.pickle', 'rb') as f:
        index = pickle.load(f)
else:
    for i, url in enumerate(images):
        image_features = extract_image_features(url)
        if image_features is not None:
            extracted_image_features.append(image_features)
            index[i] = url

    with open('extracted_features_images.pickle', 'wb') as f:
        pickle.dump(extracted_image_features, f)
    
    with open('index.pickle', 'wb') as f:
        pickle.dump(index, f)

In [147]:
def image_retrieval(input_image_features, tf_idf_input):
    top_urls = ['', '', '']
    vals_img = [-1, -1, -1]

    top_texts = ['', '', '']
    vals_text = [-1, -1, -1]

    # Adjust initial current_max based on expected range of cosine similarity scores
    current_max = -1  # Initialize with a reasonable value

    for i in range(len(extracted_image_features)):
        if extracted_image_features[i] is None:
            continue

        current_urls = images[i].split(', ')
        for c in range(len(current_urls)):
            current_urls[c].strip('"')

        max_url = ''
        for j in range(len(extracted_image_features[i])):
            cos = cosine_similarity_manual(input_image_features, extracted_image_features[i][j])
            if cos > current_max:
                current_max = cos
                max_url = current_urls[j]

        current_review = cosine_similarity_manual(tfidf_matrix[i], tf_idf_input)

        # Update top URLs and similarity scores if the current values are higher
        if current_max > vals_img[0]:
            vals_img[2], vals_img[1], vals_img[0] = vals_img[1], vals_img[0], current_max
            top_urls[2], top_urls[1], top_urls[0] = top_urls[1], top_urls[0], max_url

            vals_text[2], vals_text[1], vals_text[0] = vals_text[1], vals_text[0], current_review
            top_texts[2], top_texts[1], top_texts[0] = top_texts[1], top_texts[0], text[i]

        elif current_max > vals_img[1]:
            vals_img[2], vals_img[1] = vals_img[1], current_max
            top_urls[2], top_urls[1] = top_urls[1], max_url

            vals_text[2], vals_text[1] = vals_text[1], current_review
            top_texts[2], top_texts[1] = top_texts[1], text[i]

        elif current_max > vals_img[2]:
            vals_img[2] = current_max
            top_urls[2] = max_url

            vals_text[2] = current_review
            top_texts[2] = text[i]

    return top_urls, top_texts, vals_img, vals_text

# Use the image_retrieval function with appropriate input parameters
top_urls, top_texts, vals_img, vals_text = image_retrieval(url_extracted_features, input_tf_idf)

# Print the results
print("USING IMAGE RETRIEVAL\n")

for i in range(3):
    print(f'{i+1}) Image URL: {top_urls[i]}')
    print(f'   Review: {top_texts[i]}')
    print(f'   Cosine similarity of images: {vals_img[i]}')
    print(f'   Cosine similarity of texts: {vals_text[i]}')
    print(f'   Composite similarity score: {(vals_img[i] + vals_text[i]) / 2}\n')


USING IMAGE RETRIEVAL

1) Image URL: 
   Review: 
   Cosine similarity of images: -1
   Cosine similarity of texts: -1
   Composite similarity score: -1.0

2) Image URL: 
   Review: 
   Cosine similarity of images: -1
   Cosine similarity of texts: -1
   Composite similarity score: -1.0

3) Image URL: 
   Review: 
   Cosine similarity of images: -1
   Cosine similarity of texts: -1
   Composite similarity score: -1.0



In [148]:
def text_retrieval(input_image_features,tf_idf_input):
    top_urls = ['','','']
    vals_img = [-1,-1,-1]
    
    top_texts = ['','','']
    vals_text = [-1,-1,-1]
    for i in range(len(tfidf_matrix)):
        if extracted_image_features[i] is None:
            continue
        current_urls = images[i].split(', ')
        for c in range(len(current_urls)):
            current_urls[c].strip('"')
            
        current_max = -10000000
        max_url = ''
        for j in range(len(extracted_image_features[i])):
            cos = cosine_similarity_manual(input_image_features,extracted_image_features[i][j])
            if cos > current_max:
                current_max = cos
                max_url = current_urls[j]
        
        current_review = cosine_similarity_manual(tfidf_matrix[i],tf_idf_input)
        if current_review > vals_text[0]:
            vals_img[2] = vals_img[1]
            top_urls[2] = top_urls[1]
            vals_img[1] = vals_img[0]
            top_urls[1] = top_urls[0]
            vals_img[0] = current_max
            top_urls[0] = max_url
            
            vals_text[2] = vals_text[1]
            top_texts[2] = top_texts[1]
            vals_text[1] = vals_text[0]
            top_texts[1] = top_texts[0]
            vals_text[0] = current_review
            top_texts[0] = text[i]
            
        elif current_review > vals_text[1]:
            vals_img[2] = vals_img[1]
            top_urls[2] = top_urls[1]
            vals_img[1] = current_max
            top_urls[1] = max_url
            
            vals_text[2] = vals_text[1]
            top_texts[2] = top_texts[1]
            vals_text[1] = current_review
            top_texts[1] = text[i]
            
        elif current_review > vals_text[2]:
            vals_img[2] = current_max
            top_urls[2] = max_url
            
            vals_text[2] = current_review
            top_texts[2] = text[i]
    return top_urls,top_texts,vals_img,vals_text

top_urls,top_texts,vals_img,vals_text = text_retrieval(url_extracted_features,input_tf_idf)

print("USING TEXT RETRIEVAL\n")

print(f'1) Image URL: {top_urls[0]}')
print(f'   Review: {top_texts[0]}')
print(f'   Cosine similarity of images: {vals_img[0]}')
print(f'   Cosine similarity of texts: {vals_text[0]}') 
print(f'   Composite similarity score: {(vals_img[0]+vals_text[0])/2}')

print(f'2) Image URL: {top_urls[1]}')
print(f'   Review: {top_texts[1]}')
print(f'   Cosine similarity of images: {vals_img[1]}')
print(f'   Cosine similarity of texts: {vals_text[1]}')
print(f'   Composite similarity score: {(vals_img[1]+vals_text[1])/2}')

print(f'3) Image URL: {top_urls[2]}')
print(f'   Review: {top_texts[2]}')
print(f'   Cosine similarity of images: {vals_img[2]}')
print(f'   Cosine similarity of texts: {vals_text[2]}')
print(f'   Composite similarity score: {(vals_img[2]+vals_text[2])/2}')
  

Length of extracted_image_features: 0
Length of images: 2
Length of tfidf_matrix: 996
Index out of range at i = 0
