## Get necessary Libraries

In [44]:
import torch
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import os
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score,
    mean_absolute_percentage_error,
    root_mean_squared_error,
)

# from google.colab import drive

## Load Data

In [2]:
# Mount to drive
# drive.mount('/Drive')

Mounted at /Drive


In [3]:
# Get the csv
df = pd.read_csv("merged_data.csv")

In [4]:
df

Unnamed: 0.1,Unnamed: 0,id,product title,image_url,price,image_name
0,5,B07XSCD2R4,Maxlite 5 Softside Expandable Luggage with 4 S...,https://m.media-amazon.com/images/I/61LnBNsSBS...,144.49,B07XSCD2R4.jpg
1,6,B07MXF4G8K,"Hard Shell Carry on Luggage Airline Approved, ...",https://m.media-amazon.com/images/I/71CghLYrnA...,169.99,B07MXF4G8K.jpg
2,7,B07H515VCZ,"Maxporter II 30"" Hardside Spinner Trunk Luggag...",https://m.media-amazon.com/images/I/81f3h+YHOX...,299.99,B07H515VCZ.jpg
3,8,B08BXBCNMQ,Omni 2 Hardside Expandable Luggage with Spinne...,https://m.media-amazon.com/images/I/91eOWP4myS...,112.63,B08BXBCNMQ.jpg
4,9,B0B9K44XTS,Luggage Sets Expandable Lightweight Suitcases ...,https://m.media-amazon.com/images/I/81dsv5GrCL...,209.99,B0B9K44XTS.jpg
...,...,...,...,...,...,...
9985,9995,B0BR617B8P,Mens Athletic Workout Shorts with Compression ...,https://m.media-amazon.com/images/I/61MFjWYxa5...,34.99,B0BR617B8P.jpg
9986,9996,B07VWSP5HD,Men's Knitted Regular Fit Full Zip Cardigan Sw...,https://m.media-amazon.com/images/I/91Hm4RVlu8...,49.99,B07VWSP5HD.jpg
9987,9997,B0CD6MBV8T,Men's Jacket Windproof Qulited Bomber Jackets ...,https://m.media-amazon.com/images/I/61sVoWGbg3...,46.98,B0CD6MBV8T.jpg
9988,9998,B08XW98F22,Men's Dry Franchise Polo,https://m.media-amazon.com/images/I/51yJ8ZYPcs...,46.87,B08XW98F22.jpg


In [5]:
# checking for null values
df.isnull().sum()

Unnamed: 0       0
id               0
product title    0
image_url        0
price            0
image_name       0
dtype: int64

In [6]:
# Checking if the dataset contains products with 0 price
df[df["price"] == 0]

Unnamed: 0.1,Unnamed: 0,id,product title,image_url,price,image_name
172,177,B0BH6N9WMT,Airconic Hardside Expandable Luggage with Spin...,https://m.media-amazon.com/images/I/81lQQrl0By...,0.0,B0BH6N9WMT.jpg
175,180,B0BH9DL82Y,4 KIX 2.0 Softside Expandable Luggage with Spi...,https://m.media-amazon.com/images/I/91quX26fgZ...,0.0,B0BH9DL82Y.jpg
246,252,B0CFL93Y9M,OCEAN HOLIDAY Luggage with Double Spinner Whee...,https://m.media-amazon.com/images/I/61bvKvJaQU...,0.0,B0CFL93Y9M.jpg
322,328,B081TNY17M,Merge Short Trip Expandable Packing Case Mediu...,https://m.media-amazon.com/images/I/81UsxCRqUs...,0.0,B081TNY17M.jpg
439,445,B0BTYZR67H,"Suitcase Caster, 1 Pair Luggage Swivel Wheels ...",https://m.media-amazon.com/images/I/61Al5DVttX...,0.0,B0BTYZR67H.jpg
...,...,...,...,...,...,...
9758,9768,B08FC5NRYB,Men's Undershirt Cotton Stretch Crew Neck T-Sh...,https://m.media-amazon.com/images/I/41a5AlpJZM...,0.0,B08FC5NRYB.jpg
9776,9786,B07Z8QMK4M,Men's Outdoor Waterproof Soft Shell Hooded Mil...,https://m.media-amazon.com/images/I/71Udno3Zzx...,0.0,B07Z8QMK4M.jpg
9789,9799,B08W2R1SBR,Scooby Doo Men's Ruh-Roh! Scooby Character Adu...,https://m.media-amazon.com/images/I/815lIP3nxS...,0.0,B08W2R1SBR.jpg
9956,9966,B083TTV36F,"Men's 8"" Performance Breaker Short",https://m.media-amazon.com/images/I/71PEnlDwPW...,0.0,B083TTV36F.jpg


In [7]:
# creating a new dataframe with products that have prices not null
new_df = df.drop(df.index[df.price == 0], axis=0)
# new_df = df.loc[df['price']!=0.00]
new_df

Unnamed: 0.1,Unnamed: 0,id,product title,image_url,price,image_name
0,5,B07XSCD2R4,Maxlite 5 Softside Expandable Luggage with 4 S...,https://m.media-amazon.com/images/I/61LnBNsSBS...,144.49,B07XSCD2R4.jpg
1,6,B07MXF4G8K,"Hard Shell Carry on Luggage Airline Approved, ...",https://m.media-amazon.com/images/I/71CghLYrnA...,169.99,B07MXF4G8K.jpg
2,7,B07H515VCZ,"Maxporter II 30"" Hardside Spinner Trunk Luggag...",https://m.media-amazon.com/images/I/81f3h+YHOX...,299.99,B07H515VCZ.jpg
3,8,B08BXBCNMQ,Omni 2 Hardside Expandable Luggage with Spinne...,https://m.media-amazon.com/images/I/91eOWP4myS...,112.63,B08BXBCNMQ.jpg
4,9,B0B9K44XTS,Luggage Sets Expandable Lightweight Suitcases ...,https://m.media-amazon.com/images/I/81dsv5GrCL...,209.99,B0B9K44XTS.jpg
...,...,...,...,...,...,...
9984,9994,B09V11RVFC,Techwear Graphic Fashion Bomber Jacket,https://m.media-amazon.com/images/I/71v8BGUIS-...,209.50,B09V11RVFC.jpg
9985,9995,B0BR617B8P,Mens Athletic Workout Shorts with Compression ...,https://m.media-amazon.com/images/I/61MFjWYxa5...,34.99,B0BR617B8P.jpg
9986,9996,B07VWSP5HD,Men's Knitted Regular Fit Full Zip Cardigan Sw...,https://m.media-amazon.com/images/I/91Hm4RVlu8...,49.99,B07VWSP5HD.jpg
9987,9997,B0CD6MBV8T,Men's Jacket Windproof Qulited Bomber Jackets ...,https://m.media-amazon.com/images/I/61sVoWGbg3...,46.98,B0CD6MBV8T.jpg


In [8]:
#!unzip /Drive/MyDrive/Detect_LLM/'Copy of images.zip' -d /Drive/MyDrive/Detect_LLM/

## Get features


In [9]:
os.environ["HUGGINGFACE_TOKEN"] = "HF TOKEN"

In [10]:
# Load CLIP model and processor
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

2024-02-15 01:33:09.963067: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [11]:
# Define transformation for images
image_transform = transforms.Compose(
    [
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
    ]
)

In [12]:
# Custom dataset class
class CustomDataset(Dataset):
    def __init__(self, csv_file, root_dir, clip_model, clip_processor, transform=None):
        self.data = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.transform = transform
        self.clip_model = clip_model
        self.clip_processor = clip_processor

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_name = self.data.iloc[idx, 5]
        img_path = os.path.join(self.root_dir, img_name)
        image = Image.open(img_path).convert("RGB")
        description = self.data.iloc[idx, 2]
        price = self.data.iloc[idx, 4]
        # Preprocess image
        if self.transform:
            image = self.transform(image)
        # Extract image features using CLIP model
        inputs = self.clip_processor(images=image, return_tensors="pt", padding=True)
        with torch.no_grad():
            image_features = self.clip_model.get_image_features(**inputs)
        # Extract text features using CLIP model
        inputs = self.clip_processor(
            text=[description], return_tensors="pt", padding=True
        )
        with torch.no_grad():
            text_features = self.clip_model.get_text_features(**inputs)
        sample = {
            "image": image,
            "description": description,
            "price": price,
            "image_features": image_features,
            "text_features": text_features,
        }
        return sample

In [13]:
# Paths and file names
new_df.to_csv("new_data.csv", index=False)
csv_file = "new_data.csv"
image_folder = "images"

# Create dataset instance
dataset = CustomDataset(
    csv_file=csv_file,
    root_dir=image_folder,
    clip_model=clip_model,
    clip_processor=clip_processor,
    transform=image_transform,
)

In [14]:
dataset.__len__()

9808

In [21]:
sample = dataset[1]

In [22]:
sample

{'image': tensor([[[1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          ...,
          [0.9882, 0.9882, 0.9843,  ..., 0.9804, 0.9765, 0.9765],
          [0.9804, 0.9765, 0.9725,  ..., 0.9804, 0.9804, 0.9804],
          [0.9882, 0.9882, 0.9804,  ..., 0.9804, 0.9843, 0.9843]],
 
         [[1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          ...,
          [0.9882, 0.9882, 0.9843,  ..., 0.9804, 0.9765, 0.9765],
          [0.9804, 0.9765, 0.9725,  ..., 0.9804, 0.9804, 0.9804],
          [0.9882, 0.9882, 0.9804,  ..., 0.9804, 0.9843, 0.9843]],
 
         [[1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [1.0000, 1.0000, 1.00

In [37]:
# Create DataLoader
dataloader = DataLoader(dataset, batch_size=32, shuffle=False)

In [38]:
# Extract features and prices
image_features = []
text_features = []
prices = []

for batch in dataloader:
    images = batch["image"]
    descriptions = batch["description"]
    batch_text_features = batch["text_features"]
    batch_image_features = batch["image_features"]
    batch_prices = batch["price"]

    # Convert text features to numpy array
    batch_text_features_np = batch_text_features.cpu().numpy()

    # Flatten image features
    batch_image_features_np = batch_image_features.cpu().numpy()

    image_features.append(batch_image_features_np)
    text_features.append(batch_text_features_np)
    prices.append(batch_prices.numpy())

image_features = np.concatenate(image_features)
text_features = np.concatenate(text_features)
prices = np.concatenate(prices)

In [40]:
# Save image features
np.save("image_features.npy", image_features)

# Save text features
np.save("text_features.npy", text_features)

# Save prices
np.save("prices.npy", prices)

In [41]:
# Combine text features and prices
combined_features = np.concatenate((text_features, image_features), axis=1)

In [42]:
combined_features_2d = combined_features.reshape(combined_features.shape[0], -1)

In [43]:
prices

array([144.49, 169.99, 299.99, ...,  49.99,  46.98,  46.87])

In [30]:
from sklearn.ensemble import RandomForestRegressor

In [34]:
from xgboost import XGBRegressor

In [36]:
combined_features_2d.shape

(9808, 1024)

In [46]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    combined_features_2d, prices, test_size=0.2, random_state=42
)

# Initialize and train the Random Forest regressor
forestmodel = RandomForestRegressor()
forestmodel.fit(X_train, y_train)

# Predict on the test set
y_pred = forestmodel.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("Mean Absolute Error:", mae)
print("Mean Absolute Percentage Error:", mape)
print("The R2:", r2)

Mean Squared Error: 1741.2318452481402
Root Mean Squared Error: 41.72807023153767
Mean Absolute Error: 21.19601418298316
Mean Absolute Percentage Error: 0.5424376712163212
The R2: 0.5507796262435776


In [47]:
def evaluate_model(true, predicted):
    mse = mean_squared_error(y_test, y_pred)
    rmse = root_mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print("Mean Squared Error:", mse)
    print("Root Mean Squared Error:", rmse)
    print("Mean Absolute Error:", mae)
    print("Mean Absolute Percentage Error:", mape)
    print("The R2:", r2)

In [48]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    combined_features_2d, prices, test_size=0.2, random_state=42
)

# Initialize and train the XGBoost regressor
xgb_regressor = XGBRegressor()
xgb_regressor.fit(X_train, y_train)

# Predict on the test set
y_pred = xgb_regressor.predict(X_test)

# Evaluate the model
evaluate_model(y_test, y_pred)

Mean Squared Error: 1664.1589680833083
Root Mean Squared Error: 40.794104575089136
Mean Absolute Error: 21.26601756257019
Mean Absolute Percentage Error: 0.5032389566606228
The R2: 0.5706636564954678


In [49]:
from sklearn.neighbors import KNeighborsRegressor

In [54]:
# Initialize and train the KNN regressor
knn_regressor = KNeighborsRegressor(n_neighbors=8)
knn_regressor.fit(X_train, y_train)

# Predict on the test set
y_pred = knn_regressor.predict(X_test)

# Evaluate the model
evaluate_model(y_test, y_pred)

Mean Squared Error: 1776.1683123104613
Root Mean Squared Error: 42.14461190129126
Mean Absolute Error: 18.61250254841998
Mean Absolute Percentage Error: 0.35563174191160296
The R2: 0.5417663677080791


In [55]:
from sklearn import svm

In [56]:
# Initialize and train the Support vector regressor
svm_regressor = svm.SVR()
svm_regressor.fit(X_train, y_train)

# Predict on the test set
y_pred = svm_regressor.predict(X_test)

# Evaluate the model
evaluate_model(y_test, y_pred)

Mean Squared Error: 3347.409803195469
Root Mean Squared Error: 57.85680429470219
Mean Absolute Error: 24.858882208328218
Mean Absolute Percentage Error: 0.439202075979662
The R2: 0.13640180254508993
