In [None]:
import pandas as pd
books = pd.read_pickle('Pickle/books.pkl')   
from tqdm import tqdm
tqdm.pandas() 

replacing image_urls

In [None]:
urls = books['url'].tolist()

In [None]:
books = books[books['language_code'] == 'eng']
books = books.sample(10000, random_state=42)

In [None]:
books['average_rating'] = books['average_rating'].astype(float)

In [None]:
books = books[books['average_rating'] >= 3]

In [None]:
books = books.reset_index(drop=True)

In [7]:
import requests
from bs4 import BeautifulSoup
import time

# Ensure tqdm is applied to pandas operations
tqdm.pandas()

def get_image_url(goodreads_url, retries=2, backoff_factor=0.3):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
    for attempt in range(retries):
        try:
            response = requests.get(goodreads_url, headers=headers)
            soup = BeautifulSoup(response.content, 'html.parser')
            image_tag = soup.find('img', {'class': 'ResponsiveImage'})
            if image_tag:
                return image_tag['src']
        except Exception as e:
            print(f"Error: {e}")
        time.sleep(backoff_factor * (2 ** attempt))  # Exponential backoff
    return None

def process_urls(urls):
    return urls.progress_apply(get_image_url)

# Example usage
books['new_image_url'] = process_urls(books['url'])

KeyboardInterrupt: 

In [None]:
import os
import requests
from PIL import Image
import io

# Create a directory to save the images if it doesn't already exist
def create_directory(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

def download_image(url, book_id):
    image_directory = 'Covers'
    create_directory(image_directory)
    try:
        response = requests.get(url, stream=True)
        if response.status_code == 200:
            img = Image.open(io.BytesIO(response.content)).convert('RGB')
            image_path = f'{image_directory}/{book_id}.jpg'
            img.save(image_path)
            return image_path
        else:
            print(f"Failed to download image from {url}: Status code {response.status_code}")
            return None
    except Exception as e:
        print(f"Error occurred while downloading image from {url}: {e}")
        return None

# Apply the download_image function to each row in the dataframe
books['image_path'] = books.apply(lambda row: download_image(row['new_image_url'], row['book_id']), axis=1)


In [None]:
import torch
import torchvision.models as models
import torchvision.transforms as transforms

# Load a pre-trained ResNet model using the `weights` parameter
resnet = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
resnet.eval()

# Define image transformations
transform = transforms.Compose([
    transforms.Resize(224),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Function to extract features from an image
def extract_image_features(image_path):
    try:
        image = Image.open(image_path).convert('RGB')
        image = transform(image).unsqueeze(0)  # Add batch dimension
        with torch.no_grad():
            features = resnet(image).numpy()
        return features
    except Exception as e:
        print(f"Error occurred while extracting features from {image_path}: {e}")
        return None

# Apply the extract_image_features function to each row in the dataframe
books['image_features'] = books['image_path'].apply(extract_image_features)


In [None]:
books.head()

In [None]:
books['image_features']

In [None]:
books.to_pickle('Pickle/books.pkl')