In [None]:
# img_link,product_link
# <div id="imgTagWrapperId" class="imgTagWrapper" role="button" tabindex="0" style="height: 408px;" bis_skin_checked="1">
#                         <img alt="Wayona [Apple MFi Certified Charger Lightning to USB Charging Cable Cord Compatible iPhone 14/13/12/11 Pro/11/XS MAX/XR/8/7/6s Plus,iPad Pro/Air/Mini" src="https://m.media-amazon.com/images/I/71ojkmsYe8L._SX522_.jpg" data-old-hires="https://m.media-amazon.com/images/I/71ojkmsYe8L._SL1100_.jpg" onload="markFeatureRenderForImageBlock(); this.onload='';setCSMReq('af');if(typeof addlongPoleTag === 'function'){ addlongPoleTag('af','desktop-image-atf-marker');};setCSMReq('cf')" data-a-image-name="landingImage" class="a-dynamic-image a-stretch-vertical" id="landingImage" data-a-dynamic-image="{&quot;https://m.media-amazon.com/images/I/71ojkmsYe8L._SX425_.jpg&quot;:[425,425],&quot;https://m.media-amazon.com/images/I/71ojkmsYe8L._SX522_.jpg&quot;:[522,522],&quot;https://m.media-amazon.com/images/I/71ojkmsYe8L._SY450_.jpg&quot;:[450,450],&quot;https://m.media-amazon.com/images/I/71ojkmsYe8L._SX569_.jpg&quot;:[569,569],&quot;https://m.media-amazon.com/images/I/71ojkmsYe8L._SY355_.jpg&quot;:[355,355],&quot;https://m.media-amazon.com/images/I/71ojkmsYe8L._SX466_.jpg&quot;:[466,466],&quot;https://m.media-amazon.com/images/I/71ojkmsYe8L._SX679_.jpg&quot;:[679,679]}" style="max-width: 408px; max-height: 408px;"> </div>


In [11]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import random
from tqdm import tqdm

# --- CONFIGURATION ---
INPUT_FILE_PATH = '../data/raw/process_amazon.csv'
OUTPUT_FILE_PATH = '../data/raw/process_amazon.csv'  # Overwrite the old file

# Simulate browser headers to avoid being blocked
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Accept-Language': 'en-US,en;q=0.9',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Connection': 'keep-alive'
}

def check_image_link(img_url: str) -> bool:
    if not isinstance(img_url, str) or not img_url.startswith('http'):
        return False
    try:
        # Use HEAD method to get only headers, not download the image
        response = requests.head(img_url, headers=HEADERS, timeout=10, allow_redirects=True)
        # If status code is 200 (OK), the link is valid
        if response.status_code == 200:
            return True
    except requests.exceptions.RequestException:
        # Any network error is considered as a broken link
        return False
    return False

def get_correct_image_url(product_url: str) -> str:
    """
    Truy cập URL của sản phẩm và trích xuất link ảnh chính xác từ thẻ div#imgTagWrapperId.
    """
    if not isinstance(product_url, str) or not product_url.startswith('http'):
        return None
        
    try:
        response = requests.get(product_url, headers=HEADERS, timeout=15)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            img_wrapper = soup.find('div', id='imgTagWrapperId')
            if img_wrapper:
                image_tag = img_wrapper.find('img')
                if image_tag and 'src' in image_tag.attrs:
                    return image_tag['src']
    except requests.exceptions.RequestException as e:
        print(f"  Error when scraping data from URL: {product_url} - {e}")
    return None

def main():
    """
    Hàm chính để đọc file, chỉ cào lại các link ảnh bị lỗi và lưu file.
    """
    print(f"Reading data from file: {INPUT_FILE_PATH}")
    try:
        df = pd.read_csv(INPUT_FILE_PATH)
    except FileNotFoundError:
        print(f"Error: File not found '{INPUT_FILE_PATH}'.")
        return

    # Find indices of rows with broken image links
    print("Starting to check existing image links...")
    broken_link_indices = []
    for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Checking links"):
        if not check_image_link(row['img_link']):
            broken_link_indices.append(index)
        # Add a small delay to avoid sending too many HEAD requests at once
        time.sleep(0.1)

    print(f"\nFound {len(broken_link_indices)} broken image links. Starting to re-scrape...")

    updated_count = 0
    if not broken_link_indices:
        print("All image links are working fine. No update needed.")
    else:
        # Only iterate and re-scrape rows with broken links
        for index in tqdm(broken_link_indices, desc="Re-scraping broken links"):
            product_link = df.at[index, 'product_link']
            
            # Get new image link
            new_url = get_correct_image_url(product_link)
            
            if new_url:
                # Update directly in DataFrame
                df.at[index, 'img_link'] = new_url
                updated_count += 1
            
            # Pause to avoid being blocked
            time.sleep(random.uniform(0.8, 2.0))

    print("\nProcess completed!")
    print(f"Total number of image links successfully updated: {updated_count}/{len(broken_link_indices)}")

    # Save the file
    try:
        df.to_csv(OUTPUT_FILE_PATH, index=False)
        print(f"Successfully saved the updated file at: {OUTPUT_FILE_PATH}")
    except Exception as e:
        print(f"Error when saving file: {e}")

if __name__ == "__main__":
    main()

Reading data from file: ../data/raw/process_amazon.csv
Starting to check existing image links...


Checking links: 100%|██████████| 1465/1465 [08:45<00:00,  2.79it/s]



Found 32 broken image links. Starting to re-scrape...


Re-scraping broken links: 100%|██████████| 32/32 [01:05<00:00,  2.05s/it]


Process completed!
Total number of image links successfully updated: 0/32
Successfully saved the updated file at: ../data/raw/process_amazon.csv



