In [8]:
from sqlalchemy import create_engine
import pandas as pd
from dotenv import load_dotenv
import os

assert load_dotenv()

host = os.getenv('DB_HOST')
database = os.getenv('DB_NAME')
user = os.getenv('DB_USER')
password = os.getenv('DB_PASS')
port = os.getenv('DB_PORT')

connection_string  = f'postgresql://{user}:{password}@{host}:{port}/{database}'
engine =  create_engine(connection_string)

query = """
SELECT * FROM html_data limit 10
"""

df = pd.read_sql(query, engine)
df.head()


Unnamed: 0,reference_id,domain_start_id,response_url,response_text
0,23884,3292,https://bianchi-industrial.it/products/linear-...,<!doctypehtml><html class=no-js lang=en-GB><me...
1,65092,1265,https://vilagrancha.com/en/1292-complete-clutc...,<!doctypehtml><html lang=en><meta charset=utf-...
2,193015,278,https://www.groupe-lechevalier.com/produit/rou...,<!doctypehtml><html lang=fr xml:lang=fr xmlns:...
3,84150,1187,https://dachnikof.ru/products/Hozblok_Greensto...,"<!doctypehtml><html lang=ru-RU><meta content=""..."
4,72410,3308,https://www.bell.si/Lepila-in-tesnilne-mase/cy...,<!doctypehtml><html lang=sl><meta charset=utf-...


In [10]:
import re
import json

def extract_image_links(html):
    # Use regular expressions to find all image links in the HTML
    # This pattern matches both single and double quoted src attributes
    image_links = re.findall(r'<img[^>]+src=[\'"]([^\'"]+)[\'"]', html)
    
    # Find all background images in inline styles
    background_images = re.findall(r'style=[\'"][^\'"]*background-image:\s*url\([\'"]?([^\)]+?)[\'"]?\)', html)
    
    # Find all images in CSS url() functions
    css_images = re.findall(r'url\([\'"]?([^\)]+?)[\'"]?\)', html)
    
    # Return a dictionary with separate categories
    return {
        'image_links': list(dict.fromkeys(image_links)),
        'background_images': list(dict.fromkeys(background_images)),
        'css_images': list(dict.fromkeys(css_images))
    }

# Extract all image links from the response_text of all rows
all_image_links = {
    'image_links': [],
    'background_images': [],
    'css_images': []
}

for html in df['response_text']:
    links = extract_image_links(html)
    all_image_links['image_links'].extend(links['image_links'])
    all_image_links['background_images'].extend(links['background_images'])
    all_image_links['css_images'].extend(links['css_images'])

# Remove duplicates from each category
for category in all_image_links:
    all_image_links[category] = list(dict.fromkeys(all_image_links[category]))

# Create 'data' folder if it doesn't exist
if not os.path.exists('data'):
    os.makedirs('data')

# Save the extracted links as a JSON file
with open('data/all_image_links.json', 'w') as f:
    json.dump(all_image_links, f, indent=4)

print("Image links saved in 'data/all_image_links.json'")

# Print the first 5 links from each category
print("Image links:", all_image_links['image_links'][:5])
print("Background images:", all_image_links['background_images'][:5])
print("CSS images:", all_image_links['css_images'][:5])

Image links saved in 'data/all_image_links.json'
Image links: ['data:image/svg+xml,%3Csvg xmlns=', 'data:image/png;base64, iVBORw0KGgoAAAANSUhEUgAAAJUAAABGCAAAAADgY4kSAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAAEnRFWHRTb2Z0d2FyZQBCYXJjb2RlNEryjnYuAAADnElEQVR4Xu2UW0hUURSGd6OV3a/bsAwqsrIHicqXoiJEx0KDCqS0e+QU1YNZUlBUGt2Q7hTlBekilohUYmHZxShMyQjLikplwsSSLpqOjk5/+5yzz54ZnYfoef0Ps9fa+1//+dhzOAwe+s0epk8TawljBdmBCGeMNaLQH9btYnMXYy0rxQ7bXc3qNXdsYgN7KdbROXCw0hMhCDlRylh+DmPleMkaEoW1GPWsGk/YT6CFPcYbsWWFfyEaRRGOwOyCfvpzp6U/ZL/dGADzbIiKqIiKqHQRFVERFVERFVERlRRRERVREZUuoiIqoiIqoiIqopIiKqIiKqLSRVRERVRERVRERVRSJtUjzotyQxHFOW9CcRDi9ojNg5y32sQOT3vLP2vuVcmNvFqsU/Pg4GU6VRnnt/I4f4Fq3pgsrPfxmb9FOf8FtPJneC+24hBUjCZRRCE0t2ic/tx/o/of6VQ9N/9Rvqlat17RG0nlOh297p3Wn7LZbKmGT1ourbAmavcDVG2wbmkCfuxbHH9f6yVV99V1i3d+8ZpRFhWnChnim2rbuI16I6lSw0rTxvwQfcTmjIwCwyctx67f2zNMe+jzEcdKztUBi5aV5w6pgKJqXph1NyHE4TmjLCrOLMwQn1RPFuzwpHKNegDMOSP6iHzTpixCk25rR6eMZuALIOYkvP7B7sGV2mLOKIuKMwszxBeVI+xdsifVJ9YB7Fwj+ojoNUd/aQduS9u3/CBx552WO5u2PRP9kpS21+Or4

In [None]:
''